didpublications.bib
@MISC{Baird2002:pe,
AUTHOR = {Tapas Kanungo and Henry S. Baird and Robert M. Haralick},
TITLE = {Performance Evaluation: Theory, Practice, and Impact},
HOWPUBLISHED = {Special Issue of Int'l J. on Document Analysis and Recognition},
NOTE = {In Press, November 2001},
ABSTRACT = {The document image analysis research community has been
distinguished for over a decade by a serious and sustained
commitment to sound methodologies for measuring the
performance of algorithms and systems. Objective, quantitative,
and standardized performance evaluation methods are essential
aids in our attempts to understand the behavior of our
systems, predict their future performance, compare rival systems,
identify the particular strengths and weaknesses of proposed
technologies, and track the progress of our community's research
achievements from year to year. We feel that the time is ripe
to offer, in a journal special issue, a selection of the strongest
papers having, as their principal theme, performance evaluation
theory, practice, or impact in a large-scale application.
The manuscripts submitted were reviewed by highly qualified expert
referees in a thorough two-stage review procedure.
The articles that we have been happy to accept all enjoy a combination
of originality, high technical merit, and clear relevance to the topic.
Evaluating geometrical page-layout segmentation algorithms is a
challenging task, in part due to the diversity of metrics that have
been plausibly proposed for measuring the similarity of two segmentations.
J. Hu, R. Kashi, D. Lopresti, and G. Wilfong discuss a methodology
for evaluating systems that extract tables from document images in
their article ``Evaluating the performance of table processing
algorithms.'' One of their innovations, applicable to a wide variety
of layout segmentation tasks, is to probe two
``table graphs'' at random, counting similarities and dissimilarities,
to accumulate a statistical measure of match.
In ``Large scale address recognition systems -- truthing, testing,
tools and other evaluation issues,'' S. Setlur, A. Lawson,
V. Govindaraju, and S. Srihari describe the methodology they used for
evaluating, on a stunningly large scale, a USPS postal address
recognition system. Their methodology samples a live stream of
postal images to create ground-truthed images and evaluates the
system using encoding rate and error rate metrics.
When, as not infrequently happens, the number of original ground-truth
documents available for experimentation is severely limited, performance
evaluation results, however carefully calculated, can be inaccurate
and misleading.
In their article ``A statistical approach to the generation of a
database for evaluating OCR software,'' F. S. Brundick, A. E. M. Brodeen,
and M. S. Taylor propose a bootstrapping approach to the generation
of sufficiently large databases of synthetic ground-truthed documents.
These documents can then be printed and scanned to acquire test images
each, by construction, corresponding to known ground truth.
Contributing an interesting variation to the large literature on
applications of string-matching algorithms
to DIA problems, C. Fang, C. Liu, L. Pent, and X. Ding present
a specialized algorithm, in ``Automatic performance evaluation of
printed Chinese recognition systems,'' that assists researchers in
the evaluation and characterization of character-segmentation errors.
In ``An empirical measure of performance of document image segmentation
algorithm,'' A. K. Das, S. K. Saha, and B. Chanda argue for a new
graph-based
evaluation metric for page-layout segmentation algorithms, and provide
an algorithm to compute the metric.
Statistical classifiers form an integral part of many document image
analysis systems.
In their article ``Performance evaluation of pattern classifiers for
handwritten character
recognition,'' Liu, Sako and Fujisawa compare the performance of
well-known statistical
classifiers as a function of training sample size, outlier resistance, and
ambiguity rejection.
Agreement within the research community on standardized metrics, datasets,
and software tools is of course an essential foundation for the most
effective use of evaluation methodology.
S. Mao and T. Kanungo, in
``Software architecture of PSET: A page segmentation evaluation
toolkit,'' describe in detail the rationale and architecture of
public-domain
software tools offered for use in the evaluation of a broad class of
page-layout
segmentation algorithms.
We would like to thank all the authors who submitted manuscripts to this
special issue. We regret that only a fraction of the submissions, all
of them interesting, could be included. We owe a special debt of
gratitude to the many able reviewers who generously commented, often
is extraordinary detail, on the submissions. Finally, we would like to
acknowledge the good--hearted patience of the authors and of the journal's
editors in spite of this special issue's long gestation.}
}
@MISC{Baird1998:diur,
AUTHOR = {Junichi Kanai and Henry S. Baird},
TITLE = {Document Image Understanding and Retrieval},
HOWPUBLISHED = {Special Issue of Computer Vision and Image Understanding journal, Vol. 70, No. 3},
MONTH = {June},
YEAR = {1998},
ABSTRACT = {Vast archives of information, handwritten and machine printed on paper, have accumulated over centuries. Advances in computer and
communication technologies now offer drastically improved ways to store,
retrieve, and distribute their contents. Billions of paper documents wait
to be made accessible via electronic media.
Document image understanding and retrieval research seeks to discover
methods for automatically extracting and organizing information from
handwritten and machine printed paper documents containing text, line
drawings, maps, music scores, etc. Its characteristic problems
include some of the earliest attacked by computer-vision pioneers.
The field has long been distinguished by close and productive ties
between the academic and commercial communities. Today, document analysis
research supports a viable industry which, stimulated by the growing
demand for digital archives, the proliferation of inexpensive personal
document scanners, and the ubiquity of FAXes, is poised for rapid growth.
But the performance of these technologies still lags far behind human
abilities. Many technical problems, critically important on both
theoretical and practical grounds, remain open.
We are pleased to offer a collection of state-of-the-art papers
touching on topics of current research interest. We begin with
Doermann's up-to-date critical survey of the literature
on document image retrieval, which reveals the rich interplay between the
document analysis and information retrieval research communities.
One example of this genre is the strikingly versatile language-independent
text categorization system described by Bayer, Kressel, Mogg-Schneider,
and Renz.
Chen and Bloomberg show that English-language textual document images
can be summarized without any resort to image pattern recognition
(this won the Outstanding Paper award at the 1997 IAPR International
Conference on Document Analysis and Recognition).
Such surprising instances of non-trivial yet ``OCR-free''
document processing may be harbingers of a new generation of
architectures for document analysis systems.
Document images are usually compressed before being exchanged and
archived. It is sometimes possible to analyze compressed document
images without fully decompressing them. Spitz demonstrates that non-trivial
characteristics, such as skew angles and specially designed logos, can
be extracted directly and extremely rapidly from images compressed by
the CCITT Group III and IV methods. Kia, Doermann, Rosenfeld, and
Chellappa provide a
compression technique for document images which explicitly enables
such ``compressed-domain'' processing, as one of several improvements
to a symbolic-compression system.
The great variety of geometric arrangements of text blocks on printed pages
poses daunting challenges. Antonacopoulos' fast `white-space tiling'
method copes well with an unusually wide range of skewed, non-rectangular
layouts.
Kise and Sato, motivated by similar goals and similarly choosing to
analyze the white background, prove that methods based on area
Voronoi diagrams are also effective --- an example of the continuing
relevance of computational geometry to document analysis.
Of course the generic problem of segmentation --- the partitioning of complex
images into regions which we can more easily recognize or analyze further ---
pervades the field. Hu and Yan attempt the segmentation of handwriting,
in off-line (static) images, into individual characters.
Hidden Markov model (HMM) techniques, having been applied with notable
success in speech recognition, are increasingly being adapted to
selected sub-problems in document analysis. Knerr, Augustin, Baret,
and Price apply HMMs to word recognition in handwritten checks.
The robust detection of `graphical primitives' such as straight lines
and circular arcs is an inescapable subtask in graphics recognition.
Wenyin and Dori present a painstaking study of software-engineering
aspects arising in algorithms for this purpose. Ogier, Mullot, Labiche,
and Lecouter give an architectural tour of a complete system for the
knowledge-guided interpretation of city maps, putting to use some general
principles of human visual perception. This is one of many experiments
within our field --- still cautiously exploratory --- in the exploitation
of cognitive science.
These eleven papers were subjected to the exhaustive CVIU process of review
and revision. We would like enthusiastically to thank our twenty-seven highly
professional referees for admirable devotion to their anonymous duties,
and to the authors for responding to the referees' advice gracefully
and thoroughly. Finally, but no less ardently, we are grateful to
Editor-in-Chief Avi Kak for his kind invitation to us to assemble these
papers, and to him and Karen Rado and other CVIU Editorial Office staff for their
unfailing support and advice during our protracted labor.
We hope that this Special Issue will stimulate greater understanding,
mutual interest, and collaboration between the computer vision and
document image analysis research communities. Only a decade ago these
now divergent communities were unified. We continue to share
our most strongly held aspiration: to build machines able to infer
complete and highly accurate interpretations of the contents of complex
images --- whether scenes of the 3-D physical world or the `visible
speech' of 2-D documents.}
}
@ARTICLE{Baird1998:maps,
AUTHOR = {Tin Kam Ho and Henry S. Baird},
TITLE = {Pattern Classification with Compact Distribution Maps},
JOURNAL = {Computer Vision and Image Understanding},
VOLUME = {70},
NUMBER = {1},
MONTH = {March},
YEAR = {1998},
PAGES = {101--110},
ABSTRACT = {A difficult problem in classification is representing the
the class--conditional distributions concisely and faithfully. We propose a
way of mapping such distributions and its use in constructing a similarity
metric. A classifier using this metric can achive low error rates and useful
confidence scores permitting reliable reject behavior. We illustrate the method
by an application in a challenging character recognition problem with
thousands of classes. For applications to arbitrary domains, we present a
method to automatically construct feature transformations that are suitable
for usch mappings. }
}
@INPROCEEDINGS{Baird1999:fine,
AUTHOR = {Henry S. Baird},
TITLE = {Document Image Quality: Making Fine Discriminations},
BOOKTITLE = {Proceedings of the IAPR 1999 International Conference on Document Analysis and Recognition (ICDAR 1999)},
PAGES = {459--462},
YEAR = 1999,
ADDRESS = {Bangalore, India},
MONTH = {September},
PS = {papers/ps/Baird1999_fine.ps},
PDF = {papers/pdf/Baird1999_fine.pdf},
ABSTRACT = {We estimate empirically the smallest changes in document image quality
that can be distinguished reliably and fully automatically by Kanungo's
bootstrapping method [Kan96].
Six parameters of a physics-based document--image degradation model [Bai92]
are varied, one at a time: for each, over a range of parameter-value
differences, two sets of synthetic images are generated pseudorandomly
and the two sets tested for statistical equivalence using Kanungo's method.
The rate at which Kanungo's method rejects the hypothesis that the two
sets are drawn from the same distribution is analyzed as a function
of parameter difference (a specialized ``power function'').
The finest discriminations afforded by the method are given
by the width of the power function at a low fixed reject threshold.
The data show that remarkably fine discriminations are possible
--- often subtler than are evident to visual inspection ---
for all six parameters.
As few as 25 reference images are sufficient.
These results suggest that Kanungo's method is sufficiently sensitive
to a wide range of physics--based image degradations to serve as
an engineering foundation for many image--quality estimation and
OCR engineering purposes.}
}
@INPROCEEDINGS{Baird1999:model,
AUTHOR = {Henry S. Baird},
TITLE = {Model-Directed Document Image Analysis},
BOOKTITLE = {Proceedings of the DOD-sponsored Symposium on Document Image Understanding Technology (SDIUT 1999)},
YEAR = 1999,
ADDRESS = {Annapolis, MD},
MONTH = {April},
NOTE = {Invited published talk},
PS = {papers/ps/Baird1999_model.ps},
PDF = {papers/pdf/Baird1999_model.pdf},
ABSTRACT = {If current OCR engineering trends continue, then, we believe,
so--called ``general--purpose'' systems --- that is, fully automatic and
nonretargetable systems ---
will leave many potential users unsatisfied, and lucrative
application niches unfilled, for years to come.
However, for users who care enough to volunteer some manual effort ---
to help customize the system to their document(s) --- significantly higher
accuracy may be achievable, without delay.
We discuss in detail two state--of--the--art document recognition systems
--- Lucent Technologies' Table Reader System (TRS)
and Xerox's ``document image decoding'' (DID) research prototype ---
which yield high accuracy by reliance on explicitly stated models
of properties of the target document, whether iconic (known
typefaces and image degradations), geometric (restricted classes of
layouts), or symbolic (linguistic and pragmatic contextual constraints).
How great are the performance advantages that can be realized by sacrificing
automation in these ways? To what extent can the necessary customizations
be (semi--)automated? We outline recent and planned research at Xerox PARC
motivated by these questions.}
}
@MISC{Baird2000:dir,
AUTHOR = {Henry S. Baird and Francine Chen},
TITLE = {Document Image Retrieval},
HOWPUBLISHED = {Special Issue of Information Retrieval journal, Vol. 2, Nos. 2/3},
MONTH = {May},
YEAR = {2000},
ABSTRACT = {We are pleased to offer a selection of research papers on current
topics at the interface between the information retrieval (IR) and
document image analysis (DIA) fields. These two R&D fields have
benefited from mutual awareness and interdisciplinary collaboration
since 1992, when they were consciously brought together for this
purpose by Dr. Thomas Nartker and the staff of the Information
Sciences Research Institute of the University of Nevada in an influential
series of annual Symposia on Document Analysis and Information Retrieval
(1992-1996, Las Vegas). Since then, similar purposes have been
served by the gradually expanding annual SPIE Conference on Document
Recognition and Retrieval (1992-present, San Jose). We hope that
our special issue may also play a constructive role in stimulating
and amplifying substantive technical interactions between our two
communities.
Classical IR methods commonly operate on passages of plain,
correctly encoded text. If however that text is represented by
images of paper documents, then fresh technical issues and
opportunities for innovative solutions arise both for IR and DIA
researchers. These issues and the approaches that have been used to
resolve them are enumerated in M. Mitra and B.B. Chaudhuri's
critical survey of the literature on information retrieval from
document images.
Limitations of current DIA technology can compromise text-based
IR results. In particular, text extracted by machine vision means
(e.g. OCR systems) is usually corrupt to some degree. Paul B. Kantor
and Ellen M. Voorhees provide an overview of a TREC (Text REtrieval
Conference) track that evaluates the impact of this fact on text-based
IR performance, comparing different retrieval methods operating on page
images to which OCR techniques have been applied.
For DIA researchers, location and recognition of text in complex
document images remains a central challenge. With the advent of the
Internet have come document representations (e.g. HTML) which combine
encoded text and text-as-image (e.g. GIF, JPEG). As rendered by browsers,
it is often not apparent to the human reader when text-as-image is
employed --- yet it is natural to want to search on it as easily as
on encoded text. Web authors often incorporate text-as-image
representations for visual appeal, but the open-ended variety in design
that this leads to increases the challenge in locating and recognizing
such "text". Daniel Lopresti and Jiangying Zhou describe one of the
earliest sustained attempts to put text-as-image on an equal footing
with encoded text in WWW images for retrieval purposes.
Classical IR systems employ a number of techniques, such as the
bag-of-words model which is of course oblivious to reading order; also,
in domain-specific IR, custom non-stop-word lexica may be used.
Within the DIA world, it is notoriously harder to preserve reading order
than to isolate individual words, and it is sometimes easier to "spot"
whole-word images belonging to a constrained lexicon than to provide
exhaustive character-by-character transcriptions. Thus IR methods can
inform DIA strategies: for example, spotting words from a constrained
lexicon for use in a bag-of-words model. William J. Williams, Eugene
J. Zalubas, and Alfred O. Hero, III discuss research motivated in this
way, to spot content words reliably in low-quality document images
captured by FAX machines.
Textual content that appears in printed pages may be provided with
significant contextual clues, for example in the organization of the page
layout, in typographic design choices, and in proximity to images or graphics.
Such non-textual ``meta-data'' is often multi-dimensional and multimedia.
We are pleased to present two papers focused on this rapidly advancing
frontier of DIA research. Jianying Hu, Ramanujan Kashi, and Gordon Wilfong
describe methods for the comparison and classification of documents based
not on textual contents but on page-layout similarities. As with many IR
methods, their image-based approach supports not only retrieval but
categorization and information extraction. Rohini K. Srihari, Zhongfei Zhang,
and Aibing Rao discuss the indexing and retrieval of documents in an even
broader setting, where the documents themselves are "multimodal," that is
containing both text and images of natural scenes. In this task, it is
the images, not the text, which are the targets of retrieval. Text is
both located and recognized, but principally as "collateral" data accompanying
the sought-for images.
The six papers of this special issue are representative, we believe, of
the best current research at the boundary between the IR and DIA fields,
but they do not exhaust its potential topics. For some retrieval purposes
it is best to operate directly on the document image -- even compressed
forms of the image -- rather than on its textual content, for example to
identify nearly identical documents or search for semi-textual ``terms''
such as logos or handwritten annotations and signatures. Categorization
of (multi-page) document images, as a prelude to retrieval or for other
purposes, is a significant area of current work only glancingly touched
on in these papers. The DIA community's long experience in two-dimensional
and multi-modal content analysis deserves to find application in text-based
IR (e.g. tabular data, HTML). And, surely, insights that arose first in
text-based retrieval will continue stimulate innovative image-based
document processing.
We are grateful for the patience of the authors during a long-protracted
review process, for the thoughtful service of two dozen anonymous
referees, and for David S. Doermann's generously offered advice in the
early stages of planning for this issue. We thank the Editors of the
IR Journal for inviting us to assemble the issue. We are confident
that its themes will remain provocative and thus that it will be succeeded
by other special issues, in this and other journals, similarly focused
on the boundary between information retrieval and document image analysis.}
}
@INPROCEEDINGS{Baird2000:state,
AUTHOR = {Henry S. Baird},
TITLE = {State of the Art of Document Image Degradation Modeling},
BOOKTITLE = {Proceedings of the 4th IAPR Workshop on Document Analysis Systems (DAS 2000)},
YEAR = 2000,
ADDRESS = {Rio de Janeiro},
MONTH = {December},
NOTE = {Invited plenary talk},
PS = {papers/ps/Baird2000_state.ps},
PDF = {papers/pdf/Baird2000_state.pdf},
ABSTRACT = {The literature on models of document image degradation is
reviewed, and open problems are listed.
In response to the unpleasant fact that the accuracy of document
recognition algorithms falls drastically when image quality degrades
even slightly, researchers in the last decade have intensified their
study of explicit, quantitative, parameterized models of image defects that occur during
printing and scanning.
Several models have been proposed, some motivated by the physics of image
formation and others by the surface statistics of
image distributions.
A wide range of techniques for estimating parameters of these models
has been explored.
These models, in the form of pseudo-random generators of synthetic images,
permit, for the first time, investigations into fundamental
properties of concrete image recognition problems including
the Bayes error of problems and the asymptotic accuracy
and domain of competency of classifier technologies.
The use of massive sets of synthetic images, in the construction and
testing of high-performance classifiers, has accelerated in the last
few years.
Open problems include the search for methods for comparing competing
models and sound methodologies for the use of synthetic data in engineering.}
}
@TECHREPORT{Berger99:Workshop,
AUTHOR = {Toby Berger and Philip Chou and Michelle Effros and
Nariman Farvardin and Thomas Fischer and William
R. Gardner and Robert M. Gray and Nikil S. Jayant
and Rajiv Laroia and Upamanyu Madhow and Michael
W. Marcellin and James W. Modestino and David
L. Neuhoff and Alon Orlitsky and Kris Popat and
Kannan Ramchandran and James A. Storer and Vinay
Vaishampayan and Kenneth Zeger and Zhen Zhang},
TITLE = {Workshop report: {NSF} Sponsored Workshop on Joint
Source-Channel Coding},
INSTITUTION = {California Institute of Technology},
YEAR = 1999,
MONTH = {October},
PS = {http://www.systems.caltech.edu/EE/Faculty/Effros/JSCC/Joint-Source-Channel-Coding-Workshop-Report-1999.ps},
PDF = {http://www.systems.caltech.edu/EE/Faculty/Effros/JSCC/Joint-Source-Channel-Coding-Workshop-Report-1999.pdf}
}
@INPROCEEDINGS{Bloomberg2001:subsampled,
AUTHOR = {Dan S. Bloomberg and Thomas P. Minka and Kris Popat},
TITLE = {Document Image Decoding using Iterated Complete Path Search with Subsampled Heuristic Scoring},
BOOKTITLE = {Proceedings of the IAPR 2001 International Conference Document Analysis and Recognition (ICDAR 2001)},
YEAR = 2001,
MONTH = {September},
PS = {papers/ps/Bloomberg2001_subsampled.ps},
PDF = {papers/pdf/Bloomberg2001_subsampled.pdf},
ABSTRACT = {It has been shown that the computation time of Document Image Decoding
can be significantly reduced by employing heuristics in the search
for the best decoding of a text line. In the
Iterated Complete Path (ICP) method, template matches are performed only
along the best path found by dynamic programming on each iteration.
When the best path stabilizes, the decoding is optimal and no
more template matches need be performed.
In this way, only a tiny fraction of potential template matches
must be evaluated, and the computation time is typically dominated by
the evaluation of the initial heuristic upper-bound for each
template at each location in the image.\smallskip The time to
compute this bound depends on the resolution at which the
matching scores are found. At lower resolution,
the heuristic computation is reduced,
but because a weaker bound is used, the number of Viterbi
iterations is increased. We present the optimal (lowest
upper-bound) heuristic for any degree of subsampling
of multilevel template and/or interpolation, for use in text line
decoding with ICP.
The optimal degree of subsampling depends
on image quality, but it is typically found that a small
amount of template subsampling is effective in
reducing the overall decoding time.}
}
@INPROCEEDINGS{Breuel2000:hwrec,
AUTHOR = {Thomas M. Breuel},
TITLE = {Handwriting Recognition on US Census Forms},
BOOKTITLE = {Mathematical Morphology and its applications to image and signal processing: Proceedings of the Fifth International Symposium on Mathematical Morphology (ISMM 2000)},
YEAR = 2000,
MONTH = {June},
NOTE = {Invited Plenary Talk}
}
@INPROCEEDINGS{Breuel2000:segsp,
AUTHOR = {Thomas M. Breuel},
TITLE = {Layout Analysis by Exploring the Space of Segmentation Parameters},
BOOKTITLE = {Proceedings of the 4th IAPR Workshop on Document Analysis Systems (DAS 2000)},
YEAR = 2000,
MONTH = {December},
PS = {papers/ps/das2000-cr.ps},
PDF = {papers/pdf/das2000-cr.pdf}
}
@INPROCEEDINGS{Breuel2001:class,
AUTHOR = {Thomas M. Breuel},
TITLE = {Classification by Probabilistic Clustering},
BOOKTITLE = {Proceedings of the 2001 International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2001)},
YEAR = 2001,
ADDRESS = {Salt Lake City, Utah},
MONTH = {May},
ORGANIZATION = {IEEE},
PSINTERNAL = {papers/ps/icassp-2001.ps},
PDFINTERNAL = {papers/pdf/icassp-2001.pdf}
}
@INPROCEEDINGS{Breuel2001:impan,
AUTHOR = {Thomas M. Breuel},
TITLE = {Implicit Manipulation of Constraint Sets for Geometric Matching under Translation and Rotation},
BOOKTITLE = {Scandinavian Conference on Image Analysis (SCIA 2001)},
YEAR = 2001,
ADDRESS = {Bergen, Norway},
MONTH = {June},
PSINTERNAL = {papers/ps/rast-implicit-rotation.ps},
PDFINTERNAL = {papers/pdf/rast-implicit-rotation.pdf}
}
@INPROCEEDINGS{Breuel2001:recent,
AUTHOR = {Thomas M. Breuel and Kris Popat},
TITLE = {Recent Work in the Document Image Decoding Group
at Xerox PARC},
BOOKTITLE = {Proceedings of the DOD-sponsored Symposium on Document Image
Understanding Technology (SDIUT 2001)},
YEAR = 2001,
MONTH = {April},
PSINTERNAL = {papers/ps/Breuel2001_recent.ps},
PDFINTERNAL = {papers/pdf/Breuel2001_recent.pdf}
}
@INPROCEEDINGS{Breuel2001:samp,
AUTHOR = {Thomas M. Breuel},
TITLE = {Modeling the Sample Distribution for Clustering by OCR},
BOOKTITLE = {Proceedings of IS&T/SPIE Electronic Imaging 2001: Document Recognition and Retrieval VIII},
YEAR = 2001,
MONTH = {January},
PS = {papers/ps/spie-final.ps},
PDF = {papers/pdf/spie-final.pdf}
}
@INPROCEEDINGS{Coates2001:pessimal,
AUTHOR = {Allison L. Coates and Henry S. Baird and Richard J. Fateman},
TITLE = {Pessimal print: a reverse {T}uring test},
BOOKTITLE = {Proceedings of the IAPR 2001 International Conference Document Analysis and Recognition (ICDAR 2001)},
YEAR = {2001},
MONTH = {September},
PSINTERNAL = {papers/ps/Coates2001_pessimal.ps},
PDFINTERNAL = {papers/pdf/Coates2001_pessimal.pdf},
ABSTRACT = {We exploit the gap in ability between human and machine vision
systems to craft a family of automatic challenges that tell
human and machine users apart via graphical interfaces including
Internet browsers.
Turing proposed [Tur50] a method whereby human judges might validate
``artificial intelligence'' by failing to distinguish between human
and machine interlocutors.
Stimulated by the ``chat room problem'' posed by Udi Manber
of Yahoo!, and influenced by the CAPTCHA project [BAL00] of
Manuel Blum {\em et al} of Carnegie--Mellon Univ.,
we propose a variant of the Turing test using {\em pessimal print}:
that is,
low--quality images of machine-printed text synthesized pseudo-randomly
over certain ranges of words, typefaces, and image degradations.
We show experimentally that judicious choice of
these ranges can ensure that the images are legible to human readers
but illegible to several of the best present--day optical character
recognition (OCR) machines.
Our approach is motivated by a decade of research on performance evaluation
of OCR machines [RJN96,RNN99] and on
quantitative stochastic models of document image quality [Bai92,Kan96].
The slow pace of evolution of OCR and other species of machine vision
over many decades [NS96,Pav00] suggests that pessimal print will defy
automated attack for many years. Applications include `bot' barriers and
database rationing.}
}
@ARTICLE{Ho1997:large,
AUTHOR = {Tin Kam Ho and Henry S. Baird},
TITLE = {Large-Scale Simulation Studies in Image Pattern Recognition},
JOURNAL = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
YEAR = 1997,
VOLUME = 19,
NUMBER = 10,
PAGES = {1067--1079},
MONTH = {October},
PSINTERNAL = {papers/ps/Ho1997_large.ps},
PDFINTERNAL = {papers/pdf/Ho1997_large.pdf},
ABSTRACT = {Many obstacles to progress in image pattern recognition
result from the fact that the per--class distributions are often too irregular
to be well--approximated by simple analytical functions. Simulation studies
offer one way to circumvent these obstacles. We present three closely related
studies of machine--printed character recognition that rely on synthetic
data generated pseudorandomly in accordance with an explicit stochastic
model of document image degradations. The unusually large scale of experiments
-- invloving several million samples --- that this methodology makes possible
has allowed us to compute sharp estimates of the intrinsic difficulty
(Bayes risk) of concrete recognition problems, as well as the asymptotic
accuracy and domain of competency of classifiers.}
}
@ARTICLE{Kanungo2000:statistical,
AUTHOR = {Tapas Kanungo and Robert M. Haralick and Henry S. Baird and Werner Stuezle and David Madigan},
TITLE = {A Statistical, Nonparametric Methodology for Document Degradation Model Validation},
JOURNAL = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
YEAR = 2000,
VOLUME = 22,
NUMBER = 11,
PAGES = {1209--1223},
MONTH = {November},
PSINTERNAL = {papers/ps/Kanungo2000_statistical.ps},
PDFINTERNAL = {papers/pdf/Kanungo2000_statistical.pdf},
ABSTRACT = {Printing, photocopying, and scanning processes degrade the
image quality of a document. Statistical models of these degradation
processes are crucial for document image understanding research. Models
allow us to predict system performance, conduct controlled experiments to study the breakdown points of the systems, create large multilingual data sets
with groundtruth for training classifiers, design optimal noise removal
algorithms, choose values for the free parameters of the algorithms,
and so on. Although research in document understanding started many decades
ago, only two document degradation models have been proposed thus far.
Furthermore, no attempts have been made to statistically validate these
models. In this paper, we present a statistical methodology that can
be used to validate local degradation models. This method is based on a
nonparametric, two--sample permutation test. Another standard statistical
device---the power function---is then used to choose between algorithm
variables such as distance functions. Since the validation and the power
function procedures are independent of the model, they can be used to
validate any other degradation model. A method for comparing any two
models is also described. It uses p--values associated with the
estimated models to select the model that is closer to the real world. }
}
@ARTICLE{Kopec1994:did,
AUTHOR = {Gary E. Kopec and Philip A. Chou},
TITLE = {Document image decoding using {M}arkov source models},
JOURNAL = {IEEE Transactions on Pattern Analysis and Machine
Intelligence},
YEAR = 1994,
VOLUME = 16,
NUMBER = 6,
PAGES = {602--617},
MONTH = {June},
PS = {papers/ps/Kopec1994_did.ps},
PDF = {papers/pdf/Kopec1994_did.pdf}
}
@INPROCEEDINGS{Minka2001:ICP,
AUTHOR = {Thomas P. Minka and Dan S. Bloomberg and Kris Popat},
TITLE = {Document Image Decoding using the Iterated Complete Path Heuristic},
BOOKTITLE = {Proceedings of IS&T/SPIE Electronic Imaging 2001: Document Recognition and Retrieval VIII},
YEAR = 2001,
MONTH = {January},
PS = {papers/ps/Minka2001_ICP.ps},
PDF = {papers/pdf/Minka2001_ICP.pdf}
}
@INPROCEEDINGS{Popat2000:adding,
AUTHOR = {Kris Popat and Dan Bloomberg and Dan Greene},
TITLE = {Adding Linguistic Constraints to Document Image Decoding},
BOOKTITLE = {Proceedings of the 4th IAPR Workshop on Document Analysis Systems (DAS 2000)},
YEAR = 2000,
MONTH = {December},
PS = {papers/ps/Popat2000_adding.ps},
PDF = {papers/pdf/Popat2000_adding.pdf}
}
@INPROCEEDINGS{Popat2000:two-stage,
AUTHOR = {Kris Popat and Dan S. Bloomberg},
TITLE = {Two-Stage Lossy/Lossless Compression of Grayscale Document Images},
BOOKTITLE = {Mathematical Morphology and its applications to image and signal processing: Proceedings of the Fifth International Symposium on Mathematical Morphology (ISMM 2000)},
YEAR = 2000,
MONTH = {June},
PS = {papers/ps/Popat2000_two-stage.ps},
PDF = {papers/pdf/Popat2000_two-stage.pdf}
}
@INPROCEEDINGS{Popat2001:graydid,
AUTHOR = {Kris Popat},
TITLE = {Decoding of text lines in grayscale document images},
BOOKTITLE = {Proceedings of the 2001 International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2001)},
YEAR = 2001,
ADDRESS = {Salt Lake City, Utah},
MONTH = {May},
ORGANIZATION = {IEEE},
PS = {papers/ps/Popat2001_graydid.ps},
PDF = {papers/pdf/Popat2001_graydid.pdf}
}
@INPROCEEDINGS{Popat2001:offset,
AUTHOR = {Kris Popat},
TITLE = {Document Image Compression by Adaptive-Offset Quantization},
BOOKTITLE = {Proceedings of IS&T/SPIE Electronic Imaging 2001: Document Recognition and Retrieval VIII},
YEAR = 2001,
MONTH = {January},
PS = {papers/ps/Popat2001_offset.ps},
PDF = {papers/pdf/Popat2001_offset.pdf}
}
@INPROCEEDINGS{Popat2001:stack,
AUTHOR = {Kris Popat and Dan Greene and Justin Romberg and Dan S. Bloomberg},
TITLE = {Adding Linguistic Constraints to Document Image Decoding: Comparing the Iterated Complete
Path and Stack Algorithms},
BOOKTITLE = {Proceedings of IS&T/SPIE Electronic Imaging 2001: Document Recognition and Retrieval VIII},
YEAR = 2001,
MONTH = {January},
PS = {papers/ps/Popat2001_stack.ps},
PDF = {papers/pdf/Popat2001_stack.pdf}
}
@INPROCEEDINGS{sarkar:icpr2002,
AUTHOR = {P. Sarkar},
TITLE = {An iterative Algorithm for Optimal Style-conscious Field Classification},
BOOKTITLE = {[Submitted for review] Proceedings of the sixteenth ICPR},
PUBLISHER = {IEEE Computer Society Press},
ADDRESS = {Quebec City},
PAGES = {},
YEAR = {2002},
HTTP = {http://parcweb.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/ICPR2002/download.html}
}
@INPROCEEDINGS{Sarkar2002:triage,
AUTHOR = {Prateek Sarkar and Henry S. Baird and John Henderson},
TITLE = {Triage of OCR Output Using 'Confidence' Scores},
BOOKTITLE = {[accepted for publication in] Proceedings of SPIE/IS&T 2002 Document Recognition & Retrieval IX Conf. (DR&R IX)},
YEAR = 2002,
ADDRESS = {San Jose, California, USA},
MONTH = {January 20-25},
HTTP = {http://parcweb.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/SPIE2002/download.html},
PSINTERNAL = {papers/ps/Sarkar2002_triage.ps},
PDFINTERNAL = {papers/pdf/Sarkar2002_triage.pdf}
}
@INPROCEEDINGS{lopresti:icdar95,
AUTHOR = {Daniel Lopresti and Jiangying Zhou and George Nagy and Prateek Sarkar},
TITLE = {Spatial Sampling Effects in Optical Character Recognition},
BOOKTITLE = {Proceedings of the Third International Conference on Document Analysis and Recognition},
YEAR = {1995},
PAGES = {309-314},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/ICDAR95/download.html}
}
@INCOLLECTION{nagy:GREC98,
AUTHOR = {G. Nagy and A. Samal and S. Seth and T.Fisher and E. Guthman and K. Kalafala and L. Li and P. Sarkar and Y. Xu},
TITLE = {A Prototype for Adaptive Association of Street Names with Streets on Maps},
BOOKTITLE = {Graphics Recognition: Algorithms and Systems},
EDITOR = {K. Tombre & A. Chhabra},
SERIES = {Springer Lecture Notes in Computer Science},
VOLUME = {1389},
PAGES = {302-313},
YEAR = {1998}
}
@INPROCEEDINGS{nagy:uppsala1999,
AUTHOR = {G. Nagy and P. Sarkar},
TITLE = {Modeling Statistical Dependence in Pattern Classification},
BOOKTITLE = {Proceedings of the IAPR Workshop on Statistical Methods for Image Processing},
ADDRESS = {Uppsala},
YEAR = {1999}
}
@ARTICLE{sarkar:PAMI98,
AUTHOR = {P. Sarkar and G. Nagy and J. Zhou and D. Lopresti},
TITLE = {Spatial Sampling of Printed Patterns},
JOURNAL = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
VOLUME = {20},
NUMBER = {3},
PAGES = {344-351},
MONTH = {March},
YEAR = {1998},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/PAMI98/download.html}
}
@INPROCEEDINGS{sarkar:icdar2001,
AUTHOR = {P. Sarkar and G. Nagy},
TITLE = {Style consistency in isogenous patterns},
BOOKTITLE = {Proceedings of the Sixth ICDAR},
ADDRESS = {Seattle, USA},
MONTH = {September},
YEAR = {2001},
PAGES = {1169-1174},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/ICDAR2001/download.html}
}
@INPROCEEDINGS{sarkar:icdar99,
AUTHOR = {Prateek Sarkar and George Nagy},
TITLE = {Heeding more than the top template},
BOOKTITLE = {Proceedings of the Fifth International Conference on Document Analysis and Recognition},
ADDRESS = {Bangalore, India},
MONTH = {September},
YEAR = {1999},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/ICDAR99/download.html}
}
@INPROCEEDINGS{sarkar:icpr2000,
AUTHOR = {P. Sarkar and G. Nagy},
TITLE = {Classification of style-constrained pattern-fields},
BOOKTITLE = {Proceedings of the fifteenth ICPR},
PUBLISHER = {IEEE Computer Society Press},
ADDRESS = {Barcelona},
PAGES = {859-862},
YEAR = {2000},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/ICPR2000/download.html}
}
@MASTERSTHESIS{sarkar:msthesis,
AUTHOR = {Prateek Sarkar},
TITLE = {Random Phase Spatial Sampling Effects in Digitized Patterns},
SCHOOL = {Rensselaer Polytechnic Institute},
YEAR = {1994}
}
@PHDTHESIS{sarkar:phdthesis,
AUTHOR = {P. Sarkar},
TITLE = {Style consistency in pattern fields},
SCHOOL = {Rensselaer Polytechnic Institute},
ADDRESS = {Troy, NY},
YEAR = {2000},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/PHDTHESIS/download.html}
}
@INCOLLECTION{zhou:IWVF97,
AUTHOR = {J. Y. Zhou and D. Lopresti and P. Sarkar and G. Nagy},
TITLE = {Spatial sampling effects on scanned 2-D patterns},
BOOKTITLE = {Advances in Visual Forms Analysis},
EDITOR = {C. Arcelli and L. P. Cordella and G. S. di Baja},
PUBLISHER = {World Scientific},
ADDRESS = {Singapore},
YEAR = {1997},
HTTP = {http://www.parc.xerox.com/istl/members/psarkar/PUBLICATIONS/CAPRI97/download.html}
}
@INPROCEEDINGS{Toutanova2001:hierarchical,
AUTHOR = {Kristina Toutanova and Francine Chen and Kris Popat
and Thomas Hofmann},
TITLE = {Text classification in a hierarchical mixture model
for small training sets},
BOOKTITLE = {Proceedings of the ACM Conference on Information
and Knowledge Management (CIKM)},
YEAR = 2001,
MONTH = {November},
PS = {papers/ps/Toutanova2001_hierarchical.ps},
PDF = {papers/pdf/Toutanova2001_hierarchical.pdf},
ABSTRACT = {Documents are commonly categorized into hierarchies of topics, such as
the ones maintained by Yahoo! and the Open Directory project, in order
to facilitate browsing and other interactive forms of information
retrieval. In addition, topic hierarchies can be utilized to overcome
the sparseness problem in text categorization with a large number of
categories, which is the main focus of this paper. This paper presents
a {\em hierarchical mixture model} which extends the standard naive
Bayes classifier and previous hierarchical approaches. Improved
estimates of the term distributions are made by differentiation of
words in the hierarchy according to their level of
generality/specificity. Experiments on the Newsgroups and the
Reuters-21578 dataset indicate improved performance of the proposed
classifier in comparison to other state-of-the-art methods on datasets
with a small number of positive examples.}
}
@INPROCEEDINGS{Gaussier2002:hierarchical,
AUTHOR = {Eric Gaussier and Cyril Goutte and Kris Popat
and Francine Chen},
TITLE = {A hierarchical model for clustering and
categorising documents},
BOOKTITLE = {Proceedings of the 24th BCS-IRSG European Colloquium on
IR Research},
YEAR = 2002,
MONTH = {March},
NOTE = {to appear},
PSINTERNAL = {papers/ps/Gaussier2002_hierarchical.ps},
PDFINTERNAL = {papers/pdf/Gaussier2002_hierarchical.pdf},
ABSTRACT = {We propose a new hierarchical generative model for textual
data, where words may be generated by topic specilic distributions at
any level in the hierarchy. This model is naturally well-suited to clus-
tering documents in preset or automatically generated hierarchies, as
well as categorising new documents in an existing hierarchy. Training
algorithms are derived for both cases, and illustrated on real data by
clustering news stories and categorising newsgroup messages. Finally,
the generative model may be used to derive a Fisher kernel expressing
similarity between documents.}
}
@INPROCEEDINGS{Kopec2002:ngram,
AUTHOR = {Gary E. Kopec and Maya R. Said and Kris Popat},
TITLE = {N-Gram Language Models for Document Image Decoding},
ABSTRACT = {This paper explores the problem of incorporating
linguistic constraints into document image decoding, a communication
theory approach to document recognition. Probabilistic character
n-grams (n=2--5) are used in a two-pass strategy where the decoder
first uses a very weak language model to generate a lattice of
candidate output strings. These are then re-scored in the second pass
using the full language model. Experimental results based on both
synthesized and scanned data show that this approach is capable of
improving the error rate by a factor of two to ten depending on the
quality of the data and the details of the language model used.},
BOOKTITLE = {Proceedings of IS&T/SPIE Electronic Imaging 2002: Document Recognition and Retrieval IX},
YEAR = 2002,
MONTH = {January},
PS = {papers/ps/Kopec2002_ngram.ps},
PDF = {papers/pdf/Kopec2002_ngram.pdf}
}