public class DocIndexerPlainText extends DocIndexerConfig
config
wordsDone
currentLuceneDoc, documentName, docWriter, logger, MAX_DOCVALUES_LENGTH, metadataFieldValues, omitNorms, parameters
Constructor and Description |
---|
DocIndexerPlainText() |
Modifier and Type | Method and Description |
---|---|
void |
close() |
protected int |
getCharacterPosition() |
void |
index()
Index documents contained in a file.
|
void |
indexSpecificDocument(String documentExpr)
Index a specific document.
|
void |
setConfigInputFormat(ConfigInputFormat config) |
void |
setDocument(byte[] contents,
Charset defaultCharset)
Set the document to index.
|
void |
setDocument(File file,
Charset defaultCharset)
Set the document to index.
|
void |
setDocument(InputStream is,
Charset defaultCharset)
Set the document to index.
|
void |
setDocument(Reader reader)
Set the document to index.
|
protected void |
storeDocument()
Store (or finish storing) the document in the content store.
|
fromConfig, getSensitivitySetting, init, opChatFormatAgeToMonths, optTranslateFieldName, processLinkedDocument, processMetadataValue, processString, processStringMultipleValues, replaceDollarRefs
addAnnotatedField, addEndChar, addMetadataField, addStartChar, annotation, beginWord, dedupe, endDocument, endWord, getAnnotatedField, getAnnotatedFields, getAnnotation, getContentStoreName, getCurrentTokenPosition, getMainAnnotatedField, getMetadataFetcher, indexLinkedDocument, inlineTag, isStoreDocuments, propMain, propPunct, propTags, punctuation, reportCharsProcessed, reportTokensProcessed, resolveFileReference, setAddDefaultPunctuation, setCurrentAnnotatedFieldName, setPreventNextDefaultPunctuation, setStoreDocuments, shouldAddDefaultPunctuation, startDocument, storeWholeDocument, trace, traceln
addMetadataFieldsFromParameters, addMetadataToDocument, addNumericFields, addToForwardIndex, getCurrentLuceneDoc, getDocWriter, getMetadataField, getMetadataFieldTypeFromIndexerProperties, getParameter, getParameter, getParameter, getParameter, getSensitivitySetting, hasParameter, luceneTypeFromIndexMetadataType, setDocumentName, setDocWriter, setOmitNorms, setParameter, setParameters, tokenizeField, warn
public void close() throws BlackLabRuntimeException
close
in interface AutoCloseable
close
in class DocIndexer
BlackLabRuntimeException
public void setConfigInputFormat(ConfigInputFormat config)
setConfigInputFormat
in class DocIndexerConfig
public void setDocument(File file, Charset defaultCharset) throws FileNotFoundException
DocIndexer
setDocument
in class DocIndexer
file
- file to indexdefaultCharset
- charset to use if no BOM found, or null for the default
(utf-8)FileNotFoundException
- if not foundpublic void setDocument(byte[] contents, Charset defaultCharset)
DocIndexer
setDocument
in class DocIndexer
contents
- document contentsdefaultCharset
- charset to use if no BOM found, or null for the default (utf-8)public void setDocument(InputStream is, Charset defaultCharset)
DocIndexer
setDocument
in class DocIndexer
is
- document contentsdefaultCharset
- charset to use if no BOM found, or null for the default (utf-8)public void setDocument(Reader reader)
DocIndexer
setDocument
in class DocIndexer
reader
- documentpublic void index() throws MalformedInputFile, PluginException, IOException
DocIndexer
index
in class DocIndexerConfig
MalformedInputFile
- if the input file wasn't validPluginException
- if an error occurred in a pluginIOException
- if an I/O error occurredpublic void indexSpecificDocument(String documentExpr)
DocIndexerBase
indexSpecificDocument
in class DocIndexerConfig
documentExpr
- Expression (e.g. XPath) used to find the document to
index in the fileprotected void storeDocument()
DocIndexerBase
storeDocument
in class DocIndexerBase
protected int getCharacterPosition()
getCharacterPosition
in class DocIndexer
Copyright © 2020 Instituut voor Nederlandse Taal (INT). All rights reserved.