public abstract class DocIndexer extends Object implements AutoCloseable
Modifier and Type | Field and Description |
---|---|
protected org.apache.lucene.document.Document |
currentLuceneDoc
The Lucene Document we're currently constructing (corresponds to the document
we're indexing)
|
protected String |
documentName
File we're currently parsing.
|
protected DocWriter |
docWriter |
protected static org.apache.logging.log4j.Logger |
logger |
static int |
MAX_DOCVALUES_LENGTH |
protected Map<String,List<String>> |
metadataFieldValues
Document metadata.
|
protected boolean |
omitNorms
Do we want to omit norms? (Default: yes)
|
protected Map<String,String> |
parameters
Parameters passed to this indexer
|
Constructor and Description |
---|
DocIndexer() |
Modifier and Type | Method and Description |
---|---|
void |
addMetadataField(String name,
String value) |
protected void |
addMetadataFieldsFromParameters()
If any metadata fields were supplied in the indexer parameters, add them now.
|
void |
addMetadataToDocument()
When all metadata values have been set, call this to add the to the Lucene document.
|
void |
addNumericFields(Collection<String> fields) |
protected void |
addToForwardIndex(AnnotatedFieldWriter field)
Add the field, with all its properties, to the forward index.
|
abstract void |
close() |
protected abstract int |
getCharacterPosition() |
org.apache.lucene.document.Document |
getCurrentLuceneDoc() |
DocWriter |
getDocWriter()
Returns our DocWriter object
|
List<String> |
getMetadataField(String name) |
FieldType |
getMetadataFieldTypeFromIndexerProperties(String fieldName)
Deprecated.
use a DocIndexerConfig-based indexer
|
String |
getParameter(String name)
Deprecated.
use a DocIndexerConfig-based indexer
|
boolean |
getParameter(String name,
boolean defaultValue)
Deprecated.
use a DocIndexerConfig-based indexer
|
int |
getParameter(String name,
int defaultValue)
Deprecated.
use a DocIndexerConfig-based indexer
|
String |
getParameter(String name,
String defaultValue)
Deprecated.
use ConfigInputFormat, IndexMetadata
|
AnnotationWriter.SensitivitySetting |
getSensitivitySetting(String annotationName)
Deprecated.
|
boolean |
hasParameter(String name)
Deprecated.
use a DocIndexerConfig-based indexer
|
abstract void |
index()
Index documents contained in a file.
|
protected org.apache.lucene.document.FieldType |
luceneTypeFromIndexMetadataType(FieldType type) |
protected String |
optTranslateFieldName(String from)
Translate a field name before adding it to the Lucene document.
|
abstract void |
reportCharsProcessed()
Report the amount of new characters processed since the last call
|
abstract void |
reportTokensProcessed()
Report the amounf of new tokens processed since the last call
|
void |
setDocument(byte[] contents,
Charset cs)
Set the document to index.
|
void |
setDocument(File file,
Charset charset)
Set the document to index.
|
void |
setDocument(InputStream is,
Charset cs)
Set the document to index.
|
abstract void |
setDocument(Reader reader)
Set the document to index.
|
void |
setDocumentName(String documentName)
Set the file name of the document to index.
|
void |
setDocWriter(DocWriter docWriter)
Set the DocWriter object.
|
void |
setOmitNorms(boolean b)
Enables or disables norms.
|
void |
setParameter(String name,
String value)
Deprecated.
use a DocIndexerConfig-based indexer
|
void |
setParameters(Map<String,String> param)
Deprecated.
use a DocIndexerConfig-based indexer
|
protected boolean |
tokenizeField(String name) |
protected void |
warn(String msg) |
protected static final org.apache.logging.log4j.Logger logger
public static final int MAX_DOCVALUES_LENGTH
protected DocWriter docWriter
protected boolean omitNorms
protected String documentName
protected org.apache.lucene.document.Document currentLuceneDoc
protected Map<String,List<String>> metadataFieldValues
public abstract void close()
close
in interface AutoCloseable
public org.apache.lucene.document.Document getCurrentLuceneDoc()
public DocWriter getDocWriter()
public void setDocWriter(DocWriter docWriter)
docWriter
- our DocWriter objectpublic void setDocumentName(String documentName)
documentName
- name of the documentpublic abstract void setDocument(Reader reader)
reader
- documentpublic void setDocument(InputStream is, Charset cs)
is
- document contentscs
- charset to use if no BOM found, or null for the default (utf-8)public void setDocument(byte[] contents, Charset cs)
contents
- document contentscs
- charset to use if no BOM found, or null for the default (utf-8)public void setDocument(File file, Charset charset) throws FileNotFoundException
file
- file to indexcharset
- charset to use if no BOM found, or null for the default
(utf-8)FileNotFoundException
- if not foundpublic abstract void index() throws IOException, MalformedInputFile, PluginException
MalformedInputFile
- if the input file wasn't validIOException
- if an I/O error occurredPluginException
- if an error occurred in a plugin@Deprecated public boolean hasParameter(String name)
name
- parameter name@Deprecated public void setParameter(String name, String value)
name
- parameter namevalue
- parameter value@Deprecated public void setParameters(Map<String,String> param)
param
- the parameter names and values@Deprecated public String getParameter(String name, String defaultValue)
name
- parameter namedefaultValue
- parameter default value@Deprecated public String getParameter(String name)
name
- parameter name@Deprecated public boolean getParameter(String name, boolean defaultValue)
name
- parameter namedefaultValue
- parameter default value@Deprecated public int getParameter(String name, int defaultValue)
name
- parameter namedefaultValue
- parameter default valueprotected boolean tokenizeField(String name)
@Deprecated public FieldType getMetadataFieldTypeFromIndexerProperties(String fieldName)
fieldName
- the field nameprotected org.apache.lucene.document.FieldType luceneTypeFromIndexMetadataType(FieldType type)
public void setOmitNorms(boolean b)
b
- if true, doesn't store norms; if false, does store normspublic void addNumericFields(Collection<String> fields)
protected void warn(String msg)
protected String optTranslateFieldName(String from)
from
- original metadata field namepublic void addMetadataToDocument()
protected void addMetadataFieldsFromParameters()
@Deprecated public AnnotationWriter.SensitivitySetting getSensitivitySetting(String annotationName)
protected void addToForwardIndex(AnnotatedFieldWriter field)
field
- field to add to the forward indexprotected abstract int getCharacterPosition()
public abstract void reportCharsProcessed()
public abstract void reportTokensProcessed()
Copyright © 2020 Instituut voor Nederlandse Taal (INT). All rights reserved.