public final class LuceneUtil extends Object
Modifier and Type | Method and Description |
---|---|
static List<String> |
findTermsByPrefix(org.apache.lucene.index.IndexReader index,
String fieldName,
String prefix,
boolean sensitive)
Find terms in the index based on a prefix.
|
static List<String> |
findTermsByPrefix(org.apache.lucene.index.IndexReader index,
String fieldName,
String prefix,
boolean sensitive,
int maxResults)
Find terms in the index based on a prefix.
|
static List<String> |
getFieldTerms(org.apache.lucene.index.IndexReader index,
String fieldName)
Return the list of terms that occur in a field.
|
static List<String> |
getFieldTerms(org.apache.lucene.index.IndexReader index,
String fieldName,
int maxResults)
Return the list of terms that occur in a field.
|
static void |
getFrequenciesFromTermVector(org.apache.lucene.index.IndexReader reader,
int doc,
String luceneName,
Map<String,Integer> freq)
Add term frequencies for a single document to a frequency map.
|
static org.apache.lucene.index.IndexWriterConfig |
getIndexWriterConfig(org.apache.lucene.analysis.Analyzer analyzer,
boolean create) |
static Set<String> |
getMatchingTermsFromIndex(org.apache.lucene.index.IndexReader reader,
String luceneName,
Collection<String> searchTerms,
int maxEdits)
Get all the terms in the index with low edit distance from the supplied term
|
static Map<String,Set<String>> |
getOldSingleFieldSubprops(org.apache.lucene.index.IndexReader index,
String fieldName)
Enumerate all the terms in the given Lucene field, collecting all the
subproperty names and values.
|
static long |
getSumTotalTermFreq(org.apache.lucene.index.IndexReader reader,
String luceneField) |
static String[] |
getWordsFromTermVector(org.apache.lucene.index.IndexReader reader,
int doc,
String luceneName,
int start,
int end)
Get all words between the specified start and end positions from the term
vector.
|
static String[] |
getWordsFromTermVector(org.apache.lucene.index.IndexReader reader,
int doc,
String luceneName,
int start,
int end,
boolean partialOk)
Get all words between the specified start and end positions from the term
vector.
|
static org.apache.lucene.search.Query |
parseLuceneQuery(String luceneQuery,
org.apache.lucene.analysis.Analyzer analyzer,
String defaultField)
Parse a query in the Lucene query language format (QueryParser supplied with
Lucene).
|
static Map<String,Integer> |
termFrequencies(org.apache.lucene.search.IndexSearcher indexSearcher,
org.apache.lucene.search.Query documentFilterQuery,
AnnotationSensitivity annotSensitivity,
Set<String> searchTerms) |
public static Set<String> getMatchingTermsFromIndex(org.apache.lucene.index.IndexReader reader, String luceneName, Collection<String> searchTerms, int maxEdits)
reader
- the indexluceneName
- the field to search insearchTerms
- search termsmaxEdits
- maximum edit distance (Levenshtein algorithm) for matches
(i.e. lower is more similar)BooleanQuery.TooManyClauses
- if the expansion resulted in too many
termspublic static org.apache.lucene.search.Query parseLuceneQuery(String luceneQuery, org.apache.lucene.analysis.Analyzer analyzer, String defaultField) throws org.apache.lucene.queryparser.classic.ParseException
luceneQuery
- the query stringanalyzer
- analyzer to usedefaultField
- default search fieldorg.apache.lucene.queryparser.classic.ParseException
- on syntax errorpublic static String[] getWordsFromTermVector(org.apache.lucene.index.IndexReader reader, int doc, String luceneName, int start, int end)
reader
- the indexdoc
- doc idluceneName
- the index field from which to use the term vectorstart
- start position (first word we want to request)end
- end position (last word we want to request)public static String[] getWordsFromTermVector(org.apache.lucene.index.IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk)
reader
- the indexdoc
- doc idluceneName
- the index field from which to use the term vectorstart
- start position (first word we want to request)end
- end position (last word we want to request)partialOk
- is it okay if we're missing words in the middle, or do we
need them all? (debug)public static void getFrequenciesFromTermVector(org.apache.lucene.index.IndexReader reader, int doc, String luceneName, Map<String,Integer> freq)
reader
- the indexdoc
- doc idluceneName
- the index field from which to use the term vectorfreq
- where to add to the token frequenciespublic static List<String> getFieldTerms(org.apache.lucene.index.IndexReader index, String fieldName)
index
- the indexfieldName
- the fieldpublic static List<String> getFieldTerms(org.apache.lucene.index.IndexReader index, String fieldName, int maxResults)
index
- the indexfieldName
- the fieldmaxResults
- maximum number to return (or -1 for no limit)public static List<String> findTermsByPrefix(org.apache.lucene.index.IndexReader index, String fieldName, String prefix, boolean sensitive)
index
- the indexfieldName
- the fieldprefix
- the prefix we're looking forsensitive
- match case-sensitively or not?public static List<String> findTermsByPrefix(org.apache.lucene.index.IndexReader index, String fieldName, String prefix, boolean sensitive, int maxResults)
index
- the indexfieldName
- the fieldprefix
- the prefix we're looking for (null or empty string for all
terms)sensitive
- match case-sensitively or not?maxResults
- max. number of results to return (or -1 for all)public static Map<String,Integer> termFrequencies(org.apache.lucene.search.IndexSearcher indexSearcher, org.apache.lucene.search.Query documentFilterQuery, AnnotationSensitivity annotSensitivity, Set<String> searchTerms)
indexSearcher
- documentFilterQuery
- document filter, or null for all documentsannotSensitivity
- field to get frequencies forsearchTerms
- list of terms to get frequencies for, or null for all termspublic static org.apache.lucene.index.IndexWriterConfig getIndexWriterConfig(org.apache.lucene.analysis.Analyzer analyzer, boolean create)
public static long getSumTotalTermFreq(org.apache.lucene.index.IndexReader reader, String luceneField)
public static Map<String,Set<String>> getOldSingleFieldSubprops(org.apache.lucene.index.IndexReader index, String fieldName)
index
- our indexfieldName
- field in the Lucene index to enumerate terms fromCopyright © 2020 Instituut voor Nederlandse Taal (INT). All rights reserved.