Commit fab71c40 authored by Christian Reuschling's avatar Christian Reuschling
Browse files

Migration to Tika 2.0.0 over Leechcrawler 2.0.0 | changed version to 2.8-SNAPSHOT

parent 8f0dde69
......@@ -5,7 +5,7 @@
<groupId>dfki.sds.dynaq</groupId>
<artifactId>dynaq</artifactId>
<packaging>jar</packaging>
<version>2.7</version>
<version>2.8-SNAPSHOT</version>
<name>dynaq</name>
<url>http://dynaq.opendfki.de</url>
......@@ -97,7 +97,7 @@
<dependency>
<groupId>de.dfki.sds</groupId>
<artifactId>leechcrawler</artifactId>
<version>1.26.2</version>
<version>2.0.0</version>
<exclusions>
<exclusion>
<artifactId>asm-debug-all</artifactId>
......
......@@ -22,6 +22,8 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TikaCoreProperties;
import org.dynaq.core.DynaQDocument;
import org.dynaq.core.DynaQException;
import org.dynaq.index.LuceneIndexSet;
......@@ -402,7 +404,7 @@ public class AttributeConfig
String BUZZWORDS = "buzzwords";
String CREATOR = Metadata.CREATOR;
String CREATOR = "creator";
String DYNAQ_CATEGORY = "dynaqCategory";
......@@ -418,9 +420,11 @@ public class AttributeConfig
String MODIFIED = "modified";
String CREATED = "created";
String SIGNIFICANT_TIME = "dynaqSignificantTime";
String PAGE_COUNT = Metadata.PAGE_COUNT.getName();
String PAGE_COUNT = "pageCount";
String POSTPROCESSED = "isPostProcessed";
......@@ -429,13 +433,15 @@ public class AttributeConfig
String SOURCE_URI = Metadata.SOURCE;
String TITLE = Metadata.TITLE;
String TITLE = "title";
String ID = "leechId";
String USER_ANNOTATION = "userannotation";
String DOCUMENT_FREQUENCY_CLASS = "documentFrequencyClass";
String LANGUAGE = "language";
}
......@@ -629,7 +635,6 @@ public class AttributeConfig
Arrays.fill(fields, targetAttributeName);
try
{
document.getLuceneIndexSet();
Query targetAttributeQuery = MultiFieldQueryParser.parse(Version.LUCENE_34, queries, fields, LuceneIndexSet.getDynaQAnalyzer());
targetAttributeQuery.setBoost(boost);
stringAttributeQuery.add(targetAttributeQuery, Occur.SHOULD);
......@@ -643,7 +648,7 @@ public class AttributeConfig
}
}
Set<Term> hsTerms = new HashSet<Term>();
Set<Term> hsTerms = new HashSet<>();
stringAttributeQuery.extractTerms(hsTerms);
if(hsTerms.size() == 0) return new BooleanQuery();
......@@ -652,58 +657,6 @@ public class AttributeConfig
// private Query createTimeAttributeQuery(DynaQDocument document, String attributeName, Set<String> targetAttributeNames,
// AttributeQueryParameters parameters)
// {
//
// BooleanQuery timeAttributeQuery = new BooleanQuery();
// timeAttributeQuery.setBoost(parameters.getBoost());
//
// // For each expanded attribute
// // For each of the expanded attribute's values
// for (String value : document.getAttributeValues(attributeName))
// {
// int intValue = 0;
// try
// {
// intValue = Integer.parseInt(value);
// }
// catch (NumberFormatException e)
// {
// continue;
// }
// long minValue = Math.max(intValue - parameters.getLowerRange(), 0);
// long maxValue = Math.min(intValue + parameters.getUpperRange(), 23 * 60 + 59);
// long minHours = minValue / 60;
// long maxHours = maxValue / 60;
// long minMinutes = minValue % 60;
// long maxMinutes = maxValue % 60;
// String lowerBound = (minHours < 10 ? "0" : "") + minHours + (minMinutes < 10 ? "0" : "") + minMinutes;
// String upperBound = (maxHours < 10 ? "0" : "") + maxHours + (maxMinutes < 10 ? "0" : "") + maxMinutes;
//
// // For each target attribute
// for (String targetAttributeName : targetAttributeNames)
// {
// String type = getAttributeType(attributeName);
// if(type.equals(AttributeTypes.Time_LuceneFormat))
// {
// timeAttributeQuery.add(new TermRangeQuery(targetAttributeName, lowerBound, upperBound, true, true), Occur.SHOULD);
// }
// else if(type.equals(AttributeTypes.Time_FastIndexMapping))
// {
// timeAttributeQuery.add(new FastTimeRangeQuery(targetAttributeName, lowerBound, upperBound, true), Occur.SHOULD);
// }
// }
// }
//
// // Return expanded query.
// Logger.getLogger(AttributeConfig.class.getName()).info(timeAttributeQuery.toString());
//
// return timeAttributeQuery;
// }
/**
* Gets the whole attributes descriptions
*
......@@ -824,50 +777,10 @@ public class AttributeConfig
return AttributeTypes.String;
}
//
//
//
// String strDefaultFieldType = m_indexerConfig.getUniqueAsString("default.field.type");
//
// String strFieldType = null;
//
// MultiValueConfiguration attributeMappingsConf = m_indexerConfig.getUniqueAsConfiguration("attributeMappings");
// MultiValueConfiguration mappings4Attribute = null;
// if(attributeMappingsConf != null) mappings4Attribute = attributeMappingsConf.getFirstAsConfiguration(strAttName);
// if(mappings4Attribute != null) strFieldType = mappings4Attribute.getFirstAsString("field.type");
// if(strFieldType == null) strFieldType = strDefaultFieldType;
//
// return strFieldType.toUpperCase();
}
// /**
// * Gets a mapping indexAttributeName => configured Labels.
// *
// * @return a mapping indexAttributeName => configured Labels.
// *
// * @throws ConfigurationException
// */
// public LinkedHashMap<String, String> getIndexAttName2Labels() throws ConfigurationException
// {
// LinkedHashMap<String, String> hsAttName2Labels = new LinkedHashMap<String, String>();
//
// for (MultiValueConfiguration attDesc : getAttDescriptionInstances())
// {
// for (String attName : attDesc.getAllAsString(DescAttributes.indexAttName))
// {
// MultiValueConfiguration attConf = getAttributeConfiguration().getUniqueAsConfiguration(attName);
// String attLabel = attConf.getUniqueAsString(DescAttributes.label);
// hsAttName2Labels.put(attName, attLabel);
// }
// }
//
// return hsAttName2Labels;
// }
/**
* Gets the configured label for an index attribute
*
......@@ -908,39 +821,6 @@ public class AttributeConfig
// /**
// * Gets the configured lengths of the number attributes
// *
// * @return the configured lengths of the number attributes
// *
// * @throws ConfigurationException
// */
// public HashMap<String, Integer> getNumberAttributeLengths() throws ConfigurationException
// {
// HashMap<String, Integer> map = new HashMap<String, Integer>();
// // wir wollen ein Mapping AttributName=>NumberLength
// for (Entry<String, ConfigurationValue> configEntry : m_attributeConfig.entryList())
// {
// // alles ausser den AttributeDescriptions sind Characteristica
// if(configEntry.getKey().equals(DescAttributes.attributeDescription)) continue;
//
// String strType = configEntry.getValue().getAsConfiguration().getUniqueAsString(DescAttributes.type);
// if(!strType.equals(AttributeTypes.Number)) continue;
//
// String strAttName = configEntry.getKey();
// String strMaxNumberLength =
// configEntry.getValue().getAsConfiguration().getUnique(DescAttributes.type).getDescription()
// .getUniqueAsString(DescAttributes.maxNumberLength);
//
// map.put(strAttName, Integer.decode(strMaxNumberLength));
// }
//
//
// return map;
// }
private String[] getQueries4ExtractedBuzzwords(DynaQDocument document, String strAttName)
{
......@@ -1044,7 +924,7 @@ public class AttributeConfig
Collection<Boolean> colModifable = m_attributeConfig.getUniqueAsConfiguration(strAttName).getAllAsBoolean(AttributeConfig.DescAttributes.modifiable);
if(colModifable == null) continue;
if(colModifable.size() == 0) continue;
if(colModifable.iterator().next().booleanValue() == true) sModifiableIndexAttNames.add(strAttName);
if(colModifable.iterator().next()) sModifiableIndexAttNames.add(strAttName);
}
......@@ -1066,7 +946,7 @@ public class AttributeConfig
m_attributeConfig.getUniqueAsConfiguration(strAttName).getAllAsBoolean(AttributeConfig.DescAttributes.resultDocVisualization);
if(colResultDocImageAtt == null) continue;
if(colResultDocImageAtt.size() == 0) continue;
if(colResultDocImageAtt.iterator().next().booleanValue() == true) sResultDocumentImageAttNames.add(strAttName);
if(colResultDocImageAtt.iterator().next()) sResultDocumentImageAttNames.add(strAttName);
}
......
......@@ -2,7 +2,6 @@ package org.dynaq.documents.clustering;
import com.mchange.io.impl.SuffixFilenameFilter;
import de.dfki.inquisitor.collections.CollectionUtilz;
import de.dfki.inquisitor.collections.ThreeValuesBox;
import de.dfki.inquisitor.file.FileUtilz;
......@@ -10,6 +9,7 @@ import de.dfki.inquisitor.text.StringUtils;
import de.dfki.inquisitor.ui.SwingUtils;
import de.dfki.inquisitor.ui.color.ColorFactory;
import de.dfki.inquisitor.ui.tablelayout.TableLayoutUtil;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.language.LanguageComponents;
......@@ -380,7 +380,7 @@ public class DynaQClusterMapPanel extends JPanel implements ActionListener, RCPP
{
FileDialog dialog = new FileDialog((Frame) null, "Load blacklist text file");
dialog.setMode(FileDialog.LOAD);
dialog.setFilenameFilter(new SuffixFilenameFilter("", 0));
dialog.setFilenameFilter(new SuffixFileFilter(""));
dialog.setVisible(true);
if(dialog.getFile() != null)
m_blackListTextArea.setText(FileUtilz.file2String(dialog.getDirectory() + '/' + dialog.getFile()));
......@@ -389,7 +389,7 @@ public class DynaQClusterMapPanel extends JPanel implements ActionListener, RCPP
{
FileDialog dialog = new FileDialog((Frame) null, "Save blacklist text file");
dialog.setMode(FileDialog.SAVE);
dialog.setFilenameFilter(new SuffixFilenameFilter("", 0));
dialog.setFilenameFilter(new SuffixFileFilter(""));
dialog.setVisible(true);
if(dialog.getFile() != null)
......
......@@ -97,6 +97,9 @@ public class Indexer
}
protected static FieldConfig m_fieldConfig;
static protected HashMap<String, CrawlerContext> m_hsIndexPath2CurrentLeechContext = new HashMap<String, CrawlerContext>();
......@@ -505,6 +508,7 @@ public class Indexer
*
* @return true in the case everything worked, false in the case of an error.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public static boolean index(String strURLOrSourceString, MultiValueHashMap<String, String> hsAttNames2Values, int iCrawlingDepth, URLFilter urlFilter,
boolean bUseIndexingHistory, LuceneIndexSet luceneIndexSet) throws Exception
{
......@@ -546,10 +550,10 @@ public class Indexer
}
toLuceneContentHandler.setStaticAttributeValuePairs(hsStaticAttValuePairs);
// TODO momentan habe ich keine Möglichkeit zB Attribute zu kopieren
// toLuceneContentHandler.setFieldAggregationMap();
// toLuceneContentHandler.setFieldCopyMap();
// toLuceneContentHandler.setFieldNames2Ignore();
toLuceneContentHandler.setFieldCopyMap(m_indexerConfig.getUniqueAsMultiValueMap("copyAttributes"));
toLuceneContentHandler.setFieldAggregationMap(m_indexerConfig.getUniqueAsMultiValueMap("aggregateAttributes"));
toLuceneContentHandler.setFieldNames2Ignore(new HashSet<>(m_indexerConfig.getUniqueAsParsedEnumString("ignoreAttributes", ",")));
// TODO momentan habe ich keine Möglichkeit in der config
// toLuceneContentHandler.setIgnoreAllDocsWithout();
......
......@@ -10,6 +10,10 @@ package org.dynaq.util.lucene.basic;
import org.dynaq.config.AttributeConfig;
public class DynaQDefaultFieldConfig extends FieldConfig
{
......@@ -28,7 +32,7 @@ public class DynaQDefaultFieldConfig extends FieldConfig
this.fieldName2FieldType.put("isPostProcessed", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("Content-Encoding", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("content-language", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("dc:language", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put(AttributeConfig.IndexAttributes.LANGUAGE, DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("dataEntityContentFingerprint", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("dataEntityId", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("masterDataEntityId", DynamicFieldType.keywordFieldType);
......@@ -39,6 +43,7 @@ public class DynaQDefaultFieldConfig extends FieldConfig
this.fieldName2FieldType.put("authorContinent", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("country", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("source", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put(AttributeConfig.IndexAttributes.SOURCE, DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("globalSource", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("continentCode", DynamicFieldType.keywordFieldType);
this.fieldName2FieldType.put("countryCode", DynamicFieldType.keywordFieldType);
......@@ -68,6 +73,8 @@ public class DynaQDefaultFieldConfig extends FieldConfig
this.fieldName2FieldType.put("Page-Count", DynamicFieldType.integerFieldType);
this.fieldName2FieldType.put(AttributeConfig.IndexAttributes.PAGE_COUNT, DynamicFieldType.integerFieldType);
this.fieldName2FieldType.put("Word-Count", DynamicFieldType.integerFieldType);
this.fieldName2FieldType.put("documentFrequencyClass", DynamicFieldType.integerFieldType);
this.fieldName2FieldType.put("Image Count", DynamicFieldType.integerFieldType);
......@@ -76,7 +83,9 @@ public class DynaQDefaultFieldConfig extends FieldConfig
this.fieldName2FieldType.put("Character Count", DynamicFieldType.longFieldType);
this.fieldName2FieldType.put("modified", DynamicFieldType.dateFieldType);
this.fieldName2FieldType.put(AttributeConfig.IndexAttributes.MODIFIED, DynamicFieldType.dateFieldType);
this.fieldName2FieldType.put("Creation-Date", DynamicFieldType.dateFieldType);
this.fieldName2FieldType.put(AttributeConfig.IndexAttributes.CREATED, DynamicFieldType.dateFieldType);
this.fieldName2FieldType.put("Last-Modified", DynamicFieldType.dateFieldType);
this.fieldName2FieldType.put("date", DynamicFieldType.dateFieldType);
this.fieldName2FieldType.put("dynaqSignificantDate", DynamicFieldType.dateFieldType);
......
......@@ -211,7 +211,7 @@ urn:catwiesel:attribute:imdb_attributeuris.movie_also_known_as=
dynaqSignificantDate=
{
label=Modified date
label=Date
attValue2QueryMode=GetAllAsKeyword
}
......@@ -221,7 +221,7 @@ modified=
attValue2QueryMode=GetAllAsKeyword
}
Page-Count=
pageCount=
{
label=Page count
attValue2QueryMode=GetAllAsKeyword
......@@ -430,7 +430,7 @@ attributeDescription=
attributeDescription=
{
label=Date
indexAttName=modified
indexAttName=dynaqSignificantDate
searchRepresentationSection=General
guiRepresentation=RangeSliderFilter
}
......@@ -438,7 +438,7 @@ attributeDescription=
attributeDescription=
{
label=Pages
indexAttName=Page-Count
indexAttName=pageCount
searchRepresentationSection=General
guiRepresentation=RangeSliderFilter
}
......
......@@ -27,6 +27,35 @@
#
#
#
#### Copy attributes
# copyAttributes=
# {
# <sourceAttName1>=<targetAttName1>
# <sourceAttName1>=<targetAttName2>
# <sourceAttName2>=<targetAttName3>
# <sourceAttName2>=<targetAttName1>
# <sourceAttNameN>=<targetAttNameN>
# }
#
#### Aggregate attribtues. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry.
# You can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is
# in fact a priorized list.
# aggregateAttributes=
# {
# <targetAttName1>=<sourceAttName1>
# <targetAttName1>=<sourceAttName2>
# <targetAttName1>=<sourceAttName3>
# <targetAttName2>=<sourceAttName1>
# <targetAttNameN>=<sourceAttNameN>
# }
#
#
#
#### Ignore attributes (can still be copied)
# ignoreAttributes=attName1, attName2, attNameN
#
#
#
#### Crawling: A suburl during crawling will be indexed when it matches at least one of the include patterns but none of the exclude patterns.
# In case no include patterns are specified, all suburls that don't match any of the exclude patterns are included.
# These are the default inclusion/exclusion patterns, that appears to ALL indexed urls (filesystem, web, etc.). You can specify as much as you want.
......@@ -61,6 +90,25 @@ staticAttributeValuePairs=
# testAtt4Me=blubber_HitzliPutzli
}
copyAttributes=
{
dc:title=title
dc:creator=creator
dcterms:modified=modified
dcterms:created=created
xmpTPg:NPages=pageCount
dc:language=language
}
aggregateAttributes=
{
dynaqSignificantDate=dcterms:modified
dynaqSignificantDate=modified
dynaqSignificantDate=dcterms:created
dynaqSignificantDate=created
}
ignoreAttributes=dc:title, dc:creator, dcterms:modified, dcterms:created, xmpTPg:NPages, dc:language
defaultURLFilter=
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment