Commit 3c0e7ccc authored by Marcel's avatar Marcel
Browse files

Levenshtein StringFct

parent 0b758eed
......@@ -27,6 +27,7 @@
* endOfLic */
package de.dfki.mycbr.core.similarity;
import java.io.Console;
import java.util.HashSet;
import java.util.Observable;
......@@ -41,31 +42,94 @@ import de.dfki.mycbr.core.similarity.config.MultipleConfig;
import de.dfki.mycbr.core.similarity.config.StringConfig;
/**
* Calculates the similarity of two strings
* for a given description. At the moment only syntactic equality
* is supported.
* Calculates the similarity of two strings for a given description. At the
* moment only syntactic equality is supported.
*
* @author myCBR Team
*/
*/
public class StringFct extends Observable implements ISimFct {
private StringDesc desc;
private StringConfig config;
// if config == NGRAM
private int n = 3;
private int levenshteinDelCost = 1;
private int levenshteinAddCost = 1;
private int levenshteinChangeCost = 1;
/**
* @return the levenshteinDelCost
*/
public int getLevenshteinDelCost() {
return levenshteinDelCost;
}
/**
* @param levenshteinDelCost
* the levenshteinDelCost to set
*/
public void setLevenshteinDelCost(int levenshteinDelCost) {
if (levenshteinDelCost < 0) {
throw new UnsupportedOperationException ();
} else {
this.levenshteinDelCost = levenshteinDelCost;
}
}
/**
* @return the levenshteinAddCost
*/
public int getLevenshteinAddCost() {
return levenshteinAddCost;
}
/**
* @param levenshteinAddCost
* the levenshteinAddCost to set
*/
public void setLevenshteinAddCost(int levenshteinAddCost) {
if (levenshteinAddCost < 0) {
throw new UnsupportedOperationException ();
} else {
this.levenshteinAddCost = levenshteinAddCost;
}
}
/**
* @return the levenshteinChangeCost
*/
public int getLevenshteinChangeCost() {
return levenshteinChangeCost;
}
/**
* @param levenshteinChangeCost
* the levenshteinChangeCost to set
*/
public void setLevenshteinChangeCost(int levenshteinChangeCost) {
if (levenshteinChangeCost < 0) {
throw new UnsupportedOperationException ();
} else {
this.levenshteinChangeCost = levenshteinChangeCost;
}
}
private boolean caseSensitive = true;
private String name;
private Project prj;
private MultipleConfig mc = MultipleConfig.DEFAULT_CONFIG;
/**
* Initializes this with the given description.
* @param desc the description of the
*
* @param desc
* the description of the
*/
public StringFct (Project prj, StringConfig config, StringDesc desc, String name) {
public StringFct(Project prj, StringConfig config, StringDesc desc,
String name) {
this.prj = prj;
this.desc = desc;
this.config = config;
......@@ -73,88 +137,153 @@ public class StringFct extends Observable implements ISimFct {
}
/**
* Calculates the similarity of the given attributes.
* Returns null if an error occurs.
* @return 1.00 if the given attributes are equal, invalid similarity if
* an error occurs, else 0.00
* @throws Exception
* Calculates the similarity of the given attributes. Returns null if an
* error occurs.
*
* @return 1.00 if the given attributes are equal, invalid similarity if an
* error occurs, else 0.00
* @throws Exception
*/
@Override
public Similarity calculateSimilarity(Attribute value1, Attribute value2) throws Exception {
if (value1 instanceof MultipleAttribute<?> && value2 instanceof MultipleAttribute<?>) {
return prj
.calculateMultipleAttributeSimilarity(this,((MultipleAttribute<?>)value1), (MultipleAttribute<?>)value2);
} else if (!( value1 instanceof SpecialAttribute || value2 instanceof SpecialAttribute )) {
String v1 = ((StringAttribute)value1).toString();
String v2 = ((StringAttribute)value2).toString();
public Similarity calculateSimilarity(Attribute value1, Attribute value2)
throws Exception {
if (value1 instanceof MultipleAttribute<?>
&& value2 instanceof MultipleAttribute<?>) {
return prj.calculateMultipleAttributeSimilarity(this,
((MultipleAttribute<?>) value1),
(MultipleAttribute<?>) value2);
} else if (!(value1 instanceof SpecialAttribute || value2 instanceof SpecialAttribute)) {
String v1 = ((StringAttribute) value1).toString();
String v2 = ((StringAttribute) value2).toString();
if (!caseSensitive) {
v1 = v1.toLowerCase();
v2 = v2.toLowerCase();
}
switch(config) {
case EQUALITY: return v1.equals(v2) ? Similarity.get(1.00) : Similarity.get(0.00);
case NGRAM: return nGramSimilarity(v1,v2);
default: return Similarity.INVALID_SIM;
switch (config) {
case EQUALITY:
return v1.equals(v2) ? Similarity.get(1.00) : Similarity
.get(0.00);
case NGRAM:
return nGramSimilarity(v1, v2);
case LEVENSHTEIN:
return levenshteinSimilarity(v1, v2);
default:
return Similarity.INVALID_SIM;
}
} else {
return prj.calculateSpecialSimilarity(value1, value2);
}
}
/**
* Computes the NGRAM similarity of the given strings.
* The similarity results in the ration of common NGRAMs divided
* by all occurring NGRAMs. An NGRAM is a substring of length n
* of the given string.
* @param v1 the first string
* @param v2 the second string
* Computes the NGRAM similarity of the given strings. The similarity
* results in the ration of common NGRAMs divided by all occurring NGRAMs.
* An NGRAM is a substring of length n of the given string.
*
* @param v1
* the first string
* @param v2
* the second string
* @return the NGRAM similarity of the given strings
*/
@SuppressWarnings("unchecked")
private Similarity nGramSimilarity(String v1, String v2) {
HashSet<String> ngramsV1 = new HashSet<String>();
HashSet<String> ngramsV2 = new HashSet<String>();
// get all n grams of first string
for (int i = 0; i<v1.length()-n+1; i++) {
ngramsV1.add(v1.substring(i, i+n));
for (int i = 0; i < v1.length() - n + 1; i++) {
ngramsV1.add(v1.substring(i, i + n));
}
// get all n grams of second string
for (int i = 0; i<v2.length()-n+1; i++) {
ngramsV2.add(v2.substring(i, i+n));
for (int i = 0; i < v2.length() - n + 1; i++) {
ngramsV2.add(v2.substring(i, i + n));
}
// get # all occurring NGRAMS (without repetition)
HashSet<String> union = (HashSet<String>)ngramsV1.clone();
HashSet<String> union = (HashSet<String>) ngramsV1.clone();
union.addAll(ngramsV2);
float f2 = union.size();
// get # common NGRAMS (without repetition)
ngramsV2.retainAll(ngramsV1);
float f1 = ngramsV2.size();
// compute ratio
Similarity res = Similarity.get(new Double(f1/f2));
Similarity res = Similarity.get(new Double(f1 / f2));
return res;
}
private Similarity levenshteinSimilarity(String s0, String s1) {
// Source: http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java
int len0 = s0.length() + 1;
int len1 = s1.length() + 1;
// the array of distances
int[] cost = new int[len0];
int[] newcost = new int[len0];
// initial cost of skipping prefix in String s0
for (int i = 0; i < len0; i++) cost[i] = i;
// dynamicaly computing the array of distances
// transformation cost for each letter in s1
for (int j = 1; j < len1; j++) {
// initial cost of skipping prefix in String s1
newcost[0] = j;
// transformation cost for each letter in s0
for(int i = 1; i < len0; i++) {
// matching current letters in both strings
int match = (s0.charAt(i - 1) == s1.charAt(j - 1)) ? 0 : getLevenshteinChangeCost();
// computing cost for each transformation
int cost_replace = cost[i - 1] + match;
int cost_insert = cost[i] + getLevenshteinAddCost();
int cost_delete = newcost[i - 1] + getLevenshteinDelCost();
// keep minimum cost
newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace);
}
// swap cost/newcost arrays
int[] swap = cost;
cost = newcost;
newcost = swap;
}
double maxLength = s0.length() > s1.length() ? s0.length() : s1.length();
double result = 1 - (cost[len0 - 1] / maxLength);
Similarity sim = Similarity.get(result);
return sim;
}
/**
* Returns whether this function is symmetric.
* Equality and NGRAM similarity is always symmetric.
* Returns whether this function is symmetric. Equalitym, NGRAM and Levenshtein similarity
* is always symmetric.
*
* @return true
*/
public boolean isSymmetric() {
return true;
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#setSymmetric(boolean)
*/
@Override
......@@ -163,10 +292,12 @@ public class StringFct extends Observable implements ISimFct {
* Not implemented because at the moment only symmetric string
* functions are supported.
*/
public void setSymmetric(boolean symmetric) {}
public void setSymmetric(boolean symmetric) {
}
/**
* Returns the configuration of this function.
*
* @return the configuration describing how to compute similarity
*/
public StringConfig getConfig() {
......@@ -174,36 +305,42 @@ public class StringFct extends Observable implements ISimFct {
}
/**
* Returns the parameter n for computing NGRAM similarity.
* N specifies the length of the substrings to be compared.
* Returns the parameter n for computing NGRAM similarity. N specifies the
* length of the substrings to be compared.
*
* @return the parameter n
*/
public int getN() {
return n;
}
/**
* Sets the parameter n for computing NGRAM similarity.
* N specifies the length of the substrings to be compared.
* @param n the new parameter n
* Sets the parameter n for computing NGRAM similarity. N specifies the
* length of the substrings to be compared.
*
* @param n
* the new parameter n
*/
public void setN(int n) {
this.n = n;
setChanged();
notifyObservers();
}
/**
* Returns whether this function is case sensitive.
*
* @return true, if function is case sensitive, false otherwise
*/
public boolean isCaseSensitive() {
return caseSensitive;
}
/**
* Specifies whether this function is case sensitive.
* @param caseSensitive true, if function is case sensitive, false otherwise
*
* @param caseSensitive
* true, if function is case sensitive, false otherwise
*/
public void setCaseSensitive(boolean caseSensitive) {
if (caseSensitive != this.caseSensitive) {
......@@ -212,8 +349,10 @@ public class StringFct extends Observable implements ISimFct {
this.caseSensitive = caseSensitive;
notifyObservers();
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#getName()
*/
@Override
......@@ -225,7 +364,9 @@ public class StringFct extends Observable implements ISimFct {
return name;
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#setName(java.lang.String)
*/
@Override
......@@ -242,8 +383,10 @@ public class StringFct extends Observable implements ISimFct {
notifyObservers();
}
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#getDescription()
*/
@Override
......@@ -256,7 +399,9 @@ public class StringFct extends Observable implements ISimFct {
return desc;
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#getProject()
*/
@Override
......@@ -264,7 +409,9 @@ public class StringFct extends Observable implements ISimFct {
return prj;
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#getMultipleConfig()
*/
@Override
......@@ -272,7 +419,9 @@ public class StringFct extends Observable implements ISimFct {
return mc;
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see de.dfki.mycbr.core.similarity.ISimFct#setMultipleConfig()
*/
@Override
......@@ -282,12 +431,17 @@ public class StringFct extends Observable implements ISimFct {
notifyObservers();
}
/* (non-Javadoc)
* @see de.dfki.mycbr.core.similarity.ISimFct#clone(de.dfki.mycbr.core.model.AttributeDesc, boolean)
/*
* (non-Javadoc)
*
* @see
* de.dfki.mycbr.core.similarity.ISimFct#clone(de.dfki.mycbr.core.model.
* AttributeDesc, boolean)
*/
@Override
public void clone(AttributeDesc descNEW, boolean active) {
if (descNEW instanceof StringDesc && !name.equals(Project.DEFAULT_FCT_NAME)) {
if (descNEW instanceof StringDesc
&& !name.equals(Project.DEFAULT_FCT_NAME)) {
StringFct f = desc.addStringFct(config, name, active);
f.caseSensitive = this.caseSensitive;
f.mc = this.mc;
......@@ -295,7 +449,9 @@ public class StringFct extends Observable implements ISimFct {
}
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see java.util.Observer#update(java.util.Observable, java.lang.Object)
*/
@Override
......
......@@ -38,5 +38,5 @@ public enum StringConfig {
/**
*
*/
EQUALITY, NGRAM;
EQUALITY, NGRAM, LEVENSHTEIN;
}
......@@ -69,12 +69,91 @@ public class StringFunctionTest extends TestCase {
try {
TestFramework frame = new TestFramework();
StringFct f = frame.dealerDesc.addStringFct(StringConfig.NGRAM, "f1", true);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car23")).getRoundedValue() == 0.5);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 1.00);
} catch (Exception exp) {
assertTrue("Excpetion in StringFctTest: testCalculateSimilarityEQUALITY",false);
assertTrue("Excpetion in StringFctTest: testCalculateSimilarityNGRAM",false);
}
}
/**
* Test method for {@link de.dfki.mycbr.core.similarity.StringFct#calculateSimilarity(Attribute, Attribute)}.
*/
@Test
public void testCalculateSimilarityLEVENSHTEIN() {
try {
TestFramework frame = new TestFramework();
for (int i = 0; i < 2; i++) {
StringFct f = frame.dealerDesc.addStringFct(StringConfig.LEVENSHTEIN, "f1", true);
// Test all variations in casesens and caseinsens
f.setCaseSensitive(i == 0);
// Base functionality
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 1.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car23")).getRoundedValue() == 0.8);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car13")).getRoundedValue() == 0.6);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Bike")).getRoundedValue() == 0.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Bike24")).getRoundedValue() == 0.33);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute(""), frame.dealerDesc.getStringAttribute("Bike24")).getRoundedValue() == 0.0);
// UNICODE Test
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("ÆÇĐŸƃ®"), frame.dealerDesc.getStringAttribute("ÆÇĐŸƃ®")).getRoundedValue() == 1.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("ʤʐɰ"), frame.dealerDesc.getStringAttribute("ʤɰ")).getRoundedValue() == 0.67);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute(""), frame.dealerDesc.getStringAttribute("ʤɰ")).getRoundedValue() == 0.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("ʤʐɰ"), frame.dealerDesc.getStringAttribute("")).getRoundedValue() == 0.0);
// Test configuration feature
f.setLevenshteinAddCost(2);
f.setLevenshteinChangeCost(3);
f.setLevenshteinDelCost(2);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 1.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car23")).getRoundedValue() == 0.4);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute(""), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 0.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("")).getRoundedValue() == 0.0);
f.setLevenshteinChangeCost(6);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("Car23")).getRoundedValue() == 0.2);
f.setLevenshteinAddCost(0);
f.setLevenshteinChangeCost(0);
f.setLevenshteinDelCost(0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car25"), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 1.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Bike123"), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 1.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute(""), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 0.0);
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car24"), frame.dealerDesc.getStringAttribute("")).getRoundedValue() == 0.0);
// Only positive values allowed
try {
f.setLevenshteinChangeCost(-1);
assertTrue(false);
} catch (UnsupportedOperationException ex) {
assertTrue(true);
}
try {
f.setLevenshteinAddCost(-1);
assertTrue(false);
} catch (UnsupportedOperationException ex) {
assertTrue(true);
}
try {
f.setLevenshteinDelCost(-1);
assertTrue(false);
} catch (UnsupportedOperationException ex) {
assertTrue(true);
}
assertTrue(f.calculateSimilarity(frame.dealerDesc.getStringAttribute("Car25"), frame.dealerDesc.getStringAttribute("Car24")).getRoundedValue() == 1.0);
}
} catch (Exception exp) {
assertTrue("Excpetion in StringFctTest: testCalculateSimilarityLEVENSHTEIN",false);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment