parent
1394f09d27
commit
ec9303b790
|
|
@ -0,0 +1,106 @@
|
||||||
|
# 1.4. Levenshtein distance
|
||||||
|
|
||||||
|
## Given two strings, check if they are one edit (or zero edits) away. Edits can be insertion, substitution or deletion
|
||||||
|
|
||||||
|
## First idea
|
||||||
|
|
||||||
|
Difference in length must be at most 1
|
||||||
|
|
||||||
|
```python
|
||||||
|
if abs(len(s1) - len(s2)) > 1:
|
||||||
|
return False
|
||||||
|
if s1 == s2:
|
||||||
|
return True
|
||||||
|
```
|
||||||
|
|
||||||
|
BCR is O(n) time, O(1) space.
|
||||||
|
|
||||||
|
This 'edit away' is Levenshtein distance, task is to check if Levenshtein distance <= 1.
|
||||||
|
|
||||||
|
Can we do `s1 - s2`? Or `s1.difference(s2)`? Not in python.
|
||||||
|
|
||||||
|
## Second idea
|
||||||
|
|
||||||
|
From the Internet, this is O(1) time:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def levenshtein(s1, s2):
|
||||||
|
return len("".join(s1.rsplit(s2))) <= 1
|
||||||
|
```
|
||||||
|
|
||||||
|
But this splits the first word with the second word, it doesn't work if there's a gap/addition in the middle of the string.
|
||||||
|
|
||||||
|
## Third idea
|
||||||
|
|
||||||
|
Keep a boolean `flag` that must be at most 1. Iterate through the smallest string, check for differences, if difference: flag += 1, if flag == 2, return False.
|
||||||
|
|
||||||
|
### If equal lengths
|
||||||
|
|
||||||
|
Same length, match s1[i] vs s2[i]. if difference: flag += 1, if flag == 2, return False. if iteration done, return True
|
||||||
|
|
||||||
|
### Unequal lengths
|
||||||
|
|
||||||
|
Iterate through the smallest string, match s1[i] vs s2[i]. If difference, the smallest string must have a character deletion. flag = 1, and keep matching s1[i] vs s2[i+1] (where s1 is the small string). If the smallest string is iterated and no errors, return True
|
||||||
|
|
||||||
|
> Tests:
|
||||||
|
|
||||||
|
* ("", "") -> True, correct
|
||||||
|
* ("an", "a") -> True, correct
|
||||||
|
* ("a", "an") -> True, correct
|
||||||
|
* ("ane", "ae") -> True, correct
|
||||||
|
* ("ane", "ne") -> True, correct
|
||||||
|
* ("asdfgh", "asdgh") -> True, correct
|
||||||
|
* ("asdf", "asgf") -> True, correct
|
||||||
|
* ("asdfg", "adfgh") -> False, corrrect
|
||||||
|
* ("an", "ne") -> False, correct
|
||||||
|
|
||||||
|
Tests passed, O(min(len(s1), len(s2)))) time, O(1) space. Possibly code can be compressed but I don't know how.
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
First solution, 3 different cases for equal lengths, len1 < len2, len2 < len1. Seeing this, my code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
elif len1 < len2: # s1 shorter than s2
|
||||||
|
for i in range(len1):
|
||||||
|
if not flag:
|
||||||
|
if s1[i] != s2[i]:
|
||||||
|
if flag:
|
||||||
|
return False
|
||||||
|
flag = True
|
||||||
|
else: # there's already one difference
|
||||||
|
if s1[i] != s2[i + 1]:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif len2 < len1: # s2 shorter than s1
|
||||||
|
for i in range(len2):
|
||||||
|
if not flag:
|
||||||
|
if s1[i] != s2[i]:
|
||||||
|
if flag:
|
||||||
|
return False
|
||||||
|
flag = True
|
||||||
|
else: # there's already one difference
|
||||||
|
if s1[i + 1] != s2[i]:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
```
|
||||||
|
|
||||||
|
can be merged into one:
|
||||||
|
|
||||||
|
```python
|
||||||
|
else:
|
||||||
|
index1 = 0
|
||||||
|
index2 = 0
|
||||||
|
while index1 < len1 and index2 < len2:
|
||||||
|
if s1[index1] != s2[index2]: # difference found
|
||||||
|
if flag:
|
||||||
|
return False
|
||||||
|
flag = True
|
||||||
|
|
||||||
|
## need to check for shorter string here...
|
||||||
|
```
|
||||||
|
|
||||||
|
The solution creates 2 new strings, short and long (using O(2n) space), and updates indexes accordingly (see java file).
|
||||||
|
|
||||||
|
What's better, create 2 new strings (O(n) space) and make code more compact, or use just 3 variables and 10 more lines of code? Don't know.
|
||||||
|
|
@ -0,0 +1,63 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
def levenshtein(s1, s2):
|
||||||
|
if abs(len(s1) - len(s2)) > 1:
|
||||||
|
return False
|
||||||
|
if s1 == s2:
|
||||||
|
return True
|
||||||
|
|
||||||
|
len1 = len(s1)
|
||||||
|
len2 = len(s2)
|
||||||
|
flag = False
|
||||||
|
|
||||||
|
if len1 == len2: # equal lengths
|
||||||
|
for i in range(len1):
|
||||||
|
if s1[i] != s2[i]:
|
||||||
|
if flag:
|
||||||
|
return False
|
||||||
|
flag = True
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif len1 < len2: # s1 shorter than s2
|
||||||
|
for i in range(len1):
|
||||||
|
if not flag:
|
||||||
|
if s1[i] != s2[i]:
|
||||||
|
if flag:
|
||||||
|
return False
|
||||||
|
flag = True
|
||||||
|
else: # there's already one difference
|
||||||
|
if s1[i] != s2[i + 1]:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif len2 < len1: # s2 shorter than s1
|
||||||
|
for i in range(len2):
|
||||||
|
if not flag:
|
||||||
|
if s1[i] != s2[i]:
|
||||||
|
if flag:
|
||||||
|
return False
|
||||||
|
flag = True
|
||||||
|
else: # there's already one difference
|
||||||
|
if s1[i + 1] != s2[i]:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class Test(unittest.TestCase):
|
||||||
|
dataT = [("", ""), ("an", "a"), ("a", "an"), ("ane", "ae"),
|
||||||
|
("ane", "ne"), ("asdfgh", "asdgh"), ("asdf", "asgf")]
|
||||||
|
dataF = [("asdfg", "adfgh"), ("an", "ne"), ("asdf", "as")]
|
||||||
|
|
||||||
|
def test_unique(self):
|
||||||
|
for test in self.dataT:
|
||||||
|
res = levenshtein(*test)
|
||||||
|
self.assertTrue(res)
|
||||||
|
|
||||||
|
for test in self.dataF:
|
||||||
|
res = levenshtein(*test)
|
||||||
|
self.assertFalse(res)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Loading…
Reference in New Issue