diff --git a/Chapter 1 Arrays and strings/1.5. levenshtein.md b/Chapter 1 Arrays and strings/1.5. levenshtein.md new file mode 100644 index 0000000..6df08d1 --- /dev/null +++ b/Chapter 1 Arrays and strings/1.5. levenshtein.md @@ -0,0 +1,106 @@ +# 1.4. Levenshtein distance + +## Given two strings, check if they are one edit (or zero edits) away. Edits can be insertion, substitution or deletion + +## First idea + +Difference in length must be at most 1 + +```python +if abs(len(s1) - len(s2)) > 1: + return False +if s1 == s2: + return True +``` + +BCR is O(n) time, O(1) space. + +This 'edit away' is Levenshtein distance, task is to check if Levenshtein distance <= 1. + +Can we do `s1 - s2`? Or `s1.difference(s2)`? Not in python. + +## Second idea + +From the Internet, this is O(1) time: + +```python +def levenshtein(s1, s2): + return len("".join(s1.rsplit(s2))) <= 1 +``` + +But this splits the first word with the second word, it doesn't work if there's a gap/addition in the middle of the string. + +## Third idea + +Keep a boolean `flag` that must be at most 1. Iterate through the smallest string, check for differences, if difference: flag += 1, if flag == 2, return False. + +### If equal lengths + +Same length, match s1[i] vs s2[i]. if difference: flag += 1, if flag == 2, return False. if iteration done, return True + +### Unequal lengths + +Iterate through the smallest string, match s1[i] vs s2[i]. If difference, the smallest string must have a character deletion. flag = 1, and keep matching s1[i] vs s2[i+1] (where s1 is the small string). If the smallest string is iterated and no errors, return True + +> Tests: + +* ("", "") -> True, correct +* ("an", "a") -> True, correct +* ("a", "an") -> True, correct +* ("ane", "ae") -> True, correct +* ("ane", "ne") -> True, correct +* ("asdfgh", "asdgh") -> True, correct +* ("asdf", "asgf") -> True, correct +* ("asdfg", "adfgh") -> False, corrrect +* ("an", "ne") -> False, correct + +Tests passed, O(min(len(s1), len(s2)))) time, O(1) space. Possibly code can be compressed but I don't know how. + +## Solution + +First solution, 3 different cases for equal lengths, len1 < len2, len2 < len1. Seeing this, my code: + +```python +elif len1 < len2: # s1 shorter than s2 + for i in range(len1): + if not flag: + if s1[i] != s2[i]: + if flag: + return False + flag = True + else: # there's already one difference + if s1[i] != s2[i + 1]: + return False + return True + +elif len2 < len1: # s2 shorter than s1 + for i in range(len2): + if not flag: + if s1[i] != s2[i]: + if flag: + return False + flag = True + else: # there's already one difference + if s1[i + 1] != s2[i]: + return False + return True +``` + +can be merged into one: + +```python +else: + index1 = 0 + index2 = 0 + while index1 < len1 and index2 < len2: + if s1[index1] != s2[index2]: # difference found + if flag: + return False + flag = True + + ## need to check for shorter string here... +``` + +The solution creates 2 new strings, short and long (using O(2n) space), and updates indexes accordingly (see java file). + +What's better, create 2 new strings (O(n) space) and make code more compact, or use just 3 variables and 10 more lines of code? Don't know. \ No newline at end of file diff --git a/Chapter 1 Arrays and strings/1.5. levenshtein.py b/Chapter 1 Arrays and strings/1.5. levenshtein.py new file mode 100644 index 0000000..955021b --- /dev/null +++ b/Chapter 1 Arrays and strings/1.5. levenshtein.py @@ -0,0 +1,63 @@ +import unittest + +def levenshtein(s1, s2): + if abs(len(s1) - len(s2)) > 1: + return False + if s1 == s2: + return True + + len1 = len(s1) + len2 = len(s2) + flag = False + + if len1 == len2: # equal lengths + for i in range(len1): + if s1[i] != s2[i]: + if flag: + return False + flag = True + return True + + elif len1 < len2: # s1 shorter than s2 + for i in range(len1): + if not flag: + if s1[i] != s2[i]: + if flag: + return False + flag = True + else: # there's already one difference + if s1[i] != s2[i + 1]: + return False + return True + + elif len2 < len1: # s2 shorter than s1 + for i in range(len2): + if not flag: + if s1[i] != s2[i]: + if flag: + return False + flag = True + else: # there's already one difference + if s1[i + 1] != s2[i]: + return False + return True + + +class Test(unittest.TestCase): + dataT = [("", ""), ("an", "a"), ("a", "an"), ("ane", "ae"), + ("ane", "ne"), ("asdfgh", "asdgh"), ("asdf", "asgf")] + dataF = [("asdfg", "adfgh"), ("an", "ne"), ("asdf", "as")] + + def test_unique(self): + for test in self.dataT: + res = levenshtein(*test) + self.assertTrue(res) + + for test in self.dataF: + res = levenshtein(*test) + self.assertFalse(res) + + return + +if __name__ == "__main__": + unittest.main()