diff --git a/17. Hard problems/17.13. re_space.md b/17. Hard problems/17.13. re_space.md new file mode 100644 index 0000000..abfa28f --- /dev/null +++ b/17. Hard problems/17.13. re_space.md @@ -0,0 +1,40 @@ +# 17.13. Re-space + +## The spaces from a sentence have been removed. Most words are present in the dictionary but some aren't. Given a dictionary (a list of strings) and the document (a string), unconcatenate the document in a way that minimizes the number of unrecognized characters + +> example: 'jesslookedjustliketimherbrother' -> 'jess looked just like tim her brother' + +## First idea + +This is complicated... I could start first by unconcatenating the sentences considering they're all present in the dictionary, and then optimize. How do I unconcatenate in the first place? .tolower() first + +* Easiest case: 'iamalive' -> 'i am alive', only one possibility +* Concatenation: 'theylookedible' -> 'they look edible', but 'looked' and 'edible' concatenate. + +I could do swaps of n-grams, start with the longest words and go downwards: + +> example: 'iamalive' + +1. swap of len(s) = 8, 'iamalive' in dict? no +2. swap = 7, 'iamaliv', 'amalive' in dict? no +3. swap = 6, 'iamali', 'amaliv', 'malive' in dict? no +4. swap = 5, ยก'iamal', ..., 'alive'. 'alive' found and saved. How? +5. swap = 4, nothing +6. swap = 3, nothing +7. swap = 2, 'am' found +8. swap = 1, 'i' found. + +All the string is 'covered', done. + +> example: 'theylookedible' + +1. swap of len(s) = 10, nothing +2. wap = 6, 'looked', 'edible' saved +3. swap = 4, 'they', 'look' +4. swap = 3, 'the' +5. swap = 2, 'he' found +6. swap = 1, nothing + +How to resolve the conflict of swap = 6? We could look for 'look' and 'ible'. If we find none, choose one randomly and 4 unrecognized words. If we find 'look', assign 'look' and 'edible' and discard 'looked'. How to save these? Don't know. This algorithm is O(n2) + +Also, once the substrings are saved, how to check for string completeness to break the loop and return? \ No newline at end of file diff --git a/17. Hard problems/17.13. re_space.py b/17. Hard problems/17.13. re_space.py new file mode 100644 index 0000000..babc802 --- /dev/null +++ b/17. Hard problems/17.13. re_space.py @@ -0,0 +1,52 @@ +import unittest + +dct = ['a', 'i', 'am', 'not', 'he', 'the', 'they', 'my', + 'alive', 'darkness', 'friend', 'hello'] + +def re_space(s): + + s = s.lower() + if s in dct: + return s + + ins = s + keywords = [] + n = len(ins) + for i in range(n-1, 0, -1): + j = 0 + while j + i <= n: + tmp = ins[j:j+i] + if tmp in dct: + keywords.append(tmp) + ins = ins.replace(tmp, '') + j += 1 + n = len(ins) + + keywords = sorted(keywords) + touched = [0] * len(s) + for keyw in keywords: + idx = s.find(keyw) + if touched[idx] == 1: + continue + touched[idx:idx + len(keyw) + 1] = [1] * len(keyw) + s = s.replace(keyw, ' ' + keyw + ' ') + s = s.replace(' ', ' ') + + if s[-1] == ' ': + s = s[:-1] + if s[0] == ' ': + s = s[1:] + return s + +class Test(unittest.TestCase): + data = [('iamalive', 'i am alive'), + ('hellodarknessmyfriend', 'hello darkness my friend'), + ('invincibleami', 'i nv i nc i ble am i')] + + def test_rotate_matrix(self): + for test_num, expected in self.data: + actual = re_space(test_num) + self.assertEqual(actual, expected) + +if __name__ == "__main__": + unittest.main()