1.1 is unique

2019-03-28 17:28:38 +01:00 · 2019-03-28 17:28:38 +01:00 · 573db0cb29
parent df7b053449
commit 573db0cb29
4 changed files with 201 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.vscode/
+*.exe
--- a/is_unique.md
+++ b/is_unique.md
@ -0,0 +1,87 @@
+# 1.1. Is unique
+
+## Implement an algorithm to determine if a S has all unique characters. What if you can't use additional data structures
+
+example: s = "abdceb"
+
+## 1.1.1. First idea
+
+```python
+def is_unique(s):
+    return len(s) == len(set(s)) # complexity O(n)? from set
+```
+
+Complexity of `set()`? [It uses a hash table](https://stackoverflow.com/a/44080017/4569908) with O(1) average in insertion/lookup, so O(n) for all characters.
+
+> Tests:
+
+* s = "" -> 0==0, True, correct
+* s = "a" -> 1==1, True, correct
+* s = "ab" -> 2==2, True, correct
+* s = "aa" -> 2!=1, False, correct
+
+## 1.1.2. Non-optimal / brute force idea
+
+* compare 'a', 'bdceb', no repeated characters
+* compare 'b', 'adceb', repeated character -> break, False
+
+don't need to compare to previous chars, only from i onwards. Runtime O(n<sup>2</sup>)
+
+## 1.1.3. Improved idea
+
+* compare 'a', 'bdceb', no repeated characters
+* compare 'b', 'dceb', repeated character -> break, False
+* else, compare with all, each step a smaller comparison
+
+Test same as before, runtime still O(n<sup>2</sup>). not improving
+
+## 1.1.4. Expand first idea
+
+The function `set()` uses a hash table, [equivalent to a dictionary in python](https://stackoverflow.com/questions/2061222/what-is-the-true-difference-between-a-dictionary-and-a-hash-table). Actually a dictionary is a data structure that uses a hash function to map keys to values, but python may change the implementation in the future. A dictionary is just a mapping of a key to a value.
+
+Using hash tables / dictionaries in this case might work.
+
+```python
+def is_unique(s):
+    mapping = dict()
+    for letter in s:
+        if letter in mapping: # letter already present
+            return False
+        mapping[letter] = 1 # any value, doesn't matter
+    return True
+```
+
+A hash table takes O(1) average in lookup/insertion, so O(n) for all.
+
+## Solution
+
+Always ask if the string is an ASCII string or an Unicode string. if Unicode, we'll need more storage.
+
+One solution is to create an array of booleans, where the flag at index `i` indicates whether character `i` in the alphabet is present in the string. The second time we see character `i`, return `False`.
+
+Also immediately return `False` if the string length is bigger than the number of unique characters in the alphabet, which is 128 in ASCII. But check this with the interviewer.
+
+ane: Also immediately return `True` if the string length is 0 or 1.
+
+```java
+boolean is_unique(String s) {
+    if (str.length() > 128) return false;
+    if (str.length() <= 1) return true; // my idea
+
+    boolean[] char_set = new boolean[128];
+    for (int i = 0; i < str.length(); i++) {
+        int val = str.charAt(i);
+        if(char_set[val]) return false;
+        char_set[val] = true;
+    }
+    return true;
+}
+```
+
+Time complexity is O(1), because it'll be at most 128 steps, and space complexity O(1) because we store 128 values. If the string can have any length, then O(n) runtime and O(n) space.
+
+ane: this is my hash table but without the values... there's no need to create a table if we don't need the values.
+
+We can reduce the space usage by a factor of 8 by using a bit vector. We can assume the string only uses lowercase, and just use an int. (see `is_unique_checker()` in is_unique.py)
+
+[Run C++ in vscode](https://stackoverflow.com/a/40570882/4569908)
--- a/strings/1.1.is_unique.cpp
+++ b/strings/1.1.is_unique.cpp
@ -0,0 +1,68 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <bitset>
+#include <algorithm> // for sort() 
+
+using namespace std;
+
+bool isUniqueChars(const string &str){
+    if (str.length() > 128){
+        return false;
+    }
+    vector<bool> char_set(128);
+    for (int i = 0; i < str.length(); i++){
+        int val = str[i];
+        if (char_set[val]){
+            return false;
+        }
+        char_set[val] = true;
+    }
+    return true;
+}
+
+bool isUniqueChars_bitvector(const string &str) {
+	//Reduce space usage by a factor of 8 using bitvector. 
+	//Each boolean otherwise occupies a size of 8 bits.
+	bitset<256> bits(0);
+	for (int i = 0; i < str.length(); i++) {
+		int val = str[i];
+		if (bits.test(val) > 0) {
+			return false;
+		}
+		bits.set(val);
+	}
+	return true;
+}
+bool isUniqueChars_noDS( string str) {
+	
+	sort(str.begin(), str.end()); // O(nlogn) sort from <algorithm>
+
+	bool noRepeat = true;
+	for (int i = 0 ; i < str.size() - 1; i++) {
+		if (str[i] == str[i+1]) {
+			noRepeat = false;
+			break;
+		}
+	}
+
+	return noRepeat;	
+}
+
+int main(){
+    vector<string> words = {"abcde", "hello", "apple", "kite", "padle"};
+    for (auto word : words) {
+        cout << word << string(": ") << boolalpha << isUniqueChars(word) <<endl;
+    }
+
+    cout <<endl << "Using bit vector" <<endl;
+    for (auto word : words) {
+        cout << word << string(": ") << boolalpha << isUniqueChars_bitvector(word) <<endl;
+    }
+
+    cout <<endl << "Using no Data Structures" <<endl;
+    for (auto word : words) {
+        cout << word << string(": ") << boolalpha << isUniqueChars_noDS(word) <<endl;
+    }
+    return 0;
+}
--- a/strings/1.1.is_unique.py
+++ b/strings/1.1.is_unique.py
@ -0,0 +1,44 @@
+# O(N)
+import unittest
+
+def is_unique(string):
+    # Assuming character set is ASCII (128 characters)
+    if len(string) > 128:
+        return False
+
+    char_set = [False for _ in range(128)]
+    for char in string:
+        val = ord(char)
+        if char_set[val]:
+            # Char already found in string
+            return False
+        char_set[val] = True
+
+    return True
+
+# don't understand this
+def is_unique_checker(string):
+    checker = 0
+    for char in string:
+        val = ord(char) - ord('a')
+        if (checker and (1 << val)) > 0:
+            return False
+        checker |= (1 << val)
+    return True
+
+class Test(unittest.TestCase):
+    dataT = [('abcd'), ('s4fad'), ('')]
+    dataF = [('23ds2'), ('hb 627jh=j ()')]
+
+    def test_unique(self):
+        # true check
+        for test_string in self.dataT:
+            res = is_unique(test_string)
+            self.assertTrue(res)
+        # false check
+        for test_string in self.dataF:
+            res = is_unique(test_string)
+            self.assertFalse(res)
+
+if __name__ == "__main__":
+    unittest.main()