Jane Street Interview Question
InternsCountry: United States
Interview Type: Phone Interview
we don't even need to sort the words, we can create a hash code with the alphabets count in a word then use that as the key...it will reduce the complexity to O(n x) rather than O(n x logx). A simple example of such a hash code can be for cat = a1c1t1 bat = a1b1t1 act = a1c1t1 tac = a1c1t1 and so on
<<<
Create a list of lists using below data struct:
struct ListList
{
char * string;
char * revstring;
struct ListList * next;
} * return_list;
Start iterating over given list of words, for each word do the following:
1) search the word in hash_table, if not found then
1.1) Create a node of ListList, add it the return_list and point the node.string to that word.
1.2) Mark node.revstring = NULL
1.3) Reverse the word and put into hash_table (considering key) and value as the address of node.
2) If word found in hash table then
2.1) from hash table entry get the address of node and point node.revstring to that word.
Once the iteration of given list is over, start iterating ‘return_list’ and remove & delete the nodes whose ‘node.revstring’ value is NULL.
Total complexity O(n).
>>>
The core of this algorithm is calculating two numbers of each string. In the first round, It will calculate the sum of each characters minus 'a', meanwhile the min character in the string will be found. In the second round, the sum of the squre differrence between the min character get in the first round and each character in the string. With these two sum values, it can be easliy told that if two strings are in the same group of anagrams
struct Pair
{
string name;
int sum;
int SquSum;
};
node* createNode(Pair* p)
{
node* n = (node*) malloc(sizeof(node));
n->pair = p;
return n;
}
vector<std::list<Pair*> > results;
vector<string> inputs;
void insertResult(node* n)
{
for(vector<std::list<Pair*> >::iterator it = results.begin(); it != results.end(); it++)
{
if(((*it->begin())->sum == n->pair->sum)&&((*it->begin())->SquSum==n->pair->SquSum))
{
it->push_back(n->pair);
return;
}
}
{
list<Pair*> list;
list.push_back(n->pair);
results.push_back(list);
}
}
void calculateString(string input, Pair*& pair)
{
int length = input.size();
int sum = 0, squSum = 0, min = 0;
for (int i = 0; i < length; i++)
{
min = input[min] > input[i] ? i : min;
sum += input[i] - 'a';
}
for (int i = 0; i < length; i++)
{
squSum += (input[min] - input[i]) * (input[min] - input[i]);
}
pair->sum = sum;
pair->SquSum = squSum;
pair->name = input;
cout << "The string " << input << " pair result: " << sum << " " << squSum << endl;
}
int main()
{
inputs.push_back("abc");
inputs.push_back("bcd");
inputs.push_back("cbd");
inputs.push_back("bac");
for (vector<string>::iterator it = inputs.begin(); it != inputs.end(); it++)
{
Pair* pair = (Pair*) malloc(sizeof(Pair));
calculateString(*it, pair);
insertResult(createNode(pair));
}
for(vector<std::list<Pair*> >::iterator it = results.begin(); it != results.end(); it++)
{
for(list<Pair*>::iterator iter = it->begin();iter != it->end(); iter++)
{
cout<<(*iter)->name<<" "<<(*iter)->sum<<" "<<(*iter)->SquSum<<endl;
}
cout<<endl;
}
}
Store in a HashMap.
Now override the HashCode method for this String Object. HashCode should work on the principle of the sorted value of the Anagram hence all Anagrams will give you the same HashCode value.
Now override the Equals() method and let it work on the basis of the normal String comparison.
Hence all Anagrams are stored at the same Map location in buckets.
Code in OCaml using Core.Std.Map (Tree map)
open Core.Std
let group_anagrams anagrams =
let map = List.fold anagrams
~init:(Map.empty String.comparator)
~f:(fun map word ->
let word_sorted =
(String.to_list word)
|> (List.sort ~cmp:Char.compare)
|> String.of_char_list
in
let words = match Map.find map word_sorted with
| None -> [word]
| Some words -> word :: words
in
Map.add map ~key:word_sorted ~data:words
)
in
Map.fold map ~init:[] ~f:(fun ~key ~data acc -> data :: acc)
let () =
assert (
(group_anagrams ["cat"; "act"; "bat"; "tab"; "b"])
= [["b"]; ["act"; "cat"]; ["tab"; "bat"]]
)
Perl solution, because why not? Added a cache so that sorting only happens a single time for duplicate words. I also did some other sanitizing e.g removed all whitespace and punctuation:
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
sub group {
my @words = @_;
my %anagram_map;
my %sort_cache;
foreach my $word ( @words ) {
# strip all whitespace
$word =~ s/[^0-9A-Za-z]+//g;
# force to lowercase
$word = lc $word;
# sort word unless already sorted
my $sorted_word = $sort_cache{$word} || join '', sort +split //, $word;
# add to cache unless it already has an entry
$sort_cache{$word} = $sorted_word
unless $sort_cache{$word};
# group by anagram (sorted string)
push @{$anagram_map{$sorted_word}},
$word;
}
# values of the anagram map are the grouped solutions
return [values %anagram_map];
}
print Dumper group(qw(cat bat act tab));
print Dumper group('the END of the world is nigh', 'down this hole frightened', 'b', 'rome wasn\'t built in a day', 'but laid in two years man', 'a man a plan a canal panama');
__DATA__
# results
[
[
'cat',
'act'
],
[
'bat',
'tab'
]
];
[
[
'theendoftheworldisnigh',
'downthisholefrightened'
],
[
'romewasntbuiltinaday',
'butlaidintwoyearsman'
],
[
'b'
],
[
'amanaplanacanalpanama'
]
];
Solution in python
words = ['cat','act','bat','tab']
def anagramList(words):
grouplist = []
wordDict = {}
for w in words:
wordDict[''.join(sorted(w))] = []
for w in words:
wordDict[''.join(sorted(w))].append(w)
for sl in wordDict:
grouplist.append(wordDict[sl])
return grouplist
print(anagramList(words))
class Solution {
Map<String, List<String>> map = new HashMap<>();
public List<List<String>> groupAnagrams(String[] strs) {
for (String s : strs) {
char[] temp = s.toCharArray();
Arrays.sort(temp);
String key = new String(temp);
if(!map.containsKey(key)) {
map.put(key, new ArrayList<>());
}
map.get(key).add(s);
}
return new ArrayList<List<String>>(map.values());
}
}
- Kamy December 13, 2012