now to profile
So I have my suffix array for the assignment built and tested, but now I need to profile and analyze it's performance... and it looks like I'll need to alter my binary tree to make a significant comparison, which is part of the study drill in the assignment.
The original search tree only looks for an exact match to a given key, so for it to be comparable to my suffix array, it would need to look for the same things... longest, shortest, all... which just seems crazy.... build a whole other thing to compare it to, but otherwise they are different beasts. Now if I put the suffix's in the binary tree... and use my match_loop in the tree.... I'm basically just putting the same code into my tree.... Using the tree to store my suffix's...
What can I do to make this interesting?
How about....
def test_suffix():
for i in range(0, 100):
x = generate_string()
spam = SuffixArray_Binary(x)
spam.find_longest_binary('g')
spam.find_longest_binary(x)
spam.binary_search('b')
spam.binary_search(x)
spam.find_shortest_binary(x)
spam.find_shortest_binary('q')
spam.find_all_matches(x)
spam.find_all_matches('h')
That's my test for my suffix array....
What if I modified my tree to hold all 100 of the arrays i made.... and the tree can search any one of those arrays for the longest.... shortest.... all matches... It can return something like
KEY = 'tacocat' shortest = ['cat']
KEY = 'tabicat' , shortest = ['cat']
KEY = 'catoninetales', shortest = ['catoninetales']
Then I'd add a test like this:
def test_binary_tree():
# My binary tree has a median for the root_node
eggs = BinaryTree('hjklmno')
for i in range(0, 100):
x=generate_string()
# boolean to set if this is a suffix array holding tree here (in set_suffix too the self in __init__?)
# then a boolean check would have to be set in normal set also
# Or remove normal set all together... make it a different tree. Bare bone it... remove everything.
# Yes.... remove everything. Just scratch it make a different tree.
eggs.set_suffix(x)
# test these outside the for loop, to see how it compares looking up stuff when it is
# looking through 100 keys first. It's a tree, so it shouldn't be any slower right?
eggs.search_suffix_longest('g')
eggs.search_suffix_shortest('q')
eggs.search_suffix_all('x')
Which reminds me, I didn't test it with spaces!!!!
Here's the code for the SuffixArray_Binary.py Gotta go run some tests for spaces now.
(( OH , and clean up that delete on original tree. Zed's made mine look like a horrific mess.))
class SuffixArray_Binary(object):
def __init__(self, astring):
self.alist = []
if type(astring) != str:
raise TypeError
for i in range(len(astring)):
self.alist.append(astring[i:])
def binary_search(self, astring):
""" find a matching string in the suffix's """
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
count = 0
while min_len <= max_len:
midpoint = (max_len + min_len) // 2
count += 1
if midpoint > length or midpoint < 0:
break
if self.alist[midpoint] == astring:
x = self.alist[midpoint]
print(count, x)
return x
elif self.alist[midpoint] < astring:
min_len = midpoint + 1
else:
max_len = midpoint - 1
return -1
def find_shortest_binary(self, astring):
""" find the shortest string that starts with given string (astring) in suffix's"""
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
least = []
stop = None
while min_len <= max_len:
midpoint = (min_len + max_len)//2
if midpoint > length or midpoint < 0:
break
else:
akey = self.alist[midpoint]
if akey == astring:
print('found exact match')
return akey, midpoint
elif akey[0] == astring[0]:
if len(akey) >= len(astring):
x = 0
match = True
y = []
for x in range(0, len(astring)):
if akey[x] == astring[x]:
y.append(akey[x])
print(y)
pass
else:
match = False
if match == True:
stop = midpoint
if stop != None:
break
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
else:
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
elif akey > astring:
max_len = midpoint - 1
else:
min_len = midpoint + 1
if stop != None:
result = self.match_loop(self.alist, stop, astring)
#print(result)
result.sort(key=len)
#print(result)
return result[0]
else:
return -1
def match_loop(self, somelist, indexpoint, astring):
i = indexpoint
matchlist = []
x = somelist[i]
while x[0] == astring[0] and i < len(somelist):
akey = somelist[i]
match = True
if len(akey)>= len(astring):
for n in range(0, len(astring)):
if akey[n] == astring[n]:
pass
else:
match = False
if match:
matchlist.append(akey)
i += 1
i = indexpoint - 1
while x[0] == astring[0] and i >= 0:
akey = somelist[i]
match = True
if len(akey)>=len(astring):
for n in range(0, len(astring)):
if akey[n] == astring[n]:
print(akey, astring)
pass
else:
match = False
if match:
matchlist.append(akey)
i -= 1
return matchlist
def find_longest_binary(self, astring):
""" find the longest string that starts with given string (astring) in suffix's"""
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
least = []
stop = None
while min_len <= max_len:
midpoint = (min_len + max_len)//2
if midpoint > length or midpoint < 0:
break
else:
akey = self.alist[midpoint]
if akey[0] == astring[0]:
if len(akey) >= len(astring):
x = 0
match = True
for x in range(0, len(astring)):
if akey[x] == astring[x]:
pass
else:
match = False
if match == True:
stop = midpoint
if stop != None:
break
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
else:
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
elif akey > astring:
max_len = midpoint - 1
else:
min_len = midpoint + 1
if stop != None:
result = self.match_loop(self.alist, stop, astring)
#print(result)
result.sort(key=len, reverse=True)
#print(result)
return result[0]
else:
return -1
def find_all_matches(self, astring):
""" find the longest string that starts with given string (astring) in suffix's"""
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
least = []
stop = None
while min_len <= max_len:
midpoint = (min_len + max_len)//2
if midpoint > length or midpoint < 0:
break
else:
akey = self.alist[midpoint]
if akey[0] == astring[0]:
if len(akey) >= len(astring):
x = 0
match = True
for x in range(0, len(astring)):
if akey[x] == astring[x]:
pass
else:
match = False
if match == True:
stop = midpoint
if stop != None:
break
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
else:
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
elif akey > astring:
max_len = midpoint - 1
else:
min_len = midpoint + 1
if stop != None:
result = self.match_loop(self.alist, stop, astring)
return result
else:
return -1
def dump(self):
print(self.alist)
The original search tree only looks for an exact match to a given key, so for it to be comparable to my suffix array, it would need to look for the same things... longest, shortest, all... which just seems crazy.... build a whole other thing to compare it to, but otherwise they are different beasts. Now if I put the suffix's in the binary tree... and use my match_loop in the tree.... I'm basically just putting the same code into my tree.... Using the tree to store my suffix's...
What can I do to make this interesting?
How about....
def test_suffix():
for i in range(0, 100):
x = generate_string()
spam = SuffixArray_Binary(x)
spam.find_longest_binary('g')
spam.find_longest_binary(x)
spam.binary_search('b')
spam.binary_search(x)
spam.find_shortest_binary(x)
spam.find_shortest_binary('q')
spam.find_all_matches(x)
spam.find_all_matches('h')
That's my test for my suffix array....
What if I modified my tree to hold all 100 of the arrays i made.... and the tree can search any one of those arrays for the longest.... shortest.... all matches... It can return something like
KEY = 'tacocat' shortest = ['cat']
KEY = 'tabicat' , shortest = ['cat']
KEY = 'catoninetales', shortest = ['catoninetales']
Then I'd add a test like this:
def test_binary_tree():
# My binary tree has a median for the root_node
eggs = BinaryTree('hjklmno')
for i in range(0, 100):
x=generate_string()
# boolean to set if this is a suffix array holding tree here (in set_suffix too the self in __init__?)
# then a boolean check would have to be set in normal set also
# Or remove normal set all together... make it a different tree. Bare bone it... remove everything.
# Yes.... remove everything. Just scratch it make a different tree.
eggs.set_suffix(x)
# test these outside the for loop, to see how it compares looking up stuff when it is
# looking through 100 keys first. It's a tree, so it shouldn't be any slower right?
eggs.search_suffix_longest('g')
eggs.search_suffix_shortest('q')
eggs.search_suffix_all('x')
Which reminds me, I didn't test it with spaces!!!!
Here's the code for the SuffixArray_Binary.py Gotta go run some tests for spaces now.
(( OH , and clean up that delete on original tree. Zed's made mine look like a horrific mess.))
class SuffixArray_Binary(object):
def __init__(self, astring):
self.alist = []
if type(astring) != str:
raise TypeError
for i in range(len(astring)):
self.alist.append(astring[i:])
def binary_search(self, astring):
""" find a matching string in the suffix's """
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
count = 0
while min_len <= max_len:
midpoint = (max_len + min_len) // 2
count += 1
if midpoint > length or midpoint < 0:
break
if self.alist[midpoint] == astring:
x = self.alist[midpoint]
print(count, x)
return x
elif self.alist[midpoint] < astring:
min_len = midpoint + 1
else:
max_len = midpoint - 1
return -1
def find_shortest_binary(self, astring):
""" find the shortest string that starts with given string (astring) in suffix's"""
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
least = []
stop = None
while min_len <= max_len:
midpoint = (min_len + max_len)//2
if midpoint > length or midpoint < 0:
break
else:
akey = self.alist[midpoint]
if akey == astring:
print('found exact match')
return akey, midpoint
elif akey[0] == astring[0]:
if len(akey) >= len(astring):
x = 0
match = True
y = []
for x in range(0, len(astring)):
if akey[x] == astring[x]:
y.append(akey[x])
print(y)
pass
else:
match = False
if match == True:
stop = midpoint
if stop != None:
break
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
else:
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
elif akey > astring:
max_len = midpoint - 1
else:
min_len = midpoint + 1
if stop != None:
result = self.match_loop(self.alist, stop, astring)
#print(result)
result.sort(key=len)
#print(result)
return result[0]
else:
return -1
def match_loop(self, somelist, indexpoint, astring):
i = indexpoint
matchlist = []
x = somelist[i]
while x[0] == astring[0] and i < len(somelist):
akey = somelist[i]
match = True
if len(akey)>= len(astring):
for n in range(0, len(astring)):
if akey[n] == astring[n]:
pass
else:
match = False
if match:
matchlist.append(akey)
i += 1
i = indexpoint - 1
while x[0] == astring[0] and i >= 0:
akey = somelist[i]
match = True
if len(akey)>=len(astring):
for n in range(0, len(astring)):
if akey[n] == astring[n]:
print(akey, astring)
pass
else:
match = False
if match:
matchlist.append(akey)
i -= 1
return matchlist
def find_longest_binary(self, astring):
""" find the longest string that starts with given string (astring) in suffix's"""
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
least = []
stop = None
while min_len <= max_len:
midpoint = (min_len + max_len)//2
if midpoint > length or midpoint < 0:
break
else:
akey = self.alist[midpoint]
if akey[0] == astring[0]:
if len(akey) >= len(astring):
x = 0
match = True
for x in range(0, len(astring)):
if akey[x] == astring[x]:
pass
else:
match = False
if match == True:
stop = midpoint
if stop != None:
break
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
else:
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
elif akey > astring:
max_len = midpoint - 1
else:
min_len = midpoint + 1
if stop != None:
result = self.match_loop(self.alist, stop, astring)
#print(result)
result.sort(key=len, reverse=True)
#print(result)
return result[0]
else:
return -1
def find_all_matches(self, astring):
""" find the longest string that starts with given string (astring) in suffix's"""
self.alist.sort()
length = len(self.alist) - 1
max_len = len(self.alist)
min_len = 0
least = []
stop = None
while min_len <= max_len:
midpoint = (min_len + max_len)//2
if midpoint > length or midpoint < 0:
break
else:
akey = self.alist[midpoint]
if akey[0] == astring[0]:
if len(akey) >= len(astring):
x = 0
match = True
for x in range(0, len(astring)):
if akey[x] == astring[x]:
pass
else:
match = False
if match == True:
stop = midpoint
if stop != None:
break
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
else:
if akey > astring:
max_len = midpoint - 1
continue
else:
min_len = midpoint + 1
continue
elif akey > astring:
max_len = midpoint - 1
else:
min_len = midpoint + 1
if stop != None:
result = self.match_loop(self.alist, stop, astring)
return result
else:
return -1
def dump(self):
print(self.alist)
Comments
Post a Comment