more nltk tinkering
Found a way to kind of filter out verbs I didn't want. Also came up with a few more methods to play with nltk and it's different things.
Disclaimer, I've manually tested with print statements and such, but have not yet written a 'pytest' for it.
Updated nltk0_ex.py file:
#!usr/bin/python3
# -*- coding: utf-8 -*-
import sys
import re
from nltk.corpus import wordnet
from random import randint
import nltk as nltk
# place script1, script2, sys.argv[] here
#script1 = sys.argv[1]
#script2 = sys.argv[1]
"""
Requires:
above imports and :
install - nltk
install - python3
In your python3 shell type these to download needed data sets:
>>>import nltk
>>>nltk.download('wordnet')
>>>nltk.download('punkt')
>>>nltk.download('averaged_perceptron_tagger')
make_noun_response() -- requires script1 as sys.argv
make_verb_response() -- requires script2 as sys.argv
action_verb_getter() -- requires script2 as sys.argv
"""
def get_script_line(arg):
"""get a random script line from the text file provided as a sys.argv[1] or as defined"""
with open(arg) as f:
for i, l in enumerate(f):
pass
count = i
if count != None:
with open(arg) as f:
lines = f.readlines()
x = randint(0, count)
return lines[x]
def run_synonyms():
""" print off a synonym from a list found in the synsets """
search_for = input(">>>")
while search_for != 'QUIT':
alist = []
for syn in wordnet.synsets(search_for):
for l in syn.lemmas():
alist.append(l.name())
if len(alist) > 0:
length = len(alist) - 1
x = randint(0, length)
#print(x)
synx = alist[x]
print(f"possible matches = {length + 1}")
print(f"Are you looking for the word similar too : {synx} ?")
else:
print("word not found in wordnet")
search_for = input(">>>")
def find_NN_VV():
""" give two lists, one of identified nouns, one of the verbs """
search_for = input("...>>>")
while search_for != 'EXIT':
nounlist = []
verblist = []
search_for = clean_string(search_for)
tokens = nltk.word_tokenize(search_for)
#print(tokens)
tags = nltk.pos_tag(tokens)
for item in tags:
if item[1][0] == 'N':
nounlist.append(item[0])
if item[1][0] == 'V':
verblist.append(item[0])
nounlist = clean_list(nounlist)
verblist = clean_list(verblist)
print("nouns = ")
print(nounlist)
print("verbs = ")
print(verblist)
search_for = input("...>>>")
def make_noun_response():
""" With script (text file) given as sys.argv[1] use a found noun in user input, to
reply in the context of that noun """
make = input(" user : ")
while make not in ['EXIT', 'QUIT']:
nounlist = []
tokens = nltk.word_tokenize(make)
tags = nltk.pos_tag(tokens)
for item in tags:
x = item[1]
if x.startswith('NN') == True:
nounlist.append(item[0])
if len(nounlist) > 0:
# change it to a set to eliminate duplicates
nounlist = set(nounlist)
# change it back to list, to be able to easily index random selection
nounlist = list(nounlist)
x = randint(0, len(nounlist) - 1)
script = get_script_line(script1)
noun = nounlist[x]
print("Wiwa: ")
print(script % noun)
else:
print("Wiwa: \nI do not comprehend.")
make = input(" user : ")
def make_verb_response():
""" With script (text file) given as sys.argv[1] return a response related to an *action verb* found in user input """
make = input(" user : ")
while make not in ['EXIT', 'QUIT']:
verblist = []
tokens = nltk.word_tokenize(make)
tags = nltk.pos_tag(tokens)
for item in tags:
print(tags)
x = item[1]
if x.startswith('VB') == True:
verblist.append(item[0])
verblist2 = clean_list(verblist)
if len(verblist2) > 0:
x = randint(0, len(verblist2) - 1)
script = get_script_line(script2)
verb = verblist2[x]
print("Wiwa: ")
print(script % verb)
else:
print("Wiwa: \nI do not comprehend.")
make = input(" user : ")
def get_verb_version_def():
""" Identify the first synset item that has a .v identifier, meaning it's
the verb form of this word, print that definition found"""
getter = input("word >>>")
while getter not in ['EXIT', 'QUIT']:
syns = wordnet.synsets(getter)
if len(syns) > 0:
#print(syns)
finder = ""
finder = finder + getter + '.v' + '.01'
count = 0
for item in syns:
if item.pos() == 'v':
print('found')
print(syns[count].definition())
break
else:
count += 1
#print(syns[0].definition())
getter = input("word >>>")
else:
print("word not found in wordnet")
getter = input("word >>>")
def get_first_def():
""" print off the first definition in the synsets list """
getter = input("word >>>")
while getter not in ['EXIT', 'QUIT']:
syns = wordnet.synsets(getter)
if len(syns) > 0:
print(syns[0].definition())
getter = input("word >>>")
else:
print("word not found in word net")
getter = input("word >>>")
def action_verb_getter():
""" Trys to get only action verbs from user input, and ignore the other verbs """
#non-action-verbs = 'nav'
nav = ['am', 'were', 'is', 'as', 'are', 'was', 'be', 'being', 'been',
'may', 'might', 'must', 'do', 'does', 'did', 'should', 'could', 'would',
'have', 'has', 'had', 'will', 'can', 'shall', "'s", "'re", "'d" ]
make = input(" user : ")
while make not in ['EXIT', 'QUIT']:
make = clean_string(make)
verblist = []
tokens = nltk.word_tokenize(make)
tags = nltk.pos_tag(tokens)
for item in tags:
x = item[1]
if x.startswith('VB') == True:
verb = item[0]
if verb in nav:
pass
else:
verblist.append(item[0])
else:
pass
verblist2 = clean_list(verblist)
if len(verblist2) > 0:
x = randint(0, len(verblist2) - 1)
script = get_script_line(script2)
verb = verblist2[x]
#print(verblist2)
print("Wiwa: ")
print(script % verb)
else:
print("Wiwa:")
print(f"This entry : {make}. I do not see action in it.")
make = input(" user : ")
def bool_synset(arg):
""" Return False if wordnet does not find synsets (empty list); or
return True if the wordnet found something to return(not empty list)
for wordnet.synsets(arg) """
x = wordnet.synsets(arg)
# wordnet.synsets(arg) will return an empty list for a word not found
if len(x) < 1:
return False
else:
return True
def check_word():
""" Method that uses bool_synset to see if a word is in wordnet """
check = input("word to look for >>>")
while check not in ['EXIT', 'QUIT']:
valid = bool_synset(check)
print("word found = ", valid)
check = input("word to look for >>>")
def clean_string(astring):
""" Take the string and clear out all irrelevant data.
Goal: to identify main verbs and nouns.
Only tag words that are in the wordnet corpus. """
astring = str(astring)
astring.lower()
newstring = re.sub("[^a-zA-Z| |?]+", "", astring)
stringlist = newstring.split()
cleaned = ""
for word in stringlist:
if bool_synset(word) == True:
cleaned = cleaned + word + " "
return cleaned
def clean_list(alist):
""" Make a list a set to eliminate duplicates, then return it to
list form for easy random accessing. """
alist = set(alist)
alist = list(alist)
return alist
#find_NN_VV()
#run_synonyms()
#make_noun_response()
#make_verb_response()
#get_first_def()
#get_verb_version_def()
#action_verb_getter()
#check_word()
Disclaimer, I've manually tested with print statements and such, but have not yet written a 'pytest' for it.
Updated nltk0_ex.py file:
#!usr/bin/python3
# -*- coding: utf-8 -*-
import sys
import re
from nltk.corpus import wordnet
from random import randint
import nltk as nltk
# place script1, script2, sys.argv[] here
#script1 = sys.argv[1]
#script2 = sys.argv[1]
"""
Requires:
above imports and :
install - nltk
install - python3
In your python3 shell type these to download needed data sets:
>>>import nltk
>>>nltk.download('wordnet')
>>>nltk.download('punkt')
>>>nltk.download('averaged_perceptron_tagger')
make_noun_response() -- requires script1 as sys.argv
make_verb_response() -- requires script2 as sys.argv
action_verb_getter() -- requires script2 as sys.argv
"""
def get_script_line(arg):
"""get a random script line from the text file provided as a sys.argv[1] or as defined"""
with open(arg) as f:
for i, l in enumerate(f):
pass
count = i
if count != None:
with open(arg) as f:
lines = f.readlines()
x = randint(0, count)
return lines[x]
def run_synonyms():
""" print off a synonym from a list found in the synsets """
search_for = input(">>>")
while search_for != 'QUIT':
alist = []
for syn in wordnet.synsets(search_for):
for l in syn.lemmas():
alist.append(l.name())
if len(alist) > 0:
length = len(alist) - 1
x = randint(0, length)
#print(x)
synx = alist[x]
print(f"possible matches = {length + 1}")
print(f"Are you looking for the word similar too : {synx} ?")
else:
print("word not found in wordnet")
search_for = input(">>>")
def find_NN_VV():
""" give two lists, one of identified nouns, one of the verbs """
search_for = input("...>>>")
while search_for != 'EXIT':
nounlist = []
verblist = []
search_for = clean_string(search_for)
tokens = nltk.word_tokenize(search_for)
#print(tokens)
tags = nltk.pos_tag(tokens)
for item in tags:
if item[1][0] == 'N':
nounlist.append(item[0])
if item[1][0] == 'V':
verblist.append(item[0])
nounlist = clean_list(nounlist)
verblist = clean_list(verblist)
print("nouns = ")
print(nounlist)
print("verbs = ")
print(verblist)
search_for = input("...>>>")
def make_noun_response():
""" With script (text file) given as sys.argv[1] use a found noun in user input, to
reply in the context of that noun """
make = input(" user : ")
while make not in ['EXIT', 'QUIT']:
nounlist = []
tokens = nltk.word_tokenize(make)
tags = nltk.pos_tag(tokens)
for item in tags:
x = item[1]
if x.startswith('NN') == True:
nounlist.append(item[0])
if len(nounlist) > 0:
# change it to a set to eliminate duplicates
nounlist = set(nounlist)
# change it back to list, to be able to easily index random selection
nounlist = list(nounlist)
x = randint(0, len(nounlist) - 1)
script = get_script_line(script1)
noun = nounlist[x]
print("Wiwa: ")
print(script % noun)
else:
print("Wiwa: \nI do not comprehend.")
make = input(" user : ")
def make_verb_response():
""" With script (text file) given as sys.argv[1] return a response related to an *action verb* found in user input """
make = input(" user : ")
while make not in ['EXIT', 'QUIT']:
verblist = []
tokens = nltk.word_tokenize(make)
tags = nltk.pos_tag(tokens)
for item in tags:
print(tags)
x = item[1]
if x.startswith('VB') == True:
verblist.append(item[0])
verblist2 = clean_list(verblist)
if len(verblist2) > 0:
x = randint(0, len(verblist2) - 1)
script = get_script_line(script2)
verb = verblist2[x]
print("Wiwa: ")
print(script % verb)
else:
print("Wiwa: \nI do not comprehend.")
make = input(" user : ")
def get_verb_version_def():
""" Identify the first synset item that has a .v identifier, meaning it's
the verb form of this word, print that definition found"""
getter = input("word >>>")
while getter not in ['EXIT', 'QUIT']:
syns = wordnet.synsets(getter)
if len(syns) > 0:
#print(syns)
finder = ""
finder = finder + getter + '.v' + '.01'
count = 0
for item in syns:
if item.pos() == 'v':
print('found')
print(syns[count].definition())
break
else:
count += 1
#print(syns[0].definition())
getter = input("word >>>")
else:
print("word not found in wordnet")
getter = input("word >>>")
def get_first_def():
""" print off the first definition in the synsets list """
getter = input("word >>>")
while getter not in ['EXIT', 'QUIT']:
syns = wordnet.synsets(getter)
if len(syns) > 0:
print(syns[0].definition())
getter = input("word >>>")
else:
print("word not found in word net")
getter = input("word >>>")
def action_verb_getter():
""" Trys to get only action verbs from user input, and ignore the other verbs """
#non-action-verbs = 'nav'
nav = ['am', 'were', 'is', 'as', 'are', 'was', 'be', 'being', 'been',
'may', 'might', 'must', 'do', 'does', 'did', 'should', 'could', 'would',
'have', 'has', 'had', 'will', 'can', 'shall', "'s", "'re", "'d" ]
make = input(" user : ")
while make not in ['EXIT', 'QUIT']:
make = clean_string(make)
verblist = []
tokens = nltk.word_tokenize(make)
tags = nltk.pos_tag(tokens)
for item in tags:
x = item[1]
if x.startswith('VB') == True:
verb = item[0]
if verb in nav:
pass
else:
verblist.append(item[0])
else:
pass
verblist2 = clean_list(verblist)
if len(verblist2) > 0:
x = randint(0, len(verblist2) - 1)
script = get_script_line(script2)
verb = verblist2[x]
#print(verblist2)
print("Wiwa: ")
print(script % verb)
else:
print("Wiwa:")
print(f"This entry : {make}. I do not see action in it.")
make = input(" user : ")
def bool_synset(arg):
""" Return False if wordnet does not find synsets (empty list); or
return True if the wordnet found something to return(not empty list)
for wordnet.synsets(arg) """
x = wordnet.synsets(arg)
# wordnet.synsets(arg) will return an empty list for a word not found
if len(x) < 1:
return False
else:
return True
def check_word():
""" Method that uses bool_synset to see if a word is in wordnet """
check = input("word to look for >>>")
while check not in ['EXIT', 'QUIT']:
valid = bool_synset(check)
print("word found = ", valid)
check = input("word to look for >>>")
def clean_string(astring):
""" Take the string and clear out all irrelevant data.
Goal: to identify main verbs and nouns.
Only tag words that are in the wordnet corpus. """
astring = str(astring)
astring.lower()
newstring = re.sub("[^a-zA-Z| |?]+", "", astring)
stringlist = newstring.split()
cleaned = ""
for word in stringlist:
if bool_synset(word) == True:
cleaned = cleaned + word + " "
return cleaned
def clean_list(alist):
""" Make a list a set to eliminate duplicates, then return it to
list form for easy random accessing. """
alist = set(alist)
alist = list(alist)
return alist
#find_NN_VV()
#run_synonyms()
#make_noun_response()
#make_verb_response()
#get_first_def()
#get_verb_version_def()
#action_verb_getter()
#check_word()
Comments
Post a Comment