| from collections import defaultdict | |
| from itertools import product | |
| from typing import * | |
| import nltk | |
| from nltk.corpus import framenet, framenet15 | |
| from nltk.stem import WordNetLemmatizer | |
| lemmatizer = WordNetLemmatizer() | |
| manual = { | |
| '\'s': 'be', | |
| '\'re': 'be', | |
| '\'ve': 'have', | |
| 've': 'have', | |
| 'men': 'man', | |
| 'saw': 'see', | |
| 'could': 'can', | |
| 'neighbour': 'neighbor', | |
| 'felt': 'feel', | |
| 'fell': 'fall', | |
| 'little': 'a little', | |
| 'have': 'have to', | |
| 'raping': 'rape', | |
| 'flavor': 'flavour', | |
| 'ca': 'can', | |
| 'bit': 'a bit', | |
| } | |
| def load_framenet_corpus(version): | |
| if '1.5' in version: | |
| nltk.download('framenet_v15') | |
| return framenet15 | |
| elif '1.7' in version: | |
| nltk.download('framenet_v17') | |
| return framenet | |
| else: | |
| raise NotImplementedError | |
| def is_word(s: str): | |
| return all([c.isalpha() or c in ' -\'' for c in s]) | |
| def lu_to_frame(version: str): | |
| fn = load_framenet_corpus(version) | |
| fn._bad_statuses = [] | |
| map_no_pos = defaultdict(set) | |
| lexicon_set = set() | |
| for frame in fn.frames(): | |
| for lu in frame.lexUnit: | |
| assert lu.count('.') == 1 | |
| lexicon, pos = lu.split('.') | |
| lexicon = lexicon.lower() | |
| lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split())) | |
| if lexicon == '': | |
| continue | |
| map_no_pos[lexicon].add(frame.name) | |
| lexicon_set.add(lexicon) | |
| fn._bad_statuses = [] | |
| return map_no_pos | |
| class FrameIdentifier: | |
| def __init__(self): | |
| lf_map = lu_to_frame('1.7') | |
| lf_map['there have'].add('Existence') | |
| lf_map['there must'].add('Existence') | |
| lf_map['be there'].add('Existence') | |
| self.lf_map = dict(lf_map) | |
| def __call__(self, tokens: List[str]): | |
| if len(tokens) == 1 and tokens[0].isnumeric(): | |
| return ['Cardinal_numbers'] | |
| if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric(): | |
| return ['Ordinal_numbers'] | |
| tokens = [t.lower() for t in tokens] | |
| frames = list() | |
| if not all([is_word(t) for t in tokens]): | |
| return [] | |
| for i, token in enumerate(tokens): | |
| t2s = [token] | |
| for _pos in 'asrnv': | |
| t2s.append(lemmatizer.lemmatize(token, _pos)) | |
| for t_ in t2s: | |
| if t_ in manual: | |
| t2s.append(manual[t_]) | |
| t2s = list(set(t2s)) | |
| tokens[i] = t2s | |
| for t2s in tokens: | |
| for t in t2s: | |
| key = t | |
| if key in self.lf_map: | |
| for f in self.lf_map[key]: | |
| frames.append(f) | |
| for t1, t2 in zip(tokens, tokens[1:]): | |
| for ts in product(t1, t2): | |
| t = ' '.join(ts) | |
| if t in self.lf_map: | |
| for f in self.lf_map[t]: | |
| frames.append(f) | |
| return list(set(frames)) | |