Spaces:

responsibility-framing
/

sociolome

Running

sociolome / tools /framenet /naive_identifier.py

Gosse Minnema

Initial commit

05922fb over 1 year ago

2.98 kB

	from collections import defaultdict
	from itertools import product
	from typing import *

	import nltk
	from nltk.corpus import framenet, framenet15
	from nltk.stem import WordNetLemmatizer

	lemmatizer = WordNetLemmatizer()


	manual = {
	'\'s': 'be',
	'\'re': 'be',
	'\'ve': 'have',
	've': 'have',
	'men': 'man',
	'saw': 'see',
	'could': 'can',
	'neighbour': 'neighbor',
	'felt': 'feel',
	'fell': 'fall',
	'little': 'a little',
	'have': 'have to',
	'raping': 'rape',
	'flavor': 'flavour',
	'ca': 'can',
	'bit': 'a bit',
	}


	def load_framenet_corpus(version):
	if '1.5' in version:
	nltk.download('framenet_v15')
	return framenet15
	elif '1.7' in version:
	nltk.download('framenet_v17')
	return framenet
	else:
	raise NotImplementedError


	def is_word(s: str):
	return all([c.isalpha() or c in ' -\'' for c in s])


	def lu_to_frame(version: str):
	fn = load_framenet_corpus(version)
	fn._bad_statuses = []
	map_no_pos = defaultdict(set)
	lexicon_set = set()
	for frame in fn.frames():
	for lu in frame.lexUnit:
	assert lu.count('.') == 1
	lexicon, pos = lu.split('.')
	lexicon = lexicon.lower()
	lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split()))
	if lexicon == '':
	continue
	map_no_pos[lexicon].add(frame.name)
	lexicon_set.add(lexicon)
	fn._bad_statuses = []
	return map_no_pos


	class FrameIdentifier:
	def __init__(self):
	lf_map = lu_to_frame('1.7')
	lf_map['there have'].add('Existence')
	lf_map['there must'].add('Existence')
	lf_map['be there'].add('Existence')
	self.lf_map = dict(lf_map)

	def __call__(self, tokens: List[str]):
	if len(tokens) == 1 and tokens[0].isnumeric():
	return ['Cardinal_numbers']
	if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric():
	return ['Ordinal_numbers']
	tokens = [t.lower() for t in tokens]
	frames = list()

	if not all([is_word(t) for t in tokens]):
	return []

	for i, token in enumerate(tokens):
	t2s = [token]
	for _pos in 'asrnv':
	t2s.append(lemmatizer.lemmatize(token, _pos))
	for t_ in t2s:
	if t_ in manual:
	t2s.append(manual[t_])
	t2s = list(set(t2s))
	tokens[i] = t2s

	for t2s in tokens:
	for t in t2s:
	key = t
	if key in self.lf_map:
	for f in self.lf_map[key]:
	frames.append(f)
	for t1, t2 in zip(tokens, tokens[1:]):
	for ts in product(t1, t2):
	t = ' '.join(ts)
	if t in self.lf_map:
	for f in self.lf_map[t]:
	frames.append(f)

	return list(set(frames))