Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from collections import Counter | |
| from nltk import pos_tag | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| import string | |
| from gensim.models.phrases import Phrases, Phraser | |
| from anytree import Node, RenderTree, PreOrderIter | |
| import numpy as np | |
| import re | |
| from gensim.models import Word2Vec | |
| import itertools | |
| import pickle | |
| from agent.target_extraction.product import Product | |
| from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityDataset | |
| from agent.target_extraction.BERT.entity_extractor.bert_entity_extractor import BertEntityExtractor | |
| from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset | |
| from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import BertRelExtractor | |
| from pathos.multiprocessing import ProcessingPool as Pool | |
| import itertools | |
| from time import time | |
| import os | |
| import streamlit as st | |
| #to get the current working directory | |
| directory = os.getcwd() | |
| print(directory) | |
| np.set_printoptions(precision=4, threshold=np.inf, suppress=True) | |
| stop_words = stopwords.words('english') | |
| wnl = WordNetLemmatizer() | |
| pool = Pool(1) | |
| sentiment_lexicon = pd.read_csv(directory+'/NRC_Emotion_Lexicon-master/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', index_col=0) | |
| entity_extractor_path = directory+'/models/entity_model.pt' | |
| rel_extractor_path = directory+'/models/model.pt' | |
| def ngrams(text, phraser): | |
| if any(isinstance(subtext, list) for subtext in text): | |
| return | |
| tags = [tag for _, tag in pos_tag(text)] | |
| unfiltered = [term.split('_') for term in phraser[text]] | |
| tagged_unfiltered = [] | |
| n = 0 | |
| for term in unfiltered: | |
| tagged_unfiltered.append([(subterm, tags[n + idx]) for idx, subterm in enumerate(term)]) | |
| n += len(term) | |
| def filter_ngram(term): | |
| if len(term) > 1 and (any(not re.compile('NN|JJ').match(tag) for _, tag in term) | |
| or any(tag.startswith('JJ') and polar_adjective(t) for t, tag in term)): | |
| return [subterm for subterm, _ in term] | |
| return [' '.join([subterm for subterm, _ in term])] | |
| def polar_adjective(adj): | |
| print("adjjj"+str(adj)) | |
| return adj in sentiment_lexicon.index and (sentiment_lexicon.loc[adj]['positive'] == 1 or | |
| sentiment_lexicon.loc[adj]['negative'] == 1) | |
| return [subterm for term in tagged_unfiltered for subterm in filter_ngram(term)] | |
| def get_nouns(phrase, ngrams): | |
| pos_tags = pos_tag(phrase) | |
| def is_noun(pos_tagged): | |
| word, tag = pos_tagged | |
| return tag.startswith('NN') and word not in string.punctuation and word not in stop_words | |
| # true if term is not a preposition and does not include special characters | |
| def is_valid_term(pos_tagged): | |
| alpha_numeric_pat = '^\w+$' | |
| word, tag = pos_tagged | |
| return tag != 'IN' and re.match(alpha_numeric_pat, word) | |
| nouns = [] | |
| word_idx = 0 | |
| for token in ngrams: | |
| if ' ' in token: | |
| words = token.split(' ') | |
| word_range = range(word_idx, word_idx + len(words)) | |
| has_noun = any(is_noun(pos_tags[i]) for i in word_range) | |
| all_terms_valid = all(is_valid_term(pos_tags[i]) for i in word_range) | |
| if has_noun and all_terms_valid: | |
| nouns.append(token) | |
| word_idx += len(words) | |
| else: | |
| token_is_noun = is_noun(pos_tags[word_idx]) | |
| is_valid = is_valid_term(pos_tags[word_idx]) | |
| if len(token) > 1 and token_is_noun and is_valid: | |
| nouns.append(token) | |
| word_idx += 1 | |
| return nouns | |
| def entity_mentions_in_text(text, phrase, ngrams, entities): | |
| all_tokens = set().union(*[phrase, ngrams]) | |
| entity_mention = None | |
| for entity in entities: | |
| n_mentions = sum(1 for token in all_tokens if entity == token.lower()) | |
| if n_mentions > 1: | |
| # many mentions of same entity | |
| return None | |
| if n_mentions == 1: | |
| if entity_mention is None: | |
| entity_mention = entity | |
| elif entity_mention in entity: | |
| entity_mention = entity | |
| elif entity not in entity_mention: | |
| # text cannot have more than one entity mention, unless one is a subset of the other, | |
| # in which case the longer one is taken | |
| return None | |
| if entity_mention is not None: | |
| return text, [{'text': entity_mention}] | |
| return None | |
| def pair_relations_for_text(text, ngrams, aspects, syn_dict): | |
| def overlapping_terms(ts, t): | |
| if len(ts) == 0: | |
| return False | |
| return any(t in t2.split(' ') if len(t) < len(t2) else t2 in t.split(' ') for t2 in ts) | |
| noun_ngrams = [ngram for ngram, tag in pos_tag(ngrams) if tag.startswith('NN')] | |
| found_aspects = [] | |
| for aspect in aspects: | |
| found_form = False | |
| for form in syn_dict[aspect]: | |
| if form in noun_ngrams: | |
| if len(found_aspects) > 1 or found_form or overlapping_terms(found_aspects, form): | |
| # cannot have more than two aspects, or two forms of the same aspect, or overlapping terms | |
| return None | |
| found_aspects.append(form) | |
| found_form = True | |
| return (text, [{'em1Text': found_aspects[0], 'em2Text': found_aspects[1]}]) if len(found_aspects) == 2 else None | |
| class TargetExtractor: | |
| N_ASPECTS = 100 | |
| # phraser | |
| PHRASE_THRESHOLD = 4 | |
| # tree | |
| SUBFEATURE_MULT = 1.4 # for z to be a subfeature of x, matrix(z, x) > matrix(z, f) * SUBFEATURE_MULT for all other f | |
| COUNT_MULT = 5 | |
| MAX_DEPTH = 2 | |
| # word2vec | |
| MIN_TERM_COUNT = 100 | |
| SYNONYM_SIMILARITY = 0.21 | |
| SYNONYM_SIMILARITY_PRODUCT = 0.08 | |
| WV_SIZE = 300 | |
| WV_WINDOW = 4 | |
| # bert | |
| ENTITY_PROB_THRESHOLD = 0.65 | |
| # parent is a TargetExtrator of a parent category, eg. > electronics > camera | |
| def __init__(self, product, file_path, text_column): | |
| self.product = product | |
| self.file_path = file_path | |
| self.sentiment_lexicon=sentiment_lexicon | |
| self.entity_extractor_path=entity_extractor_path | |
| self.rel_extractor_path=rel_extractor_path | |
| ts = time() | |
| print('tokenizing phrases...') | |
| st.write('tokenizing phrases...') | |
| print("hello") | |
| # tokenize and normalize phrases | |
| texts = TargetExtractor.obtain_texts(self.file_path, text_column, n=50) | |
| self.sentences = list(itertools.chain.from_iterable(map(sent_tokenize, texts))) | |
| self.sentences =list(map(lambda s: s.replace('_', ' ').lower(), self.sentences)) | |
| self.phrases = list(map(word_tokenize, self.sentences)) | |
| print('obtaining n-grams...') | |
| st.write('obtaining n-grams...') | |
| # train bigram map | |
| bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD) | |
| trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD) | |
| phraser = Phraser(trigram) | |
| self.ngram_phrases = list(map(ngrams, self.phrases, itertools.repeat(phraser, len(self.phrases)))) | |
| print('counting terms...') | |
| st.write('counting terms...') | |
| # count terms | |
| self.counter = self.count_nouns() | |
| self.total_count = sum(self.counter.values()) | |
| t_noun = time() | |
| print('Noun extraction took {} seconds'.format(t_noun - ts)) | |
| st.write('Noun extraction took {} seconds'.format(t_noun - ts)) | |
| print('mining aspects...') | |
| st.write('mining aspects...') | |
| # mine aspects | |
| self.aspects, self.counts = self.get_aspects(self.counter) | |
| t_feature = time() | |
| print('Feature extraction took {} seconds'.format(t_feature - t_noun)) | |
| st.write('Feature extraction took {} seconds'.format(t_feature - t_noun)) | |
| print('training word2vec model...') | |
| st.write('training word2vec model...') | |
| # train word2vec model | |
| self.wv = self.get_word2vec_model(TargetExtractor.WV_SIZE, window=TargetExtractor.WV_WINDOW, | |
| min_count=TargetExtractor.MIN_TERM_COUNT) | |
| print('extracting synonyms...') | |
| st.write('extracting synonyms...') | |
| # obtain synonyms | |
| self.syn_dict = self.get_syn_dict() | |
| # remove aspect synonyms and reorder list based on sum of all synonym counts | |
| self.aspects = [aspect for aspect in self.aspects if aspect in self.syn_dict.keys()] | |
| dt={} | |
| for aspect in self.aspects: | |
| summ=0 | |
| for syn in self.syn_dict[aspect]: | |
| try: | |
| print(self.counts[syn]) | |
| st.write(self.counts[syn]) | |
| summ=summ+1 | |
| except: | |
| print("nothing") | |
| st.write("nothing") | |
| dt[aspect]=summ | |
| self.counts = dt #{aspect: sum(self.counts[syn] for syn in self.syn_dict[aspect]) for aspect in self.aspects} | |
| print(self.counts) | |
| st.write(self.counts) | |
| self.aspects = sorted(self.aspects, key=self.counts.get, reverse=True) | |
| t_syn = time() | |
| print('Synonym extraction took {} seconds'.format(t_syn - t_feature)) | |
| st.write('Synonym extraction took {} seconds'.format(t_syn - t_feature)) | |
| print('extracting relatedness matrix...') | |
| st.write('extracting relatedness matrix...') | |
| self.relatedness_matrix = self.get_bert_relations() | |
| print('extracting aspect tree...') | |
| st.write('extracting aspect tree...') | |
| self.tree = self.get_product_tree3() | |
| te = time() | |
| print('Ontology extraction took {} seconds'.format(te - t_syn)) | |
| st.write('Ontology extraction took {} seconds'.format(te - t_syn)) | |
| print('Full process took {} seconds'.format(te - ts)) | |
| st.write('Full process took {} seconds'.format(te - ts)) | |
| print('saving...') | |
| st.write('saving...') | |
| self.save() | |
| print('done:') | |
| st.write('done:') | |
| print(self.aspects) | |
| st.write(self.aspects) | |
| print(self.syn_dict) | |
| st.write(self.syn_dict) | |
| print(self.relatedness_matrix) | |
| st.write(self.relatedness_matrix) | |
| print(self.tree) | |
| st.write(self.tree) | |
| print(RenderTree(self.tree)) | |
| st.write(RenderTree(self.tree)) | |
| def save_product_representation(self,project_dir): | |
| f = open(directory+ project_dir +"/"+ self.product + Product.FILE_EXTENSION, 'wb') | |
| p = Product(self.tree, self.syn_dict) | |
| pickle.dump(p, f) | |
| f.close() | |
| def obtain_texts(path, col, n=None): | |
| print(path) | |
| file = pd.read_csv(path) | |
| file = file[~file[col].isnull()] | |
| if n and n < len(file.index): | |
| file = file.sample(frac=1).reset_index(drop=True) | |
| file = file.head(n) | |
| texts = [text for _, text in file[col].items() if not pd.isnull(text)] | |
| print('Obtained {} texts'.format(len(texts))) | |
| st.write('Obtained {} texts'.format(len(texts))) | |
| return texts | |
| def get_bert_relations(self): | |
| print(' select phrases for relation extraction...') | |
| st.write(' select phrases for relation extraction...') | |
| pair_texts = [rel for rel in map(pair_relations_for_text, self.sentences, self.ngram_phrases, | |
| itertools.repeat(self.aspects, len(self.sentences)), | |
| itertools.repeat(self.syn_dict, len(self.sentences))) if rel is not None] | |
| df = pd.DataFrame(pair_texts, columns=['sentText', 'relationMentions']) | |
| print(' extracting relations with BERT...') | |
| st.write(' extracting relations with BERT...') | |
| dataset = PairRelDataset.from_df(df) | |
| bert_extractor = BertRelExtractor.load_saved(self.rel_extractor_path) | |
| aspect_counts = np.array([self.counts[aspect] for aspect in self.aspects]) | |
| prob_matrix, count_matrix = bert_extractor.extract_relations(len(self.aspects), self.aspect_index_map(), | |
| aspect_counts, dataset=dataset) | |
| self.relatedness_matrix = prob_matrix / aspect_counts # scale rows by aspect counts | |
| return self.relatedness_matrix | |
| def extract_synset(self): | |
| for idx, aspect in enumerate(self.aspects): | |
| if idx == 0: | |
| continue | |
| synset = {idx} | |
| aspect_dependence = self.aspect_dependence(idx) | |
| for syn_idx in self.get_syns(aspect): | |
| if syn_idx < idx and syn_idx != aspect_dependence: | |
| synset.add(syn_idx) | |
| self.print_relations_from(aspect) | |
| if len(synset) > 1: | |
| return synset | |
| return None | |
| def get_syns(self, aspect): | |
| return {idx for idx, a in enumerate(self.aspects) | |
| if a != aspect and self.wv.relative_cosine_similarity(a, aspect) > TargetExtractor.SYNONYM_SIMILARITY} | |
| def aspect_index_map(self): | |
| return {syn: idx for idx, aspect in enumerate(self.aspects) for syn in self.syn_dict[aspect]} | |
| def count_nouns(self): | |
| nouns = itertools.chain.from_iterable(map(get_nouns, self.phrases, self.ngram_phrases)) | |
| return Counter(nouns) | |
| def get_aspects(self, counter): | |
| # take N_ASPECTS most common terms | |
| term_counts = counter.most_common()[:TargetExtractor.N_ASPECTS] | |
| terms = [term for term, count in term_counts] | |
| print(' preparing entity texts for BERT...') | |
| st.write(' preparing entity texts for BERT...') | |
| entity_texts = [t for t in map(entity_mentions_in_text, self.sentences, self.phrases, self.ngram_phrases, | |
| itertools.repeat(terms, len(self.sentences))) | |
| if t is not None] | |
| df = pd.DataFrame(entity_texts, columns=['sentText', 'entityMentions']) | |
| print(' extracting entities with BERT...') | |
| st.write(' extracting entities with BERT...') | |
| dataset = EntityDataset.from_df(df) | |
| entity_extractor = BertEntityExtractor.load_saved(self.entity_extractor_path) | |
| probs = entity_extractor.extract_entity_probabilities(terms, dataset=dataset) | |
| aspects = [term for term in terms if probs[term] is not None and probs[term] >= TargetExtractor.ENTITY_PROB_THRESHOLD] | |
| # bring product to front of list | |
| if self.product in aspects: | |
| aspects.remove(self.product) | |
| aspects.insert(0, self.product) | |
| return aspects, {term: count for term, count in term_counts if term in aspects} | |
| def get_word2vec_model(self, size, window, min_count): | |
| print("phrases",str(self.ngram_phrases)) | |
| model = Word2Vec(self.ngram_phrases, size=size, window=window, min_count=min_count).wv | |
| return model | |
| def save(self): | |
| f = open(directory+'/content/{}_extractor_f.pickle'.format(self.product), 'wb') | |
| pickle.dump(self, f) | |
| f.close() | |
| def load_saved(product): | |
| f = open(directory+'/content/{}_extractor_f.pickle'.format(product), 'rb') | |
| extractor = pickle.load(f) | |
| f.close() | |
| return extractor | |
| def closest_relative_for_idx(self, idx): | |
| return np.argmax(self.relatedness_matrix[idx]) | |
| def aspect_dependence(self, idx): | |
| row = self.relatedness_matrix[idx] | |
| max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1 | |
| if max_idx1 < idx and row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT: | |
| return max_idx1 | |
| else: | |
| return None | |
| def get_product_tree(self): | |
| root = Node(self.aspects[0]) | |
| root.idx = 0 | |
| for idx in range(1, len(self.aspects)): # for each feature in order from highest to lowest count | |
| dep_idx = self.aspect_dependence(idx) | |
| if dep_idx is not None: | |
| parent = next(n for n in root.descendants if n.idx == dep_idx) | |
| else: | |
| parent = root | |
| node = Node(self.aspects[idx], parent=parent) | |
| node.idx = idx | |
| self.node_map = {n.idx: n for n in (root,) + root.descendants} | |
| return root | |
| def aspect_dependence_with_strength(self, idx): | |
| row = self.relatedness_matrix[idx] | |
| max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1 | |
| if (row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT and | |
| self.counts[self.aspects[max_idx1]] * TargetExtractor.COUNT_MULT > self.counts[self.aspects[idx]]): | |
| return max_idx1, row[max_idx1] | |
| else: | |
| return None | |
| def aspect_dependence_with_strength2(self, idx): | |
| row = self.relatedness_matrix[idx] | |
| max_idx1 = np.argmax(row[1:]) + 1 | |
| if (row[max_idx1] >= row[0] and | |
| self.counts[self.aspects[max_idx1]] * TargetExtractor.COUNT_MULT > self.counts[self.aspects[idx]]): | |
| return max_idx1, row[max_idx1] | |
| else: | |
| return None | |
| def get_product_tree2(self): | |
| root = Node(self.aspects[0]) | |
| root.idx = 0 | |
| deps = {idx: self.aspect_dependence_with_strength2(idx) for idx in range(1, len(self.aspects))} | |
| for no_dep_idx in {idx for idx, dep in deps.items() if dep is None}: | |
| node = Node(self.aspects[no_dep_idx], parent=root) | |
| node.idx = no_dep_idx | |
| del deps[no_dep_idx] | |
| sorted_deps = sorted(deps.items(), key=lambda x: x[1][1], reverse=True) | |
| for idx, (dep, _) in sorted_deps: | |
| n = next((n for n in root.descendants if n.idx == idx), None) | |
| dep_n = next((n for n in root.descendants if n.idx == dep), None) | |
| if dep_n is None: | |
| dep_n = Node(self.aspects[dep], parent=root) | |
| dep_n.idx = dep | |
| if n is not None: | |
| if dep_n not in n.descendants and dep_n.depth + (max(c.depth for c in n.descendants) if n.descendants else 0) <= TargetExtractor.MAX_DEPTH: | |
| n.parent = dep_n | |
| else: | |
| if dep_n.depth < TargetExtractor.MAX_DEPTH: | |
| n = Node(self.aspects[idx], parent=dep_n) | |
| else: | |
| n = Node(self.aspects[idx], parent=root) | |
| n.idx = idx | |
| return root | |
| def get_product_tree3(self): | |
| root = Node(self.aspects[0]) | |
| root.idx = 0 | |
| deps = {idx: self.aspect_dependence_with_strength2(idx) for idx in range(1, len(self.aspects))} | |
| for no_dep_idx in {idx for idx, dep in deps.items() if dep is None}: | |
| node = Node(self.aspects[no_dep_idx], parent=root) | |
| node.idx = no_dep_idx | |
| del deps[no_dep_idx] | |
| sorted_deps = sorted(deps.items(), key=lambda x: x[1][1], reverse=True) | |
| for idx, (dep_idx, _) in sorted_deps: | |
| if any(n for n in root.descendants if n.idx == idx): | |
| continue | |
| dep_n = next((n for n in root.descendants if n.idx == dep_idx), None) | |
| if dep_n: | |
| if dep_n.depth < 2: | |
| n = Node(self.aspects[idx], parent=dep_n) | |
| else: | |
| n = Node(self.aspects[idx], parent=dep_n.parent) | |
| else: | |
| dep_n = Node(self.aspects[dep_idx], parent=root) | |
| dep_n.idx = dep_idx | |
| n = Node(self.aspects[idx], parent=dep_n) | |
| n.idx = idx | |
| return root | |
| def print_relations(target_indices, dep_matrix, targets): | |
| idx_pairs = {frozenset((idx1, idx2)) for idx1 in target_indices for idx2 in target_indices if idx1 != idx2} | |
| for idx1, idx2 in idx_pairs: | |
| t1 = targets[idx1] | |
| t2 = targets[idx2] | |
| print('{} {:.4f} {}'.format(t1, dep_matrix[idx1][idx2], t2)) | |
| print('{} {:.4f} {}'.format(' ' * len(t1), dep_matrix[idx2][idx1], ' ' * len(t2))) | |
| print('') | |
| st.write('{} {:.4f} {}'.format(t1, dep_matrix[idx1][idx2], t2)) | |
| st.write('{} {:.4f} {}'.format(' ' * len(t1), dep_matrix[idx2][idx1], ' ' * len(t2))) | |
| st.write('') | |
| def print_relations_from(self, aspect): | |
| idx = self.aspects.index(aspect) | |
| rels = self.relatedness_matrix[idx].copy() | |
| print(' relations from {}:'.format(aspect)) | |
| st.write(' relations from {}:'.format(aspect)) | |
| for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True)[:20]: | |
| print(' {:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx]) | |
| st.write(' {:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx]) | |
| def get_syn_dict(self): | |
| all_pairs = {frozenset((t1, t2)) for t1 in self.aspects for t2 in self.aspects if t1 != t2} | |
| syn_pairs = {frozenset((t1, t2)) for t1, t2 in all_pairs if self.are_syns(t1, t2)} | |
| synset = Synset(self.aspects, syn_pairs, self.product) | |
| return synset.get_dict(self.counts) | |
| def are_syns(self, t1, t2): | |
| if wnl.lemmatize(t1) == wnl.lemmatize(t2): | |
| return True | |
| try: | |
| if self.product in [t1, t2]: | |
| print(t1,t2,self.wv.wv.n_similarity([t1], [t2])) | |
| st.write(t1,t2,self.wv.wv.n_similarity([t1], [t2])) | |
| return (self.wv.wv.n_similarity([t1], [t2]) >= TargetExtractor.SYNONYM_SIMILARITY_PRODUCT or | |
| self.wv.wv.n_similarity([t2], [t1]) >= TargetExtractor.SYNONYM_SIMILARITY_PRODUCT) | |
| else: | |
| print(t1,t2) | |
| st.write(t1,t2) | |
| sim_sum = self.wv.wv.n_similarity([t1], [t2]) + self.wv.wv.n_similarity([t2], [t1]) | |
| return sim_sum >= TargetExtractor.SYNONYM_SIMILARITY | |
| except: | |
| return False | |
| class Synset: | |
| def __init__(self, aspects, syn_pairs, product): | |
| self.vocab = aspects | |
| self.syn_pairs = syn_pairs | |
| self.product = product | |
| def get_dict(self, counts): | |
| groups = self.get_groups() | |
| return {max(group, key=counts.get) if self.product not in group else self.product: group for group in groups} | |
| def get_groups(self): | |
| groups = [] | |
| for w1, w2 in self.syn_pairs: | |
| if not Synset.join_groups(w1, w2, groups): | |
| groups.append({w1, w2}) | |
| for word in self.vocab: | |
| if not Synset.group_for(word, groups): | |
| groups.append({word}) | |
| return groups | |
| def join_groups(w1, w2, groups): | |
| g1 = Synset.group_for(w1, groups) | |
| g2 = Synset.group_for(w2, groups) | |
| if g1 and g2 and g1 == g2: | |
| return True | |
| if g1: | |
| groups.remove(g1) | |
| if g2: | |
| groups.remove(g2) | |
| g1 = g1 if g1 else {w1} | |
| g2 = g2 if g2 else {w2} | |
| groups.append(g1.union(g2)) | |
| return True | |
| def group_for(w, groups): | |
| for group in groups: | |
| if w in group: | |
| return group | |
| return None | |