Spaces:
Sleeping
Sleeping
| import string | |
| from random import random, sample | |
| from utilities_language_general.morphology import inflect | |
| from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS | |
| from utilities_language_general.esp_utils import check_token_bert, fix_irregular_lemma, get_distractors_from_model_bert | |
| class SENTENCE: | |
| def __init__(self, original: str, n_sentence: int, max_num_distractors): | |
| self.original = original | |
| self.n_sentence = n_sentence | |
| self.max_num_distractors = max_num_distractors | |
| self.parsed = nlp(self.original) | |
| self.sentence_lemma_pos = [] | |
| self.sentence_phrases = [] | |
| self.target_words = [] | |
| self.text_with_masked_task = '' | |
| def lemmatize_sentence(self): | |
| for token in self.parsed: | |
| lemma_pos = f'{token.lemma_}_{token.pos_}' | |
| if token.pos_ in ('AUX', 'VERB', 'ADJ'): | |
| lemma_pos = fix_irregular_lemma(lemma=lemma_pos) | |
| self.sentence_lemma_pos.append((lemma_pos, token)) | |
| def bind_phrases(self): | |
| previous_was_phrase = False | |
| for i in range(len(self.sentence_lemma_pos) - 1): | |
| phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}' | |
| if phrase_candidate in PHRASES and not previous_was_phrase: | |
| # phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}} | |
| phrase = [ | |
| f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}', | |
| { | |
| 'original_token1': self.sentence_lemma_pos[i][1], | |
| 'original_token2': self.sentence_lemma_pos[i + 1][1] | |
| } | |
| ] | |
| self.sentence_phrases.append(phrase) | |
| previous_was_phrase = True | |
| else: | |
| if not previous_was_phrase: | |
| self.sentence_phrases.append(self.sentence_lemma_pos[i][1]) | |
| previous_was_phrase = False | |
| self.sentence_phrases.append(self.sentence_lemma_pos[-1][1]) | |
| def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary:list=None): | |
| for token in self.sentence_phrases: | |
| if isinstance(token, list): # if token is a phrase | |
| original_token1 = token[1]['original_token1'] | |
| original_token2 = token[1]['original_token2'] | |
| original_token1_tags = original_token1.morph.to_dict() | |
| original_token2_tags = original_token2.morph.to_dict() | |
| tags = dict() | |
| if ('haber_AUX' == f'{original_token1.lemma_}_{original_token1.pos_}' | |
| and original_token2.pos_ in ('VERB', 'ADJ', 'AUX')): | |
| tags['VerbForm'] = 'Compuesto' | |
| tags['Mood'] = original_token1_tags.get('Mood') | |
| tags['Tense'] = original_token1_tags.get('Tense') | |
| tags['Person'] = original_token1_tags.get('Person') | |
| tags['Number'] = original_token1_tags.get('Number') | |
| tags['Gender'] = None | |
| else: | |
| tags = {**original_token1_tags, **original_token2_tags} | |
| not_ner = True if (original_token1.ent_type == 0 and original_token2.ent_type == 0) else False | |
| target_word = { | |
| 'masked_sentence': self.original.replace(f'{original_token1.text} {original_token2.text}', | |
| '[MASK]'), | |
| 'sentence_number': self.n_sentence, | |
| 'sentence_text': self.original, | |
| 'original_text': f'{original_token1.text} {original_token2.text}', | |
| 'lemma': token[0], | |
| 'pos': ('phrase', 'phrase'), | |
| 'gender': tags.get('Gender'), | |
| 'tags': tags, | |
| 'position_in_sentence': self.original.find(original_token1.text), | |
| 'not_named_entity': not_ner, | |
| 'frequency_in_text': 0, | |
| 'in_summary': self.original in summary | |
| } | |
| self.target_words.append(target_word) | |
| else: # if token is just a spacy.nlp token | |
| if check_token_bert(token=token, current_minimum=target_minimum): | |
| tags = token.morph.to_dict() | |
| target_word = { | |
| 'masked_sentence': self.original.replace(token.text, '[MASK]'), | |
| 'sentence_number': self.n_sentence, | |
| 'sentence_text': self.original, | |
| 'original_text': token.text, | |
| 'lemma': token.lemma_, | |
| 'pos': ('simple', token.pos_), | |
| 'gender': tags.get('Gender'), | |
| 'number_children': len([child for child in token.children]), | |
| 'tags': tags, | |
| 'position_in_sentence': self.original.find(token.text), | |
| 'not_named_entity': True if token.ent_type == 0 else False, | |
| 'frequency_in_text': frequency_dict.get(token.lemma_, 1), | |
| 'in_summary': self.original in summary | |
| } | |
| self.target_words.append(target_word) | |
| def search_user_target_words(self, user_target_words: set = None, frequency_dict: dict = None, summary:list=None): | |
| for _utw in user_target_words: | |
| if _utw in self.original: | |
| parse_utw = nlp(_utw) | |
| if ' ' in _utw: | |
| tags = dict() | |
| if ('haber_AUX' == f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}' | |
| and parse_utw[1].pos_ in ('VERB', 'ADJ', 'AUX')): | |
| tags['VerbForm'] = 'Compuesto' | |
| tags['Mood'] = parse_utw[0].morph.to_dict().get('Mood') | |
| tags['Tense'] = parse_utw[0].morph.to_dict().get('Tense') | |
| tags['Person'] = parse_utw[0].morph.to_dict().get('Person') | |
| tags['Number'] = parse_utw[0].morph.to_dict().get('Number') | |
| tags['Gender'] = None | |
| else: | |
| tags = {**parse_utw[0].morph.to_dict(), **parse_utw[1].morph.to_dict()} | |
| user_target_word_lemma = '_'.join([f'{token.lemma_}_{token.pos_}' for token in parse_utw]) | |
| user_target_word_pos = 'phrase' | |
| user_target_word_tags = tags | |
| not_ner = True if (parse_utw[0].ent_type == 0 and parse_utw[1].ent_type == 0) else False | |
| else: | |
| user_target_word_lemma = f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}' | |
| user_target_word_pos = ('simple', parse_utw[0].pos_) | |
| user_target_word_tags = parse_utw[0].morph.to_dict() | |
| not_ner = parse_utw[0].ent_type == 0 | |
| target_word = { | |
| 'masked_sentence': self.original.replace(_utw, '[MASK]'), | |
| 'sentence_number': self.n_sentence, | |
| 'sentence_text': self.original, | |
| 'original_text': _utw, | |
| 'lemma': user_target_word_lemma, | |
| 'pos': user_target_word_pos, | |
| 'gender': user_target_word_tags.get('Gender'), | |
| 'tags': user_target_word_tags, | |
| 'position_in_sentence': self.original.find(_utw), | |
| 'not_named_entity': not_ner, | |
| 'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1), | |
| 'in_summary': self.original in summary | |
| } | |
| self.target_words.append(target_word) | |
| def search_target_words(self, target_words_automatic_mode: bool, target_minimum, | |
| user_target_words: set = None, | |
| frequency_dict: dict = None, summary:list=None): | |
| if target_words_automatic_mode: | |
| self.search_target_words_automatically(target_minimum=target_minimum, | |
| frequency_dict=frequency_dict, summary=summary) | |
| else: | |
| self.search_user_target_words(user_target_words=user_target_words, | |
| frequency_dict=frequency_dict, summary=summary) | |
| def filter_target_words(self, target_words_automatic_mode): | |
| c_position = 0 | |
| bad_target_words = [] | |
| for target_word in self.target_words: | |
| position_difference = 5 if target_words_automatic_mode else 0 | |
| if not (target_word['position_in_sentence'] == 0 | |
| or abs(target_word['position_in_sentence'] - c_position) >= position_difference): | |
| bad_target_words.append(target_word) | |
| for btw in bad_target_words: | |
| BAD_USER_TARGET_WORDS.append(btw['original_text']) | |
| self.target_words.remove(btw) | |
| class TASK: | |
| def __init__(self, task_data, max_num_distractors): | |
| self.task_data = task_data | |
| self.distractors = None | |
| self.distractors_number = 0 | |
| self.bad_target_word = False | |
| self.inflected_distractors = None | |
| self.pos = task_data['pos'] | |
| self.tags = task_data['tags'] | |
| self.lemma = task_data['lemma'] | |
| self.gender = task_data['gender'] | |
| self.in_summary = task_data['in_summary'] | |
| self.max_num_distractors = max_num_distractors | |
| self.original_text = task_data['original_text'] | |
| self.sentence_text = task_data['sentence_text'] | |
| self.sentence_number = task_data['sentence_number'] | |
| self.masked_sentence = task_data['masked_sentence'] | |
| self.frequency_in_text = task_data['frequency_in_text'] | |
| self.position_in_sentence = task_data['position_in_sentence'] | |
| self.text_with_masked_task = task_data['text_with_masked_task'] | |
| self.result = '' | |
| self.variants = [] | |
| def __repr__(self): | |
| return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()]) | |
| def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict, | |
| global_distractors, distractor_minimum, level_name, max_frequency): | |
| pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1] | |
| distractors_sentence = get_distractors_from_model_bert(model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict, | |
| level_name=level_name, lemma=self.lemma, pos=pos, gender=self.gender, | |
| text_with_masked_task=self.masked_sentence, | |
| global_distractors=global_distractors, | |
| distractor_minimum=distractor_minimum, | |
| max_num_distractors=self.max_num_distractors) | |
| if distractors_sentence is None or self.frequency_in_text > max_frequency: | |
| self.bad_target_word = True | |
| self.distractors = None | |
| else: | |
| self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 30] | |
| self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0 | |
| def inflect_distractors(self): | |
| inflected_distractors = [] | |
| if self.distractors is None: | |
| self.bad_target_word = True | |
| return | |
| for distractor_lemma in self.distractors: | |
| if distractor_lemma.count('_') > 1: | |
| if distractor_lemma.startswith('haber_'): | |
| distractor_lemma = distractor_lemma.split('_')[-2] | |
| inflected = inflect(lemma=distractor_lemma, target_pos=self.pos[1], target_tags=self.tags) | |
| else: | |
| continue | |
| else: | |
| inflected = inflect(lemma=distractor_lemma, target_pos=self.pos[1], target_tags=self.tags) | |
| if inflected is not None: | |
| inflected_distractors.append(inflected) | |
| num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \ | |
| else self.max_num_distractors | |
| if len(inflected_distractors) < num_distractors: | |
| self.bad_target_word = True | |
| else: | |
| self.distractors_number = num_distractors | |
| self.inflected_distractors = inflected_distractors | |
| def sample_distractors(self, num_distractors): | |
| if not self.bad_target_word: | |
| num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors | |
| self.inflected_distractors = sample(self.inflected_distractors, num_distractors) | |
| def compile_task(self, max_num_distractors): | |
| len_distractors = len(self.inflected_distractors) | |
| len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \ | |
| else max_num_distractors | |
| letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1]) | |
| try: | |
| distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ] | |
| except ValueError as e: | |
| print(f'{e}\n{len_distractors=}\n{len_variants=}') | |
| distractors = self.inflected_distractors + [self.original_text, ] | |
| tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'.lower() | |
| for item in zip(letters, sorted(distractors, key=lambda _: random()))] | |
| self.variants.append((self.original_text, tmp_vars)) | |