Spaces:
Runtime error
Runtime error
TestTaker
commited on
Commit
·
703d114
1
Parent(s):
c51f116
Fix bert bugs
Browse files
utilities_language_bert/rus_main_workflow_bert.py
CHANGED
|
@@ -106,7 +106,7 @@ def main_workflow(
|
|
| 106 |
|
| 107 |
# Get summary. May choose between round_summary_length and summary_length
|
| 108 |
SUMMARY = summarization(current_text, num_sentences=round_summary_length)
|
| 109 |
-
logs.
|
| 110 |
progress.progress(25)
|
| 111 |
|
| 112 |
for sentence in workflow:
|
|
@@ -174,7 +174,7 @@ def main_workflow(
|
|
| 174 |
logs.update(label='Подобрали неправильные варианты!', state='running')
|
| 175 |
|
| 176 |
for task in RESULT_TASKS:
|
| 177 |
-
task.inflect_distractors()
|
| 178 |
progress.progress(80)
|
| 179 |
logs.update(label='Просклоняли и проспрягали неправильные варианты!', state='running')
|
| 180 |
|
|
|
|
| 106 |
|
| 107 |
# Get summary. May choose between round_summary_length and summary_length
|
| 108 |
SUMMARY = summarization(current_text, num_sentences=round_summary_length)
|
| 109 |
+
logs.success('Нашли интересные предложения. Пригодятся!')
|
| 110 |
progress.progress(25)
|
| 111 |
|
| 112 |
for sentence in workflow:
|
|
|
|
| 174 |
logs.update(label='Подобрали неправильные варианты!', state='running')
|
| 175 |
|
| 176 |
for task in RESULT_TASKS:
|
| 177 |
+
task.inflect_distractors(level_name=level)
|
| 178 |
progress.progress(80)
|
| 179 |
logs.update(label='Просклоняли и проспрягали неправильные варианты!', state='running')
|
| 180 |
|
utilities_language_bert/rus_sentence_bert.py
CHANGED
|
@@ -150,6 +150,7 @@ class TASK:
|
|
| 150 |
self.tags = task_data['tags']
|
| 151 |
self.lemma = task_data['lemma']
|
| 152 |
self.gender = task_data['gender']
|
|
|
|
| 153 |
self.max_num_distractors = max_num_distractors
|
| 154 |
self.original_text = task_data['original_text']
|
| 155 |
self.sentence_text = task_data['sentence_text']
|
|
@@ -180,13 +181,13 @@ class TASK:
|
|
| 180 |
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
|
| 181 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
| 182 |
|
| 183 |
-
def inflect_distractors(self):
|
| 184 |
inflected_distractors = []
|
| 185 |
if self.distractors is None:
|
| 186 |
self.bad_target_word = True
|
| 187 |
return
|
| 188 |
for distractor_lemma in self.distractors:
|
| 189 |
-
inflected = make_inflection(text=distractor_lemma, pos=self.pos[1], tags=self.tags)
|
| 190 |
if inflected is not None:
|
| 191 |
inflected_distractors.append(inflected)
|
| 192 |
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
|
|
|
|
| 150 |
self.tags = task_data['tags']
|
| 151 |
self.lemma = task_data['lemma']
|
| 152 |
self.gender = task_data['gender']
|
| 153 |
+
self.in_summary = task_data['in_summary']
|
| 154 |
self.max_num_distractors = max_num_distractors
|
| 155 |
self.original_text = task_data['original_text']
|
| 156 |
self.sentence_text = task_data['sentence_text']
|
|
|
|
| 181 |
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
|
| 182 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
| 183 |
|
| 184 |
+
def inflect_distractors(self, level_name):
|
| 185 |
inflected_distractors = []
|
| 186 |
if self.distractors is None:
|
| 187 |
self.bad_target_word = True
|
| 188 |
return
|
| 189 |
for distractor_lemma in self.distractors:
|
| 190 |
+
inflected = make_inflection(text=distractor_lemma, pos=self.pos[1], tags=self.tags, level=level_name)
|
| 191 |
if inflected is not None:
|
| 192 |
inflected_distractors.append(inflected)
|
| 193 |
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
|
utilities_language_general/rus_constants.py
CHANGED
|
@@ -34,7 +34,7 @@ def load_spacy():
|
|
| 34 |
@st.cache_resource
|
| 35 |
def load_bert():
|
| 36 |
with st.spinner('Загружаю языковую модель'):
|
| 37 |
-
_pipeline = pipeline(task="fill-mask", model="a-v-
|
| 38 |
return _pipeline
|
| 39 |
|
| 40 |
|
|
@@ -113,6 +113,7 @@ COMBINE_POS = {
|
|
| 113 |
'B2': {'VERB': ['AUX']},
|
| 114 |
'C1': {'VERB': ['AUX']},
|
| 115 |
'C2': {'VERB': ['AUX']},
|
|
|
|
| 116 |
},
|
| 117 |
'phrase':
|
| 118 |
{
|
|
@@ -122,5 +123,6 @@ COMBINE_POS = {
|
|
| 122 |
'B2': {'VERB': ['AUX']},
|
| 123 |
'C1': {'VERB': ['AUX']},
|
| 124 |
'C2': {'VERB': ['AUX']},
|
|
|
|
| 125 |
},
|
| 126 |
}
|
|
|
|
| 34 |
@st.cache_resource
|
| 35 |
def load_bert():
|
| 36 |
with st.spinner('Загружаю языковую модель'):
|
| 37 |
+
_pipeline = pipeline(task="fill-mask", model="a-v-bely/ruBert-base-finetuned-russian-moshkov-child-corpus-pro")
|
| 38 |
return _pipeline
|
| 39 |
|
| 40 |
|
|
|
|
| 113 |
'B2': {'VERB': ['AUX']},
|
| 114 |
'C1': {'VERB': ['AUX']},
|
| 115 |
'C2': {'VERB': ['AUX']},
|
| 116 |
+
'Без уровня': {'VERB': ['AUX']}
|
| 117 |
},
|
| 118 |
'phrase':
|
| 119 |
{
|
|
|
|
| 123 |
'B2': {'VERB': ['AUX']},
|
| 124 |
'C1': {'VERB': ['AUX']},
|
| 125 |
'C2': {'VERB': ['AUX']},
|
| 126 |
+
'Без уровня': {'VERB': ['AUX']}
|
| 127 |
},
|
| 128 |
}
|
utilities_language_general/rus_utils.py
CHANGED
|
@@ -41,7 +41,7 @@ def compute_frequency_dict(text: str) -> dict:
|
|
| 41 |
|
| 42 |
|
| 43 |
def convert_gender(gender_spacy):
|
| 44 |
-
genders = {'Masc': 'masc', 'Fem': 'femn', 'Neut': 'neut'}
|
| 45 |
return genders[gender_spacy]
|
| 46 |
|
| 47 |
|
|
@@ -359,21 +359,23 @@ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, le
|
|
| 359 |
distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
|
| 360 |
distractor_similarity = candidate_distractor[1]
|
| 361 |
candidate_gender = define_gender(distractor_lemma)
|
|
|
|
| 362 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
| 363 |
decision = make_decision(doc=None, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict, level=level_name,
|
| 364 |
target_lemma=lemma, target_text=None, target_pos=pos, target_position=None,
|
| 365 |
substitute_lemma=distractor_lemma, substitute_pos=distractor_pos, bert_score=distractor_similarity)
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
| 377 |
if distractor_minimum is not None:
|
| 378 |
if distractor_lemma in distractor_minimum:
|
| 379 |
_distractors.append((distractor_lemma, candidate_distractor[1]))
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
def convert_gender(gender_spacy):
|
| 44 |
+
genders = {'Masc': 'masc', 'Fem': 'femn', 'Neut': 'neut', None: False}
|
| 45 |
return genders[gender_spacy]
|
| 46 |
|
| 47 |
|
|
|
|
| 359 |
distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
|
| 360 |
distractor_similarity = candidate_distractor[1]
|
| 361 |
candidate_gender = define_gender(distractor_lemma)
|
| 362 |
+
# print(distractor_lemma, candidate_gender, distractor_pos, pos)
|
| 363 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
| 364 |
decision = make_decision(doc=None, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict, level=level_name,
|
| 365 |
target_lemma=lemma, target_text=None, target_pos=pos, target_position=None,
|
| 366 |
substitute_lemma=distractor_lemma, substitute_pos=distractor_pos, bert_score=distractor_similarity)
|
| 367 |
+
condition = (((distractor_pos == pos)
|
| 368 |
+
or (COMBINE_POS['phrase'][level_name].get(pos) is not None and COMBINE_POS['phrase'][level_name].get(distractor_pos) is not None
|
| 369 |
+
and distractor_pos in COMBINE_POS['phrase'][level_name][pos] and pos in COMBINE_POS['phrase'][level_name][distractor_pos]))
|
| 370 |
+
and decision
|
| 371 |
+
and distractor_lemma != lemma
|
| 372 |
+
and (len(_distractors) < max_num_distractors + 10)
|
| 373 |
+
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
|
| 374 |
+
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
| 375 |
+
and (distractor_lemma not in global_distractors)
|
| 376 |
+
and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
|
| 377 |
+
/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio))
|
| 378 |
+
if condition:
|
| 379 |
if distractor_minimum is not None:
|
| 380 |
if distractor_lemma in distractor_minimum:
|
| 381 |
_distractors.append((distractor_lemma, candidate_distractor[1]))
|
utilities_language_general/similarity_measures.py
CHANGED
|
@@ -185,6 +185,8 @@ def get_context_linked_words(doc, target_position, target_text):
|
|
| 185 |
|
| 186 |
|
| 187 |
def compute_all_necessary_metrics(target_lemma, target_text, target_position, substitute_lemma, doc, model_type:str, model=None):
|
|
|
|
|
|
|
| 188 |
|
| 189 |
target_vector = get_vector_for_token(model, target_lemma)
|
| 190 |
substitute_vector = get_vector_for_token(model, substitute_lemma)
|
|
@@ -246,9 +248,11 @@ def make_decision(doc, model_type, scaler, classifier, pos_dict, level, target_l
|
|
| 246 |
metrics = compute_all_necessary_metrics(target_lemma=target_lemma, target_text=target_text, target_position=target_position,
|
| 247 |
substitute_lemma=substitute_lemma, doc=doc, model_type=model_type, model=model)
|
| 248 |
target_multiword, substitute_multiword = target_lemma.count('_') > 2, substitute_lemma.count('_') > 2
|
| 249 |
-
data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword] + scaler.transform([metrics]).tolist()[0]
|
| 250 |
if model_type == 'bert':
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
| 252 |
predict = classifier.predict(data)
|
| 253 |
return bool(predict)
|
| 254 |
|
|
|
|
| 185 |
|
| 186 |
|
| 187 |
def compute_all_necessary_metrics(target_lemma, target_text, target_position, substitute_lemma, doc, model_type:str, model=None):
|
| 188 |
+
if model_type == 'bert':
|
| 189 |
+
return
|
| 190 |
|
| 191 |
target_vector = get_vector_for_token(model, target_lemma)
|
| 192 |
substitute_vector = get_vector_for_token(model, substitute_lemma)
|
|
|
|
| 248 |
metrics = compute_all_necessary_metrics(target_lemma=target_lemma, target_text=target_text, target_position=target_position,
|
| 249 |
substitute_lemma=substitute_lemma, doc=doc, model_type=model_type, model=model)
|
| 250 |
target_multiword, substitute_multiword = target_lemma.count('_') > 2, substitute_lemma.count('_') > 2
|
|
|
|
| 251 |
if model_type == 'bert':
|
| 252 |
+
scaled_data = scaler.transform([[bert_score]]).tolist()[0]
|
| 253 |
+
else:
|
| 254 |
+
scaled_data = scaler.transform([metrics]).tolist()[0]
|
| 255 |
+
data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword] + scaled_data
|
| 256 |
predict = classifier.predict(data)
|
| 257 |
return bool(predict)
|
| 258 |
|