Spaces:
Runtime error
Runtime error
a-v-bely
commited on
Commit
·
92aa5ff
1
Parent(s):
192a825
upd code
Browse files
utilities_language_bert/rus_sentence_bert.py
CHANGED
|
@@ -21,25 +21,7 @@ class SENTENCE:
|
|
| 21 |
self.sentence_lemma_pos.append((lemma_pos, token))
|
| 22 |
|
| 23 |
def bind_phrases(self):
|
| 24 |
-
|
| 25 |
-
for i in range(len(self.sentence_lemma_pos) - 1):
|
| 26 |
-
phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}'
|
| 27 |
-
if phrase_candidate in PHRASES and not previous_was_phrase:
|
| 28 |
-
# phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}}
|
| 29 |
-
phrase = [
|
| 30 |
-
f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}',
|
| 31 |
-
{
|
| 32 |
-
'original_token1': self.sentence_lemma_pos[i][1],
|
| 33 |
-
'original_token2': self.sentence_lemma_pos[i + 1][1]
|
| 34 |
-
}
|
| 35 |
-
]
|
| 36 |
-
self.sentence_phrases.append(phrase)
|
| 37 |
-
previous_was_phrase = True
|
| 38 |
-
else:
|
| 39 |
-
if not previous_was_phrase:
|
| 40 |
-
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
| 41 |
-
previous_was_phrase = False
|
| 42 |
-
self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
|
| 43 |
|
| 44 |
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
|
| 45 |
for token in self.sentence_phrases:
|
|
|
|
| 21 |
self.sentence_lemma_pos.append((lemma_pos, token))
|
| 22 |
|
| 23 |
def bind_phrases(self):
|
| 24 |
+
self.sentence_phrases = self.parsed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
|
| 27 |
for token in self.sentence_phrases:
|
utilities_language_general/rus_constants.py
CHANGED
|
@@ -109,22 +109,22 @@ BAD_USER_TARGET_WORDS = []
|
|
| 109 |
COMBINE_POS = {
|
| 110 |
'simple':
|
| 111 |
{
|
| 112 |
-
'A1': {'VERB': ['AUX']},
|
| 113 |
-
'A2': {'VERB': ['AUX']},
|
| 114 |
-
'B1': {'VERB': ['AUX']},
|
| 115 |
-
'B2': {'VERB': ['AUX']},
|
| 116 |
-
'C1': {'VERB': ['AUX']},
|
| 117 |
-
'C2': {'VERB': ['AUX']},
|
| 118 |
-
'Без уровня': {'VERB': ['AUX']}
|
| 119 |
},
|
| 120 |
'phrase':
|
| 121 |
{
|
| 122 |
-
'A1': {'VERB': ['AUX']},
|
| 123 |
-
'A2': {'VERB': ['AUX']},
|
| 124 |
-
'B1': {'VERB': ['AUX']},
|
| 125 |
-
'B2': {'VERB': ['AUX']},
|
| 126 |
-
'C1': {'VERB': ['AUX']},
|
| 127 |
-
'C2': {'VERB': ['AUX']},
|
| 128 |
-
'Без уровня': {'VERB': ['AUX']}
|
| 129 |
},
|
| 130 |
}
|
|
|
|
| 109 |
COMBINE_POS = {
|
| 110 |
'simple':
|
| 111 |
{
|
| 112 |
+
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
| 113 |
+
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
| 114 |
+
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
| 115 |
+
'B2': {'VERB': ['AUX'], '': ['VERB'], },
|
| 116 |
+
'C1': {'VERB': ['AUX'], '': ['VERB'], },
|
| 117 |
+
'C2': {'VERB': ['AUX'], '': ['VERB'], },
|
| 118 |
+
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
|
| 119 |
},
|
| 120 |
'phrase':
|
| 121 |
{
|
| 122 |
+
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
| 123 |
+
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
| 124 |
+
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
| 125 |
+
'B2': {'VERB': ['AUX'], '': ['VERB'], },
|
| 126 |
+
'C1': {'VERB': ['AUX'], '': ['VERB'], },
|
| 127 |
+
'C2': {'VERB': ['AUX'], '': ['VERB'], },
|
| 128 |
+
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
|
| 129 |
},
|
| 130 |
}
|
utilities_language_general/rus_utils.py
CHANGED
|
@@ -99,14 +99,13 @@ def make_inflection(text: str, pos: str or list, tags: set, level: str) -> str o
|
|
| 99 |
return None
|
| 100 |
else:
|
| 101 |
word_form = morph.parse(text)[0].inflect(tags)
|
| 102 |
-
|
| 103 |
-
if pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
|
| 104 |
tags.discard('impf')
|
| 105 |
tags.add('perf')
|
| 106 |
word_form = morph.parse(text)[0].inflect(tags)
|
| 107 |
if word_form is not None:
|
| 108 |
return word_form.word
|
| 109 |
-
elif pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
|
| 110 |
tags.discard('perf')
|
| 111 |
tags.add('impf')
|
| 112 |
word_form = morph.parse(text)[0].inflect(tags)
|
|
@@ -209,7 +208,7 @@ def check_token_bert(token, current_minimum: set = None, stop_words=stop_list,
|
|
| 209 |
|
| 210 |
def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
|
| 211 |
lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
|
| 212 |
-
max_length_ratio=5, min_edit_distance_ratio=0.
|
| 213 |
distractors = []
|
| 214 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
| 215 |
lemma = '_'.join(lemma.split('_')[::2])
|
|
@@ -240,8 +239,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
|
|
| 240 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
|
| 241 |
and length_ratio <= max_length_ratio
|
| 242 |
and distractor_lemma not in global_distractors
|
| 243 |
-
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2)
|
| 244 |
-
min_edit_distance_ratio)
|
| 245 |
if condition:
|
| 246 |
if distractor_minimum is not None:
|
| 247 |
if distractor_lemma in distractor_minimum:
|
|
@@ -290,7 +288,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
|
|
| 290 |
|
| 291 |
def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
|
| 292 |
text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
|
| 293 |
-
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.
|
| 294 |
_distractors = []
|
| 295 |
try:
|
| 296 |
if distractor_minimum:
|
|
@@ -329,7 +327,7 @@ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, le
|
|
| 329 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
| 330 |
and (distractor_lemma not in global_distractors)
|
| 331 |
and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
|
| 332 |
-
/ ((len(lemma) + len(distractor_lemma)) / 2)
|
| 333 |
if condition:
|
| 334 |
if distractor_minimum is not None:
|
| 335 |
if distractor_lemma in distractor_minimum:
|
|
|
|
| 99 |
return None
|
| 100 |
else:
|
| 101 |
word_form = morph.parse(text)[0].inflect(tags)
|
| 102 |
+
if word_form is None and pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
|
|
|
|
| 103 |
tags.discard('impf')
|
| 104 |
tags.add('perf')
|
| 105 |
word_form = morph.parse(text)[0].inflect(tags)
|
| 106 |
if word_form is not None:
|
| 107 |
return word_form.word
|
| 108 |
+
elif word_form is None and pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
|
| 109 |
tags.discard('perf')
|
| 110 |
tags.add('impf')
|
| 111 |
word_form = morph.parse(text)[0].inflect(tags)
|
|
|
|
| 208 |
|
| 209 |
def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
|
| 210 |
lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
|
| 211 |
+
max_length_ratio=5, min_edit_distance_ratio=0.4):
|
| 212 |
distractors = []
|
| 213 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
| 214 |
lemma = '_'.join(lemma.split('_')[::2])
|
|
|
|
| 239 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
|
| 240 |
and length_ratio <= max_length_ratio
|
| 241 |
and distractor_lemma not in global_distractors
|
| 242 |
+
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio)
|
|
|
|
| 243 |
if condition:
|
| 244 |
if distractor_minimum is not None:
|
| 245 |
if distractor_lemma in distractor_minimum:
|
|
|
|
| 288 |
|
| 289 |
def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
|
| 290 |
text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
|
| 291 |
+
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.4):
|
| 292 |
_distractors = []
|
| 293 |
try:
|
| 294 |
if distractor_minimum:
|
|
|
|
| 327 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
| 328 |
and (distractor_lemma not in global_distractors)
|
| 329 |
and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
|
| 330 |
+
/ ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio))
|
| 331 |
if condition:
|
| 332 |
if distractor_minimum is not None:
|
| 333 |
if distractor_lemma in distractor_minimum:
|