Spaces:
Running
Running
togokah
commited on
Commit
·
9efc4ef
1
Parent(s):
015d17f
Prepare for experiment and add morphology to bert
Browse files
pages/2_👨🏫_Начало_работы.py
CHANGED
|
@@ -128,13 +128,13 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
|
|
| 128 |
key='-TARGET_WORDS_MODE-', horizontal=True)
|
| 129 |
DISTRACTOR_MODEL = DISTRACTOR_MODEL_COL.radio(
|
| 130 |
label='**Модель для выбора неправильных вариантов**',
|
| 131 |
-
options=['
|
| 132 |
key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
|
| 133 |
CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
|
| 134 |
with CEFR_NUM_DISTRACTORS_COL:
|
| 135 |
CEFR_TEXT_LEVEL = custom_select_box(
|
| 136 |
'Укажите уровень по CEFR:',
|
| 137 |
-
['A1', 'A2', 'B1', 'B2', 'C1', 'C2'
|
| 138 |
no_selection_label='-Выберите языковой уровень-')
|
| 139 |
st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
|
| 140 |
NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
|
|
@@ -186,7 +186,7 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
|
|
| 186 |
PROGRESS_BAR_S = st.progress(0)
|
| 187 |
|
| 188 |
# Start generation process. Everything happens inside main_workflow func
|
| 189 |
-
if DISTRACTOR_MODEL == '
|
| 190 |
from utilities_language_bert.esp_main_workflow_bert import main_workflow
|
| 191 |
__TASK_DATA__ = main_workflow(
|
| 192 |
file=UPLOAD_FILE,
|
|
|
|
| 128 |
key='-TARGET_WORDS_MODE-', horizontal=True)
|
| 129 |
DISTRACTOR_MODEL = DISTRACTOR_MODEL_COL.radio(
|
| 130 |
label='**Модель для выбора неправильных вариантов**',
|
| 131 |
+
options=['Модель-1', 'Модель-2'],
|
| 132 |
key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
|
| 133 |
CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
|
| 134 |
with CEFR_NUM_DISTRACTORS_COL:
|
| 135 |
CEFR_TEXT_LEVEL = custom_select_box(
|
| 136 |
'Укажите уровень по CEFR:',
|
| 137 |
+
['Без уровня', 'A1', 'A2', 'B1', 'B2', 'C1', 'C2'],
|
| 138 |
no_selection_label='-Выберите языковой уровень-')
|
| 139 |
st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
|
| 140 |
NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
|
|
|
|
| 186 |
PROGRESS_BAR_S = st.progress(0)
|
| 187 |
|
| 188 |
# Start generation process. Everything happens inside main_workflow func
|
| 189 |
+
if DISTRACTOR_MODEL == 'Модель-2':
|
| 190 |
from utilities_language_bert.esp_main_workflow_bert import main_workflow
|
| 191 |
__TASK_DATA__ = main_workflow(
|
| 192 |
file=UPLOAD_FILE,
|
utilities_language_bert/esp_main_workflow_bert.py
CHANGED
|
@@ -122,7 +122,7 @@ def main_workflow(
|
|
| 122 |
elif level == 'C2':
|
| 123 |
target_minimum = esp_constants.c2_target_set
|
| 124 |
distractor_minimum = esp_constants.c2_distractor_set
|
| 125 |
-
elif level == '
|
| 126 |
target_minimum = None
|
| 127 |
distractor_minimum = None
|
| 128 |
else:
|
|
@@ -150,8 +150,8 @@ def main_workflow(
|
|
| 150 |
target_minimum=target_minimum,
|
| 151 |
user_target_words=USER_TARGET_WORDS,
|
| 152 |
frequency_dict=FREQ_DICT)
|
| 153 |
-
progress.progress(int(30 + (j * (
|
| 154 |
-
progress_s.progress(
|
| 155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
| 156 |
for sentence in workflow:
|
| 157 |
for target_word in sentence.target_words:
|
|
@@ -164,7 +164,7 @@ def main_workflow(
|
|
| 164 |
if target_word not in RESULT_TW:
|
| 165 |
global_bad_target_words.append(target_word['original_text'])
|
| 166 |
sentence.target_words.remove(target_word)
|
| 167 |
-
progress_s.progress(
|
| 168 |
logs.success('Выбрали слова-пропуски!')
|
| 169 |
|
| 170 |
for sentence in workflow:
|
|
@@ -176,7 +176,7 @@ def main_workflow(
|
|
| 176 |
|
| 177 |
for sentence in workflow:
|
| 178 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
| 179 |
-
progress_s.progress(
|
| 180 |
|
| 181 |
RESULT_TASKS = []
|
| 182 |
for sentence in workflow:
|
|
@@ -193,9 +193,14 @@ def main_workflow(
|
|
| 193 |
f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
|
| 194 |
logs_d.success(
|
| 195 |
f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
|
|
|
|
|
|
|
| 196 |
|
|
|
|
|
|
|
| 197 |
progress_s.progress(70)
|
| 198 |
-
logs.success('
|
|
|
|
| 199 |
for task in RESULT_TASKS:
|
| 200 |
task.sample_distractors(num_distractors=num_distractors)
|
| 201 |
progress_s.progress(75)
|
|
|
|
| 122 |
elif level == 'C2':
|
| 123 |
target_minimum = esp_constants.c2_target_set
|
| 124 |
distractor_minimum = esp_constants.c2_distractor_set
|
| 125 |
+
elif level == 'Без уровня':
|
| 126 |
target_minimum = None
|
| 127 |
distractor_minimum = None
|
| 128 |
else:
|
|
|
|
| 150 |
target_minimum=target_minimum,
|
| 151 |
user_target_words=USER_TARGET_WORDS,
|
| 152 |
frequency_dict=FREQ_DICT)
|
| 153 |
+
progress.progress(int(30 + (j * (20 / len(workflow)))))
|
| 154 |
+
progress_s.progress(50)
|
| 155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
| 156 |
for sentence in workflow:
|
| 157 |
for target_word in sentence.target_words:
|
|
|
|
| 164 |
if target_word not in RESULT_TW:
|
| 165 |
global_bad_target_words.append(target_word['original_text'])
|
| 166 |
sentence.target_words.remove(target_word)
|
| 167 |
+
progress_s.progress(55)
|
| 168 |
logs.success('Выбрали слова-пропуски!')
|
| 169 |
|
| 170 |
for sentence in workflow:
|
|
|
|
| 176 |
|
| 177 |
for sentence in workflow:
|
| 178 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
| 179 |
+
progress_s.progress(60)
|
| 180 |
|
| 181 |
RESULT_TASKS = []
|
| 182 |
for sentence in workflow:
|
|
|
|
| 193 |
f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
|
| 194 |
logs_d.success(
|
| 195 |
f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
|
| 196 |
+
progress_s.progress(65)
|
| 197 |
+
logs.success('Подобрали неправильные варианты!')
|
| 198 |
|
| 199 |
+
for task in RESULT_TASKS:
|
| 200 |
+
task.inflect_distractors()
|
| 201 |
progress_s.progress(70)
|
| 202 |
+
logs.success('Просклоняли и проспрягали неправильные варианты!')
|
| 203 |
+
|
| 204 |
for task in RESULT_TASKS:
|
| 205 |
task.sample_distractors(num_distractors=num_distractors)
|
| 206 |
progress_s.progress(75)
|
utilities_language_bert/esp_sentence_bert.py
CHANGED
|
@@ -2,6 +2,7 @@ import string
|
|
| 2 |
from random import random
|
| 3 |
from random import sample
|
| 4 |
from utilities_language_general.esp_constants import nlp
|
|
|
|
| 5 |
from utilities_language_general.esp_constants import PHRASES
|
| 6 |
from utilities_language_general.esp_utils import check_token_bert
|
| 7 |
from utilities_language_general.esp_utils import fix_irregular_lemma
|
|
@@ -169,7 +170,9 @@ class TASK:
|
|
| 169 |
self.distractors = None
|
| 170 |
self.distractors_number = 0
|
| 171 |
self.bad_target_word = False
|
|
|
|
| 172 |
self.pos = task_data['pos']
|
|
|
|
| 173 |
self.lemma = task_data['lemma']
|
| 174 |
self.gender = task_data['gender']
|
| 175 |
self.max_num_distractors = max_num_distractors
|
|
@@ -208,20 +211,40 @@ class TASK:
|
|
| 208 |
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
|
| 209 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def sample_distractors(self, num_distractors):
|
| 212 |
if not self.bad_target_word:
|
| 213 |
num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
|
| 214 |
self.distractors = sample(self.distractors[:min(self.distractors_number, 10)], num_distractors)
|
| 215 |
|
| 216 |
def compile_task(self, max_num_distractors):
|
| 217 |
-
len_distractors = len(self.
|
| 218 |
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
|
| 219 |
else max_num_distractors
|
| 220 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
| 221 |
try:
|
| 222 |
-
distractors = sample(self.
|
| 223 |
except ValueError:
|
| 224 |
-
distractors = self.
|
| 225 |
tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'
|
| 226 |
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
|
| 227 |
self.variants.append((self.original_text, tmp_vars))
|
|
|
|
| 2 |
from random import random
|
| 3 |
from random import sample
|
| 4 |
from utilities_language_general.esp_constants import nlp
|
| 5 |
+
from utilities_language_general.morphology import inflect
|
| 6 |
from utilities_language_general.esp_constants import PHRASES
|
| 7 |
from utilities_language_general.esp_utils import check_token_bert
|
| 8 |
from utilities_language_general.esp_utils import fix_irregular_lemma
|
|
|
|
| 170 |
self.distractors = None
|
| 171 |
self.distractors_number = 0
|
| 172 |
self.bad_target_word = False
|
| 173 |
+
self.inflected_distractors = None
|
| 174 |
self.pos = task_data['pos']
|
| 175 |
+
self.tags = task_data['tags']
|
| 176 |
self.lemma = task_data['lemma']
|
| 177 |
self.gender = task_data['gender']
|
| 178 |
self.max_num_distractors = max_num_distractors
|
|
|
|
| 211 |
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
|
| 212 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
| 213 |
|
| 214 |
+
def inflect_distractors(self):
|
| 215 |
+
inflected_distractors = []
|
| 216 |
+
for distractor_lemma, distractor_similarity in self.distractors:
|
| 217 |
+
if distractor_lemma.count('_') > 1:
|
| 218 |
+
if distractor_lemma.startswith('haber_'):
|
| 219 |
+
distractor_lemma = distractor_lemma.split('_')[-2]
|
| 220 |
+
inflected = inflect(lemma=distractor_lemma, target_pos=self.pos, target_tags=self.tags)
|
| 221 |
+
else:
|
| 222 |
+
continue
|
| 223 |
+
else:
|
| 224 |
+
inflected = inflect(lemma=distractor_lemma, target_pos=self.pos, target_tags=self.tags)
|
| 225 |
+
if inflected is not None:
|
| 226 |
+
inflected_distractors.append(inflected)
|
| 227 |
+
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
|
| 228 |
+
else self.max_num_distractors
|
| 229 |
+
if len(inflected_distractors) < num_distractors:
|
| 230 |
+
self.bad_target_word = True
|
| 231 |
+
else:
|
| 232 |
+
self.inflected_distractors = inflected_distractors
|
| 233 |
+
|
| 234 |
def sample_distractors(self, num_distractors):
|
| 235 |
if not self.bad_target_word:
|
| 236 |
num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
|
| 237 |
self.distractors = sample(self.distractors[:min(self.distractors_number, 10)], num_distractors)
|
| 238 |
|
| 239 |
def compile_task(self, max_num_distractors):
|
| 240 |
+
len_distractors = len(self.inflected_distractors)
|
| 241 |
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
|
| 242 |
else max_num_distractors
|
| 243 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
| 244 |
try:
|
| 245 |
+
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
|
| 246 |
except ValueError:
|
| 247 |
+
distractors = self.inflected_distractors + [self.original_text, ]
|
| 248 |
tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'
|
| 249 |
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
|
| 250 |
self.variants.append((self.original_text, tmp_vars))
|
utilities_language_general/esp_constants.py
CHANGED
|
@@ -23,9 +23,7 @@ def load_bert():
|
|
| 23 |
|
| 24 |
|
| 25 |
nlp = load_spacy()
|
| 26 |
-
|
| 27 |
-
all_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'
|
| 28 |
-
lit_model_path = r'LITERATURA_annot_all_pos_spell_g_h_phrases_s300_cw10_mc50_w4_negative_5-075_mean_e20_shr.bin.gz'
|
| 29 |
|
| 30 |
# Upload minimums
|
| 31 |
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
nlp = load_spacy()
|
| 26 |
+
w2v_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Upload minimums
|
| 29 |
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
|
utilities_language_general/esp_utils.py
CHANGED
|
@@ -190,23 +190,22 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
| 190 |
targets=list(distractor_minimum))]
|
| 191 |
else:
|
| 192 |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
|
| 193 |
-
|
| 194 |
for candidate in bert_candidates:
|
| 195 |
if isinstance(candidate, list):
|
| 196 |
bert_candidates = candidate
|
| 197 |
continue
|
| 198 |
if candidate['token_str'].isalpha():
|
| 199 |
candidate_morph = nlp(candidate['token_str'])[0]
|
| 200 |
-
|
| 201 |
-
candidate['score']))
|
| 202 |
except KeyError:
|
| 203 |
return None
|
| 204 |
-
for candidate_distractor in
|
| 205 |
if '_' in candidate_distractor[0]:
|
| 206 |
-
distractor_lemma,
|
| 207 |
else:
|
| 208 |
-
|
| 209 |
-
|
| 210 |
distractor_similarity = candidate_distractor[1]
|
| 211 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
| 212 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
|
@@ -222,10 +221,10 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
| 222 |
/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
|
| 223 |
if distractor_minimum is not None:
|
| 224 |
if distractor_lemma in distractor_minimum:
|
| 225 |
-
_distractors.append((
|
| 226 |
global_distractors.add(distractor_lemma)
|
| 227 |
else:
|
| 228 |
-
_distractors.append((
|
| 229 |
num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
|
| 230 |
if len(_distractors) < num_distractors:
|
| 231 |
return None
|
|
|
|
| 190 |
targets=list(distractor_minimum))]
|
| 191 |
else:
|
| 192 |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
|
| 193 |
+
candidates = []
|
| 194 |
for candidate in bert_candidates:
|
| 195 |
if isinstance(candidate, list):
|
| 196 |
bert_candidates = candidate
|
| 197 |
continue
|
| 198 |
if candidate['token_str'].isalpha():
|
| 199 |
candidate_morph = nlp(candidate['token_str'])[0]
|
| 200 |
+
candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score']))
|
|
|
|
| 201 |
except KeyError:
|
| 202 |
return None
|
| 203 |
+
for candidate_distractor in candidates:
|
| 204 |
if '_' in candidate_distractor[0]:
|
| 205 |
+
distractor_lemma, distractor_pos = candidate_distractor[0].split('_')
|
| 206 |
else:
|
| 207 |
+
candidate_morph = nlp(candidate_distractor[0])[0]
|
| 208 |
+
distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
|
| 209 |
distractor_similarity = candidate_distractor[1]
|
| 210 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
| 211 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
|
|
|
| 221 |
/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
|
| 222 |
if distractor_minimum is not None:
|
| 223 |
if distractor_lemma in distractor_minimum:
|
| 224 |
+
_distractors.append((distractor_lemma, candidate_distractor[1]))
|
| 225 |
global_distractors.add(distractor_lemma)
|
| 226 |
else:
|
| 227 |
+
_distractors.append((distractor_lemma, candidate_distractor[1]))
|
| 228 |
num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
|
| 229 |
if len(_distractors) < num_distractors:
|
| 230 |
return None
|
utilities_language_w2v/esp_main_workflow_w2v.py
CHANGED
|
@@ -11,9 +11,7 @@ from utilities_language_general.esp_constants import load_w2v
|
|
| 11 |
from utilities_language_general.esp_utils import prepare_tasks
|
| 12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 13 |
import utilities_language_general.esp_constants as esp_constants
|
| 14 |
-
from utilities_language_general.esp_constants import
|
| 15 |
-
from utilities_language_general.esp_constants import lit_model_path
|
| 16 |
-
from utilities_language_general.esp_constants import news_model_path
|
| 17 |
from utilities_language_general.esp_utils import prepare_target_words
|
| 18 |
from utilities_language_general.esp_utils import compute_frequency_dict
|
| 19 |
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
|
@@ -65,12 +63,7 @@ def main_workflow(
|
|
| 65 |
MAX_FREQUENCY = 0
|
| 66 |
|
| 67 |
# Define which model is used for distractor generation
|
| 68 |
-
|
| 69 |
-
mask_filler = load_w2v(lit_model_path)
|
| 70 |
-
elif distractor_model == 'Новости':
|
| 71 |
-
mask_filler = load_w2v(news_model_path)
|
| 72 |
-
else:
|
| 73 |
-
mask_filler = load_w2v(all_model_path)
|
| 74 |
|
| 75 |
# Get input text
|
| 76 |
if file is not None:
|
|
@@ -136,7 +129,7 @@ def main_workflow(
|
|
| 136 |
elif level == 'C2':
|
| 137 |
target_minimum = esp_constants.c2_target_set
|
| 138 |
distractor_minimum = esp_constants.c2_distractor_set
|
| 139 |
-
elif level == '
|
| 140 |
target_minimum = None
|
| 141 |
distractor_minimum = None
|
| 142 |
else:
|
|
|
|
| 11 |
from utilities_language_general.esp_utils import prepare_tasks
|
| 12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 13 |
import utilities_language_general.esp_constants as esp_constants
|
| 14 |
+
from utilities_language_general.esp_constants import w2v_model_path
|
|
|
|
|
|
|
| 15 |
from utilities_language_general.esp_utils import prepare_target_words
|
| 16 |
from utilities_language_general.esp_utils import compute_frequency_dict
|
| 17 |
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
|
|
|
| 63 |
MAX_FREQUENCY = 0
|
| 64 |
|
| 65 |
# Define which model is used for distractor generation
|
| 66 |
+
mask_filler = load_w2v(w2v_model_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Get input text
|
| 69 |
if file is not None:
|
|
|
|
| 129 |
elif level == 'C2':
|
| 130 |
target_minimum = esp_constants.c2_target_set
|
| 131 |
distractor_minimum = esp_constants.c2_distractor_set
|
| 132 |
+
elif level == 'Без уровня':
|
| 133 |
target_minimum = None
|
| 134 |
distractor_minimum = None
|
| 135 |
else:
|