Spaces:
Sleeping
Sleeping
a-v-bely
commited on
Commit
·
94004b3
1
Parent(s):
f49c9b7
distractor classification for bert
Browse files- language_data/model3_with_wn_catboost_classifier.pickle +3 -0
- language_data/model3_with_wn_minmaxscaler.pickle +3 -0
- utilities_cookies/cookie_manager.py +2 -4
- utilities_cookies/encrypted_cookie_manager.py +1 -3
- utilities_database/user_database_utils.py +7 -11
- utilities_database/user_database_widgets.py +1 -2
- utilities_language_bert/esp_main_workflow_bert.py +23 -46
- utilities_language_bert/esp_sentence_bert.py +5 -4
- utilities_language_general/esp_constants.py +0 -1
- utilities_language_general/esp_utils.py +5 -6
- utilities_language_w2v/esp_main_workflow_w2v.py +1 -5
language_data/model3_with_wn_catboost_classifier.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d27b12b7d7c7aa81da02aba229941ffef9e51879be6673c4f389bea10cd1a2db
|
| 3 |
+
size 2425245
|
language_data/model3_with_wn_minmaxscaler.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c415fb5e8b4258876b11043f43593fde8026456202629c5280cc59a1a5c5351b
|
| 3 |
+
size 1404
|
utilities_cookies/cookie_manager.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from pathlib import Path
|
| 3 |
-
from typing import Mapping
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
from datetime import timedelta
|
| 6 |
from urllib.parse import unquote
|
| 7 |
-
from
|
|
|
|
| 8 |
from streamlit.components.v1 import components
|
| 9 |
|
| 10 |
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
| 3 |
from urllib.parse import unquote
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
from typing import Mapping, MutableMapping
|
| 6 |
from streamlit.components.v1 import components
|
| 7 |
|
| 8 |
|
utilities_cookies/encrypted_cookie_manager.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
import os
|
| 2 |
import base64
|
| 3 |
import streamlit as st
|
| 4 |
-
from typing import Tuple
|
| 5 |
-
from typing import Optional
|
| 6 |
from cryptography import fernet
|
| 7 |
-
from typing import MutableMapping
|
| 8 |
from cryptography.fernet import Fernet
|
| 9 |
from cryptography.hazmat.primitives import hashes
|
| 10 |
from utilities_cookies.cookie_manager import CookieManager
|
|
|
|
| 1 |
import os
|
| 2 |
import base64
|
| 3 |
import streamlit as st
|
| 4 |
+
from typing import Tuple, Optional, MutableMapping
|
|
|
|
| 5 |
from cryptography import fernet
|
|
|
|
| 6 |
from cryptography.fernet import Fernet
|
| 7 |
from cryptography.hazmat.primitives import hashes
|
| 8 |
from utilities_cookies.cookie_manager import CookieManager
|
utilities_database/user_database_utils.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import json
|
| 3 |
-
import secrets
|
| 4 |
import pandas as pd
|
| 5 |
import streamlit as st
|
|
|
|
|
|
|
| 6 |
from trycourier import Courier
|
|
|
|
| 7 |
from argon2 import PasswordHasher
|
| 8 |
from argon2.exceptions import VerifyMismatchError
|
| 9 |
|
|
@@ -37,7 +37,7 @@ def check_valid_name(name_sign_up: str) -> bool:
|
|
| 37 |
name_regex_eng = r'^[A-Za-z_]\w *'
|
| 38 |
name_regex_rus = r'^[А-Яа-я_][А-Яа-я0-9_] *'
|
| 39 |
|
| 40 |
-
if
|
| 41 |
return True
|
| 42 |
return False
|
| 43 |
|
|
@@ -46,12 +46,8 @@ def check_valid_email(email_sign_up: str) -> bool:
|
|
| 46 |
"""
|
| 47 |
Checks if the user entered a valid email while creating the account.
|
| 48 |
"""
|
| 49 |
-
regex =
|
| 50 |
return True
|
| 51 |
-
|
| 52 |
-
# if re.fullmatch(regex, email_sign_up):
|
| 53 |
-
# return True
|
| 54 |
-
# return False
|
| 55 |
|
| 56 |
|
| 57 |
def check_unique_email(user_log_in_database, email_sign_up: str) -> bool:
|
|
@@ -133,7 +129,7 @@ def generate_random_passwd() -> str:
|
|
| 133 |
Generates a random password to be sent in email.
|
| 134 |
"""
|
| 135 |
password_length = 10
|
| 136 |
-
return
|
| 137 |
|
| 138 |
|
| 139 |
def send_passwd_in_email(auth_token: str, user_name_forgot_passwd: str, email_forgot_passwd: str, company_name: str,
|
|
@@ -266,5 +262,5 @@ def load_users_particular_task(user_task_database, load_mode, creator_name, save
|
|
| 266 |
.eq('save_name', save_name)\
|
| 267 |
.eq('save_type', load_mode)\
|
| 268 |
.eq('cefr_level',cefr_level).execute().data[0]['generated_result']
|
| 269 |
-
return_data =
|
| 270 |
return return_data
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import streamlit as st
|
| 3 |
+
from json import loads
|
| 4 |
+
from re import search, compile
|
| 5 |
from trycourier import Courier
|
| 6 |
+
from secrets import token_urlsafe
|
| 7 |
from argon2 import PasswordHasher
|
| 8 |
from argon2.exceptions import VerifyMismatchError
|
| 9 |
|
|
|
|
| 37 |
name_regex_eng = r'^[A-Za-z_]\w *'
|
| 38 |
name_regex_rus = r'^[А-Яа-я_][А-Яа-я0-9_] *'
|
| 39 |
|
| 40 |
+
if search(name_regex_eng, name_sign_up) or search(name_regex_rus, name_sign_up):
|
| 41 |
return True
|
| 42 |
return False
|
| 43 |
|
|
|
|
| 46 |
"""
|
| 47 |
Checks if the user entered a valid email while creating the account.
|
| 48 |
"""
|
| 49 |
+
regex = compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
|
| 50 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def check_unique_email(user_log_in_database, email_sign_up: str) -> bool:
|
|
|
|
| 129 |
Generates a random password to be sent in email.
|
| 130 |
"""
|
| 131 |
password_length = 10
|
| 132 |
+
return token_urlsafe(password_length)
|
| 133 |
|
| 134 |
|
| 135 |
def send_passwd_in_email(auth_token: str, user_name_forgot_passwd: str, email_forgot_passwd: str, company_name: str,
|
|
|
|
| 262 |
.eq('save_name', save_name)\
|
| 263 |
.eq('save_type', load_mode)\
|
| 264 |
.eq('cefr_level',cefr_level).execute().data[0]['generated_result']
|
| 265 |
+
return_data = loads(return_data.replace("'", '"'), strict=False)
|
| 266 |
return return_data
|
utilities_database/user_database_widgets.py
CHANGED
|
@@ -3,7 +3,6 @@ from datetime import datetime
|
|
| 3 |
from supabase import create_client, Client
|
| 4 |
from utilities_option_menu.option_menu import option_menu
|
| 5 |
import utilities_database.user_database_utils as db_utils
|
| 6 |
-
from utilities_database.user_database_utils import check_usr_pass
|
| 7 |
from utilities_cookies.encrypted_cookie_manager import EncryptedCookieManager
|
| 8 |
|
| 9 |
DB_URL = st.secrets['SUPABASE_URL']
|
|
@@ -91,7 +90,7 @@ class LogIn:
|
|
| 91 |
login_submit_button = st.form_submit_button(label='Войти')
|
| 92 |
|
| 93 |
if login_submit_button:
|
| 94 |
-
authenticate_user_check = check_usr_pass(user_log_in_database=user_login_table,
|
| 95 |
user_name=user_name,
|
| 96 |
password=password)
|
| 97 |
|
|
|
|
| 3 |
from supabase import create_client, Client
|
| 4 |
from utilities_option_menu.option_menu import option_menu
|
| 5 |
import utilities_database.user_database_utils as db_utils
|
|
|
|
| 6 |
from utilities_cookies.encrypted_cookie_manager import EncryptedCookieManager
|
| 7 |
|
| 8 |
DB_URL = st.secrets['SUPABASE_URL']
|
|
|
|
| 90 |
login_submit_button = st.form_submit_button(label='Войти')
|
| 91 |
|
| 92 |
if login_submit_button:
|
| 93 |
+
authenticate_user_check = db_utils.check_usr_pass(user_log_in_database=user_login_table,
|
| 94 |
user_name=user_name,
|
| 95 |
password=password)
|
| 96 |
|
utilities_language_bert/esp_main_workflow_bert.py
CHANGED
|
@@ -1,29 +1,20 @@
|
|
| 1 |
import datetime
|
| 2 |
from io import StringIO
|
|
|
|
| 3 |
from random import sample
|
| 4 |
from collections import defaultdict
|
| 5 |
-
from streamlit import progress as st_progress
|
| 6 |
-
from streamlit.elements import WIDGETS as ST_WIDGETS
|
| 7 |
-
from utilities_language_general.esp_constants import st
|
| 8 |
-
from utilities_language_bert.esp_sentence_bert import TASK
|
| 9 |
-
from utilities_language_bert.esp_sentence_bert import SENTENCE
|
| 10 |
-
from utilities_language_general.esp_utils import prepare_tasks
|
| 11 |
-
from utilities_language_general.esp_constants import load_bert
|
| 12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 13 |
-
|
| 14 |
-
from utilities_language_general.
|
| 15 |
-
from utilities_language_general.
|
| 16 |
-
from utilities_language_general.esp_utils import compute_frequency_dict
|
| 17 |
-
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
| 18 |
-
|
| 19 |
|
| 20 |
|
| 21 |
def main_workflow(
|
| 22 |
-
file: UploadedFile
|
| 23 |
text: str,
|
| 24 |
-
logs
|
| 25 |
-
progress
|
| 26 |
-
progress_d
|
| 27 |
level: str,
|
| 28 |
tw_mode_automatic_mode: str,
|
| 29 |
target_words: str,
|
|
@@ -58,6 +49,7 @@ def main_workflow(
|
|
| 58 |
MAX_FREQUENCY = 0
|
| 59 |
|
| 60 |
logs.update(label='Загружаем языковые модели и другие данные', state='running')
|
|
|
|
| 61 |
mask_filler = load_bert()
|
| 62 |
|
| 63 |
# Get input text
|
|
@@ -67,15 +59,15 @@ def main_workflow(
|
|
| 67 |
elif text != '':
|
| 68 |
current_text = text
|
| 69 |
else:
|
| 70 |
-
|
| 71 |
current_text = ''
|
| 72 |
-
|
| 73 |
|
| 74 |
# Process target words
|
| 75 |
if tw_mode_automatic_mode == 'Самостоятельно':
|
| 76 |
if target_words == '':
|
| 77 |
-
|
| 78 |
-
|
| 79 |
# Cannot make up paradigm, so only USER_TARGET_WORDS is used
|
| 80 |
USER_TARGET_WORDS = prepare_target_words(target_words)
|
| 81 |
tw_mode_automatic_mode = False
|
|
@@ -89,7 +81,7 @@ def main_workflow(
|
|
| 89 |
.replace(' ', ' ').replace('…', '...').replace('…', '...')
|
| 90 |
.replace('—', '-').replace('\u2014', '-').replace('—', '-')
|
| 91 |
.replace('-\n', '').replace('\n', '%^&*'))
|
| 92 |
-
current_text_sentences = [sent.text.strip() for sent in
|
| 93 |
logs.update(label='Получили Ваш текст!', state='running')
|
| 94 |
progress.progress(10)
|
| 95 |
|
|
@@ -106,27 +98,8 @@ def main_workflow(
|
|
| 106 |
progress.progress(15)
|
| 107 |
|
| 108 |
# Choose necessary language minimum according to user's input
|
| 109 |
-
if level
|
| 110 |
-
target_minimum =
|
| 111 |
-
distractor_minimum = esp_constants.a1_distractor_set
|
| 112 |
-
elif level == 'A2':
|
| 113 |
-
target_minimum = esp_constants.a2_target_set
|
| 114 |
-
distractor_minimum = esp_constants.a2_distractor_set
|
| 115 |
-
elif level == 'B1':
|
| 116 |
-
target_minimum = esp_constants.b1_target_set
|
| 117 |
-
distractor_minimum = esp_constants.b1_distractor_set
|
| 118 |
-
elif level == 'B2':
|
| 119 |
-
target_minimum = esp_constants.b2_target_set
|
| 120 |
-
distractor_minimum = esp_constants.b2_distractor_set
|
| 121 |
-
elif level == 'C1':
|
| 122 |
-
target_minimum = esp_constants.c1_target_set
|
| 123 |
-
distractor_minimum = esp_constants.c1_distractor_set
|
| 124 |
-
elif level == 'C2':
|
| 125 |
-
target_minimum = esp_constants.c2_target_set
|
| 126 |
-
distractor_minimum = esp_constants.c2_distractor_set
|
| 127 |
-
elif level == 'Без уровня':
|
| 128 |
-
target_minimum = None
|
| 129 |
-
distractor_minimum = None
|
| 130 |
else:
|
| 131 |
target_minimum = None
|
| 132 |
distractor_minimum = None
|
|
@@ -204,7 +177,11 @@ def main_workflow(
|
|
| 204 |
RESULT_TASKS.append(task)
|
| 205 |
|
| 206 |
for num, task in enumerate(RESULT_TASKS):
|
| 207 |
-
task.attach_distractors_to_target_word(model=mask_filler,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
global_distractors=GLOBAL_DISTRACTORS,
|
| 209 |
distractor_minimum=distractor_minimum,
|
| 210 |
max_frequency=MAX_FREQUENCY)
|
|
@@ -240,8 +217,8 @@ def main_workflow(
|
|
| 240 |
NUMBER_TASKS = 10
|
| 241 |
else:
|
| 242 |
NUMBER_TASKS = len(RESULT_TASKS)
|
| 243 |
-
RESULT_TASKS_in_summary = filter(lambda task: task.in_summary, RESULT_TASKS)
|
| 244 |
-
RESULT_TASTS_not_in_summary = filter(lambda task: not task.in_summary, RESULT_TASKS)
|
| 245 |
if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
|
| 246 |
RESULT_TASKS = RESULT_TASKS_in_summary
|
| 247 |
else:
|
|
|
|
| 1 |
import datetime
|
| 2 |
from io import StringIO
|
| 3 |
+
from typing import Union
|
| 4 |
from random import sample
|
| 5 |
from collections import defaultdict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 7 |
+
from utilities_language_bert.esp_sentence_bert import TASK, SENTENCE
|
| 8 |
+
from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
|
| 9 |
+
from utilities_language_general.esp_constants import st, load_bert, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def main_workflow(
|
| 13 |
+
file: Union[UploadedFile, None],
|
| 14 |
text: str,
|
| 15 |
+
logs,
|
| 16 |
+
progress,
|
| 17 |
+
progress_d,
|
| 18 |
level: str,
|
| 19 |
tw_mode_automatic_mode: str,
|
| 20 |
target_words: str,
|
|
|
|
| 49 |
MAX_FREQUENCY = 0
|
| 50 |
|
| 51 |
logs.update(label='Загружаем языковые модели и другие данные', state='running')
|
| 52 |
+
pos_dict, scaler, classifier = load_classifiers('model3')
|
| 53 |
mask_filler = load_bert()
|
| 54 |
|
| 55 |
# Get input text
|
|
|
|
| 59 |
elif text != '':
|
| 60 |
current_text = text
|
| 61 |
else:
|
| 62 |
+
st.warning('Вы и текст не вставили, и файл не выбрали 😢')
|
| 63 |
current_text = ''
|
| 64 |
+
st.stop()
|
| 65 |
|
| 66 |
# Process target words
|
| 67 |
if tw_mode_automatic_mode == 'Самостоятельно':
|
| 68 |
if target_words == '':
|
| 69 |
+
st.warning('Вы не ввели целевые слова')
|
| 70 |
+
st.stop()
|
| 71 |
# Cannot make up paradigm, so only USER_TARGET_WORDS is used
|
| 72 |
USER_TARGET_WORDS = prepare_target_words(target_words)
|
| 73 |
tw_mode_automatic_mode = False
|
|
|
|
| 81 |
.replace(' ', ' ').replace('…', '...').replace('…', '...')
|
| 82 |
.replace('—', '-').replace('\u2014', '-').replace('—', '-')
|
| 83 |
.replace('-\n', '').replace('\n', '%^&*'))
|
| 84 |
+
current_text_sentences = [sent.text.strip() for sent in nlp(current_text).sents]
|
| 85 |
logs.update(label='Получили Ваш текст!', state='running')
|
| 86 |
progress.progress(10)
|
| 87 |
|
|
|
|
| 98 |
progress.progress(15)
|
| 99 |
|
| 100 |
# Choose necessary language minimum according to user's input
|
| 101 |
+
if level:
|
| 102 |
+
target_minimum, distractor_minimum = MINIMUM_SETS[level]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
else:
|
| 104 |
target_minimum = None
|
| 105 |
distractor_minimum = None
|
|
|
|
| 177 |
RESULT_TASKS.append(task)
|
| 178 |
|
| 179 |
for num, task in enumerate(RESULT_TASKS):
|
| 180 |
+
task.attach_distractors_to_target_word(model=mask_filler,
|
| 181 |
+
scaler=scaler,
|
| 182 |
+
classifier=classifier,
|
| 183 |
+
pos_dict=pos_dict,
|
| 184 |
+
level_name=level,
|
| 185 |
global_distractors=GLOBAL_DISTRACTORS,
|
| 186 |
distractor_minimum=distractor_minimum,
|
| 187 |
max_frequency=MAX_FREQUENCY)
|
|
|
|
| 217 |
NUMBER_TASKS = 10
|
| 218 |
else:
|
| 219 |
NUMBER_TASKS = len(RESULT_TASKS)
|
| 220 |
+
RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
|
| 221 |
+
RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
|
| 222 |
if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
|
| 223 |
RESULT_TASKS = RESULT_TASKS_in_summary
|
| 224 |
else:
|
utilities_language_bert/esp_sentence_bert.py
CHANGED
|
@@ -43,6 +43,7 @@ class SENTENCE:
|
|
| 43 |
if not previous_was_phrase:
|
| 44 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
| 45 |
previous_was_phrase = False
|
|
|
|
| 46 |
|
| 47 |
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary:list=None):
|
| 48 |
for token in self.sentence_phrases:
|
|
@@ -188,11 +189,11 @@ class TASK:
|
|
| 188 |
def __repr__(self):
|
| 189 |
return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
|
| 190 |
|
| 191 |
-
def attach_distractors_to_target_word(self, model,
|
| 192 |
-
level_name, max_frequency):
|
| 193 |
pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
|
| 194 |
-
distractors_sentence = get_distractors_from_model_bert(model=model,
|
| 195 |
-
gender=self.gender,
|
| 196 |
text_with_masked_task=self.masked_sentence,
|
| 197 |
global_distractors=global_distractors,
|
| 198 |
distractor_minimum=distractor_minimum,
|
|
|
|
| 43 |
if not previous_was_phrase:
|
| 44 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
| 45 |
previous_was_phrase = False
|
| 46 |
+
self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
|
| 47 |
|
| 48 |
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary:list=None):
|
| 49 |
for token in self.sentence_phrases:
|
|
|
|
| 189 |
def __repr__(self):
|
| 190 |
return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
|
| 191 |
|
| 192 |
+
def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict,
|
| 193 |
+
global_distractors, distractor_minimum, level_name, max_frequency):
|
| 194 |
pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
|
| 195 |
+
distractors_sentence = get_distractors_from_model_bert(model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
|
| 196 |
+
level_name=level_name, lemma=self.lemma, pos=pos, gender=self.gender,
|
| 197 |
text_with_masked_task=self.masked_sentence,
|
| 198 |
global_distractors=global_distractors,
|
| 199 |
distractor_minimum=distractor_minimum,
|
utilities_language_general/esp_constants.py
CHANGED
|
@@ -2,7 +2,6 @@ import json
|
|
| 2 |
import spacy
|
| 3 |
import gensim
|
| 4 |
import streamlit as st
|
| 5 |
-
|
| 6 |
from pickle import load
|
| 7 |
from transformers import pipeline
|
| 8 |
from summarizer import Summarizer
|
|
|
|
| 2 |
import spacy
|
| 3 |
import gensim
|
| 4 |
import streamlit as st
|
|
|
|
| 5 |
from pickle import load
|
| 6 |
from transformers import pipeline
|
| 7 |
from summarizer import Summarizer
|
utilities_language_general/esp_utils.py
CHANGED
|
@@ -192,9 +192,8 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
|
|
| 192 |
return distractors
|
| 193 |
|
| 194 |
|
| 195 |
-
def get_distractors_from_model_bert(
|
| 196 |
-
global_distractors: set, distractor_minimum: set,
|
| 197 |
-
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
|
| 198 |
_distractors = []
|
| 199 |
try:
|
| 200 |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
|
|
@@ -217,9 +216,9 @@ def get_distractors_from_model_bert(doc, model, scaler, classifier, text_with_ma
|
|
| 217 |
distractor_similarity = candidate_distractor[1]
|
| 218 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
| 219 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
| 220 |
-
decision = make_decision(doc, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict,
|
| 221 |
-
|
| 222 |
-
|
| 223 |
if ((distractor_pos == pos
|
| 224 |
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
| 225 |
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
|
|
|
|
| 192 |
return distractors
|
| 193 |
|
| 194 |
|
| 195 |
+
def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str, text_with_masked_task: str,
|
| 196 |
+
global_distractors: set, distractor_minimum: set, max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
|
|
|
|
| 197 |
_distractors = []
|
| 198 |
try:
|
| 199 |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
|
|
|
|
| 216 |
distractor_similarity = candidate_distractor[1]
|
| 217 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
| 218 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
| 219 |
+
decision = make_decision(doc=None, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict, level=level_name,
|
| 220 |
+
target_lemma=lemma, target_text=None, target_pos=pos, target_position=None,
|
| 221 |
+
substitute_lemma=distractor_lemma, substitute_pos=distractor_pos, bert_score=distractor_similarity)
|
| 222 |
if ((distractor_pos == pos
|
| 223 |
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
| 224 |
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
|
utilities_language_w2v/esp_main_workflow_w2v.py
CHANGED
|
@@ -3,16 +3,12 @@ from io import StringIO
|
|
| 3 |
from typing import Union
|
| 4 |
from random import sample
|
| 5 |
from collections import defaultdict
|
| 6 |
-
from streamlit import progress as st_progress
|
| 7 |
-
from streamlit.elements import WIDGETS as ST_WIDGETS
|
| 8 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 9 |
-
import utilities_language_general.esp_constants as esp_constants
|
| 10 |
from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
|
| 11 |
from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
|
| 12 |
from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
|
| 13 |
|
| 14 |
|
| 15 |
-
|
| 16 |
def main_workflow(
|
| 17 |
file: Union[UploadedFile, None],
|
| 18 |
text: str,
|
|
@@ -84,7 +80,7 @@ def main_workflow(
|
|
| 84 |
.replace(' ', ' ').replace('…', '...').replace('…', '...')
|
| 85 |
.replace('—', '-').replace('\u2014', '-').replace('—', '-')
|
| 86 |
.replace('-\n', '').replace('\n', '%^&*'))
|
| 87 |
-
current_text_sentences = [sent.text.strip() for sent in
|
| 88 |
logs.update(label='Получили Ваш текст!', state='running')
|
| 89 |
progress.progress(10)
|
| 90 |
|
|
|
|
| 3 |
from typing import Union
|
| 4 |
from random import sample
|
| 5 |
from collections import defaultdict
|
|
|
|
|
|
|
| 6 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
|
|
|
| 7 |
from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
|
| 8 |
from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
|
| 9 |
from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
|
| 10 |
|
| 11 |
|
|
|
|
| 12 |
def main_workflow(
|
| 13 |
file: Union[UploadedFile, None],
|
| 14 |
text: str,
|
|
|
|
| 80 |
.replace(' ', ' ').replace('…', '...').replace('…', '...')
|
| 81 |
.replace('—', '-').replace('\u2014', '-').replace('—', '-')
|
| 82 |
.replace('-\n', '').replace('\n', '%^&*'))
|
| 83 |
+
current_text_sentences = [sent.text.strip() for sent in nlp(current_text).sents]
|
| 84 |
logs.update(label='Получили Ваш текст!', state='running')
|
| 85 |
progress.progress(10)
|
| 86 |
|