antimoda1 commited on
Commit ·
3d115be
1
Parent(s): c1c2970
complete refactor
Browse files
calculate_params.py
CHANGED
|
@@ -96,15 +96,6 @@ test_cases = (
|
|
| 96 |
)
|
| 97 |
|
| 98 |
|
| 99 |
-
def get_ranks(scores, good_paragraphs):
|
| 100 |
-
scores = np.array(scores)
|
| 101 |
-
ranks = np.argsort(scores, axis=1) # важно: сортировка по возрастанию от нерелевантных к релевантным
|
| 102 |
-
mask = np.array([np.isin(rank_for_case, good_paragraphs_for_case)
|
| 103 |
-
for rank_for_case, good_paragraphs_for_case in zip(ranks, good_paragraphs, strict=True)])
|
| 104 |
-
relevant_ranks = [ranks_case[mask_case] for mask_case, ranks_case in zip(mask, ranks, strict=True)]
|
| 105 |
-
breakpoint()
|
| 106 |
-
return relevant_ranks
|
| 107 |
-
|
| 108 |
def test_cross_encoder_vs_bm25():
|
| 109 |
"""Тестирует кросс-энкодер vs BM25 на всех документах."""
|
| 110 |
# Создаем объект Retrieval (загружает корпус автоматически)
|
|
|
|
| 96 |
)
|
| 97 |
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
def test_cross_encoder_vs_bm25():
|
| 100 |
"""Тестирует кросс-энкодер vs BM25 на всех документах."""
|
| 101 |
# Создаем объект Retrieval (загружает корпус автоматически)
|
_1_get_documents.py → get_documents.py
RENAMED
|
@@ -1,20 +1,12 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
def get_text(inst):
|
| 4 |
-
if isinstance(inst, str):
|
| 5 |
-
return inst
|
| 6 |
-
if isinstance(inst, list):
|
| 7 |
-
return ' '.join([get_text(inst_) for inst_ in inst])
|
| 8 |
-
if isinstance(inst, dict):
|
| 9 |
-
return get_text(inst['text'])
|
| 10 |
-
|
| 11 |
-
|
| 12 |
def process_file(file_path):
|
| 13 |
with open(file_path, 'r', encoding='utf-8-sig') as f:
|
| 14 |
text = f.read()
|
| 15 |
assert text
|
| 16 |
return text, str(file_path).split('.')[0]
|
| 17 |
|
|
|
|
| 18 |
def process_folder_recursive(folder_path):
|
| 19 |
all_messages = []
|
| 20 |
for file in os.listdir(folder_path):
|
|
@@ -24,8 +16,9 @@ def process_folder_recursive(folder_path):
|
|
| 24 |
else:
|
| 25 |
all_messages += process_folder_recursive(file_path)
|
| 26 |
return all_messages
|
| 27 |
-
|
|
|
|
| 28 |
def load_and_process_data() -> list[dict]:
|
| 29 |
"""Загрузка и предобработка данных из JSON файлов"""
|
| 30 |
all_messages = process_folder_recursive('texts')
|
| 31 |
-
return [x[0] for x in all_messages], [x[1][:-3] for x in all_messages] # возвращаем расширения и тексты документов
|
|
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
def process_file(file_path):
|
| 4 |
with open(file_path, 'r', encoding='utf-8-sig') as f:
|
| 5 |
text = f.read()
|
| 6 |
assert text
|
| 7 |
return text, str(file_path).split('.')[0]
|
| 8 |
|
| 9 |
+
|
| 10 |
def process_folder_recursive(folder_path):
|
| 11 |
all_messages = []
|
| 12 |
for file in os.listdir(folder_path):
|
|
|
|
| 16 |
else:
|
| 17 |
all_messages += process_folder_recursive(file_path)
|
| 18 |
return all_messages
|
| 19 |
+
|
| 20 |
+
|
| 21 |
def load_and_process_data() -> list[dict]:
|
| 22 |
"""Загрузка и предобработка данных из JSON файлов"""
|
| 23 |
all_messages = process_folder_recursive('texts')
|
| 24 |
+
return [x[0] for x in all_messages], [x[1][:-3] for x in all_messages] # возвращаем расширения и тексты документов
|
_2_splitting.py → parse_documents.py
RENAMED
|
File without changes
|
tests/test_lemmatization.py
CHANGED
|
@@ -69,14 +69,12 @@ def lemmas_in_sentence(query: str, sentence: str) -> bool:
|
|
| 69 |
def run_test_suite(test_set_name: str, test_set: list[TestSearch]) -> tuple[int, int]:
|
| 70 |
"""Запускает набор тестов и возвращает (пройдено, провалено)"""
|
| 71 |
|
| 72 |
-
print("\n" + "-"*70)
|
| 73 |
print(f"НАБОР ТЕСТОВ: {test_set_name}")
|
| 74 |
-
print("-"*70)
|
| 75 |
|
| 76 |
passed = 0
|
| 77 |
failed = 0
|
| 78 |
|
| 79 |
-
for
|
| 80 |
word = test.word
|
| 81 |
sentence = test.sentence
|
| 82 |
|
|
@@ -108,30 +106,22 @@ def run_test_suite(test_set_name: str, test_set: list[TestSearch]) -> tuple[int,
|
|
| 108 |
|
| 109 |
def test_lemmatization():
|
| 110 |
"""Запускает все наборы тестов"""
|
| 111 |
-
|
| 112 |
-
print("\n" + "="*70)
|
| 113 |
-
print("ТЕСТ ЛЕММАТИЗАЦИИ для русского языка")
|
| 114 |
-
print("="*70)
|
| 115 |
-
|
| 116 |
total_passed = 0
|
| 117 |
total_failed = 0
|
| 118 |
|
| 119 |
# Запускаем все наборы тестов
|
| 120 |
-
for test_name, test_set in
|
| 121 |
("ТОПОНИМЫ (падежи и словоформы)", TESTS_TOPONIMS),
|
| 122 |
("ЛЕММАТИЗАЦИЯ (беглые гласные)", TESTS_LEMMATIZATION),
|
| 123 |
("ОБРАБОТКА Е/Ё", TESTS_LETTER_E),
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
total_failed += failed
|
| 129 |
|
| 130 |
# Финальный результат
|
| 131 |
-
print("
|
| 132 |
-
print(f"ИТОГО: {total_passed} пройдено, {total_failed} провалено из {total_passed + total_failed}")
|
| 133 |
-
print("="*70 + "\n")
|
| 134 |
-
|
| 135 |
return total_failed == 0
|
| 136 |
|
| 137 |
|
|
|
|
| 69 |
def run_test_suite(test_set_name: str, test_set: list[TestSearch]) -> tuple[int, int]:
|
| 70 |
"""Запускает набор тестов и возвращает (пройдено, провалено)"""
|
| 71 |
|
|
|
|
| 72 |
print(f"НАБОР ТЕСТОВ: {test_set_name}")
|
|
|
|
| 73 |
|
| 74 |
passed = 0
|
| 75 |
failed = 0
|
| 76 |
|
| 77 |
+
for test in test_set:
|
| 78 |
word = test.word
|
| 79 |
sentence = test.sentence
|
| 80 |
|
|
|
|
| 106 |
|
| 107 |
def test_lemmatization():
|
| 108 |
"""Запускает все наборы тестов"""
|
| 109 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
total_passed = 0
|
| 111 |
total_failed = 0
|
| 112 |
|
| 113 |
# Запускаем все наборы тестов
|
| 114 |
+
for test_name, test_set in (
|
| 115 |
("ТОПОНИМЫ (падежи и словоформы)", TESTS_TOPONIMS),
|
| 116 |
("ЛЕММАТИЗАЦИЯ (беглые гласные)", TESTS_LEMMATIZATION),
|
| 117 |
("ОБРАБОТКА Е/Ё", TESTS_LETTER_E),
|
| 118 |
+
):
|
| 119 |
+
passed, failed = run_test_suite(test_name, test_set)
|
| 120 |
+
total_passed += passed
|
| 121 |
+
total_failed += failed
|
|
|
|
| 122 |
|
| 123 |
# Финальный результат
|
| 124 |
+
print(f"ИТОГО: {total_passed} пройдено, {total_failed} провалено из {total_passed + total_failed}")
|
|
|
|
|
|
|
|
|
|
| 125 |
return total_failed == 0
|
| 126 |
|
| 127 |
|