Spaces:
Running
Running
Vlad Juracovschi commited on
Commit ·
44706f3
1
Parent(s): 8b892b9
OptimizedUserSearchWithMorph
Browse files- app.py +1 -1
- extractors/user_extractor.py +41 -20
app.py
CHANGED
|
@@ -472,7 +472,7 @@ def process_audio():
|
|
| 472 |
|
| 473 |
audio = request.files.get("audio")
|
| 474 |
mode = (request.form.get("mode") or "expense").strip()
|
| 475 |
-
debug = (
|
| 476 |
context = parse_context(request.form.get("context"))
|
| 477 |
|
| 478 |
if audio is None:
|
|
|
|
| 472 |
|
| 473 |
audio = request.files.get("audio")
|
| 474 |
mode = (request.form.get("mode") or "expense").strip()
|
| 475 |
+
debug = (request.args.get("debug") == "1" or "")
|
| 476 |
context = parse_context(request.form.get("context"))
|
| 477 |
|
| 478 |
if audio is None:
|
extractors/user_extractor.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import re
|
|
|
|
| 6 |
from typing import Any
|
| 7 |
|
| 8 |
from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_text
|
|
@@ -22,38 +23,56 @@ class ExpenseUserExtractor:
|
|
| 22 |
self.threshold = threshold
|
| 23 |
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 24 |
self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
|
| 25 |
-
self.
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def _build_user_candidate_text(
|
| 38 |
self,
|
| 39 |
normalized_text: str,
|
| 40 |
supplier_phrase: str | None,
|
| 41 |
date_phrase: str | None,
|
| 42 |
-
) -> tuple[str, list[str]]:
|
| 43 |
excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
|
| 44 |
if supplier_phrase:
|
| 45 |
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
| 46 |
if date_phrase:
|
| 47 |
excluded_tokens.update(normalize_text(date_phrase).split())
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
| 50 |
for token in normalized_text.split():
|
| 51 |
if token in excluded_tokens or token.isdigit() or len(token) <= 1:
|
| 52 |
continue
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def extract(
|
| 59 |
self,
|
|
@@ -77,7 +96,7 @@ class ExpenseUserExtractor:
|
|
| 77 |
}
|
| 78 |
return payload
|
| 79 |
|
| 80 |
-
candidate_text,
|
| 81 |
normalized_text=normalized_text,
|
| 82 |
supplier_phrase=supplier_phrase,
|
| 83 |
date_phrase=date_phrase,
|
|
@@ -96,7 +115,8 @@ class ExpenseUserExtractor:
|
|
| 96 |
"excluded_supplier_phrase": supplier_phrase,
|
| 97 |
"normalized_text": normalized_text,
|
| 98 |
"candidate_text": candidate_text,
|
| 99 |
-
"
|
|
|
|
| 100 |
"matcher_debug": None,
|
| 101 |
}
|
| 102 |
return payload
|
|
@@ -123,7 +143,8 @@ class ExpenseUserExtractor:
|
|
| 123 |
"excluded_supplier_phrase": supplier_phrase,
|
| 124 |
"normalized_text": normalized_text,
|
| 125 |
"candidate_text": candidate_text,
|
| 126 |
-
"
|
|
|
|
| 127 |
"matcher_debug": match.get("supplier_debug"),
|
| 128 |
}
|
| 129 |
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import re
|
| 6 |
+
import importlib
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_text
|
|
|
|
| 23 |
self.threshold = threshold
|
| 24 |
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 25 |
self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
|
| 26 |
+
self.morph: Any = None
|
| 27 |
+
try:
|
| 28 |
+
pymorphy3_module = importlib.import_module("pymorphy3")
|
| 29 |
+
self.morph = pymorphy3_module.MorphAnalyzer()
|
| 30 |
+
except Exception:
|
| 31 |
+
self.morph = None
|
| 32 |
+
|
| 33 |
+
def _looks_like_person_token(self, token: str) -> tuple[bool, float, bool]:
|
| 34 |
+
lexical = self.user_matcher.lexical_support(token)
|
| 35 |
+
has_person_grammeme = False
|
| 36 |
+
if self.morph is not None:
|
| 37 |
+
parses = self.morph.parse(token)
|
| 38 |
+
has_person_grammeme = any(
|
| 39 |
+
{"Name", "Surn", "Patr"}.intersection(set(parse.tag.grammemes))
|
| 40 |
+
for parse in parses
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
|
| 44 |
+
accepted = lexical >= 0.40 or (has_person_grammeme and lexical >= 0.30)
|
| 45 |
+
return accepted, lexical, has_person_grammeme
|
| 46 |
|
| 47 |
def _build_user_candidate_text(
|
| 48 |
self,
|
| 49 |
normalized_text: str,
|
| 50 |
supplier_phrase: str | None,
|
| 51 |
date_phrase: str | None,
|
| 52 |
+
) -> tuple[str, list[str], list[dict[str, Any]]]:
|
| 53 |
excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
|
| 54 |
if supplier_phrase:
|
| 55 |
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
| 56 |
if date_phrase:
|
| 57 |
excluded_tokens.update(normalize_text(date_phrase).split())
|
| 58 |
+
excluded_tokens.update(self.supplier_terms)
|
| 59 |
|
| 60 |
+
candidate_tokens: list[str] = []
|
| 61 |
+
candidate_debug: list[dict[str, Any]] = []
|
| 62 |
for token in normalized_text.split():
|
| 63 |
if token in excluded_tokens or token.isdigit() or len(token) <= 1:
|
| 64 |
continue
|
| 65 |
+
accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
|
| 66 |
+
candidate_debug.append({
|
| 67 |
+
"token": token,
|
| 68 |
+
"lexical_support": round(lexical, 4),
|
| 69 |
+
"has_person_grammeme": has_person_grammeme,
|
| 70 |
+
"accepted": accepted,
|
| 71 |
+
})
|
| 72 |
+
if accepted:
|
| 73 |
+
candidate_tokens.append(token)
|
| 74 |
+
|
| 75 |
+
return " ".join(candidate_tokens), candidate_tokens, candidate_debug
|
| 76 |
|
| 77 |
def extract(
|
| 78 |
self,
|
|
|
|
| 96 |
}
|
| 97 |
return payload
|
| 98 |
|
| 99 |
+
candidate_text, candidate_tokens, candidate_debug = self._build_user_candidate_text(
|
| 100 |
normalized_text=normalized_text,
|
| 101 |
supplier_phrase=supplier_phrase,
|
| 102 |
date_phrase=date_phrase,
|
|
|
|
| 115 |
"excluded_supplier_phrase": supplier_phrase,
|
| 116 |
"normalized_text": normalized_text,
|
| 117 |
"candidate_text": candidate_text,
|
| 118 |
+
"candidate_tokens": candidate_tokens,
|
| 119 |
+
"candidate_token_debug": candidate_debug,
|
| 120 |
"matcher_debug": None,
|
| 121 |
}
|
| 122 |
return payload
|
|
|
|
| 143 |
"excluded_supplier_phrase": supplier_phrase,
|
| 144 |
"normalized_text": normalized_text,
|
| 145 |
"candidate_text": candidate_text,
|
| 146 |
+
"candidate_tokens": candidate_tokens,
|
| 147 |
+
"candidate_token_debug": candidate_debug,
|
| 148 |
"matcher_debug": match.get("supplier_debug"),
|
| 149 |
}
|
| 150 |
|