ConvertAudioToJSON / extractors /amount_extractor.py
VladGeekPro
SupplierDebugging
c743599
raw
history blame
3.35 kB
"""Простой regex-экстрактор суммы из текста."""
from __future__ import annotations
import re
from typing import Any, Optional
AMOUNT_PATTERN = re.compile(r"\d+(?:,\d{1,2})?", re.IGNORECASE)
class ExpenseAmountExtractor:
"""Извлекает сумму как целое число или число с запятой."""
def __init__(self, suppliers: list[str] | None = None) -> None:
self.suppliers = suppliers or []
@staticmethod
def to_float(value: str) -> Optional[float]:
try:
return float(value.replace(",", "."))
except ValueError:
return None
@staticmethod
def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
if not phrase:
return None
idx = text.lower().find(phrase.lower())
if idx == -1:
return None
return idx, idx + len(phrase)
@staticmethod
def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
if span2 is None:
return False
return span1[0] < span2[1] and span2[0] < span1[1]
def extract(
self,
text: str,
matched_date_phrase: Optional[str] = None,
matched_supplier_phrase: Optional[str] = None,
debug: bool = False,
) -> dict[str, Any]:
date_span = self.phrase_span(text, matched_date_phrase)
supplier_span = self.phrase_span(text, matched_supplier_phrase)
candidates: list[dict[str, Any]] = []
for match in AMOUNT_PATTERN.finditer(text):
span = match.span()
overlaps_date = self.overlaps(span, date_span)
overlaps_supplier = self.overlaps(span, supplier_span)
amount_text = match.group(0)
if debug:
candidates.append({
"value": amount_text,
"span": [span[0], span[1]],
"overlaps_date": overlaps_date,
"overlaps_supplier": overlaps_supplier,
})
if overlaps_date or overlaps_supplier:
continue
amount = self.to_float(amount_text)
if amount is not None:
payload = {"amount": amount, "amount_text": amount_text}
if debug:
payload["amount_debug"] = {
"matched_date_phrase": matched_date_phrase,
"matched_supplier_phrase": matched_supplier_phrase,
"date_span": list(date_span) if date_span else None,
"supplier_span": list(supplier_span) if supplier_span else None,
"candidates": candidates,
"selected": amount_text,
}
return payload
payload = {"amount": None, "amount_text": None}
if debug:
payload["amount_debug"] = {
"matched_date_phrase": matched_date_phrase,
"matched_supplier_phrase": matched_supplier_phrase,
"date_span": list(date_span) if date_span else None,
"supplier_span": list(supplier_span) if supplier_span else None,
"candidates": candidates,
"selected": None,
}
return payload