File size: 3,349 Bytes
9fa4ecb
218085c
 
 
 
 
 
 
9fa4ecb
218085c
 
 
9fa4ecb
 
218085c
9fa4ecb
218085c
 
 
 
9fa4ecb
218085c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63a687d
218085c
 
 
63a687d
218085c
9fa4ecb
 
63a687d
 
 
218085c
63a687d
 
 
 
 
 
 
 
 
218085c
9fa4ecb
 
 
63a687d
 
 
c743599
 
63a687d
 
 
 
 
 
218085c
63a687d
 
 
c743599
 
63a687d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Простой regex-экстрактор суммы из текста."""

from __future__ import annotations

import re
from typing import Any, Optional


AMOUNT_PATTERN = re.compile(r"\d+(?:,\d{1,2})?", re.IGNORECASE)


class ExpenseAmountExtractor:
    """Извлекает сумму как целое число или число с запятой."""

    def __init__(self, suppliers: list[str] | None = None) -> None:
        self.suppliers = suppliers or []

    @staticmethod
    def to_float(value: str) -> Optional[float]:
        try:
            return float(value.replace(",", "."))
        except ValueError:
            return None

    @staticmethod
    def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
        if not phrase:
            return None
        idx = text.lower().find(phrase.lower())
        if idx == -1:
            return None
        return idx, idx + len(phrase)

    @staticmethod
    def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
        if span2 is None:
            return False
        return span1[0] < span2[1] and span2[0] < span1[1]

    def extract(
        self,
        text: str,
        matched_date_phrase: Optional[str] = None,
        matched_supplier_phrase: Optional[str] = None,
        debug: bool = False,
    ) -> dict[str, Any]:
        date_span = self.phrase_span(text, matched_date_phrase)
        supplier_span = self.phrase_span(text, matched_supplier_phrase)
        candidates: list[dict[str, Any]] = []

        for match in AMOUNT_PATTERN.finditer(text):
            span = match.span()
            overlaps_date = self.overlaps(span, date_span)
            overlaps_supplier = self.overlaps(span, supplier_span)
            amount_text = match.group(0)

            if debug:
                candidates.append({
                    "value": amount_text,
                    "span": [span[0], span[1]],
                    "overlaps_date": overlaps_date,
                    "overlaps_supplier": overlaps_supplier,
                })

            if overlaps_date or overlaps_supplier:
                continue

            amount = self.to_float(amount_text)
            if amount is not None:
                payload = {"amount": amount, "amount_text": amount_text}
                if debug:
                    payload["amount_debug"] = {
                        "matched_date_phrase": matched_date_phrase,
                        "matched_supplier_phrase": matched_supplier_phrase,
                        "date_span": list(date_span) if date_span else None,
                        "supplier_span": list(supplier_span) if supplier_span else None,
                        "candidates": candidates,
                        "selected": amount_text,
                    }
                return payload

        payload = {"amount": None, "amount_text": None}
        if debug:
            payload["amount_debug"] = {
                "matched_date_phrase": matched_date_phrase,
                "matched_supplier_phrase": matched_supplier_phrase,
                "date_span": list(date_span) if date_span else None,
                "supplier_span": list(supplier_span) if supplier_span else None,
                "candidates": candidates,
                "selected": None,
            }
        return payload