File size: 4,769 Bytes
f725a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""Deterministic numeric-consistency checker (no LLM, instant, free).

A ≤4B model generating freehand still slips on prose numbers (e.g. "gera R$375k/dia"
when 5.000 rides × R$25 = R$125k). The teacher/code-audit fixes the TRAINING corpus,
but the live student isn't audited at inference (that would 3× latency + burn ZeroGPU
quota). So instead the Space runs this cheap pattern-checker over the generated case and
**flags** likely inconsistencies for the author to verify — fitting the authoring/review
model. It WARNS, never silently "fixes" prose.

Conservative by design: only flags when it confidently extracts a relationship.
"""

from __future__ import annotations

import re

_UNIT = r"(?:corridas?|viagens?|unidades?|clientes?|pedidos?|atendimentos?|" \
        r"rides?|trips?|units?|customers?|orders?)"
_PERDAY = r"(?:por dia|/\s*dia|ao dia|di[aá]ri[ao]s?|per day|/\s*day|daily|por m[eê]s|/\s*m[eê]s|monthly|per month)"


def _num(text: str) -> float | None:
    """Parse a pt-BR/en number, honoring mil/milhão (and thousands/decimal seps)."""
    t = text.lower()
    mult = 1.0
    if "milh" in t or "million" in t or "mi " in t:
        mult = 1_000_000.0
    elif "mil" in t or "thousand" in t:
        mult = 1_000.0
    m = re.search(r"\d[\d.,]*", t)
    if not m:
        return None
    n = m.group(0)
    if "," in n and "." in n:          # pt-BR: '.' thousands, ',' decimal
        n = n.replace(".", "").replace(",", ".")
    elif "," in n:                     # only comma → decimal sep
        n = n.replace(",", ".")
    else:                             # only dots → thousands sep (375.000)
        if re.search(r"\.\d{3}\b", n):
            n = n.replace(".", "")
    try:
        return float(n) * mult
    except ValueError:
        return None


def _money(seg: str):
    """First R$/$ amount in a fragment (carrying mil/milhão words after it)."""
    m = re.search(r"(?:R\$|\$)\s*([\d.,]+(?:\s*(?:mil|milh[õo]es?|milh[ãa]o|million|thousand))?)", seg, re.I)
    return _num(m.group(1)) if m else None


def _close(a: float, b: float, tol: float = 0.05) -> bool:
    if a is None or b is None:
        return True
    big = max(abs(a), abs(b), 1.0)
    return abs(a - b) / big <= tol


def _fmt(v: float) -> str:
    v = round(v)
    return f"{v:,.0f}".replace(",", ".")


def _sentences(text: str) -> list[str]:
    return re.split(r"(?<=[.;:])\s+|\n", text)


def check(obj: dict, lang: str = "pt") -> list[str]:
    """Return author-facing warnings about likely numeric inconsistencies."""
    if not obj:
        return []
    c = obj.get("case") or {}
    data = c.get("data") or []
    text = " ".join([c.get("context", "")] + [str(d) for d in data]
                    + [str(e.get("content", "")) for e in (c.get("exhibits") or [])])
    warns: list[str] = []

    # --- 1) daily/monthly revenue ≈ quantity × unit price -------------------
    # quantity per period (prefer "média"/"total"/largest)
    qtys = [_num(m.group(0)) for m in
            re.finditer(rf"\d[\d.,]*\s*{_UNIT}[^.]*?{_PERDAY}", text, re.I)]
    qtys = [q for q in qtys if q]
    unit_price = None
    mp = re.search(rf"(?:R\$|\$)\s*([\d.,]+)\s*(?:por|/|each|per)\s*{_UNIT}", text, re.I)
    if mp:
        unit_price = _num(mp.group(1))
    # the revenue sentence: has "receita/gera/faturamento" + a per-period amount
    rev = None
    for s in _sentences(text):
        if re.search(r"receita|fatur|gera|revenue|generat|bring", s, re.I) and re.search(_PERDAY, s, re.I):
            rev = _money(s)
            if rev:
                break
    if qtys and unit_price and rev:
        q = max(qtys)
        if not _close(q * unit_price, rev):
            warns.append(
                (f"receita ({_fmt(rev)}) não bate com {_fmt(q)} × {_fmt(unit_price)} = "
                 f"{_fmt(q * unit_price)}" if lang == "pt" else
                 f"revenue ({_fmt(rev)}) doesn't match {_fmt(q)} × {_fmt(unit_price)} = "
                 f"{_fmt(q * unit_price)}"))

    # --- 2) cost + profit ≈ price ------------------------------------------
    def _find(label_re):
        m = re.search(rf"(?:{label_re})[^.]*?(?:R\$|\$)\s*([\d.,]+)", text, re.I)
        return _num(m.group(1)) if m else None

    custo = _find(r"custo|cost")
    lucro = _find(r"lucro|margem de lucro|profit")
    preco = _find(r"pre[çc]o|price")
    if custo and lucro and preco and not _close(custo + lucro, preco):
        warns.append(
            (f"custo ({_fmt(custo)}) + lucro ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
             f"≠ preço ({_fmt(preco)})" if lang == "pt" else
             f"cost ({_fmt(custo)}) + profit ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
             f"≠ price ({_fmt(preco)})"))

    return warns


__all__ = ["check"]