"""Deterministic numeric-consistency checker (no LLM, instant, free). A ≤4B model generating freehand still slips on prose numbers (e.g. "gera R$375k/dia" when 5.000 rides × R$25 = R$125k). The teacher/code-audit fixes the TRAINING corpus, but the live student isn't audited at inference (that would 3× latency + burn ZeroGPU quota). So instead the Space runs this cheap pattern-checker over the generated case and **flags** likely inconsistencies for the author to verify — fitting the authoring/review model. It WARNS, never silently "fixes" prose. Conservative by design: only flags when it confidently extracts a relationship. """ from __future__ import annotations import re _UNIT = r"(?:corridas?|viagens?|unidades?|clientes?|pedidos?|atendimentos?|" \ r"rides?|trips?|units?|customers?|orders?)" _PERDAY = r"(?:por dia|/\s*dia|ao dia|di[aá]ri[ao]s?|per day|/\s*day|daily|por m[eê]s|/\s*m[eê]s|monthly|per month)" def _num(text: str) -> float | None: """Parse a pt-BR/en number, honoring mil/milhão (and thousands/decimal seps).""" t = text.lower() mult = 1.0 if "milh" in t or "million" in t or "mi " in t: mult = 1_000_000.0 elif "mil" in t or "thousand" in t: mult = 1_000.0 m = re.search(r"\d[\d.,]*", t) if not m: return None n = m.group(0) if "," in n and "." in n: # pt-BR: '.' thousands, ',' decimal n = n.replace(".", "").replace(",", ".") elif "," in n: # only comma → decimal sep n = n.replace(",", ".") else: # only dots → thousands sep (375.000) if re.search(r"\.\d{3}\b", n): n = n.replace(".", "") try: return float(n) * mult except ValueError: return None def _money(seg: str): """First R$/$ amount in a fragment (carrying mil/milhão words after it).""" m = re.search(r"(?:R\$|\$)\s*([\d.,]+(?:\s*(?:mil|milh[õo]es?|milh[ãa]o|million|thousand))?)", seg, re.I) return _num(m.group(1)) if m else None def _close(a: float, b: float, tol: float = 0.05) -> bool: if a is None or b is None: return True big = max(abs(a), abs(b), 1.0) return abs(a - b) / big <= tol def _fmt(v: float) -> str: v = round(v) return f"{v:,.0f}".replace(",", ".") def _sentences(text: str) -> list[str]: return re.split(r"(?<=[.;:])\s+|\n", text) def check(obj: dict, lang: str = "pt") -> list[str]: """Return author-facing warnings about likely numeric inconsistencies.""" if not obj: return [] c = obj.get("case") or {} data = c.get("data") or [] text = " ".join([c.get("context", "")] + [str(d) for d in data] + [str(e.get("content", "")) for e in (c.get("exhibits") or [])]) warns: list[str] = [] # --- 1) daily/monthly revenue ≈ quantity × unit price ------------------- # quantity per period (prefer "média"/"total"/largest) qtys = [_num(m.group(0)) for m in re.finditer(rf"\d[\d.,]*\s*{_UNIT}[^.]*?{_PERDAY}", text, re.I)] qtys = [q for q in qtys if q] unit_price = None mp = re.search(rf"(?:R\$|\$)\s*([\d.,]+)\s*(?:por|/|each|per)\s*{_UNIT}", text, re.I) if mp: unit_price = _num(mp.group(1)) # the revenue sentence: has "receita/gera/faturamento" + a per-period amount rev = None for s in _sentences(text): if re.search(r"receita|fatur|gera|revenue|generat|bring", s, re.I) and re.search(_PERDAY, s, re.I): rev = _money(s) if rev: break if qtys and unit_price and rev: q = max(qtys) if not _close(q * unit_price, rev): warns.append( (f"receita ({_fmt(rev)}) não bate com {_fmt(q)} × {_fmt(unit_price)} = " f"{_fmt(q * unit_price)}" if lang == "pt" else f"revenue ({_fmt(rev)}) doesn't match {_fmt(q)} × {_fmt(unit_price)} = " f"{_fmt(q * unit_price)}")) # --- 2) cost + profit ≈ price ------------------------------------------ def _find(label_re): m = re.search(rf"(?:{label_re})[^.]*?(?:R\$|\$)\s*([\d.,]+)", text, re.I) return _num(m.group(1)) if m else None custo = _find(r"custo|cost") lucro = _find(r"lucro|margem de lucro|profit") preco = _find(r"pre[çc]o|price") if custo and lucro and preco and not _close(custo + lucro, preco): warns.append( (f"custo ({_fmt(custo)}) + lucro ({_fmt(lucro)}) = {_fmt(custo + lucro)} " f"≠ preço ({_fmt(preco)})" if lang == "pt" else f"cost ({_fmt(custo)}) + profit ({_fmt(lucro)}) = {_fmt(custo + lucro)} " f"≠ price ({_fmt(preco)})")) return warns __all__ = ["check"]