Spaces:
Running on Zero
Running on Zero
File size: 4,769 Bytes
f725a35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """Deterministic numeric-consistency checker (no LLM, instant, free).
A ≤4B model generating freehand still slips on prose numbers (e.g. "gera R$375k/dia"
when 5.000 rides × R$25 = R$125k). The teacher/code-audit fixes the TRAINING corpus,
but the live student isn't audited at inference (that would 3× latency + burn ZeroGPU
quota). So instead the Space runs this cheap pattern-checker over the generated case and
**flags** likely inconsistencies for the author to verify — fitting the authoring/review
model. It WARNS, never silently "fixes" prose.
Conservative by design: only flags when it confidently extracts a relationship.
"""
from __future__ import annotations
import re
_UNIT = r"(?:corridas?|viagens?|unidades?|clientes?|pedidos?|atendimentos?|" \
r"rides?|trips?|units?|customers?|orders?)"
_PERDAY = r"(?:por dia|/\s*dia|ao dia|di[aá]ri[ao]s?|per day|/\s*day|daily|por m[eê]s|/\s*m[eê]s|monthly|per month)"
def _num(text: str) -> float | None:
"""Parse a pt-BR/en number, honoring mil/milhão (and thousands/decimal seps)."""
t = text.lower()
mult = 1.0
if "milh" in t or "million" in t or "mi " in t:
mult = 1_000_000.0
elif "mil" in t or "thousand" in t:
mult = 1_000.0
m = re.search(r"\d[\d.,]*", t)
if not m:
return None
n = m.group(0)
if "," in n and "." in n: # pt-BR: '.' thousands, ',' decimal
n = n.replace(".", "").replace(",", ".")
elif "," in n: # only comma → decimal sep
n = n.replace(",", ".")
else: # only dots → thousands sep (375.000)
if re.search(r"\.\d{3}\b", n):
n = n.replace(".", "")
try:
return float(n) * mult
except ValueError:
return None
def _money(seg: str):
"""First R$/$ amount in a fragment (carrying mil/milhão words after it)."""
m = re.search(r"(?:R\$|\$)\s*([\d.,]+(?:\s*(?:mil|milh[õo]es?|milh[ãa]o|million|thousand))?)", seg, re.I)
return _num(m.group(1)) if m else None
def _close(a: float, b: float, tol: float = 0.05) -> bool:
if a is None or b is None:
return True
big = max(abs(a), abs(b), 1.0)
return abs(a - b) / big <= tol
def _fmt(v: float) -> str:
v = round(v)
return f"{v:,.0f}".replace(",", ".")
def _sentences(text: str) -> list[str]:
return re.split(r"(?<=[.;:])\s+|\n", text)
def check(obj: dict, lang: str = "pt") -> list[str]:
"""Return author-facing warnings about likely numeric inconsistencies."""
if not obj:
return []
c = obj.get("case") or {}
data = c.get("data") or []
text = " ".join([c.get("context", "")] + [str(d) for d in data]
+ [str(e.get("content", "")) for e in (c.get("exhibits") or [])])
warns: list[str] = []
# --- 1) daily/monthly revenue ≈ quantity × unit price -------------------
# quantity per period (prefer "média"/"total"/largest)
qtys = [_num(m.group(0)) for m in
re.finditer(rf"\d[\d.,]*\s*{_UNIT}[^.]*?{_PERDAY}", text, re.I)]
qtys = [q for q in qtys if q]
unit_price = None
mp = re.search(rf"(?:R\$|\$)\s*([\d.,]+)\s*(?:por|/|each|per)\s*{_UNIT}", text, re.I)
if mp:
unit_price = _num(mp.group(1))
# the revenue sentence: has "receita/gera/faturamento" + a per-period amount
rev = None
for s in _sentences(text):
if re.search(r"receita|fatur|gera|revenue|generat|bring", s, re.I) and re.search(_PERDAY, s, re.I):
rev = _money(s)
if rev:
break
if qtys and unit_price and rev:
q = max(qtys)
if not _close(q * unit_price, rev):
warns.append(
(f"receita ({_fmt(rev)}) não bate com {_fmt(q)} × {_fmt(unit_price)} = "
f"{_fmt(q * unit_price)}" if lang == "pt" else
f"revenue ({_fmt(rev)}) doesn't match {_fmt(q)} × {_fmt(unit_price)} = "
f"{_fmt(q * unit_price)}"))
# --- 2) cost + profit ≈ price ------------------------------------------
def _find(label_re):
m = re.search(rf"(?:{label_re})[^.]*?(?:R\$|\$)\s*([\d.,]+)", text, re.I)
return _num(m.group(1)) if m else None
custo = _find(r"custo|cost")
lucro = _find(r"lucro|margem de lucro|profit")
preco = _find(r"pre[çc]o|price")
if custo and lucro and preco and not _close(custo + lucro, preco):
warns.append(
(f"custo ({_fmt(custo)}) + lucro ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
f"≠ preço ({_fmt(preco)})" if lang == "pt" else
f"cost ({_fmt(custo)}) + profit ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
f"≠ price ({_fmt(preco)})"))
return warns
__all__ = ["check"]
|