case-forge / core /numcheck.py
nextmarte's picture
Add deterministic numeric-consistency checker (flags prose number slips)
f725a35 verified
Raw
History Blame Contribute Delete
4.77 kB
"""Deterministic numeric-consistency checker (no LLM, instant, free).
A ≤4B model generating freehand still slips on prose numbers (e.g. "gera R$375k/dia"
when 5.000 rides × R$25 = R$125k). The teacher/code-audit fixes the TRAINING corpus,
but the live student isn't audited at inference (that would 3× latency + burn ZeroGPU
quota). So instead the Space runs this cheap pattern-checker over the generated case and
**flags** likely inconsistencies for the author to verify — fitting the authoring/review
model. It WARNS, never silently "fixes" prose.
Conservative by design: only flags when it confidently extracts a relationship.
"""
from __future__ import annotations
import re
_UNIT = r"(?:corridas?|viagens?|unidades?|clientes?|pedidos?|atendimentos?|" \
r"rides?|trips?|units?|customers?|orders?)"
_PERDAY = r"(?:por dia|/\s*dia|ao dia|di[aá]ri[ao]s?|per day|/\s*day|daily|por m[eê]s|/\s*m[eê]s|monthly|per month)"
def _num(text: str) -> float | None:
"""Parse a pt-BR/en number, honoring mil/milhão (and thousands/decimal seps)."""
t = text.lower()
mult = 1.0
if "milh" in t or "million" in t or "mi " in t:
mult = 1_000_000.0
elif "mil" in t or "thousand" in t:
mult = 1_000.0
m = re.search(r"\d[\d.,]*", t)
if not m:
return None
n = m.group(0)
if "," in n and "." in n: # pt-BR: '.' thousands, ',' decimal
n = n.replace(".", "").replace(",", ".")
elif "," in n: # only comma → decimal sep
n = n.replace(",", ".")
else: # only dots → thousands sep (375.000)
if re.search(r"\.\d{3}\b", n):
n = n.replace(".", "")
try:
return float(n) * mult
except ValueError:
return None
def _money(seg: str):
"""First R$/$ amount in a fragment (carrying mil/milhão words after it)."""
m = re.search(r"(?:R\$|\$)\s*([\d.,]+(?:\s*(?:mil|milh[õo]es?|milh[ãa]o|million|thousand))?)", seg, re.I)
return _num(m.group(1)) if m else None
def _close(a: float, b: float, tol: float = 0.05) -> bool:
if a is None or b is None:
return True
big = max(abs(a), abs(b), 1.0)
return abs(a - b) / big <= tol
def _fmt(v: float) -> str:
v = round(v)
return f"{v:,.0f}".replace(",", ".")
def _sentences(text: str) -> list[str]:
return re.split(r"(?<=[.;:])\s+|\n", text)
def check(obj: dict, lang: str = "pt") -> list[str]:
"""Return author-facing warnings about likely numeric inconsistencies."""
if not obj:
return []
c = obj.get("case") or {}
data = c.get("data") or []
text = " ".join([c.get("context", "")] + [str(d) for d in data]
+ [str(e.get("content", "")) for e in (c.get("exhibits") or [])])
warns: list[str] = []
# --- 1) daily/monthly revenue ≈ quantity × unit price -------------------
# quantity per period (prefer "média"/"total"/largest)
qtys = [_num(m.group(0)) for m in
re.finditer(rf"\d[\d.,]*\s*{_UNIT}[^.]*?{_PERDAY}", text, re.I)]
qtys = [q for q in qtys if q]
unit_price = None
mp = re.search(rf"(?:R\$|\$)\s*([\d.,]+)\s*(?:por|/|each|per)\s*{_UNIT}", text, re.I)
if mp:
unit_price = _num(mp.group(1))
# the revenue sentence: has "receita/gera/faturamento" + a per-period amount
rev = None
for s in _sentences(text):
if re.search(r"receita|fatur|gera|revenue|generat|bring", s, re.I) and re.search(_PERDAY, s, re.I):
rev = _money(s)
if rev:
break
if qtys and unit_price and rev:
q = max(qtys)
if not _close(q * unit_price, rev):
warns.append(
(f"receita ({_fmt(rev)}) não bate com {_fmt(q)} × {_fmt(unit_price)} = "
f"{_fmt(q * unit_price)}" if lang == "pt" else
f"revenue ({_fmt(rev)}) doesn't match {_fmt(q)} × {_fmt(unit_price)} = "
f"{_fmt(q * unit_price)}"))
# --- 2) cost + profit ≈ price ------------------------------------------
def _find(label_re):
m = re.search(rf"(?:{label_re})[^.]*?(?:R\$|\$)\s*([\d.,]+)", text, re.I)
return _num(m.group(1)) if m else None
custo = _find(r"custo|cost")
lucro = _find(r"lucro|margem de lucro|profit")
preco = _find(r"pre[çc]o|price")
if custo and lucro and preco and not _close(custo + lucro, preco):
warns.append(
(f"custo ({_fmt(custo)}) + lucro ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
f"≠ preço ({_fmt(preco)})" if lang == "pt" else
f"cost ({_fmt(custo)}) + profit ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
f"≠ price ({_fmt(preco)})"))
return warns
__all__ = ["check"]