Spaces:

build-small-hackathon
/

case-forge

Running on Zero

App Files Files Community

case-forge / core /numcheck.py

nextmarte

Add deterministic numeric-consistency checker (flags prose number slips)

f725a35 verified 16 days ago

Raw

History Blame Contribute Delete

4.77 kB

	"""Deterministic numeric-consistency checker (no LLM, instant, free).

	A ≤4B model generating freehand still slips on prose numbers (e.g. "gera R$375k/dia"
	when 5.000 rides × R$25 = R$125k). The teacher/code-audit fixes the TRAINING corpus,
	but the live student isn't audited at inference (that would 3× latency + burn ZeroGPU
	quota). So instead the Space runs this cheap pattern-checker over the generated case and
	flags likely inconsistencies for the author to verify — fitting the authoring/review
	model. It WARNS, never silently "fixes" prose.

	Conservative by design: only flags when it confidently extracts a relationship.
	"""

	from __future__ import annotations

	import re

	_UNIT = r"(?:corridas?\|viagens?\|unidades?\|clientes?\|pedidos?\|atendimentos?\|" \
	r"rides?\|trips?\|units?\|customers?\|orders?)"
	_PERDAY = r"(?:por dia\|/\sdia\|ao dia\|di[aá]ri[ao]s?\|per day\|/\sday\|daily\|por m[eê]s\|/\s*m[eê]s\|monthly\|per month)"


	def _num(text: str) -> float \| None:
	"""Parse a pt-BR/en number, honoring mil/milhão (and thousands/decimal seps)."""
	t = text.lower()
	mult = 1.0
	if "milh" in t or "million" in t or "mi " in t:
	mult = 1_000_000.0
	elif "mil" in t or "thousand" in t:
	mult = 1_000.0
	m = re.search(r"\d[\d.,]*", t)
	if not m:
	return None
	n = m.group(0)
	if "," in n and "." in n: # pt-BR: '.' thousands, ',' decimal
	n = n.replace(".", "").replace(",", ".")
	elif "," in n: # only comma → decimal sep
	n = n.replace(",", ".")
	else: # only dots → thousands sep (375.000)
	if re.search(r"\.\d{3}\b", n):
	n = n.replace(".", "")
	try:
	return float(n) * mult
	except ValueError:
	return None


	def _money(seg: str):
	"""First R$/$ amount in a fragment (carrying mil/milhão words after it)."""
	m = re.search(r"(?:R\$\|\$)\s([\d.,]+(?:\s(?:mil\|milh[õo]es?\|milh[ãa]o\|million\|thousand))?)", seg, re.I)
	return _num(m.group(1)) if m else None


	def _close(a: float, b: float, tol: float = 0.05) -> bool:
	if a is None or b is None:
	return True
	big = max(abs(a), abs(b), 1.0)
	return abs(a - b) / big <= tol


	def _fmt(v: float) -> str:
	v = round(v)
	return f"{v:,.0f}".replace(",", ".")


	def _sentences(text: str) -> list[str]:
	return re.split(r"(?<=[.;:])\s+\|\n", text)


	def check(obj: dict, lang: str = "pt") -> list[str]:
	"""Return author-facing warnings about likely numeric inconsistencies."""
	if not obj:
	return []
	c = obj.get("case") or {}
	data = c.get("data") or []
	text = " ".join([c.get("context", "")] + [str(d) for d in data]
	+ [str(e.get("content", "")) for e in (c.get("exhibits") or [])])
	warns: list[str] = []

	# --- 1) daily/monthly revenue ≈ quantity × unit price -------------------
	# quantity per period (prefer "média"/"total"/largest)
	qtys = [_num(m.group(0)) for m in
	re.finditer(rf"\d[\d.,]\s{_UNIT}[^.]*?{_PERDAY}", text, re.I)]
	qtys = [q for q in qtys if q]
	unit_price = None
	mp = re.search(rf"(?:R\$\|\$)\s([\d.,]+)\s(?:por\|/\|each\|per)\s*{_UNIT}", text, re.I)
	if mp:
	unit_price = _num(mp.group(1))
	# the revenue sentence: has "receita/gera/faturamento" + a per-period amount
	rev = None
	for s in _sentences(text):
	if re.search(r"receita\|fatur\|gera\|revenue\|generat\|bring", s, re.I) and re.search(_PERDAY, s, re.I):
	rev = _money(s)
	if rev:
	break
	if qtys and unit_price and rev:
	q = max(qtys)
	if not _close(q * unit_price, rev):
	warns.append(
	(f"receita ({_fmt(rev)}) não bate com {_fmt(q)} × {_fmt(unit_price)} = "
	f"{_fmt(q * unit_price)}" if lang == "pt" else
	f"revenue ({_fmt(rev)}) doesn't match {_fmt(q)} × {_fmt(unit_price)} = "
	f"{_fmt(q * unit_price)}"))

	# --- 2) cost + profit ≈ price ------------------------------------------
	def _find(label_re):
	m = re.search(rf"(?:{label_re})[^.]?(?:R\$\|\$)\s([\d.,]+)", text, re.I)
	return _num(m.group(1)) if m else None

	custo = _find(r"custo\|cost")
	lucro = _find(r"lucro\|margem de lucro\|profit")
	preco = _find(r"pre[çc]o\|price")
	if custo and lucro and preco and not _close(custo + lucro, preco):
	warns.append(
	(f"custo ({_fmt(custo)}) + lucro ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
	f"≠ preço ({_fmt(preco)})" if lang == "pt" else
	f"cost ({_fmt(custo)}) + profit ({_fmt(lucro)}) = {_fmt(custo + lucro)} "
	f"≠ price ({_fmt(preco)})"))

	return warns


	__all__ = ["check"]