Spaces:

HarshitShri026
/

AutoMathReasoner

Sleeping

App Files Files Community

AutoMathReasoner / env /verifier.py

HarshitShri026

push

973cd6f 22 days ago

raw

history blame contribute delete

17 kB

	import re
	import math
	from typing import Dict, Any, Tuple

	class VerifierSystem:
	"""
	Multi-stage verification system that returns graduated correctness scores
	instead of binary pass/fail. This provides a dense reward signal for RL
	training, enabling faster convergence.

	Correctness tiers:
	1.0 — Fully correct (exact or numerical match)
	0.7 — Structurally correct (right form, wrong coefficient)
	0.4 — Partially correct (correct technique identified)
	0.15 — Minimal credit (parseable math expression attempted)
	0.0 — Garbage / trivial output

	References:
	- DeepSeek-R1 GRPO reward design
	- arxiv:2408.10215 (Reward Engineering for RL)
	- arxiv:2601.19100 (Reward Engineering for Software Tasks)
	"""

	# Integration techniques and their associated keywords
	TECHNIQUE_KEYWORDS = {
	'u_substitution': ['substitut', 'u =', 'u=', 'let u', 'du'],
	'by_parts': ['by parts', 'integration by parts', 'ibp', 'uv -', 'udv'],
	'trig_sub': ['trig sub', 'trigonometric substitution', 'sin(θ)', 'cos(θ)', 'tan(θ)'],
	'partial_fraction': ['partial fraction', 'decompos'],
	'power_rule': ['power rule', 'x^n', 'x**'],
	'exponential': ['exponential', 'e^', 'exp('],
	'trigonometric': ['sin', 'cos', 'tan', 'sec', 'csc', 'cot'],
	'logarithmic': ['ln', 'log', 'logarithm'],
	}

	# Mathematical reasoning markers for process supervision
	MATH_MARKERS = [
	'step', 'first', 'then', 'next', 'therefore', 'because', 'since',
	'equals', 'simplif', 'substitut', 'evaluat', 'factor', 'expand',
	'differentiat', 'integrat', 'apply', 'using', 'recall', 'note that',
	'we get', 'we have', 'we know', 'this gives', 'which yields',
	]

	MATH_SYMBOLS = set('∫∂∑∏√±×÷≠≤≥≈∞∝∈∉⊂⊃∩∪αβγδεζηθλμπσφψω')

	def __init__(self):
	pass

	def check_exact_match(self, prediction: str, ground_truth: str) -> bool:
	"""1. Exact match verifier"""
	return prediction.strip().lower() == ground_truth.strip().lower()

	def check_numeric_tolerance(self, prediction: str, ground_truth: str, tol: float = 1e-4) -> bool:
	"""2. Numeric tolerance checker"""
	try:
	pred_val = float(prediction.strip())
	gt_val = float(ground_truth.strip())
	return math.isclose(pred_val, gt_val, rel_tol=tol, abs_tol=tol)
	except ValueError:
	return False

	def check_python_execution(self, prediction: str, ground_truth: str) -> bool:
	"""3. Python execution (eval safe expressions)"""
	# If prediction is an expression like "2+3", try evaluating it safely
	safe_dict = {"__builtins__": None, "math": math}
	try:
	# We are verifying if evaluating the prediction gives ground truth
	pred_eval = eval(prediction.strip(), safe_dict, {})
	try:
	gt_eval = float(ground_truth.strip())
	return math.isclose(float(pred_eval), gt_eval, rel_tol=1e-4, abs_tol=1e-4)
	except ValueError:
	return str(pred_eval).strip().lower() == ground_truth.strip().lower()
	except Exception:
	return False

	def check_numerical_integration(self, prediction: str, sympy_f: Any) -> bool:
	"""
	[PAPER TRACEABILITY: Section 3.1.3 Solution Verification]
	Numerical multi-point quadrature verification.
	Differentiates the prediction F_pred(x) and compares it to the ground
	truth integrand f(x) at 5 random points.
	"""
	import sympy as sp
	import random
	x = sp.Symbol('x')
	try:
	clean_pred = self._clean_math_answer(prediction)
	F_pred = sp.parse_expr(clean_pred)
	f_pred = sp.diff(F_pred, x)

	# Evaluate at 5 random points
	for _ in range(5):
	test_point = random.uniform(-5, 5)
	p_val = float(f_pred.subs(x, test_point).evalf())
	t_val = float(sympy_f.subs(x, test_point).evalf())

	# Paper uses 10^-2 relative tolerance
	if not math.isclose(p_val, t_val, rel_tol=1e-2, abs_tol=1e-2):
	return False
	return True
	except Exception:
	return False

	def check_structural_similarity(self, prediction: str, ground_truth: str, sympy_f: Any = None) -> float:
	"""
	Graduated structural similarity check.
	Compares SymPy expression trees to provide partial credit when the
	model's answer has the right structure but wrong coefficients.

	Returns:
	0.7 if structure matches but coefficients differ
	0.4 if the expression is parseable and shares operand types
	0.15 if the prediction is a parseable math expression
	0.0 if unparseable
	"""
	import sympy as sp
	x = sp.Symbol('x')

	try:
	clean_pred = self._clean_math_answer(prediction)
	clean_gt = self._clean_math_answer(ground_truth)

	pred_expr = sp.parse_expr(clean_pred)
	gt_expr = sp.parse_expr(clean_gt)
	except Exception:
	# Can't even parse — check if it at least looks like math
	if self._looks_like_math(prediction):
	return 0.15
	return 0.0

	# Check if expression trees have similar structure
	try:
	pred_funcs = self._extract_function_types(pred_expr)
	gt_funcs = self._extract_function_types(gt_expr)

	# Count overlapping function types (sin, cos, exp, log, Pow, etc.)
	overlap = pred_funcs & gt_funcs
	union = pred_funcs \| gt_funcs

	if not union:
	return 0.15 # Both are just constants/variables

	jaccard = len(overlap) / len(union)

	if jaccard >= 0.8:
	# Very similar structure — likely right form, wrong coefficient
	# Verify by checking at sample points if shapes are proportional
	if self._check_proportional(pred_expr, gt_expr, x):
	return 0.7
	return 0.5
	elif jaccard >= 0.4:
	return 0.4
	else:
	return 0.15

	except Exception:
	return 0.15

	def check_technique_recognition(self, reasoning: str, technique_hint: str = "") -> float:
	"""
	Checks if the model identified the correct integration technique.
	Returns a score ∈ [0, 1] based on technique match.

	This provides reward signal even when the final answer is wrong,
	as long as the model is using the right approach.
	"""
	if not technique_hint:
	return 0.0

	lower_r = reasoning.lower()

	# Check if the correct technique keywords appear in reasoning
	technique_kws = self.TECHNIQUE_KEYWORDS.get(technique_hint, [])
	if not technique_kws:
	return 0.0

	matches = sum(1 for kw in technique_kws if kw in lower_r)

	if matches >= 2:
	return 1.0 # Strong evidence of correct technique
	elif matches == 1:
	return 0.6 # Some evidence

	# Check if any technique was attempted at all
	any_technique = False
	for tech, kws in self.TECHNIQUE_KEYWORDS.items():
	if any(kw in lower_r for kw in kws):
	any_technique = True
	break

	return 0.2 if any_technique else 0.0

	def mock_llm_judge(self, reasoning: str, prediction: str, ground_truth: str) -> float:
	"""4. LLM judge (mock or placeholder scoring reasoning quality)
	Returns reasoning quality score Q (0.0 to 1.0)

	Improved with mathematical density scoring and better structural analysis.
	"""
	score = 0.0
	lower_reasoning = reasoning.lower()
	words = reasoning.split()
	length = len(words)

	# Length bonus (up to 0.25) — diminishing returns, gentle curve
	score += min(0.25, length * 0.005)

	# Mathematical marker bonus (up to 0.35)
	marker_count = sum(1 for m in self.MATH_MARKERS if m in lower_reasoning)
	score += min(0.35, marker_count * 0.05)

	# Mathematical symbol density bonus (up to 0.2)
	math_chars = sum(1 for c in reasoning if c in '=+-*/^()∫∂∑√' or c in self.MATH_SYMBOLS)
	if length > 0:
	math_density = math_chars / max(1, len(reasoning))
	score += min(0.2, math_density * 2.0)

	# Structured step progression bonus (up to 0.2)
	has_numbered_steps = bool(re.search(r'step\s*\d\|^\d+[\.\)]', lower_reasoning, re.MULTILINE))
	has_logical_flow = ('therefore' in lower_reasoning or 'thus' in lower_reasoning or
	'hence' in lower_reasoning or 'so we' in lower_reasoning)
	if has_numbered_steps:
	score += 0.12
	if has_logical_flow:
	score += 0.08

	return round(min(1.0, score), 3)

	def check_process_supervision(self, reasoning: str) -> float:
	"""
	[PAPER TRACEABILITY: Process Supervision (Lightweight PRM)]
	E. PROCESS SUPERVISION (STEP-AWARE REWARD)

	Improved with:
	- Mathematical density scoring
	- Multi-level step detection
	- Granular logical jump penalties
	- Technique-specific reward signals
	"""
	lower_r = reasoning.lower()
	words = lower_r.split()
	word_count = len(words)
	score = 0.0

	# 1. Check stepwise structure (up to 0.4)
	numbered_steps = len(re.findall(r'step\s*\d', lower_r))
	if numbered_steps >= 3:
	score += 0.4
	elif numbered_steps >= 2:
	score += 0.3
	elif numbered_steps >= 1:
	score += 0.2
	elif 'first' in lower_r and ('then' in lower_r or 'next' in lower_r):
	score += 0.15

	# 2. Mathematical operation density (up to 0.3)
	math_ops = len(re.findall(r'[=+\-*/^]', reasoning))
	if word_count > 0:
	op_density = math_ops / word_count
	score += min(0.3, op_density * 3.0)

	# 3. Technique identification bonus (up to 0.2)
	techniques_mentioned = 0
	for tech, kws in self.TECHNIQUE_KEYWORDS.items():
	if any(kw in lower_r for kw in kws):
	techniques_mentioned += 1
	score += min(0.2, techniques_mentioned * 0.1)

	# 4. Logical jump penalty — short reasoning with complex claims
	if word_count < 10 and ('=' in lower_r or 'so' in lower_r):
	score -= 0.3
	elif word_count < 20 and math_ops > 3:
	score -= 0.15 # Slightly suspicious — many operations, few words

	# 5. Bonus for showing intermediate results
	intermediate_results = len(re.findall(r'=\s*[\d\w]', reasoning))
	score += min(0.1, intermediate_results * 0.02)

	return max(-1.0, min(1.0, score))

	def check_reflection(self, reasoning: str, c: float) -> float:
	"""
	[PAPER TRACEABILITY: Reflection Module]
	H. REFLECTION MODULE
	Model generates "What could be wrong?"
	Penalize if contradiction with final answer, reward correct self-correction.

	Improved with graduated scoring based on reflection quality.
	"""
	lower_r = reasoning.lower()
	score = 0.0

	reflection_phrases = [
	"what could be wrong", "wait,", "let me check", "alternatively",
	"let me verify", "double check", "reconsider", "hmm",
	"actually,", "correction:", "i made an error", "let me redo"
	]

	reflections_found = sum(1 for phrase in reflection_phrases if phrase in lower_r)

	if reflections_found > 0:
	if c >= 0.7: # At least partially correct
	# Graduated reward based on how many reflection markers used
	score += min(1.0, 0.5 + reflections_found * 0.2)
	elif c >= 0.4:
	# Some credit — reflected but didn't fully fix
	score += 0.1
	else:
	# Reflected but still wrong — mild penalty (not as harsh as before)
	score -= 0.3

	return max(-1.0, min(1.0, score))

	def verify(self, reasoning: str, prediction: str, ground_truth: str,
	sympy_f: Any = None, technique_hint: str = "") -> Tuple[float, float, float, float]:
	"""
	Run all verifiers with GRADUATED CORRECTNESS scoring.

	Returns:
	C — Correctness ∈ [0, 1] (graduated, not binary)
	Q — Reasoning Quality ∈ [0, 1]
	P — Process Supervision ∈ [-1, 1]
	R — Reflection Score ∈ [-1, 1]
	"""
	# --- Graduated Correctness ---
	c = 0.0

	# Tier 1: Full correctness (1.0)
	if self.check_exact_match(prediction, ground_truth):
	c = 1.0
	elif sympy_f is not None and self.check_numerical_integration(prediction, sympy_f):
	c = 1.0
	elif self.check_numeric_tolerance(prediction, ground_truth):
	c = 1.0
	elif self.check_python_execution(prediction, ground_truth):
	c = 1.0

	# Tier 2-4: Partial credit (only if not fully correct)
	if c < 1.0:
	structural_score = self.check_structural_similarity(prediction, ground_truth, sympy_f)
	technique_score = self.check_technique_recognition(reasoning, technique_hint)

	# Take the best partial credit signal
	c = max(c, structural_score)

	# Technique recognition can boost partial credit
	if technique_score > 0 and c < 0.7:
	c = max(c, 0.15 + technique_score * 0.25) # Up to 0.4 from technique alone

	q = self.mock_llm_judge(reasoning, prediction, ground_truth)
	p = self.check_process_supervision(reasoning)
	r = self.check_reflection(reasoning, c)

	return c, q, p, r

	# --- Private Helpers ---

	def _clean_math_answer(self, text: str) -> str:
	"""Clean a math answer string for SymPy parsing."""
	clean = text.strip()
	if "Answer:" in clean:
	clean = clean.split("Answer:")[-1].strip()
	# Remove constant of integration
	clean = re.sub(r'\+\s[Cc]\s$', '', clean).strip()
	# Remove LaTeX wrappers
	clean = clean.replace('$', '').replace('\\', '')
	return clean

	def _looks_like_math(self, text: str) -> bool:
	"""Check if text contains mathematical content."""
	math_indicators = ['=', '+', '-', '*', '/', '^', 'x', 'sin', 'cos', 'exp', 'log', '(']
	return sum(1 for m in math_indicators if m in text.lower()) >= 2

	def _extract_function_types(self, expr) -> set:
	"""Extract the set of function types from a SymPy expression tree."""
	import sympy as sp
	types = set()

	if isinstance(expr, sp.Add):
	types.add('Add')
	elif isinstance(expr, sp.Mul):
	types.add('Mul')
	elif isinstance(expr, sp.Pow):
	types.add('Pow')

	func_type = type(expr).__name__
	if func_type in ('sin', 'cos', 'tan', 'exp', 'log', 'ln', 'Abs',
	'asin', 'acos', 'atan', 'sinh', 'cosh', 'tanh'):
	types.add(func_type)

	# Recurse into sub-expressions
	if hasattr(expr, 'args'):
	for arg in expr.args:
	types \|= self._extract_function_types(arg)

	return types

	def _check_proportional(self, expr1, expr2, x) -> bool:
	"""Check if two expressions are proportional (differ only by a constant factor)."""
	import sympy as sp
	import random

	try:
	ratios = []
	for _ in range(3):
	pt = random.uniform(-3, 3)
	v1 = float(expr1.subs(x, pt).evalf())
	v2 = float(expr2.subs(x, pt).evalf())
	if abs(v2) < 1e-10:
	continue
	ratios.append(v1 / v2)

	if len(ratios) < 2:
	return False

	# Check if all ratios are approximately equal (constant factor)
	return all(math.isclose(r, ratios[0], rel_tol=0.1) for r in ratios)
	except Exception:
	return False