mvm2-math-verification / math_utils.py
Antigravity Agent
fix(ocr): apply universal CJK filter and early input sanitization
be2c8ad
import re
# CJK character ranges (Chinese, Japanese, Korean) including punctuation
CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
def clean_latex(text: str) -> str:
"""Standardized cleaning for both OCR and LLM logic."""
if not text: return ""
# Remove CJK
text = CJK_PATTERN.sub('', text)
# Remove LaTeX wrappers
text = text.replace('\\', '').replace('{', '').replace('}', '').replace('$', '')
# Remove common conversational prefixes in math problems
text = re.sub(r'(?i)\b(prove|solve|calculate|find|simplify|evaluate|where)\b', '', text)
# Expand implicit multiplication: 2x -> 2*x
text = re.sub(r'(\d)([a-zA-Z\(])', r'\1*\2', text)
text = re.sub(r'([a-zA-Z\)])(\d)', r'\1*\2', text)
# Normalize whitespace and strip
text = re.sub(r'\s+', ' ', text).strip()
return text
def normalize_math_string(s: str) -> str:
"""Normalize mathematical strings for comparison."""
if not s: return ""
s = s.replace(" ", "").lower()
# Try to normalize numeric parts
try:
if ',' in s:
parts = [normalize_math_string(p) for p in s.split(',')]
return ','.join(sorted(parts))
f = float(s)
return str(int(f)) if f == int(f) else str(round(f, 6))
except:
return s