| import re | |
| # CJK character ranges (Chinese, Japanese, Korean) including punctuation | |
| CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]') | |
| def clean_latex(text: str) -> str: | |
| """Standardized cleaning for both OCR and LLM logic.""" | |
| if not text: return "" | |
| # Remove CJK | |
| text = CJK_PATTERN.sub('', text) | |
| # Remove LaTeX wrappers | |
| text = text.replace('\\', '').replace('{', '').replace('}', '').replace('$', '') | |
| # Remove common conversational prefixes in math problems | |
| text = re.sub(r'(?i)\b(prove|solve|calculate|find|simplify|evaluate|where)\b', '', text) | |
| # Expand implicit multiplication: 2x -> 2*x | |
| text = re.sub(r'(\d)([a-zA-Z\(])', r'\1*\2', text) | |
| text = re.sub(r'([a-zA-Z\)])(\d)', r'\1*\2', text) | |
| # Normalize whitespace and strip | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def normalize_math_string(s: str) -> str: | |
| """Normalize mathematical strings for comparison.""" | |
| if not s: return "" | |
| s = s.replace(" ", "").lower() | |
| # Try to normalize numeric parts | |
| try: | |
| if ',' in s: | |
| parts = [normalize_math_string(p) for p in s.split(',')] | |
| return ','.join(sorted(parts)) | |
| f = float(s) | |
| return str(int(f)) if f == int(f) else str(round(f, 6)) | |
| except: | |
| return s | |