File size: 1,381 Bytes
ce0c46c
 
be2c8ad
 
ce0c46c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re

# CJK character ranges (Chinese, Japanese, Korean) including punctuation
CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')

def clean_latex(text: str) -> str:
    """Standardized cleaning for both OCR and LLM logic."""
    if not text: return ""
    # Remove CJK
    text = CJK_PATTERN.sub('', text)
    # Remove LaTeX wrappers
    text = text.replace('\\', '').replace('{', '').replace('}', '').replace('$', '')
    # Remove common conversational prefixes in math problems
    text = re.sub(r'(?i)\b(prove|solve|calculate|find|simplify|evaluate|where)\b', '', text)
    # Expand implicit multiplication: 2x -> 2*x
    text = re.sub(r'(\d)([a-zA-Z\(])', r'\1*\2', text)
    text = re.sub(r'([a-zA-Z\)])(\d)', r'\1*\2', text)
    # Normalize whitespace and strip
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_math_string(s: str) -> str:
    """Normalize mathematical strings for comparison."""
    if not s: return ""
    s = s.replace(" ", "").lower()
    # Try to normalize numeric parts
    try:
        if ',' in s:
            parts = [normalize_math_string(p) for p in s.split(',')]
            return ','.join(sorted(parts))
        f = float(s)
        return str(int(f)) if f == int(f) else str(round(f, 6))
    except:
        return s