Spaces:

Varshithdharmajv
/

mvm2-math-verification

Sleeping

Antigravity Agent commited on 17 days ago

Commit

be2c8ad

1 Parent(s): ce0c46c

fix(ocr): apply universal CJK filter and early input sanitization

Files changed (3) hide show

llm_agent.py CHANGED Viewed

@@ -104,6 +104,9 @@ class LLMAgent:
                 self.use_real_api = False
     def generate_solution(self, problem: str) -> dict:
         if self.use_real_api and self.client:
             return self._call_real_gemini(problem)
         return self._simulate_agent(problem)

                 self.use_real_api = False
     def generate_solution(self, problem: str) -> dict:
+        # Expert ML Patch: Clean input early to prevent CJK leakage to APIs
+        problem = clean_latex(problem)
         if self.use_real_api and self.client:
             return self._call_real_gemini(problem)
         return self._simulate_agent(problem)

math_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
-# CJK character ranges (Chinese, Japanese, Korean)
-CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]')
 def clean_latex(text: str) -> str:
     """Standardized cleaning for both OCR and LLM logic."""

 import re
+# CJK character ranges (Chinese, Japanese, Korean) including punctuation
+CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
 def clean_latex(text: str) -> str:
     """Standardized cleaning for both OCR and LLM logic."""

ocr_module.py CHANGED Viewed

@@ -9,8 +9,8 @@ from PIL import Image
 CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
 BRACKETS_LIMITS = ["(", ")", "[", "]", "\\{", "\\}", "^", "_"]
 AMBIGUOUS_SYMBOLS = ["8", "B", "0", "O", "l", "1", "I", "S", "5", "Z", "2"]
-# CJK character ranges (Chinese, Japanese, Korean)
-CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]')
 def get_symbol_weight(symbol: str) -> float:
     if symbol in CRITICAL_OPERATORS: return 1.5
@@ -46,9 +46,11 @@ def calculate_weighted_confidence(latex_string: str, mock_logits: bool = True) -
     return round(total_weighted_ci / total_weights, 4)
 def clean_latex_output(text: str) -> str:
-    """Aggressively remove CJK characters from OCR output."""
     cleaned = CJK_PATTERN.sub('', text)
-    # Also remove lone punctuation clusters that result from CJK removal
     cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()
     return cleaned

 CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
 BRACKETS_LIMITS = ["(", ")", "[", "]", "\\{", "\\}", "^", "_"]
 AMBIGUOUS_SYMBOLS = ["8", "B", "0", "O", "l", "1", "I", "S", "5", "Z", "2"]
+# CJK character ranges (Chinese, Japanese, Korean) including punctuation
+CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
 def get_symbol_weight(symbol: str) -> float:
     if symbol in CRITICAL_OPERATORS: return 1.5
     return round(total_weighted_ci / total_weights, 4)
 def clean_latex_output(text: str) -> str:
+    """Aggressively remove CJK characters and punctuation from OCR output."""
+    if not text: return ""
     cleaned = CJK_PATTERN.sub('', text)
+    # Remove common conversational noise
+    cleaned = re.sub(r'(?i)\b(solve|find|evaluate|simplify)\b', '', cleaned)
     cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()
     return cleaned