Antigravity Agent commited on
Commit ·
be2c8ad
1
Parent(s): ce0c46c
fix(ocr): apply universal CJK filter and early input sanitization
Browse files- llm_agent.py +3 -0
- math_utils.py +2 -2
- ocr_module.py +6 -4
llm_agent.py
CHANGED
|
@@ -104,6 +104,9 @@ class LLMAgent:
|
|
| 104 |
self.use_real_api = False
|
| 105 |
|
| 106 |
def generate_solution(self, problem: str) -> dict:
|
|
|
|
|
|
|
|
|
|
| 107 |
if self.use_real_api and self.client:
|
| 108 |
return self._call_real_gemini(problem)
|
| 109 |
return self._simulate_agent(problem)
|
|
|
|
| 104 |
self.use_real_api = False
|
| 105 |
|
| 106 |
def generate_solution(self, problem: str) -> dict:
|
| 107 |
+
# Expert ML Patch: Clean input early to prevent CJK leakage to APIs
|
| 108 |
+
problem = clean_latex(problem)
|
| 109 |
+
|
| 110 |
if self.use_real_api and self.client:
|
| 111 |
return self._call_real_gemini(problem)
|
| 112 |
return self._simulate_agent(problem)
|
math_utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
-
# CJK character ranges (Chinese, Japanese, Korean)
|
| 4 |
-
CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]')
|
| 5 |
|
| 6 |
def clean_latex(text: str) -> str:
|
| 7 |
"""Standardized cleaning for both OCR and LLM logic."""
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
+
# CJK character ranges (Chinese, Japanese, Korean) including punctuation
|
| 4 |
+
CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
|
| 5 |
|
| 6 |
def clean_latex(text: str) -> str:
|
| 7 |
"""Standardized cleaning for both OCR and LLM logic."""
|
ocr_module.py
CHANGED
|
@@ -9,8 +9,8 @@ from PIL import Image
|
|
| 9 |
CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
|
| 10 |
BRACKETS_LIMITS = ["(", ")", "[", "]", "\\{", "\\}", "^", "_"]
|
| 11 |
AMBIGUOUS_SYMBOLS = ["8", "B", "0", "O", "l", "1", "I", "S", "5", "Z", "2"]
|
| 12 |
-
# CJK character ranges (Chinese, Japanese, Korean)
|
| 13 |
-
CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]')
|
| 14 |
|
| 15 |
def get_symbol_weight(symbol: str) -> float:
|
| 16 |
if symbol in CRITICAL_OPERATORS: return 1.5
|
|
@@ -46,9 +46,11 @@ def calculate_weighted_confidence(latex_string: str, mock_logits: bool = True) -
|
|
| 46 |
return round(total_weighted_ci / total_weights, 4)
|
| 47 |
|
| 48 |
def clean_latex_output(text: str) -> str:
|
| 49 |
-
"""Aggressively remove CJK characters from OCR output."""
|
|
|
|
| 50 |
cleaned = CJK_PATTERN.sub('', text)
|
| 51 |
-
#
|
|
|
|
| 52 |
cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()
|
| 53 |
return cleaned
|
| 54 |
|
|
|
|
| 9 |
CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
|
| 10 |
BRACKETS_LIMITS = ["(", ")", "[", "]", "\\{", "\\}", "^", "_"]
|
| 11 |
AMBIGUOUS_SYMBOLS = ["8", "B", "0", "O", "l", "1", "I", "S", "5", "Z", "2"]
|
| 12 |
+
# CJK character ranges (Chinese, Japanese, Korean) including punctuation
|
| 13 |
+
CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
|
| 14 |
|
| 15 |
def get_symbol_weight(symbol: str) -> float:
|
| 16 |
if symbol in CRITICAL_OPERATORS: return 1.5
|
|
|
|
| 46 |
return round(total_weighted_ci / total_weights, 4)
|
| 47 |
|
| 48 |
def clean_latex_output(text: str) -> str:
|
| 49 |
+
"""Aggressively remove CJK characters and punctuation from OCR output."""
|
| 50 |
+
if not text: return ""
|
| 51 |
cleaned = CJK_PATTERN.sub('', text)
|
| 52 |
+
# Remove common conversational noise
|
| 53 |
+
cleaned = re.sub(r'(?i)\b(solve|find|evaluate|simplify)\b', '', cleaned)
|
| 54 |
cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()
|
| 55 |
return cleaned
|
| 56 |
|