Antigravity Agent commited on
Commit
be2c8ad
·
1 Parent(s): ce0c46c

fix(ocr): apply universal CJK filter and early input sanitization

Browse files
Files changed (3) hide show
  1. llm_agent.py +3 -0
  2. math_utils.py +2 -2
  3. ocr_module.py +6 -4
llm_agent.py CHANGED
@@ -104,6 +104,9 @@ class LLMAgent:
104
  self.use_real_api = False
105
 
106
  def generate_solution(self, problem: str) -> dict:
 
 
 
107
  if self.use_real_api and self.client:
108
  return self._call_real_gemini(problem)
109
  return self._simulate_agent(problem)
 
104
  self.use_real_api = False
105
 
106
  def generate_solution(self, problem: str) -> dict:
107
+ # Expert ML Patch: Clean input early to prevent CJK leakage to APIs
108
+ problem = clean_latex(problem)
109
+
110
  if self.use_real_api and self.client:
111
  return self._call_real_gemini(problem)
112
  return self._simulate_agent(problem)
math_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
 
3
- # CJK character ranges (Chinese, Japanese, Korean)
4
- CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]')
5
 
6
  def clean_latex(text: str) -> str:
7
  """Standardized cleaning for both OCR and LLM logic."""
 
1
  import re
2
 
3
+ # CJK character ranges (Chinese, Japanese, Korean) including punctuation
4
+ CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
5
 
6
  def clean_latex(text: str) -> str:
7
  """Standardized cleaning for both OCR and LLM logic."""
ocr_module.py CHANGED
@@ -9,8 +9,8 @@ from PIL import Image
9
  CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
10
  BRACKETS_LIMITS = ["(", ")", "[", "]", "\\{", "\\}", "^", "_"]
11
  AMBIGUOUS_SYMBOLS = ["8", "B", "0", "O", "l", "1", "I", "S", "5", "Z", "2"]
12
- # CJK character ranges (Chinese, Japanese, Korean)
13
- CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]')
14
 
15
  def get_symbol_weight(symbol: str) -> float:
16
  if symbol in CRITICAL_OPERATORS: return 1.5
@@ -46,9 +46,11 @@ def calculate_weighted_confidence(latex_string: str, mock_logits: bool = True) -
46
  return round(total_weighted_ci / total_weights, 4)
47
 
48
  def clean_latex_output(text: str) -> str:
49
- """Aggressively remove CJK characters from OCR output."""
 
50
  cleaned = CJK_PATTERN.sub('', text)
51
- # Also remove lone punctuation clusters that result from CJK removal
 
52
  cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()
53
  return cleaned
54
 
 
9
  CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
10
  BRACKETS_LIMITS = ["(", ")", "[", "]", "\\{", "\\}", "^", "_"]
11
  AMBIGUOUS_SYMBOLS = ["8", "B", "0", "O", "l", "1", "I", "S", "5", "Z", "2"]
12
+ # CJK character ranges (Chinese, Japanese, Korean) including punctuation
13
+ CJK_PATTERN = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef\u3001\u3002\uff0c\uff0e\uff1a\uff1b\uff1f\uff01]')
14
 
15
  def get_symbol_weight(symbol: str) -> float:
16
  if symbol in CRITICAL_OPERATORS: return 1.5
 
46
  return round(total_weighted_ci / total_weights, 4)
47
 
48
  def clean_latex_output(text: str) -> str:
49
+ """Aggressively remove CJK characters and punctuation from OCR output."""
50
+ if not text: return ""
51
  cleaned = CJK_PATTERN.sub('', text)
52
+ # Remove common conversational noise
53
+ cleaned = re.sub(r'(?i)\b(solve|find|evaluate|simplify)\b', '', cleaned)
54
  cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()
55
  return cleaned
56