Varshithdharmajv commited on
Commit
1af77e5
·
verified ·
1 Parent(s): 52eaa3d

Upload ocr_module.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ocr_module.py +50 -20
ocr_module.py CHANGED
@@ -104,12 +104,37 @@ class MVM2OCREngine:
104
  except Exception as e:
105
  print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def process_image(self, image_path: str) -> Dict[str, Any]:
108
- """Full OCR pipeline with CJK filtering and confidence scoring."""
109
  if not os.path.exists(image_path):
110
  return {"error": f"Image not found: {image_path}", "latex_output": "", "weighted_confidence": 0.0}
111
 
112
- # Validate image
113
  try:
114
  with Image.open(image_path) as img:
115
  width, height = img.size
@@ -121,33 +146,36 @@ class MVM2OCREngine:
121
  raw_latex = ""
122
  if self.model_loaded and self.p2t:
123
  try:
124
- # Primary: use recognize() for formula detection
125
- out = self.p2t.recognize(image_path)
126
- raw_latex = extract_latex_from_pix2text(out)
127
-
128
- # Fallback if empty result
129
- if not raw_latex.strip() or raw_latex.strip() in [".", ","]:
130
- try:
131
- out2 = self.p2t.recognize_formula(image_path)
132
- raw_latex = clean_latex_output(str(out2))
133
- except:
134
- pass
135
-
 
 
 
 
 
 
136
  if not raw_latex.strip():
137
- raw_latex = "No math content detected."
138
 
139
  except Exception as e:
140
  print(f"[OCR] Inference error: {e}")
141
  raw_latex = f"OCR Error: {str(e)}"
142
  else:
143
- # Simulation mode disabled. If Pix2Text is not loaded, we return empty so the user knows.
144
  raw_latex = "No math detected (OCR model not loaded)."
145
 
146
- # Final CJK cleanup pass (catches anything that slipped through)
147
  raw_latex = clean_latex_output(raw_latex)
148
 
149
- # If no math detected by Pix2Text, try HandwritingTranscriber for InkML
150
- if (not raw_latex.strip() or "No math content" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
151
  try:
152
  raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
153
  print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
@@ -159,5 +187,7 @@ class MVM2OCREngine:
159
  return {
160
  "latex_output": raw_latex,
161
  "weighted_confidence": ocr_conf,
162
- "backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else ("pix2text" if self.model_loaded else "simulation")
 
 
163
  }
 
104
  except Exception as e:
105
  print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
106
 
107
+ def _extract_formulas_only(self, pix2text_output) -> str:
108
+ """Extract ONLY math formula regions, discarding prose text regions."""
109
+ if isinstance(pix2text_output, str):
110
+ if any(op in pix2text_output for op in ['\\', '^', '_', '=', '+', '-']):
111
+ return clean_latex_output(pix2text_output)
112
+ return ""
113
+ if isinstance(pix2text_output, list):
114
+ formula_parts = []
115
+ for item in pix2text_output:
116
+ if isinstance(item, dict):
117
+ item_type = item.get('type', 'text')
118
+ if item_type in ('isolated_equation', 'embedding', 'formula', 'math'):
119
+ text = item.get('text', '') or item.get('latex', '')
120
+ text = clean_latex_output(str(text)).strip()
121
+ if text:
122
+ formula_parts.append(text)
123
+ elif item_type == 'text':
124
+ raw = item.get('text', '')
125
+ inline = re.findall(r'\$(.*?)\$|\\\((.*?)\\\)', raw)
126
+ for match in inline:
127
+ part = match[0] or match[1]
128
+ if part.strip():
129
+ formula_parts.append(clean_latex_output(part))
130
+ return '\n'.join(formula_parts)
131
+ return ""
132
+
133
  def process_image(self, image_path: str) -> Dict[str, Any]:
134
+ """Full OCR pipeline: formula-first mode with prose filtering and confidence scoring."""
135
  if not os.path.exists(image_path):
136
  return {"error": f"Image not found: {image_path}", "latex_output": "", "weighted_confidence": 0.0}
137
 
 
138
  try:
139
  with Image.open(image_path) as img:
140
  width, height = img.size
 
146
  raw_latex = ""
147
  if self.model_loaded and self.p2t:
148
  try:
149
+ # --- PASS 1: Formula-only mode (cleanest LaTeX output) ---
150
+ try:
151
+ formula_out = self.p2t.recognize_formula(image_path)
152
+ raw_latex = clean_latex_output(str(formula_out)).strip()
153
+ print(f"[OCR] Pass 1 (formula mode): {raw_latex[:80]}")
154
+ except Exception as e1:
155
+ print(f"[OCR] Pass 1 formula mode failed: {e1}")
156
+ raw_latex = ""
157
+
158
+ # --- PASS 2: General recognize(), extract formula regions only ---
159
+ if not raw_latex or len(raw_latex) < 3:
160
+ out2 = self.p2t.recognize(image_path)
161
+ raw_latex = self._extract_formulas_only(out2)
162
+ print(f"[OCR] Pass 2 (formula extraction): {raw_latex[:80]}")
163
+
164
+ # --- PASS 3: Full text fallback ---
165
+ if not raw_latex.strip():
166
+ raw_latex = extract_latex_from_pix2text(out2 if 'out2' in dir() else "")
167
  if not raw_latex.strip():
168
+ raw_latex = "No mathematical formula detected."
169
 
170
  except Exception as e:
171
  print(f"[OCR] Inference error: {e}")
172
  raw_latex = f"OCR Error: {str(e)}"
173
  else:
 
174
  raw_latex = "No math detected (OCR model not loaded)."
175
 
 
176
  raw_latex = clean_latex_output(raw_latex)
177
 
178
+ if (not raw_latex.strip() or "No math" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
 
179
  try:
180
  raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
181
  print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
 
187
  return {
188
  "latex_output": raw_latex,
189
  "weighted_confidence": ocr_conf,
190
+ "backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else (
191
+ "pix2text-formula" if self.model_loaded else "simulation"
192
+ )
193
  }