Spaces:

Varshithdharmajv
/

mvm2-math-verification

Sleeping

App Files Files Community

Varshithdharmajv commited on 20 days ago

Commit

1af77e5

verified ·

1 Parent(s): 52eaa3d

Upload ocr_module.py with huggingface_hub

Browse files

Files changed (1) hide show

ocr_module.py +50 -20

ocr_module.py CHANGED Viewed

@@ -104,12 +104,37 @@ class MVM2OCREngine:
         except Exception as e:
             print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
     def process_image(self, image_path: str) -> Dict[str, Any]:
-        """Full OCR pipeline with CJK filtering and confidence scoring."""
         if not os.path.exists(image_path):
             return {"error": f"Image not found: {image_path}", "latex_output": "", "weighted_confidence": 0.0}
-        # Validate image
         try:
             with Image.open(image_path) as img:
                 width, height = img.size
@@ -121,33 +146,36 @@ class MVM2OCREngine:
         raw_latex = ""
         if self.model_loaded and self.p2t:
             try:
-                # Primary: use recognize() for formula detection
-                out = self.p2t.recognize(image_path)
-                raw_latex = extract_latex_from_pix2text(out)
-                # Fallback if empty result
-                if not raw_latex.strip() or raw_latex.strip() in [".", ","]:
-                    try:
-                        out2 = self.p2t.recognize_formula(image_path)
-                        raw_latex = clean_latex_output(str(out2))
-                    except:
-                        pass
                 if not raw_latex.strip():
-                    raw_latex = "No math content detected."
             except Exception as e:
                 print(f"[OCR] Inference error: {e}")
                 raw_latex = f"OCR Error: {str(e)}"
         else:
-            # Simulation mode disabled. If Pix2Text is not loaded, we return empty so the user knows.
             raw_latex = "No math detected (OCR model not loaded)."
-        # Final CJK cleanup pass (catches anything that slipped through)
         raw_latex = clean_latex_output(raw_latex)
-        # If no math detected by Pix2Text, try HandwritingTranscriber for InkML
-        if (not raw_latex.strip() or "No math content" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
             try:
                 raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
                 print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
@@ -159,5 +187,7 @@ class MVM2OCREngine:
         return {
             "latex_output": raw_latex,
             "weighted_confidence": ocr_conf,
-            "backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else ("pix2text" if self.model_loaded else "simulation")
         }

         except Exception as e:
             print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
+    def _extract_formulas_only(self, pix2text_output) -> str:
+        """Extract ONLY math formula regions, discarding prose text regions."""
+        if isinstance(pix2text_output, str):
+            if any(op in pix2text_output for op in ['\\', '^', '_', '=', '+', '-']):
+                return clean_latex_output(pix2text_output)
+            return ""
+        if isinstance(pix2text_output, list):
+            formula_parts = []
+            for item in pix2text_output:
+                if isinstance(item, dict):
+                    item_type = item.get('type', 'text')
+                    if item_type in ('isolated_equation', 'embedding', 'formula', 'math'):
+                        text = item.get('text', '') or item.get('latex', '')
+                        text = clean_latex_output(str(text)).strip()
+                        if text:
+                            formula_parts.append(text)
+                    elif item_type == 'text':
+                        raw = item.get('text', '')
+                        inline = re.findall(r'\$(.*?)\$|\\\((.*?)\\\)', raw)
+                        for match in inline:
+                            part = match[0] or match[1]
+                            if part.strip():
+                                formula_parts.append(clean_latex_output(part))
+            return '\n'.join(formula_parts)
+        return ""
     def process_image(self, image_path: str) -> Dict[str, Any]:
+        """Full OCR pipeline: formula-first mode with prose filtering and confidence scoring."""
         if not os.path.exists(image_path):
             return {"error": f"Image not found: {image_path}", "latex_output": "", "weighted_confidence": 0.0}
         try:
             with Image.open(image_path) as img:
                 width, height = img.size
         raw_latex = ""
         if self.model_loaded and self.p2t:
             try:
+                # --- PASS 1: Formula-only mode (cleanest LaTeX output) ---
+                try:
+                    formula_out = self.p2t.recognize_formula(image_path)
+                    raw_latex = clean_latex_output(str(formula_out)).strip()
+                    print(f"[OCR] Pass 1 (formula mode): {raw_latex[:80]}")
+                except Exception as e1:
+                    print(f"[OCR] Pass 1 formula mode failed: {e1}")
+                    raw_latex = ""
+                # --- PASS 2: General recognize(), extract formula regions only ---
+                if not raw_latex or len(raw_latex) < 3:
+                    out2 = self.p2t.recognize(image_path)
+                    raw_latex = self._extract_formulas_only(out2)
+                    print(f"[OCR] Pass 2 (formula extraction): {raw_latex[:80]}")
+                # --- PASS 3: Full text fallback ---
+                if not raw_latex.strip():
+                    raw_latex = extract_latex_from_pix2text(out2 if 'out2' in dir() else "")
                 if not raw_latex.strip():
+                    raw_latex = "No mathematical formula detected."
             except Exception as e:
                 print(f"[OCR] Inference error: {e}")
                 raw_latex = f"OCR Error: {str(e)}"
         else:
             raw_latex = "No math detected (OCR model not loaded)."
         raw_latex = clean_latex_output(raw_latex)
+        if (not raw_latex.strip() or "No math" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
             try:
                 raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
                 print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
         return {
             "latex_output": raw_latex,
             "weighted_confidence": ocr_conf,
+            "backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else (
+                "pix2text-formula" if self.model_loaded else "simulation"
+            )
         }