Upload ocr_module.py with huggingface_hub
Browse files- ocr_module.py +25 -6
ocr_module.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import json
|
| 3 |
-
import random
|
| 4 |
-
import re
|
| 5 |
-
from typing import Dict, List, Any
|
| 6 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# MVM2 Configuration for OCR Confidence Weights
|
| 9 |
CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
|
|
@@ -90,6 +89,17 @@ class MVM2OCREngine:
|
|
| 90 |
except Exception as e:
|
| 91 |
print(f"[OCR] Warning: Pix2Text unavailable ({e}). Using simulation mode.")
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def process_image(self, image_path: str) -> Dict[str, Any]:
|
| 94 |
"""Full OCR pipeline with CJK filtering and confidence scoring."""
|
| 95 |
if not os.path.exists(image_path):
|
|
@@ -131,10 +141,19 @@ class MVM2OCREngine:
|
|
| 131 |
|
| 132 |
# Final CJK cleanup pass (catches anything that slipped through)
|
| 133 |
raw_latex = clean_latex_output(raw_latex)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
ocr_conf = calculate_weighted_confidence(raw_latex)
|
| 135 |
|
| 136 |
return {
|
| 137 |
"latex_output": raw_latex,
|
| 138 |
"weighted_confidence": ocr_conf,
|
| 139 |
-
"backend": "pix2text" if self.model_loaded else "simulation"
|
| 140 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from PIL import Image
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
# Handwrite Transcription models are bundled in this folder
|
| 5 |
+
MODEL_PATH = os.path.join(os.getcwd(), "handwritten-math-transcription", "checkpoints", "model_v3_0.pth")
|
| 6 |
|
| 7 |
# MVM2 Configuration for OCR Confidence Weights
|
| 8 |
CRITICAL_OPERATORS = ["\\int", "\\sum", "=", "\\frac", "+", "-", "*", "\\times", "\\div"]
|
|
|
|
| 89 |
except Exception as e:
|
| 90 |
print(f"[OCR] Warning: Pix2Text unavailable ({e}). Using simulation mode.")
|
| 91 |
|
| 92 |
+
self.transcriber = None
|
| 93 |
+
try:
|
| 94 |
+
from handwriting_transcriber import HandwritingTranscriber
|
| 95 |
+
if os.path.exists(MODEL_PATH):
|
| 96 |
+
self.transcriber = HandwritingTranscriber(model_path=MODEL_PATH)
|
| 97 |
+
print(f"[OCR] HandwritingTranscriber loaded with model: {MODEL_PATH}")
|
| 98 |
+
else:
|
| 99 |
+
print(f"[OCR] Warning: Handwriting model not found at {MODEL_PATH}")
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
|
| 102 |
+
|
| 103 |
def process_image(self, image_path: str) -> Dict[str, Any]:
|
| 104 |
"""Full OCR pipeline with CJK filtering and confidence scoring."""
|
| 105 |
if not os.path.exists(image_path):
|
|
|
|
| 141 |
|
| 142 |
# Final CJK cleanup pass (catches anything that slipped through)
|
| 143 |
raw_latex = clean_latex_output(raw_latex)
|
| 144 |
+
|
| 145 |
+
# If no math detected by Pix2Text, try HandwritingTranscriber for InkML
|
| 146 |
+
if (not raw_latex.strip() or "No math content" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
|
| 147 |
+
try:
|
| 148 |
+
raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
|
| 149 |
+
print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"[OCR] HandwritingTranscriber error: {e}")
|
| 152 |
+
|
| 153 |
ocr_conf = calculate_weighted_confidence(raw_latex)
|
| 154 |
|
| 155 |
return {
|
| 156 |
"latex_output": raw_latex,
|
| 157 |
"weighted_confidence": ocr_conf,
|
| 158 |
+
"backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else ("pix2text" if self.model_loaded else "simulation")
|
| 159 |
}
|