Upload ocr_module.py with huggingface_hub
Browse files- ocr_module.py +50 -20
ocr_module.py
CHANGED
|
@@ -104,12 +104,37 @@ class MVM2OCREngine:
|
|
| 104 |
except Exception as e:
|
| 105 |
print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
def process_image(self, image_path: str) -> Dict[str, Any]:
|
| 108 |
-
"""Full OCR pipeline with
|
| 109 |
if not os.path.exists(image_path):
|
| 110 |
return {"error": f"Image not found: {image_path}", "latex_output": "", "weighted_confidence": 0.0}
|
| 111 |
|
| 112 |
-
# Validate image
|
| 113 |
try:
|
| 114 |
with Image.open(image_path) as img:
|
| 115 |
width, height = img.size
|
|
@@ -121,33 +146,36 @@ class MVM2OCREngine:
|
|
| 121 |
raw_latex = ""
|
| 122 |
if self.model_loaded and self.p2t:
|
| 123 |
try:
|
| 124 |
-
#
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
if not raw_latex.strip():
|
| 137 |
-
raw_latex = "No
|
| 138 |
|
| 139 |
except Exception as e:
|
| 140 |
print(f"[OCR] Inference error: {e}")
|
| 141 |
raw_latex = f"OCR Error: {str(e)}"
|
| 142 |
else:
|
| 143 |
-
# Simulation mode disabled. If Pix2Text is not loaded, we return empty so the user knows.
|
| 144 |
raw_latex = "No math detected (OCR model not loaded)."
|
| 145 |
|
| 146 |
-
# Final CJK cleanup pass (catches anything that slipped through)
|
| 147 |
raw_latex = clean_latex_output(raw_latex)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
if (not raw_latex.strip() or "No math content" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
|
| 151 |
try:
|
| 152 |
raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
|
| 153 |
print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
|
|
@@ -159,5 +187,7 @@ class MVM2OCREngine:
|
|
| 159 |
return {
|
| 160 |
"latex_output": raw_latex,
|
| 161 |
"weighted_confidence": ocr_conf,
|
| 162 |
-
"backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else (
|
|
|
|
|
|
|
| 163 |
}
|
|
|
|
| 104 |
except Exception as e:
|
| 105 |
print(f"[OCR] Warning: HandwritingTranscriber unavailable ({e})")
|
| 106 |
|
| 107 |
+
def _extract_formulas_only(self, pix2text_output) -> str:
|
| 108 |
+
"""Extract ONLY math formula regions, discarding prose text regions."""
|
| 109 |
+
if isinstance(pix2text_output, str):
|
| 110 |
+
if any(op in pix2text_output for op in ['\\', '^', '_', '=', '+', '-']):
|
| 111 |
+
return clean_latex_output(pix2text_output)
|
| 112 |
+
return ""
|
| 113 |
+
if isinstance(pix2text_output, list):
|
| 114 |
+
formula_parts = []
|
| 115 |
+
for item in pix2text_output:
|
| 116 |
+
if isinstance(item, dict):
|
| 117 |
+
item_type = item.get('type', 'text')
|
| 118 |
+
if item_type in ('isolated_equation', 'embedding', 'formula', 'math'):
|
| 119 |
+
text = item.get('text', '') or item.get('latex', '')
|
| 120 |
+
text = clean_latex_output(str(text)).strip()
|
| 121 |
+
if text:
|
| 122 |
+
formula_parts.append(text)
|
| 123 |
+
elif item_type == 'text':
|
| 124 |
+
raw = item.get('text', '')
|
| 125 |
+
inline = re.findall(r'\$(.*?)\$|\\\((.*?)\\\)', raw)
|
| 126 |
+
for match in inline:
|
| 127 |
+
part = match[0] or match[1]
|
| 128 |
+
if part.strip():
|
| 129 |
+
formula_parts.append(clean_latex_output(part))
|
| 130 |
+
return '\n'.join(formula_parts)
|
| 131 |
+
return ""
|
| 132 |
+
|
| 133 |
def process_image(self, image_path: str) -> Dict[str, Any]:
|
| 134 |
+
"""Full OCR pipeline: formula-first mode with prose filtering and confidence scoring."""
|
| 135 |
if not os.path.exists(image_path):
|
| 136 |
return {"error": f"Image not found: {image_path}", "latex_output": "", "weighted_confidence": 0.0}
|
| 137 |
|
|
|
|
| 138 |
try:
|
| 139 |
with Image.open(image_path) as img:
|
| 140 |
width, height = img.size
|
|
|
|
| 146 |
raw_latex = ""
|
| 147 |
if self.model_loaded and self.p2t:
|
| 148 |
try:
|
| 149 |
+
# --- PASS 1: Formula-only mode (cleanest LaTeX output) ---
|
| 150 |
+
try:
|
| 151 |
+
formula_out = self.p2t.recognize_formula(image_path)
|
| 152 |
+
raw_latex = clean_latex_output(str(formula_out)).strip()
|
| 153 |
+
print(f"[OCR] Pass 1 (formula mode): {raw_latex[:80]}")
|
| 154 |
+
except Exception as e1:
|
| 155 |
+
print(f"[OCR] Pass 1 formula mode failed: {e1}")
|
| 156 |
+
raw_latex = ""
|
| 157 |
+
|
| 158 |
+
# --- PASS 2: General recognize(), extract formula regions only ---
|
| 159 |
+
if not raw_latex or len(raw_latex) < 3:
|
| 160 |
+
out2 = self.p2t.recognize(image_path)
|
| 161 |
+
raw_latex = self._extract_formulas_only(out2)
|
| 162 |
+
print(f"[OCR] Pass 2 (formula extraction): {raw_latex[:80]}")
|
| 163 |
+
|
| 164 |
+
# --- PASS 3: Full text fallback ---
|
| 165 |
+
if not raw_latex.strip():
|
| 166 |
+
raw_latex = extract_latex_from_pix2text(out2 if 'out2' in dir() else "")
|
| 167 |
if not raw_latex.strip():
|
| 168 |
+
raw_latex = "No mathematical formula detected."
|
| 169 |
|
| 170 |
except Exception as e:
|
| 171 |
print(f"[OCR] Inference error: {e}")
|
| 172 |
raw_latex = f"OCR Error: {str(e)}"
|
| 173 |
else:
|
|
|
|
| 174 |
raw_latex = "No math detected (OCR model not loaded)."
|
| 175 |
|
|
|
|
| 176 |
raw_latex = clean_latex_output(raw_latex)
|
| 177 |
|
| 178 |
+
if (not raw_latex.strip() or "No math" in raw_latex) and self.transcriber and image_path.endswith('.inkml'):
|
|
|
|
| 179 |
try:
|
| 180 |
raw_latex, _ = self.transcriber.transcribe_inkml(image_path)
|
| 181 |
print(f"[OCR] Used HandwritingTranscriber for InkML: {raw_latex}")
|
|
|
|
| 187 |
return {
|
| 188 |
"latex_output": raw_latex,
|
| 189 |
"weighted_confidence": ocr_conf,
|
| 190 |
+
"backend": "handwriting" if self.transcriber and image_path.endswith('.inkml') else (
|
| 191 |
+
"pix2text-formula" if self.model_loaded else "simulation"
|
| 192 |
+
)
|
| 193 |
}
|