mcp_ocr_chandra

Sleeping

App Files Files Community

Vachudev commited on Dec 5, 2025

Commit

678c3d1

verified ·

1 Parent(s): 88f995b

chandra ocr

Browse files

Files changed (1) hide show

ocr_engine.py +113 -47

ocr_engine.py CHANGED Viewed

@@ -1,63 +1,129 @@
-import pytesseract
-from pytesseract import Output
-from pdf2image import convert_from_path
-from PIL import Image
 import os
 import logging
-import numpy as np
 logger = logging.getLogger("ocr_engine")
-def extract_text_and_conf(file_path: str) -> tuple[str, float]:
     """
-    Extracts text AND confidence score from a PDF or Image.
     Returns: (text_content, average_confidence_0_to_100)
     """
     if not os.path.exists(file_path):
         return "", 0.0
-    text_content = ""
-    confidences = []
     try:
-        images = []
-        # 1. Load Images
-        if file_path.lower().endswith('.pdf'):
-            try:
-                images = convert_from_path(file_path)
-            except Exception as e:
-                logger.error(f"PDF Convert Error: {e}")
                 return "", 0.0
-        elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
-            try:
-                images = [Image.open(file_path)]
-            except Exception as e:
-                logger.error(f"Image Open Error: {e}")
                 return "", 0.0
-        # 2. Process Each Page
-        for i, image in enumerate(images):
-            # A. Get Layout-Preserved Text (Best for LLM)
-            page_text = pytesseract.image_to_string(image)
-            text_content += f"--- Page {i+1} ---\n{page_text}\n"
-            # B. Get Confidence Data (Best for KPIs)
-            # data_dict keys: ['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
-            data = pytesseract.image_to_data(image, output_type=Output.DICT)
-            # Filter valid confidences (ignore -1 which usually means whitespace/block info)
-            for conf in data['conf']:
-                # Tesseract returns -1 for structural elements (not words)
-                if conf != -1:
-                    confidences.append(conf)
-        # 3. Calculate Average Confidence
-        avg_conf = 0.0
-        if confidences:
-            avg_conf = sum(confidences) / len(confidences)
-        return text_content.strip(), round(avg_conf, 2)
     except Exception as e:
-        logger.error(f"OCR Critical Error: {e}")
-        return "", 0.0

 import os
+import json
 import logging
+import subprocess
+import tempfile
+from typing import Tuple, List
 logger = logging.getLogger("ocr_engine")
+def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
     """
+    Extracts text AND confidence score from a PDF or Image using Chandra OCR.
     Returns: (text_content, average_confidence_0_to_100)
+    Requirements:
+        - pip install chandra-ocr
+        - 'chandra' CLI must be on PATH
     """
     if not os.path.exists(file_path):
+        logger.error(f"File not found: {file_path}")
         return "", 0.0
+    # Decide which Chandra subcommand to use
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".pdf":
+        chandra_cmd = ["chandra", "pdf"]
+    else:
+        # For png/jpg/webp/tiff/etc.
+        chandra_cmd = ["chandra", "ocr"]
     try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            out_jsonl = os.path.join(tmpdir, "out.jsonl")
+            # Build CLI command:
+            #   chandra pdf <file> --format jsonl --out out.jsonl
+            #   chandra ocr <file> --format jsonl --out out.jsonl
+            cmd: List[str] = (
+                chandra_cmd
+                + [
+                    file_path,
+                    "--format", "jsonl",
+                    "--out", out_jsonl,
+                    # optional flags you can tweak:
+                    # "--lang", "en",            # if you want to force language
+                    # "--dpi", "300",            # PDF/image rasterization DPI
+                    # "--rotate", "auto",        # auto-rotate skewed pages
+                ]
+            )
+            logger.info(f"Running Chandra OCR: {' '.join(cmd)}")
+            proc = subprocess.run(
+                cmd,
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            if proc.returncode != 0:
+                logger.error(
+                    f"Chandra OCR failed (exit={proc.returncode}). "
+                    f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+                )
                 return "", 0.0
+            if not os.path.exists(out_jsonl):
+                logger.error(f"Chandra OCR did not produce expected output file: {out_jsonl}")
                 return "", 0.0
+            text_parts: List[str] = []
+            confidences: List[float] = []
+            # JSONL structure (simplified from docs/blog):
+            # {
+            #   "page": 1,
+            #   "blocks": [
+            #     {
+            #       "type": "text",
+            #       "bbox": [...],
+            #       "lines": [
+            #         {"text": "Some line", "conf": 0.97, "tokens": [...]}
+            #       ]
+            #     }
+            #   ]
+            # }
+            page_counter = 0
+            with open(out_jsonl, "r", encoding="utf-8") as f:
+                for raw_line in f:
+                    raw_line = raw_line.strip()
+                    if not raw_line:
+                        continue
+                    page_data = json.loads(raw_line)
+                    page_counter += 1
+                    page_num = page_data.get("page", page_counter)
+                    # Add page separator (similar to your Tesseract version)
+                    text_parts.append(f"--- Page {page_num} ---")
+                    for block in page_data.get("blocks", []):
+                        if block.get("type") != "text":
+                            continue
+                        for line in block.get("lines", []):
+                            line_text = line.get("text", "")
+                            if line_text:
+                                text_parts.append(line_text)
+                            # Prefer line-level confidence if present
+                            if "conf" in line and line["conf"] is not None:
+                                confidences.append(float(line["conf"]))
+                            else:
+                                # Fall back to token-level conf if available
+                                for tok in line.get("tokens", []):
+                                    if "conf" in tok and tok["conf"] is not None:
+                                        confidences.append(float(tok["conf"]))
+            full_text = "\n".join(text_parts).strip()
+            avg_conf = 0.0
+            if confidences:
+                # Chandra's JSON typically gives conf ∈ [0,1]; scale to 0–100
+                avg_conf = (sum(confidences) / len(confidences)) * 100.0
+            return full_text, round(avg_conf, 2)
     except Exception as e:
+        logger.error(f"OCR Critical Error (Chandra): {e}", exc_info=True)
+        return "", 0.0