Spaces:

hkai20000
/

ocrAPP

Sleeping

App Files Files Community

hkai20000 commited on Feb 3

Commit

50ca5e1

verified ·

1 Parent(s): 8e73c00

Update main.py

Browse files

Files changed (1) hide show

main.py +156 -0

main.py CHANGED Viewed

@@ -16,6 +16,11 @@ import tempfile
 import base64
 from typing import Dict, Any, Optional, List
 app = FastAPI(title="ScanAssured OCR & NER API")
 # --- DRUG INTERACTIONS DATABASE ---
@@ -88,6 +93,141 @@ NER_MODELS = {
 ner_model_cache: Dict[str, Any] = {}
 ocr_model_cache: Dict[str, Any] = {}
 # --- OCR MODEL LOADING ---
 def get_ocr_predictor(det_arch: str, reco_arch: str):
     """Retrieves a loaded OCR predictor from cache or loads it if necessary."""
@@ -1327,6 +1467,10 @@ async def process_image(
         print("Generating synthesized document image...")
         synthesized_image = generate_synthesized_image(result)
         # Method 1: img2table with built-in OCR
         print("Running img2table for table detection (Method 1: integrated OCR)...")
         table_formatted_text, table_data = extract_text_with_table_detection(
@@ -1450,6 +1594,18 @@ async def process_image(
                     "formatted_text": block_geo_text if block_geo_data.get('is_table') else None,
                     "fill_ratio": block_geo_data.get('fill_ratio', 0)
                 }
             }
         }

 import base64
 from typing import Dict, Any, Optional, List
+# Docling pipeline
+from docling.document_converter import DocumentConverter, InputFormat, ImageFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling_ocr_onnxtr import OnnxtrOcrOptions
 app = FastAPI(title="ScanAssured OCR & NER API")
 # --- DRUG INTERACTIONS DATABASE ---
 ner_model_cache: Dict[str, Any] = {}
 ocr_model_cache: Dict[str, Any] = {}
+# --- DOCLING CONVERTER CACHE ---
+docling_converter_cache: Dict[str, Any] = {}
+def get_docling_converter(det_arch: str = "db_mobilenet_v3_large", reco_arch: str = "crnn_vgg16_bn"):
+    """Get or create a cached Docling DocumentConverter with OnnxTR OCR."""
+    cache_key = f"docling_{det_arch}_{reco_arch}"
+    if cache_key in docling_converter_cache:
+        print(f"Using cached Docling converter: {cache_key}")
+        return docling_converter_cache[cache_key]
+    try:
+        print(f"Initializing Docling converter: det={det_arch}, reco={reco_arch}...")
+        ocr_options = OnnxtrOcrOptions(
+            det_arch=det_arch,
+            reco_arch=reco_arch,
+        )
+        pipeline_options = PdfPipelineOptions(ocr_options=ocr_options)
+        pipeline_options.do_table_structure = True
+        pipeline_options.do_ocr = True
+        pipeline_options.allow_external_plugins = True
+        converter = DocumentConverter(
+            format_options={
+                InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
+            }
+        )
+        docling_converter_cache[cache_key] = converter
+        print(f"Docling converter {cache_key} initialized successfully!")
+        return converter
+    except Exception as e:
+        print(f"ERROR: Failed to initialize Docling converter: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def run_docling_pipeline(file_content: bytes) -> Dict[str, Any]:
+    """
+    Run the Docling pipeline on raw image bytes.
+    Returns structured results for comparison with docTR.
+    """
+    try:
+        converter = get_docling_converter()
+        if converter is None:
+            return {"error": "Docling converter not available", "success": False}
+        # Docling needs a file path - write to temp file
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
+            tmp_file.write(file_content)
+            tmp_path = tmp_file.name
+        try:
+            print("Running Docling pipeline...")
+            result = converter.convert(source=tmp_path)
+            # Extract markdown (preserves headings, tables, paragraphs)
+            markdown_text = result.document.export_to_markdown()
+            # Extract plain text
+            if hasattr(result.document, 'export_to_text'):
+                plain_text = result.document.export_to_text()
+            else:
+                plain_text = markdown_text
+            # Extract tables
+            docling_tables = []
+            if hasattr(result.document, 'tables') and result.document.tables:
+                for table in result.document.tables:
+                    table_data = _parse_docling_table(table)
+                    if table_data:
+                        docling_tables.append(table_data)
+            print(f"Docling: {len(markdown_text)} chars markdown, {len(docling_tables)} tables")
+            return {
+                "success": True,
+                "markdown_text": markdown_text,
+                "plain_text": plain_text,
+                "tables": docling_tables,
+                "primary_table": docling_tables[0] if docling_tables else None,
+            }
+        finally:
+            try:
+                os.unlink(tmp_path)
+            except:
+                pass
+    except Exception as e:
+        print(f"Docling pipeline error: {e}")
+        import traceback
+        traceback.print_exc()
+        return {"error": str(e), "success": False}
+def _parse_docling_table(table) -> Optional[Dict]:
+    """Parse a Docling table into {cells, num_rows, num_columns} format."""
+    try:
+        if hasattr(table, 'export_to_dataframe'):
+            df = table.export_to_dataframe()
+            if df is not None and not df.empty:
+                cells = []
+                header = [str(col) if col is not None else '' for col in df.columns.tolist()]
+                cells.append(header)
+                for _, row in df.iterrows():
+                    row_cells = [str(val).strip() if val is not None else '' for val in row.tolist()]
+                    cells.append(row_cells)
+                return {
+                    "cells": cells,
+                    "num_rows": len(cells),
+                    "num_columns": len(header),
+                    "method": "docling_tableformer"
+                }
+        if hasattr(table, 'export_to_markdown'):
+            md = table.export_to_markdown()
+            if md:
+                return {
+                    "cells": [],
+                    "num_rows": 0,
+                    "num_columns": 0,
+                    "method": "docling_tableformer",
+                    "markdown": md
+                }
+        return None
+    except Exception as e:
+        print(f"Docling table parse error: {e}")
+        return None
 # --- OCR MODEL LOADING ---
 def get_ocr_predictor(det_arch: str, reco_arch: str):
     """Retrieves a loaded OCR predictor from cache or loads it if necessary."""
         print("Generating synthesized document image...")
         synthesized_image = generate_synthesized_image(result)
+        # --- DOCLING PIPELINE (runs on raw bytes, not preprocessed) ---
+        print("Running Docling pipeline for comparison...")
+        docling_result = run_docling_pipeline(file_content)
         # Method 1: img2table with built-in OCR
         print("Running img2table for table detection (Method 1: integrated OCR)...")
         table_formatted_text, table_data = extract_text_with_table_detection(
                     "formatted_text": block_geo_text if block_geo_data.get('is_table') else None,
                     "fill_ratio": block_geo_data.get('fill_ratio', 0)
                 }
+            },
+            # Docling pipeline results (side-by-side comparison)
+            "docling_result": {
+                "available": docling_result.get("success", False),
+                "markdown_text": docling_result.get("markdown_text", ""),
+                "plain_text": docling_result.get("plain_text", ""),
+                "table_detected": bool(docling_result.get("tables")),
+                "table_data": docling_result.get("primary_table"),
+                "error": docling_result.get("error"),
+            } if docling_result else {
+                "available": False,
+                "error": "Docling pipeline did not run",
             }
         }