Update main.py
Browse files
main.py
CHANGED
|
@@ -16,6 +16,11 @@ import tempfile
|
|
| 16 |
import base64
|
| 17 |
from typing import Dict, Any, Optional, List
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
app = FastAPI(title="ScanAssured OCR & NER API")
|
| 20 |
|
| 21 |
# --- DRUG INTERACTIONS DATABASE ---
|
|
@@ -88,6 +93,141 @@ NER_MODELS = {
|
|
| 88 |
ner_model_cache: Dict[str, Any] = {}
|
| 89 |
ocr_model_cache: Dict[str, Any] = {}
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
# --- OCR MODEL LOADING ---
|
| 92 |
def get_ocr_predictor(det_arch: str, reco_arch: str):
|
| 93 |
"""Retrieves a loaded OCR predictor from cache or loads it if necessary."""
|
|
@@ -1327,6 +1467,10 @@ async def process_image(
|
|
| 1327 |
print("Generating synthesized document image...")
|
| 1328 |
synthesized_image = generate_synthesized_image(result)
|
| 1329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1330 |
# Method 1: img2table with built-in OCR
|
| 1331 |
print("Running img2table for table detection (Method 1: integrated OCR)...")
|
| 1332 |
table_formatted_text, table_data = extract_text_with_table_detection(
|
|
@@ -1450,6 +1594,18 @@ async def process_image(
|
|
| 1450 |
"formatted_text": block_geo_text if block_geo_data.get('is_table') else None,
|
| 1451 |
"fill_ratio": block_geo_data.get('fill_ratio', 0)
|
| 1452 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1453 |
}
|
| 1454 |
}
|
| 1455 |
|
|
|
|
| 16 |
import base64
|
| 17 |
from typing import Dict, Any, Optional, List
|
| 18 |
|
| 19 |
+
# Docling pipeline
|
| 20 |
+
from docling.document_converter import DocumentConverter, InputFormat, ImageFormatOption
|
| 21 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 22 |
+
from docling_ocr_onnxtr import OnnxtrOcrOptions
|
| 23 |
+
|
| 24 |
app = FastAPI(title="ScanAssured OCR & NER API")
|
| 25 |
|
| 26 |
# --- DRUG INTERACTIONS DATABASE ---
|
|
|
|
| 93 |
ner_model_cache: Dict[str, Any] = {}
|
| 94 |
ocr_model_cache: Dict[str, Any] = {}
|
| 95 |
|
| 96 |
+
# --- DOCLING CONVERTER CACHE ---
|
| 97 |
+
docling_converter_cache: Dict[str, Any] = {}
|
| 98 |
+
|
| 99 |
+
def get_docling_converter(det_arch: str = "db_mobilenet_v3_large", reco_arch: str = "crnn_vgg16_bn"):
|
| 100 |
+
"""Get or create a cached Docling DocumentConverter with OnnxTR OCR."""
|
| 101 |
+
cache_key = f"docling_{det_arch}_{reco_arch}"
|
| 102 |
+
|
| 103 |
+
if cache_key in docling_converter_cache:
|
| 104 |
+
print(f"Using cached Docling converter: {cache_key}")
|
| 105 |
+
return docling_converter_cache[cache_key]
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
print(f"Initializing Docling converter: det={det_arch}, reco={reco_arch}...")
|
| 109 |
+
|
| 110 |
+
ocr_options = OnnxtrOcrOptions(
|
| 111 |
+
det_arch=det_arch,
|
| 112 |
+
reco_arch=reco_arch,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
pipeline_options = PdfPipelineOptions(ocr_options=ocr_options)
|
| 116 |
+
pipeline_options.do_table_structure = True
|
| 117 |
+
pipeline_options.do_ocr = True
|
| 118 |
+
pipeline_options.allow_external_plugins = True
|
| 119 |
+
|
| 120 |
+
converter = DocumentConverter(
|
| 121 |
+
format_options={
|
| 122 |
+
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
|
| 123 |
+
}
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
docling_converter_cache[cache_key] = converter
|
| 127 |
+
print(f"Docling converter {cache_key} initialized successfully!")
|
| 128 |
+
return converter
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"ERROR: Failed to initialize Docling converter: {e}")
|
| 131 |
+
import traceback
|
| 132 |
+
traceback.print_exc()
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def run_docling_pipeline(file_content: bytes) -> Dict[str, Any]:
|
| 137 |
+
"""
|
| 138 |
+
Run the Docling pipeline on raw image bytes.
|
| 139 |
+
Returns structured results for comparison with docTR.
|
| 140 |
+
"""
|
| 141 |
+
try:
|
| 142 |
+
converter = get_docling_converter()
|
| 143 |
+
if converter is None:
|
| 144 |
+
return {"error": "Docling converter not available", "success": False}
|
| 145 |
+
|
| 146 |
+
# Docling needs a file path - write to temp file
|
| 147 |
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
|
| 148 |
+
tmp_file.write(file_content)
|
| 149 |
+
tmp_path = tmp_file.name
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
print("Running Docling pipeline...")
|
| 153 |
+
result = converter.convert(source=tmp_path)
|
| 154 |
+
|
| 155 |
+
# Extract markdown (preserves headings, tables, paragraphs)
|
| 156 |
+
markdown_text = result.document.export_to_markdown()
|
| 157 |
+
|
| 158 |
+
# Extract plain text
|
| 159 |
+
if hasattr(result.document, 'export_to_text'):
|
| 160 |
+
plain_text = result.document.export_to_text()
|
| 161 |
+
else:
|
| 162 |
+
plain_text = markdown_text
|
| 163 |
+
|
| 164 |
+
# Extract tables
|
| 165 |
+
docling_tables = []
|
| 166 |
+
if hasattr(result.document, 'tables') and result.document.tables:
|
| 167 |
+
for table in result.document.tables:
|
| 168 |
+
table_data = _parse_docling_table(table)
|
| 169 |
+
if table_data:
|
| 170 |
+
docling_tables.append(table_data)
|
| 171 |
+
|
| 172 |
+
print(f"Docling: {len(markdown_text)} chars markdown, {len(docling_tables)} tables")
|
| 173 |
+
|
| 174 |
+
return {
|
| 175 |
+
"success": True,
|
| 176 |
+
"markdown_text": markdown_text,
|
| 177 |
+
"plain_text": plain_text,
|
| 178 |
+
"tables": docling_tables,
|
| 179 |
+
"primary_table": docling_tables[0] if docling_tables else None,
|
| 180 |
+
}
|
| 181 |
+
finally:
|
| 182 |
+
try:
|
| 183 |
+
os.unlink(tmp_path)
|
| 184 |
+
except:
|
| 185 |
+
pass
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
print(f"Docling pipeline error: {e}")
|
| 189 |
+
import traceback
|
| 190 |
+
traceback.print_exc()
|
| 191 |
+
return {"error": str(e), "success": False}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _parse_docling_table(table) -> Optional[Dict]:
|
| 195 |
+
"""Parse a Docling table into {cells, num_rows, num_columns} format."""
|
| 196 |
+
try:
|
| 197 |
+
if hasattr(table, 'export_to_dataframe'):
|
| 198 |
+
df = table.export_to_dataframe()
|
| 199 |
+
if df is not None and not df.empty:
|
| 200 |
+
cells = []
|
| 201 |
+
header = [str(col) if col is not None else '' for col in df.columns.tolist()]
|
| 202 |
+
cells.append(header)
|
| 203 |
+
for _, row in df.iterrows():
|
| 204 |
+
row_cells = [str(val).strip() if val is not None else '' for val in row.tolist()]
|
| 205 |
+
cells.append(row_cells)
|
| 206 |
+
|
| 207 |
+
return {
|
| 208 |
+
"cells": cells,
|
| 209 |
+
"num_rows": len(cells),
|
| 210 |
+
"num_columns": len(header),
|
| 211 |
+
"method": "docling_tableformer"
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
if hasattr(table, 'export_to_markdown'):
|
| 215 |
+
md = table.export_to_markdown()
|
| 216 |
+
if md:
|
| 217 |
+
return {
|
| 218 |
+
"cells": [],
|
| 219 |
+
"num_rows": 0,
|
| 220 |
+
"num_columns": 0,
|
| 221 |
+
"method": "docling_tableformer",
|
| 222 |
+
"markdown": md
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
return None
|
| 226 |
+
except Exception as e:
|
| 227 |
+
print(f"Docling table parse error: {e}")
|
| 228 |
+
return None
|
| 229 |
+
|
| 230 |
+
|
| 231 |
# --- OCR MODEL LOADING ---
|
| 232 |
def get_ocr_predictor(det_arch: str, reco_arch: str):
|
| 233 |
"""Retrieves a loaded OCR predictor from cache or loads it if necessary."""
|
|
|
|
| 1467 |
print("Generating synthesized document image...")
|
| 1468 |
synthesized_image = generate_synthesized_image(result)
|
| 1469 |
|
| 1470 |
+
# --- DOCLING PIPELINE (runs on raw bytes, not preprocessed) ---
|
| 1471 |
+
print("Running Docling pipeline for comparison...")
|
| 1472 |
+
docling_result = run_docling_pipeline(file_content)
|
| 1473 |
+
|
| 1474 |
# Method 1: img2table with built-in OCR
|
| 1475 |
print("Running img2table for table detection (Method 1: integrated OCR)...")
|
| 1476 |
table_formatted_text, table_data = extract_text_with_table_detection(
|
|
|
|
| 1594 |
"formatted_text": block_geo_text if block_geo_data.get('is_table') else None,
|
| 1595 |
"fill_ratio": block_geo_data.get('fill_ratio', 0)
|
| 1596 |
}
|
| 1597 |
+
},
|
| 1598 |
+
# Docling pipeline results (side-by-side comparison)
|
| 1599 |
+
"docling_result": {
|
| 1600 |
+
"available": docling_result.get("success", False),
|
| 1601 |
+
"markdown_text": docling_result.get("markdown_text", ""),
|
| 1602 |
+
"plain_text": docling_result.get("plain_text", ""),
|
| 1603 |
+
"table_detected": bool(docling_result.get("tables")),
|
| 1604 |
+
"table_data": docling_result.get("primary_table"),
|
| 1605 |
+
"error": docling_result.get("error"),
|
| 1606 |
+
} if docling_result else {
|
| 1607 |
+
"available": False,
|
| 1608 |
+
"error": "Docling pipeline did not run",
|
| 1609 |
}
|
| 1610 |
}
|
| 1611 |
|