hkai20000 commited on
Commit
50ca5e1
·
verified ·
1 Parent(s): 8e73c00

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +156 -0
main.py CHANGED
@@ -16,6 +16,11 @@ import tempfile
16
  import base64
17
  from typing import Dict, Any, Optional, List
18
 
 
 
 
 
 
19
  app = FastAPI(title="ScanAssured OCR & NER API")
20
 
21
  # --- DRUG INTERACTIONS DATABASE ---
@@ -88,6 +93,141 @@ NER_MODELS = {
88
  ner_model_cache: Dict[str, Any] = {}
89
  ocr_model_cache: Dict[str, Any] = {}
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  # --- OCR MODEL LOADING ---
92
  def get_ocr_predictor(det_arch: str, reco_arch: str):
93
  """Retrieves a loaded OCR predictor from cache or loads it if necessary."""
@@ -1327,6 +1467,10 @@ async def process_image(
1327
  print("Generating synthesized document image...")
1328
  synthesized_image = generate_synthesized_image(result)
1329
 
 
 
 
 
1330
  # Method 1: img2table with built-in OCR
1331
  print("Running img2table for table detection (Method 1: integrated OCR)...")
1332
  table_formatted_text, table_data = extract_text_with_table_detection(
@@ -1450,6 +1594,18 @@ async def process_image(
1450
  "formatted_text": block_geo_text if block_geo_data.get('is_table') else None,
1451
  "fill_ratio": block_geo_data.get('fill_ratio', 0)
1452
  }
 
 
 
 
 
 
 
 
 
 
 
 
1453
  }
1454
  }
1455
 
 
16
  import base64
17
  from typing import Dict, Any, Optional, List
18
 
19
+ # Docling pipeline
20
+ from docling.document_converter import DocumentConverter, InputFormat, ImageFormatOption
21
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
22
+ from docling_ocr_onnxtr import OnnxtrOcrOptions
23
+
24
  app = FastAPI(title="ScanAssured OCR & NER API")
25
 
26
  # --- DRUG INTERACTIONS DATABASE ---
 
93
  ner_model_cache: Dict[str, Any] = {}
94
  ocr_model_cache: Dict[str, Any] = {}
95
 
96
+ # --- DOCLING CONVERTER CACHE ---
97
+ docling_converter_cache: Dict[str, Any] = {}
98
+
99
+ def get_docling_converter(det_arch: str = "db_mobilenet_v3_large", reco_arch: str = "crnn_vgg16_bn"):
100
+ """Get or create a cached Docling DocumentConverter with OnnxTR OCR."""
101
+ cache_key = f"docling_{det_arch}_{reco_arch}"
102
+
103
+ if cache_key in docling_converter_cache:
104
+ print(f"Using cached Docling converter: {cache_key}")
105
+ return docling_converter_cache[cache_key]
106
+
107
+ try:
108
+ print(f"Initializing Docling converter: det={det_arch}, reco={reco_arch}...")
109
+
110
+ ocr_options = OnnxtrOcrOptions(
111
+ det_arch=det_arch,
112
+ reco_arch=reco_arch,
113
+ )
114
+
115
+ pipeline_options = PdfPipelineOptions(ocr_options=ocr_options)
116
+ pipeline_options.do_table_structure = True
117
+ pipeline_options.do_ocr = True
118
+ pipeline_options.allow_external_plugins = True
119
+
120
+ converter = DocumentConverter(
121
+ format_options={
122
+ InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
123
+ }
124
+ )
125
+
126
+ docling_converter_cache[cache_key] = converter
127
+ print(f"Docling converter {cache_key} initialized successfully!")
128
+ return converter
129
+ except Exception as e:
130
+ print(f"ERROR: Failed to initialize Docling converter: {e}")
131
+ import traceback
132
+ traceback.print_exc()
133
+ return None
134
+
135
+
136
+ def run_docling_pipeline(file_content: bytes) -> Dict[str, Any]:
137
+ """
138
+ Run the Docling pipeline on raw image bytes.
139
+ Returns structured results for comparison with docTR.
140
+ """
141
+ try:
142
+ converter = get_docling_converter()
143
+ if converter is None:
144
+ return {"error": "Docling converter not available", "success": False}
145
+
146
+ # Docling needs a file path - write to temp file
147
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
148
+ tmp_file.write(file_content)
149
+ tmp_path = tmp_file.name
150
+
151
+ try:
152
+ print("Running Docling pipeline...")
153
+ result = converter.convert(source=tmp_path)
154
+
155
+ # Extract markdown (preserves headings, tables, paragraphs)
156
+ markdown_text = result.document.export_to_markdown()
157
+
158
+ # Extract plain text
159
+ if hasattr(result.document, 'export_to_text'):
160
+ plain_text = result.document.export_to_text()
161
+ else:
162
+ plain_text = markdown_text
163
+
164
+ # Extract tables
165
+ docling_tables = []
166
+ if hasattr(result.document, 'tables') and result.document.tables:
167
+ for table in result.document.tables:
168
+ table_data = _parse_docling_table(table)
169
+ if table_data:
170
+ docling_tables.append(table_data)
171
+
172
+ print(f"Docling: {len(markdown_text)} chars markdown, {len(docling_tables)} tables")
173
+
174
+ return {
175
+ "success": True,
176
+ "markdown_text": markdown_text,
177
+ "plain_text": plain_text,
178
+ "tables": docling_tables,
179
+ "primary_table": docling_tables[0] if docling_tables else None,
180
+ }
181
+ finally:
182
+ try:
183
+ os.unlink(tmp_path)
184
+ except:
185
+ pass
186
+
187
+ except Exception as e:
188
+ print(f"Docling pipeline error: {e}")
189
+ import traceback
190
+ traceback.print_exc()
191
+ return {"error": str(e), "success": False}
192
+
193
+
194
+ def _parse_docling_table(table) -> Optional[Dict]:
195
+ """Parse a Docling table into {cells, num_rows, num_columns} format."""
196
+ try:
197
+ if hasattr(table, 'export_to_dataframe'):
198
+ df = table.export_to_dataframe()
199
+ if df is not None and not df.empty:
200
+ cells = []
201
+ header = [str(col) if col is not None else '' for col in df.columns.tolist()]
202
+ cells.append(header)
203
+ for _, row in df.iterrows():
204
+ row_cells = [str(val).strip() if val is not None else '' for val in row.tolist()]
205
+ cells.append(row_cells)
206
+
207
+ return {
208
+ "cells": cells,
209
+ "num_rows": len(cells),
210
+ "num_columns": len(header),
211
+ "method": "docling_tableformer"
212
+ }
213
+
214
+ if hasattr(table, 'export_to_markdown'):
215
+ md = table.export_to_markdown()
216
+ if md:
217
+ return {
218
+ "cells": [],
219
+ "num_rows": 0,
220
+ "num_columns": 0,
221
+ "method": "docling_tableformer",
222
+ "markdown": md
223
+ }
224
+
225
+ return None
226
+ except Exception as e:
227
+ print(f"Docling table parse error: {e}")
228
+ return None
229
+
230
+
231
  # --- OCR MODEL LOADING ---
232
  def get_ocr_predictor(det_arch: str, reco_arch: str):
233
  """Retrieves a loaded OCR predictor from cache or loads it if necessary."""
 
1467
  print("Generating synthesized document image...")
1468
  synthesized_image = generate_synthesized_image(result)
1469
 
1470
+ # --- DOCLING PIPELINE (runs on raw bytes, not preprocessed) ---
1471
+ print("Running Docling pipeline for comparison...")
1472
+ docling_result = run_docling_pipeline(file_content)
1473
+
1474
  # Method 1: img2table with built-in OCR
1475
  print("Running img2table for table detection (Method 1: integrated OCR)...")
1476
  table_formatted_text, table_data = extract_text_with_table_detection(
 
1594
  "formatted_text": block_geo_text if block_geo_data.get('is_table') else None,
1595
  "fill_ratio": block_geo_data.get('fill_ratio', 0)
1596
  }
1597
+ },
1598
+ # Docling pipeline results (side-by-side comparison)
1599
+ "docling_result": {
1600
+ "available": docling_result.get("success", False),
1601
+ "markdown_text": docling_result.get("markdown_text", ""),
1602
+ "plain_text": docling_result.get("plain_text", ""),
1603
+ "table_detected": bool(docling_result.get("tables")),
1604
+ "table_data": docling_result.get("primary_table"),
1605
+ "error": docling_result.get("error"),
1606
+ } if docling_result else {
1607
+ "available": False,
1608
+ "error": "Docling pipeline did not run",
1609
  }
1610
  }
1611