Spaces:
Paused
Paused
Suvadeep Das
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -38,7 +38,7 @@ def load_model():
|
|
| 38 |
"openbmb/MiniCPM-V-2_6",
|
| 39 |
trust_remote_code=True,
|
| 40 |
torch_dtype=torch.float16,
|
| 41 |
-
device_map="auto"
|
| 42 |
)
|
| 43 |
return _model, _tokenizer
|
| 44 |
except Exception as e:
|
|
@@ -195,8 +195,45 @@ def extract_data_from_image(image, extraction_prompt, model, tokenizer):
|
|
| 195 |
"extracted_data": None
|
| 196 |
}
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
def combine_page_data(pages_data):
|
| 199 |
-
"""Combine extracted data from multiple pages into final medical record"""
|
| 200 |
combined_data = {
|
| 201 |
"date_of_receipt": "",
|
| 202 |
"patient_first_name": "",
|
|
@@ -238,24 +275,25 @@ def combine_page_data(pages_data):
|
|
| 238 |
|
| 239 |
# Combine data from all pages
|
| 240 |
for page_num, page_data in enumerate(pages_data, 1):
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
# If we got JSON data, merge it
|
| 245 |
-
if isinstance(extracted, dict) and "data" in extracted:
|
| 246 |
-
page_info = extracted["data"]
|
| 247 |
-
|
| 248 |
-
# Merge non-empty fields (first non-empty value wins)
|
| 249 |
-
for field, value in page_info.items():
|
| 250 |
-
if field in combined_data and value and not combined_data[field]:
|
| 251 |
-
combined_data[field] = value
|
| 252 |
-
combined_data["extracted_page_numbers"].append(page_num)
|
| 253 |
|
| 254 |
-
#
|
| 255 |
-
if "
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
return {
|
| 261 |
"data": combined_data,
|
|
@@ -282,7 +320,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
| 282 |
"pages_data": []
|
| 283 |
}
|
| 284 |
|
| 285 |
-
# Step 1: Convert PDF to images (CPU operation
|
| 286 |
print("Converting PDF to images...")
|
| 287 |
images = pdf_to_images(pdf_file)
|
| 288 |
|
|
@@ -296,7 +334,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
| 296 |
|
| 297 |
print(f"Converted {len(images)} pages. Starting GPU processing...")
|
| 298 |
|
| 299 |
-
# Step 2: Load model on GPU
|
| 300 |
model, tokenizer = load_model()
|
| 301 |
|
| 302 |
# Step 3: Use custom prompt or default
|
|
@@ -314,7 +352,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
| 314 |
|
| 315 |
print("GPU processing complete. Combining results...")
|
| 316 |
|
| 317 |
-
# Step 5: Combine data from all pages
|
| 318 |
combined_result = combine_page_data(pages_data)
|
| 319 |
|
| 320 |
# Final result
|
|
@@ -331,6 +369,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
| 331 |
return result
|
| 332 |
|
| 333 |
except Exception as e:
|
|
|
|
| 334 |
return {
|
| 335 |
"status": "error",
|
| 336 |
"error": str(e),
|
|
@@ -340,9 +379,9 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
| 340 |
|
| 341 |
# Create Gradio Interface
|
| 342 |
def create_gradio_interface():
|
| 343 |
-
with gr.Blocks(title="eFax PDF Data Extractor -
|
| 344 |
gr.Markdown("# π₯ eFax Medical Data Extraction API")
|
| 345 |
-
gr.Markdown("π **
|
| 346 |
|
| 347 |
with gr.Tab("π PDF Upload & Extraction"):
|
| 348 |
with gr.Row():
|
|
@@ -361,13 +400,14 @@ def create_gradio_interface():
|
|
| 361 |
placeholder="Leave empty to use optimized medical data extraction prompt..."
|
| 362 |
)
|
| 363 |
|
| 364 |
-
extract_btn = gr.Button("π Extract Medical Data (
|
| 365 |
|
| 366 |
gr.Markdown("""
|
| 367 |
-
###
|
| 368 |
-
- **
|
| 369 |
-
- **
|
| 370 |
-
- **
|
|
|
|
| 371 |
""")
|
| 372 |
|
| 373 |
with gr.Column():
|
|
@@ -376,7 +416,7 @@ def create_gradio_interface():
|
|
| 376 |
|
| 377 |
with gr.Tab("π API Usage"):
|
| 378 |
gr.Markdown("""
|
| 379 |
-
##
|
| 380 |
|
| 381 |
### Python Usage
|
| 382 |
```
|
|
@@ -396,36 +436,14 @@ def create_gradio_interface():
|
|
| 396 |
}
|
| 397 |
)
|
| 398 |
|
| 399 |
-
#
|
| 400 |
result = response.json()
|
| 401 |
-
|
|
|
|
|
|
|
| 402 |
```
|
| 403 |
""")
|
| 404 |
|
| 405 |
-
with gr.Tab("β‘ Performance Info"):
|
| 406 |
-
gr.Markdown("""
|
| 407 |
-
## Optimized ZeroGPU Performance
|
| 408 |
-
|
| 409 |
-
### Before Optimization (β Had Timeout Issues)
|
| 410 |
-
- GPU session per page = 13 Γ 30 seconds = 6.5 minutes
|
| 411 |
-
- Model loading repeated = wasted time
|
| 412 |
-
- Timeout around page 11/13
|
| 413 |
-
|
| 414 |
-
### After Optimization (β
No Timeouts)
|
| 415 |
-
- **Single 10-minute GPU session** for entire document
|
| 416 |
-
- Model loads once, processes all pages
|
| 417 |
-
- Handles 15-20+ page documents easily
|
| 418 |
-
- PDF conversion on CPU (doesn't count toward GPU time)
|
| 419 |
-
|
| 420 |
-
### Processing Flow
|
| 421 |
-
1. **PDF β Images** (CPU, before GPU starts)
|
| 422 |
-
2. **π GPU Session Starts** (10 minutes allocated)
|
| 423 |
-
3. **Load Model** (once, on GPU)
|
| 424 |
-
4. **Process All Pages** (GPU, sequential)
|
| 425 |
-
5. **GPU Session Ends**
|
| 426 |
-
6. **Combine Results** (CPU, after GPU)
|
| 427 |
-
""")
|
| 428 |
-
|
| 429 |
def process_with_status(pdf_file, custom_prompt):
|
| 430 |
if pdf_file is None:
|
| 431 |
return "β No PDF file uploaded", {"error": "Please upload a PDF file"}
|
|
@@ -436,7 +454,7 @@ def create_gradio_interface():
|
|
| 436 |
result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None)
|
| 437 |
|
| 438 |
if result["status"] == "success":
|
| 439 |
-
yield f"β
Successfully processed {result['total_pages']} pages
|
| 440 |
else:
|
| 441 |
yield f"β Error: {result.get('error', 'Unknown error')}", result
|
| 442 |
|
|
@@ -463,4 +481,4 @@ if __name__ == "__main__":
|
|
| 463 |
server_name="0.0.0.0",
|
| 464 |
server_port=7860,
|
| 465 |
show_error=True
|
| 466 |
-
)
|
|
|
|
| 38 |
"openbmb/MiniCPM-V-2_6",
|
| 39 |
trust_remote_code=True,
|
| 40 |
torch_dtype=torch.float16,
|
| 41 |
+
device_map="auto"
|
| 42 |
)
|
| 43 |
return _model, _tokenizer
|
| 44 |
except Exception as e:
|
|
|
|
| 195 |
"extracted_data": None
|
| 196 |
}
|
| 197 |
|
| 198 |
+
def safe_merge_field(combined_data, field, value, page_num, extracted_pages):
|
| 199 |
+
"""Safely merge field data with type checking"""
|
| 200 |
+
try:
|
| 201 |
+
if field in combined_data and value:
|
| 202 |
+
# Handle nested dictionaries (like insurance)
|
| 203 |
+
if isinstance(value, dict) and isinstance(combined_data[field], dict):
|
| 204 |
+
for sub_field, sub_value in value.items():
|
| 205 |
+
if sub_field in combined_data[field] and sub_value and not combined_data[field][sub_field]:
|
| 206 |
+
combined_data[field][sub_field] = sub_value
|
| 207 |
+
if page_num not in extracted_pages:
|
| 208 |
+
extracted_pages.append(page_num)
|
| 209 |
+
# Handle simple fields
|
| 210 |
+
elif not isinstance(value, (dict, list)) and not combined_data[field]:
|
| 211 |
+
combined_data[field] = value
|
| 212 |
+
if page_num not in extracted_pages:
|
| 213 |
+
extracted_pages.append(page_num)
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f"Warning: Error merging field {field}: {e}")
|
| 216 |
+
|
| 217 |
+
def safe_merge_confidence(combined_confidence, field, score):
|
| 218 |
+
"""Safely merge confidence scores with type checking"""
|
| 219 |
+
try:
|
| 220 |
+
# Handle nested confidence scores (like primary_insurance)
|
| 221 |
+
if isinstance(score, dict):
|
| 222 |
+
if field not in combined_confidence:
|
| 223 |
+
combined_confidence[field] = {}
|
| 224 |
+
for sub_field, sub_score in score.items():
|
| 225 |
+
if (sub_field not in combined_confidence[field] and
|
| 226 |
+
isinstance(sub_score, (int, float)) and sub_score > 0):
|
| 227 |
+
combined_confidence[field][sub_field] = sub_score
|
| 228 |
+
# Handle simple confidence scores
|
| 229 |
+
elif isinstance(score, (int, float)) and score > 0:
|
| 230 |
+
if field not in combined_confidence:
|
| 231 |
+
combined_confidence[field] = score
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f"Warning: Error merging confidence for {field}: {e}")
|
| 234 |
+
|
| 235 |
def combine_page_data(pages_data):
|
| 236 |
+
"""Combine extracted data from multiple pages into final medical record - FIXED VERSION"""
|
| 237 |
combined_data = {
|
| 238 |
"date_of_receipt": "",
|
| 239 |
"patient_first_name": "",
|
|
|
|
| 275 |
|
| 276 |
# Combine data from all pages
|
| 277 |
for page_num, page_data in enumerate(pages_data, 1):
|
| 278 |
+
try:
|
| 279 |
+
if page_data.get("page_data", {}).get("status") == "success":
|
| 280 |
+
extracted = page_data["page_data"].get("extracted_data", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
+
# If we got JSON data, merge it
|
| 283 |
+
if isinstance(extracted, dict) and "data" in extracted:
|
| 284 |
+
page_info = extracted["data"]
|
| 285 |
+
|
| 286 |
+
# Safely merge each field
|
| 287 |
+
for field, value in page_info.items():
|
| 288 |
+
safe_merge_field(combined_data, field, value, page_num, combined_data["extracted_page_numbers"])
|
| 289 |
+
|
| 290 |
+
# Safely merge confidence scores
|
| 291 |
+
if "confidence_scores" in extracted:
|
| 292 |
+
for field, score in extracted["confidence_scores"].items():
|
| 293 |
+
safe_merge_confidence(combined_confidence, field, score)
|
| 294 |
+
except Exception as e:
|
| 295 |
+
print(f"Warning: Error processing page {page_num}: {e}")
|
| 296 |
+
continue
|
| 297 |
|
| 298 |
return {
|
| 299 |
"data": combined_data,
|
|
|
|
| 320 |
"pages_data": []
|
| 321 |
}
|
| 322 |
|
| 323 |
+
# Step 1: Convert PDF to images (CPU operation)
|
| 324 |
print("Converting PDF to images...")
|
| 325 |
images = pdf_to_images(pdf_file)
|
| 326 |
|
|
|
|
| 334 |
|
| 335 |
print(f"Converted {len(images)} pages. Starting GPU processing...")
|
| 336 |
|
| 337 |
+
# Step 2: Load model on GPU
|
| 338 |
model, tokenizer = load_model()
|
| 339 |
|
| 340 |
# Step 3: Use custom prompt or default
|
|
|
|
| 352 |
|
| 353 |
print("GPU processing complete. Combining results...")
|
| 354 |
|
| 355 |
+
# Step 5: Combine data from all pages (with error handling)
|
| 356 |
combined_result = combine_page_data(pages_data)
|
| 357 |
|
| 358 |
# Final result
|
|
|
|
| 369 |
return result
|
| 370 |
|
| 371 |
except Exception as e:
|
| 372 |
+
print(f"Error in extract_efax_from_pdf: {e}")
|
| 373 |
return {
|
| 374 |
"status": "error",
|
| 375 |
"error": str(e),
|
|
|
|
| 379 |
|
| 380 |
# Create Gradio Interface
|
| 381 |
def create_gradio_interface():
|
| 382 |
+
with gr.Blocks(title="eFax PDF Data Extractor - Fixed", theme=gr.themes.Soft()) as demo:
|
| 383 |
gr.Markdown("# π₯ eFax Medical Data Extraction API")
|
| 384 |
+
gr.Markdown("π **Fixed Version** - Single 10-minute GPU session with proper error handling")
|
| 385 |
|
| 386 |
with gr.Tab("π PDF Upload & Extraction"):
|
| 387 |
with gr.Row():
|
|
|
|
| 400 |
placeholder="Leave empty to use optimized medical data extraction prompt..."
|
| 401 |
)
|
| 402 |
|
| 403 |
+
extract_btn = gr.Button("π Extract Medical Data (Fixed)", variant="primary", size="lg")
|
| 404 |
|
| 405 |
gr.Markdown("""
|
| 406 |
+
### β
Bug Fixes Applied
|
| 407 |
+
- **Fixed**: Dict/int comparison error
|
| 408 |
+
- **Added**: Safe type checking for all operations
|
| 409 |
+
- **Improved**: Error handling and logging
|
| 410 |
+
- **Single GPU Session**: No more timeouts
|
| 411 |
""")
|
| 412 |
|
| 413 |
with gr.Column():
|
|
|
|
| 416 |
|
| 417 |
with gr.Tab("π API Usage"):
|
| 418 |
gr.Markdown("""
|
| 419 |
+
## Fixed API (No More Errors)
|
| 420 |
|
| 421 |
### Python Usage
|
| 422 |
```
|
|
|
|
| 436 |
}
|
| 437 |
)
|
| 438 |
|
| 439 |
+
# Should work without dict/int comparison errors
|
| 440 |
result = response.json()
|
| 441 |
+
if result["data"]["status"] == "success":
|
| 442 |
+
medical_data = result["data"]["combined_extraction"]
|
| 443 |
+
print("Patient:", medical_data["data"]["patient_first_name"])
|
| 444 |
```
|
| 445 |
""")
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
def process_with_status(pdf_file, custom_prompt):
|
| 448 |
if pdf_file is None:
|
| 449 |
return "β No PDF file uploaded", {"error": "Please upload a PDF file"}
|
|
|
|
| 454 |
result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None)
|
| 455 |
|
| 456 |
if result["status"] == "success":
|
| 457 |
+
yield f"β
Successfully processed {result['total_pages']} pages", result
|
| 458 |
else:
|
| 459 |
yield f"β Error: {result.get('error', 'Unknown error')}", result
|
| 460 |
|
|
|
|
| 481 |
server_name="0.0.0.0",
|
| 482 |
server_port=7860,
|
| 483 |
show_error=True
|
| 484 |
+
)
|