Spaces:

crimson-suv
/

miniCPM

Paused

App Files Files Community

Suvadeep Das commited on Aug 5

Commit

998302b

verified ·

1 Parent(s): f1ee120

Update app.py

Browse files

Files changed (1) hide show

app.py +302 -221

app.py CHANGED Viewed

@@ -9,10 +9,9 @@ import os
 import json
 from huggingface_hub import login
 from pdf2image import convert_from_bytes
-import tempfile
 from datetime import datetime
-# Set your HF token (add this to your Space secrets)
 HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")
 if HF_TOKEN:
     login(token=HF_TOKEN)
@@ -22,7 +21,7 @@ _model = None
 _tokenizer = None
 def load_model():
-    """Load MiniCPM model (CPU loading, GPU usage happens in main function)"""
     global _model, _tokenizer
     if _model is not None and _tokenizer is not None:
@@ -57,7 +56,7 @@ def load_model():
         return _model, _tokenizer
 def pdf_to_images(pdf_file):
-    """Convert PDF file to list of PIL images (CPU operation)"""
     try:
         if hasattr(pdf_file, 'read'):
             pdf_bytes = pdf_file.read()
@@ -71,12 +70,20 @@ def pdf_to_images(pdf_file):
         print(f"Error converting PDF to images: {e}")
         return []
-def get_medical_extraction_prompt():
-    """Get the medical data extraction prompt"""
-    return """You are a medical document OCR and data extraction specialist. Analyze this medical document image and extract ALL visible information. Return the data in this exact JSON format:
 {
-  "data": {
     "date_of_receipt": "",
     "patient_first_name": "",
     "patient_last_name": "",
@@ -114,7 +121,12 @@ def get_medical_extraction_prompt():
         "description": ""
       }
     ],
-    "refine_reason": ""
   },
   "confidence_scores": {
     "date_of_receipt": 0.0,
@@ -136,28 +148,106 @@ def get_medical_extraction_prompt():
       "member_id": 0.0,
       "group_id": 0.0
     },
     "priority": 0.0,
-    "reason_for_referral": 0.0
   }
 }
-INSTRUCTIONS:
-1. Read ALL text visible in the document
-2. Extract exact values as they appear (no modifications)
-3. For dates, use MM/DD/YYYY format
-4. For phone numbers, use format like 850-463-0143
-5. Assign confidence scores 0.0-1.0 (1.0 = completely certain, 0.0 = not found)
-6. If information is not visible, leave field empty but still include it
-7. Return ONLY the JSON, no other text"""
-def extract_data_from_image(image, extraction_prompt, model, tokenizer):
-    """Extract data from a single image using MiniCPM (runs within GPU session)"""
     try:
-        # Convert PIL image to proper format if needed
         if hasattr(image, 'convert'):
             image = image.convert('RGB')
-        # Use the correct MiniCPM chat interface
         response = model.chat(
             image=image,
             msgs=[{
@@ -167,301 +257,293 @@ def extract_data_from_image(image, extraction_prompt, model, tokenizer):
             tokenizer=tokenizer,
             sampling=False,
             temperature=0.1,
-            max_new_tokens=2048
         )
-        # Try to parse JSON response
         try:
             parsed_data = json.loads(response)
             return {
                 "status": "success",
-                "extracted_data": parsed_data,
                 "raw_response": response,
-                "model_used": "MiniCPM-V-2_6-GPU"
             }
         except json.JSONDecodeError:
             return {
-                "status": "partial_success",
-                "extracted_data": response,
                 "raw_response": response,
-                "model_used": "MiniCPM-V-2_6-GPU",
-                "note": "Response was not valid JSON"
             }
     except Exception as e:
         return {
-            "status": "error",
             "error": str(e),
-            "extracted_data": None
         }
-def safe_merge_field(combined_data, field, value, page_num, extracted_pages):
-    """Safely merge field data with type checking"""
-    try:
-        if field in combined_data and value:
-            # Handle nested dictionaries (like insurance)
-            if isinstance(value, dict) and isinstance(combined_data[field], dict):
-                for sub_field, sub_value in value.items():
-                    if sub_field in combined_data[field] and sub_value and not combined_data[field][sub_field]:
-                        combined_data[field][sub_field] = sub_value
-                        if page_num not in extracted_pages:
-                            extracted_pages.append(page_num)
-            # Handle simple fields
-            elif not isinstance(value, (dict, list)) and not combined_data[field]:
-                combined_data[field] = value
-                if page_num not in extracted_pages:
-                    extracted_pages.append(page_num)
-    except Exception as e:
-        print(f"Warning: Error merging field {field}: {e}")
-def safe_merge_confidence(combined_confidence, field, score):
-    """Safely merge confidence scores with type checking"""
-    try:
-        # Handle nested confidence scores (like primary_insurance)
-        if isinstance(score, dict):
-            if field not in combined_confidence:
-                combined_confidence[field] = {}
-            for sub_field, sub_score in score.items():
-                if (sub_field not in combined_confidence[field] and
-                    isinstance(sub_score, (int, float)) and sub_score > 0):
-                    combined_confidence[field][sub_field] = sub_score
-        # Handle simple confidence scores
-        elif isinstance(score, (int, float)) and score > 0:
-            if field not in combined_confidence:
-                combined_confidence[field] = score
-    except Exception as e:
-        print(f"Warning: Error merging confidence for {field}: {e}")
-def combine_page_data(pages_data):
-    """Combine extracted data from multiple pages into final medical record - FIXED VERSION"""
-    combined_data = {
-        "date_of_receipt": "",
-        "patient_first_name": "",
-        "patient_last_name": "",
-        "patient_dob": "",
-        "patient_gender": "",
-        "patient_primary_phone_number": "",
-        "patient_secondary_phone_number": "",
-        "patient_email": "",
-        "patient_address": "",
-        "patient_zip_code": "",
-        "referral_source": "",
-        "referral_source_phone_no": "",
-        "referral_source_fax_no": "",
-        "referral_source_email": "",
-        "primary_insurance": {
-            "payer_name": "",
-            "member_id": "",
-            "group_id": ""
-        },
-        "secondary_insurance": {
-            "payer_name": None,
-            "member_id": None,
-            "group_id": None
-        },
-        "tertiary_insurance": {
-            "payer_name": None,
-            "member_id": None,
-            "group_id": None
-        },
-        "priority": "",
-        "reason_for_referral": "",
-        "diagnosis_informations": [],
-        "refine_reason": "",
-        "extracted_page_numbers": []
-    }
-    combined_confidence = {}
-    # Combine data from all pages
-    for page_num, page_data in enumerate(pages_data, 1):
-        try:
-            if page_data.get("page_data", {}).get("status") == "success":
-                extracted = page_data["page_data"].get("extracted_data", {})
-                # If we got JSON data, merge it
-                if isinstance(extracted, dict) and "data" in extracted:
-                    page_info = extracted["data"]
-                    # Safely merge each field
-                    for field, value in page_info.items():
-                        safe_merge_field(combined_data, field, value, page_num, combined_data["extracted_page_numbers"])
-                    # Safely merge confidence scores
-                    if "confidence_scores" in extracted:
-                        for field, score in extracted["confidence_scores"].items():
-                            safe_merge_confidence(combined_confidence, field, score)
-        except Exception as e:
-            print(f"Warning: Error processing page {page_num}: {e}")
-            continue
-    return {
-        "data": combined_data,
-        "confidence_scores": combined_confidence,
-        "fields_needing_review": [],
-        "metadata": {
-            "extraction_timestamp": datetime.now().isoformat(),
-            "model_used": "MiniCPM-V-2_6-GPU",
-            "confidence_threshold": 0.9,
-            "requires_human_review": False,
-            "total_pages_processed": len(pages_data)
-        }
-    }
-@spaces.GPU(duration=600)  # 10 minutes for large documents
-def extract_efax_from_pdf(pdf_file, custom_prompt=None):
-    """Main function to process multi-page PDF eFax - ALL GPU processing happens here"""
     try:
         if pdf_file is None:
-            return {
-                "status": "error",
-                "error": "No PDF file provided",
-                "total_pages": 0,
-                "pages_data": []
-            }
-        # Step 1: Convert PDF to images (CPU operation)
         print("Converting PDF to images...")
         images = pdf_to_images(pdf_file)
         if not images:
-            return {
-                "status": "error",
-                "error": "Could not convert PDF to images",
-                "total_pages": 0,
-                "pages_data": []
-            }
-        print(f"Converted {len(images)} pages. Starting GPU processing...")
-        # Step 2: Load model on GPU
         model, tokenizer = load_model()
-        # Step 3: Use custom prompt or default
-        extraction_prompt = custom_prompt if custom_prompt else get_medical_extraction_prompt()
-        # Step 4: Process all pages within single GPU session
-        pages_data = []
         for i, image in enumerate(images):
-            print(f"Processing page {i+1}/{len(images)} on GPU...")
-            page_result = extract_data_from_image(image, extraction_prompt, model, tokenizer)
-            pages_data.append({
                 "page_number": i + 1,
-                "page_data": page_result
             })
-        print("GPU processing complete. Combining results...")
-        # Step 5: Combine data from all pages (with error handling)
-        combined_result = combine_page_data(pages_data)
-        # Final result
-        result = {
-            "status": "success",
             "total_pages": len(images),
-            "pages_data": pages_data,
-            "combined_extraction": combined_result,
-            "model_used": "MiniCPM-V-2_6-ZeroGPU",
-            "hardware": "ZeroGPU",
-            "processing_time": "Within 10-minute GPU session"
         }
-        return result
     except Exception as e:
-        print(f"Error in extract_efax_from_pdf: {e}")
         return {
             "status": "error",
             "error": str(e),
             "total_pages": 0,
-            "pages_data": []
         }
-# Create Gradio Interface
 def create_gradio_interface():
-    with gr.Blocks(title="eFax PDF Data Extractor - Fixed", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🏥 eFax Medical Data Extraction API")
-        gr.Markdown("🚀 **Fixed Version** - Single 10-minute GPU session with proper error handling")
-        with gr.Tab("📄 PDF Upload & Extraction"):
             with gr.Row():
                 with gr.Column():
                     pdf_input = gr.File(
                         file_types=[".pdf"],
-                        label="Upload eFax PDF (up to 20 pages)",
                         file_count="single"
                     )
-                    with gr.Accordion("🔧 Advanced Options", open=False):
                         prompt_input = gr.Textbox(
                             value="",
-                            label="Custom Extraction Prompt (leave empty for default medical extraction)",
-                            lines=5,
-                            placeholder="Leave empty to use optimized medical data extraction prompt..."
                         )
-                    extract_btn = gr.Button("🚀 Extract Medical Data (Fixed)", variant="primary", size="lg")
                     gr.Markdown("""
-                    ### ✅ Bug Fixes Applied
-                    - **Fixed**: Dict/int comparison error
-                    - **Added**: Safe type checking for all operations
-                    - **Improved**: Error handling and logging
-                    - **Single GPU Session**: No more timeouts
                     """)
                 with gr.Column():
                     status_output = gr.Textbox(label="📊 Processing Status", interactive=False)
-                    output = gr.JSON(label="📋 Extracted Medical Data", show_label=True)
         with gr.Tab("🔌 API Usage"):
             gr.Markdown("""
-            ## Fixed API (No More Errors)
             ### Python Usage
             ```
             import requests
             import base64
-            with open("large_medical_fax.pdf", "rb") as f:
                 pdf_b64 = base64.b64encode(f.read()).decode()
             response = requests.post(
                 "https://your-username-extracting-efax.hf.space/api/predict",
                 json={
                     "data": [
-                        {"name": "medical_fax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
-                        ""  # Empty for default prompt
                     ]
                 }
             )
-            # Should work without dict/int comparison errors
             result = response.json()
-            if result["data"]["status"] == "success":
-                medical_data = result["data"]["combined_extraction"]
-                print("Patient:", medical_data["data"]["patient_first_name"])
             ```
             """)
         def process_with_status(pdf_file, custom_prompt):
             if pdf_file is None:
-                return "❌ No PDF file uploaded", {"error": "Please upload a PDF file"}
             yield "📄 Converting PDF to images...", {}
             try:
-                result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None)
                 if result["status"] == "success":
-                    yield f"✅ Successfully processed {result['total_pages']} pages", result
                 else:
-                    yield f"❌ Error: {result.get('error', 'Unknown error')}", result
             except Exception as e:
-                yield f"❌ Processing failed: {str(e)}", {"error": str(e)}
-        # Connect the interface
         extract_btn.click(
             fn=process_with_status,
             inputs=[pdf_input, prompt_input],
@@ -471,7 +553,6 @@ def create_gradio_interface():
     return demo
-# Launch the app
 if __name__ == "__main__":
     demo = create_gradio_interface()
     demo.queue(

 import json
 from huggingface_hub import login
 from pdf2image import convert_from_bytes
 from datetime import datetime
+# Set your HF token
 HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")
 if HF_TOKEN:
     login(token=HF_TOKEN)
 _tokenizer = None
 def load_model():
+    """Load MiniCPM model"""
     global _model, _tokenizer
     if _model is not None and _tokenizer is not None:
         return _model, _tokenizer
 def pdf_to_images(pdf_file):
+    """Convert PDF file to list of PIL images"""
     try:
         if hasattr(pdf_file, 'read'):
             pdf_bytes = pdf_file.read()
         print(f"Error converting PDF to images: {e}")
         return []
+def get_comprehensive_medical_extraction_prompt():
+    """Complete medical data extraction prompt with all fields"""
+    return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.
+Your response MUST follow this exact JSON format:
 {
+  "page_analysis": {
+    "page_contains_text": true,
+    "page_type": "cover_page|patient_demographics|insurance|medical_history|referral_info|other",
+    "overall_page_confidence": 0.0,
+    "all_visible_text": "Complete text transcription of everything visible on this page"
+  },
+  "extracted_data": {
     "date_of_receipt": "",
     "patient_first_name": "",
     "patient_last_name": "",
         "description": ""
       }
     ],
+    "refine_reason": "",
+    "additional_medical_info": "",
+    "provider_names": [],
+    "appointment_dates": [],
+    "medication_info": [],
+    "other_important_details": ""
   },
   "confidence_scores": {
     "date_of_receipt": 0.0,
       "member_id": 0.0,
       "group_id": 0.0
     },
+    "secondary_insurance": {
+      "payer_name": 0.0,
+      "member_id": 0.0,
+      "group_id": 0.0
+    },
+    "tertiary_insurance": {
+      "payer_name": 0.0,
+      "member_id": 0.0,
+      "group_id": 0.0
+    },
     "priority": 0.0,
+    "reason_for_referral": 0.0,
+    "diagnosis_informations": 0.0,
+    "refine_reason": 0.0
+  },
+  "fields_found_on_this_page": [],
+  "metadata": {
+    "extraction_timestamp": "",
+    "model_used": "MiniCPM-V-2_6-GPU",
+    "page_processing_notes": ""
   }
 }
+--------------------------------
+STRICT FIELD FORMATTING RULES:
+--------------------------------
+• Dates: Format as MM/DD/YYYY only
+• Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses
+• Gender: "Male", "Female", or "Other" only
+• Email: Must contain @ and valid domain, otherwise leave empty
+• Zip code: Only extract as last 5 digits of address
+--------------------------------
+REFERRAL SOURCE RULES:
+--------------------------------
+• Extract clinic/hospital/facility name ONLY – never the provider's name
+• Use facility's phone/fax/email, not individual provider's contact
+• Prefer header/fax banner for referral source over body text
+• Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source
+--------------------------------
+INSURANCE EXTRACTION FORMAT:
+--------------------------------
+Each tier must follow this structure:
+"primary_insurance": {
+  "payer_name": "string",
+  "member_id": "string",
+  "group_id": "string"
+},
+"secondary_insurance": { ... },
+"tertiary_insurance": { ... }
+• Use "member_id" for any ID (Policy, Insurance ID, Subscriber ID, etc.)
+• Use "group_id" ONLY if explicitly labeled as "Group ID", "Group Number", etc.
+• Leave all fields empty if "Self Pay" is indicated
+--------------------------------
+DIAGNOSIS EXTRACTION RULES:
+--------------------------------
+• Extract diagnosis codes AND their descriptions
+• If only code is present, set description to "" and confidence ≤ 0.6
+• DO NOT infer description from ICD code
+--------------------------------
+CONFIDENCE SCORING:
+--------------------------------
+Assign realistic confidence (0.0–1.0) per field, e.g.:
+• 0.95–1.0 → Clearly labeled, unambiguous data
+• 0.7–0.94 → Some uncertainty (low quality, odd format)
+• 0.0–0.6 → Missing, ambiguous, or noisy data
+• Use float precision (e.g., 0.87, not just 1.0)
+Always populate the `confidence_scores` dictionary with the same structure as `extracted_data`.
+--------------------------------
+CRITICAL INSTRUCTIONS:
+--------------------------------
+1. READ EVERYTHING: Transcribe all visible text in "all_visible_text"
+2. EXTRACT PRECISELY: Only extract what's actually visible on THIS page
+3. NO ASSUMPTIONS: Don't guess or infer information not present
+4. FIELD CLASSIFICATION: List which fields were actually found in "fields_found_on_this_page"
+5. CONFIDENCE: Be realistic - 0.0 if not found, up to 1.0 if completely certain
+6. FORMAT EXACTLY: Follow date/phone/address formatting rules strictly
+7. JSON ONLY: Return only valid JSON, no other text
+This is ONE PAGE of a multi-page document. Extract only what's visible on this specific page."""
+def extract_single_page(image, extraction_prompt, model, tokenizer):
+    """Extract data from a single page with comprehensive medical fields"""
     try:
         if hasattr(image, 'convert'):
             image = image.convert('RGB')
         response = model.chat(
             image=image,
             msgs=[{
             tokenizer=tokenizer,
             sampling=False,
             temperature=0.1,
+            max_new_tokens=4000  # More tokens for comprehensive extraction
         )
+        # Try to parse JSON
         try:
             parsed_data = json.loads(response)
             return {
                 "status": "success",
+                "data": parsed_data,
                 "raw_response": response,
+                "model": "MiniCPM-V-2_6-GPU"
             }
         except json.JSONDecodeError:
+            # Return structured error with raw text
             return {
+                "status": "json_parse_error",
+                "data": {
+                    "page_analysis": {
+                        "page_contains_text": True,
+                        "page_type": "unknown",
+                        "overall_page_confidence": 0.5,
+                        "all_visible_text": response
+                    },
+                    "extracted_data": {},
+                    "confidence_scores": {},
+                    "fields_found_on_this_page": [],
+                    "parsing_error": "Could not parse JSON response"
+                },
                 "raw_response": response,
+                "model": "MiniCPM-V-2_6-GPU",
+                "error": "JSON parsing failed - returned raw text"
             }
     except Exception as e:
         return {
+            "status": "extraction_error",
             "error": str(e),
+            "data": None,
+            "raw_response": ""
         }
+@spaces.GPU(duration=600)  # 10 minutes
+def extract_pages_individually(pdf_file, custom_prompt=None):
+    """Extract each page individually with comprehensive medical data"""
     try:
         if pdf_file is None:
+            return {"status": "error", "error": "No PDF provided"}
+        # Convert PDF to images
         print("Converting PDF to images...")
         images = pdf_to_images(pdf_file)
         if not images:
+            return {"status": "error", "error": "Could not convert PDF"}
+        print(f"Processing {len(images)} pages individually with comprehensive extraction...")
+        # Load model once
         model, tokenizer = load_model()
+        extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
+        # Process each page independently
+        results = []
+        successful_extractions = 0
         for i, image in enumerate(images):
+            print(f"Extracting page {i+1}/{len(images)} with full medical fields...")
+            page_result = extract_single_page(image, extraction_prompt, model, tokenizer)
+            if page_result["status"] == "success":
+                successful_extractions += 1
+            results.append({
                 "page_number": i + 1,
+                "extraction_result": page_result,
+                "timestamp": datetime.now().isoformat()
             })
+        return {
+            "status": "success",
             "total_pages": len(images),
+            "successful_extractions": successful_extractions,
+            "individual_pages": results,
+            "processing_info": {
+                "model_used": "MiniCPM-V-2_6-GPU",
+                "extraction_timestamp": datetime.now().isoformat(),
+                "processing_method": "comprehensive_individual_page_extraction",
+                "extraction_prompt_used": "comprehensive_medical_fields",
+                "note": "Each page processed with full medical field extraction - combine results with separate AI"
+            },
+            "next_step_instructions": {
+                "combination_method": "Use ChatGPT/Claude to combine all pages into final medical record",
+                "fields_to_aggregate": [
+                    "date_of_receipt", "patient_demographics", "insurance_info",
+                    "referral_source", "diagnosis_codes", "reason_for_referral"
+                ],
+                "confidence_handling": "Take highest confidence values across pages for each field"
+            }
         }
     except Exception as e:
         return {
             "status": "error",
             "error": str(e),
             "total_pages": 0,
+            "individual_pages": []
         }
 def create_gradio_interface():
+    with gr.Blocks(title="Comprehensive Medical Page Extractor", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🏥 Comprehensive Medical Data Extractor")
+        gr.Markdown("📋 **Complete Field Extraction** - All medical fields extracted per page, ready for AI combination")
+        with gr.Tab("📄 Comprehensive Page Extraction"):
             with gr.Row():
                 with gr.Column():
                     pdf_input = gr.File(
                         file_types=[".pdf"],
+                        label="Upload Medical eFax PDF",
                         file_count="single"
                     )
+                    with gr.Accordion("🔧 Custom Prompt", open=False):
                         prompt_input = gr.Textbox(
                             value="",
+                            label="Custom Extraction Prompt (optional)",
+                            lines=4,
+                            placeholder="Leave empty for comprehensive medical extraction with all fields..."
                         )
+                    extract_btn = gr.Button("🏥 Extract All Medical Fields Per Page", variant="primary", size="lg")
                     gr.Markdown("""
+                    ### 📋 Comprehensive Fields Extracted:
+                    - ✅ **Patient Demographics** (name, DOB, gender, address, phone, email)
+                    - ✅ **Insurance Information** (primary/secondary/tertiary with IDs)
+                    - ✅ **Referral Source** (clinic, phone, fax, email)
+                    - ✅ **Medical Codes** (diagnosis codes with descriptions)
+                    - ✅ **Clinical Info** (priority, reason for referral, medical history)
+                    - ✅ **Confidence Scores** (0.0-1.0 for each field)
+                    - ✅ **Full Text Transcription** (everything visible on each page)
                     """)
                 with gr.Column():
                     status_output = gr.Textbox(label="📊 Processing Status", interactive=False)
+                    output = gr.JSON(label="📋 Comprehensive Page Results", show_label=True)
         with gr.Tab("🔌 API Usage"):
             gr.Markdown("""
+            ## Comprehensive Medical Extraction API
             ### Python Usage
             ```
             import requests
             import base64
+            with open("medical_efax.pdf", "rb") as f:
                 pdf_b64 = base64.b64encode(f.read()).decode()
             response = requests.post(
                 "https://your-username-extracting-efax.hf.space/api/predict",
                 json={
                     "data": [
+                        {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
+                        ""  # Custom prompt (optional)
                     ]
                 }
             )
             result = response.json()
+            # Access comprehensive page results
+            for page in result["data"]["individual_pages"]:
+                page_num = page["page_number"]
+                extraction = page["extraction_result"]
+                if extraction["status"] == "success":
+                    data = extraction["data"]
+                    # Page analysis
+                    print(f"Page {page_num} Type: {data['page_analysis']['page_type']}")
+                    print(f"Confidence: {data['page_analysis']['overall_page_confidence']}")
+                    # Extracted medical fields
+                    extracted = data['extracted_data']
+                    print(f"Patient: {extracted['patient_first_name']} {extracted['patient_last_name']}")
+                    print(f"Insurance: {extracted['primary_insurance']['payer_name']}")
+                    print(f"Diagnosis: {extracted['diagnosis_informations']}")
+                    # Fields found on this page
+                    print(f"Fields found: {data['fields_found_on_this_page']}")
+            ```
+            ### Use ChatGPT/Claude for Final Combination
+            ```
+            # Prepare all page data for combination
+            all_pages_data = []
+            for page in result["data"]["individual_pages"]:
+                if page["extraction_result"]["status"] == "success":
+                    all_pages_data.append({
+                        "page": page["page_number"],
+                        "extracted_data": page["extraction_result"]["data"]["extracted_data"],
+                        "confidence_scores": page["extraction_result"]["data"]["confidence_scores"],
+                        "fields_found": page["extraction_result"]["data"]["fields_found_on_this_page"]
+                    })
+            # Send to ChatGPT for combination
+            combination_prompt = f'''
+            Combine these {len(all_pages_data)} medical document pages into a single comprehensive patient record.
+            For each field, choose the value with highest confidence across all pages.
+            If multiple pages have the same field, verify consistency.
+            Page Data:
+            {json.dumps(all_pages_data, indent=2)}
+            Return the final medical record in the same structure with:
+            - Combined data from all pages
+            - Highest confidence scores per field
+            - List of pages where each field was found
+            - Fields needing human review (confidence < 0.9)
+            '''
             ```
             """)
+        with gr.Tab("📊 Field Mapping"):
+            gr.Markdown("""
+            ## Complete Medical Fields Extracted Per Page
+            ### Patient Demographics
+            - `date_of_receipt` - Document receipt date (MM/DD/YYYY)
+            - `patient_first_name` - Patient's first name
+            - `patient_last_name` - Patient's last name
+            - `patient_dob` - Date of birth (MM/DD/YYYY)
+            - `patient_gender` - Male/Female/Other only
+            - `patient_primary_phone_number` - Main phone (###-###-####)
+            - `patient_secondary_phone_number` - Secondary phone
+            - `patient_email` - Email address (must have @ and domain)
+            - `patient_address` - Full address
+            - `patient_zip_code` - Last 5 digits only
+            ### Referral Information
+            - `referral_source` - Clinic/hospital name (NOT provider name)
+            - `referral_source_phone_no` - Facility phone
+            - `referral_source_fax_no` - Facility fax
+            - `referral_source_email` - Facility email
+            ### Insurance (Primary/Secondary/Tertiary)
+            - `payer_name` - Insurance company name
+            - `member_id` - Any ID (policy, subscriber, member, etc.)
+            - `group_id` - Only if explicitly labeled as "Group"
+            ### Medical Information
+            - `priority` - "Routine" or "Urgent" only
+            - `reason_for_referral` - Why patient was referred
+            - `diagnosis_informations` - Array of {code, description}
+            - `refine_reason` - Additional refinement details
+            ### Page Analysis
+            - `page_type` - Classification of page content
+            - `all_visible_text` - Complete text transcription
+            - `overall_page_confidence` - Page extraction confidence
+            - `fields_found_on_this_page` - List of fields with data
+            ### Confidence Scoring (0.0 - 1.0)
+            - `0.95-1.0` → Clearly visible, unambiguous
+            - `0.7-0.94` → Some uncertainty, formatting issues
+            - `0.0-0.6` → Missing, unclear, or poor quality
+            """)
         def process_with_status(pdf_file, custom_prompt):
             if pdf_file is None:
+                return "❌ No PDF uploaded", {"error": "Upload a PDF file"}
             yield "📄 Converting PDF to images...", {}
             try:
+                result = extract_pages_individually(pdf_file, custom_prompt if custom_prompt.strip() else None)
                 if result["status"] == "success":
+                    yield f"✅ Extracted comprehensive medical data from {result['successful_extractions']}/{result['total_pages']} pages", result
                 else:
+                    yield f"❌ Error: {result.get('error')}", result
             except Exception as e:
+                yield f"❌ Failed: {str(e)}", {"error": str(e)}
         extract_btn.click(
             fn=process_with_status,
             inputs=[pdf_input, prompt_input],
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
     demo.queue(