Spaces:

Chhagan005
/

Multi_ML_OCR

Running on Zero

App Files Files Community

Chhagan005 commited on 7 days ago

Commit

3c88fc5

verified ·

1 Parent(s): c9833a0

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -502

app.py CHANGED Viewed

@@ -3,8 +3,9 @@ import random
 import uuid
 import json
 import time
 from threading import Thread
-from typing import Iterable
 import gradio as gr
 import spaces
@@ -45,6 +46,7 @@ from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -120,7 +122,6 @@ css = """
 #output-title h2 {
     font-size: 2.2em !important;
 }
-/* RadioAnimated Styles */
 .ra-wrap{ width: fit-content; }
 .ra-inner{
   position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
@@ -139,7 +140,6 @@ css = """
   transition: transform 0.2s, width 0.2s;
 }
 .ra-input:checked + .ra-label{ color: black; }
-/* Dark mode adjustments for Radio */
 .dark .ra-inner { background: var(--neutral-800); }
 .dark .ra-label { color: var(--neutral-400); }
 .dark .ra-highlight { background: var(--neutral-600); }
@@ -151,6 +151,11 @@ css = """
     border: 1px solid var(--border-color-primary);
     margin-top: 10px;
 }
 """
 MAX_MAX_NEW_TOKENS = 4096
@@ -170,502 +175,55 @@ if torch.cuda.is_available():
 print("Using device:", device)
-# Multilingual OCR prompt template
-MULTILINGUAL_OCR_PROMPT = """Perform comprehensive OCR extraction on this government ID/document. Follow these rules:
-1. Extract ALL text exactly as it appears in the original language
-2. If the text is NOT in English, provide an English translation after the original text
-3. Identify the document type (ID Card, Passport, License, etc.)
-4. Extract key fields with structured format
-5. Preserve formatting and layout structure
-Format your response as:
-**Document Type:** [type]
-**Original Text:** (in source language)
-[extracted text with layout preserved]
-**English Translation:** (if not already in English)
-[translated text]
-**Key Fields:**
-- Full Name:
-- ID Number:
-- Date of Birth:
-- Issue Date:
-- Expiry Date:
-- Nationality:
-- [other relevant fields]
-Be accurate and preserve all details."""
-class RadioAnimated(gr.HTML):
-    def __init__(self, choices, value=None, **kwargs):
-        if not choices or len(choices) < 2:
-            raise ValueError("RadioAnimated requires at least 2 choices.")
-        if value is None:
-            value = choices[0]
-        uid = uuid.uuid4().hex[:8]
-        group_name = f"ra-{uid}"
-        inputs_html = "\\n".join(
-            f"""
-            <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
-            <label class="ra-label" for="{group_name}-{i}">{c}</label>
-            """
-            for i, c in enumerate(choices)
-        )
-        html_template = f"""
-        <div class="ra-wrap" data-ra="{uid}">
-          <div class="ra-inner">
-            <div class="ra-highlight"></div>
-            {inputs_html}
-          </div>
-        </div>
-        """
-        js_on_load = r"""
-        (() => {
-          const wrap = element.querySelector('.ra-wrap');
-          const inner = element.querySelector('.ra-inner');
-          const highlight = element.querySelector('.ra-highlight');
-          const inputs = Array.from(element.querySelectorAll('.ra-input'));
-          if (!inputs.length) return;
-          const choices = inputs.map(i => i.value);
-          function setHighlightByIndex(idx) {
-            const n = choices.length;
-            const pct = 100 / n;
-            highlight.style.width = `calc(${pct}% - 6px)`;
-            highlight.style.transform = `translateX(${idx * 100}%)`;
-          }
-          function setCheckedByValue(val, shouldTrigger=false) {
-            const idx = Math.max(0, choices.indexOf(val));
-            inputs.forEach((inp, i) => { inp.checked = (i === idx); });
-            setHighlightByIndex(idx);
-            props.value = choices[idx];
-            if (shouldTrigger) trigger('change', props.value);
-          }
-          setCheckedByValue(props.value ?? choices[0], false);
-          inputs.forEach((inp) => {
-            inp.addEventListener('change', () => {
-              setCheckedByValue(inp.value, true);
-            });
-          });
-        })();
-        """
-        super().__init__(
-            value=value,
-            html_template=html_template,
-            js_on_load=js_on_load,
-            **kwargs
-        )
-def apply_gpu_duration(val: str):
-    return int(val)
-# ===== MODEL LOADING =====
-print("\n" + "="*70)
-print("🚀 LOADING ALL 4 MODELS")
-print("="*70 + "\n")
-# Model 1: Chhagan_ML-VL-OCR-v1 (LoRA Fine-tuned for ID Cards)
-print("1️⃣ Loading Chhagan_ML-VL-OCR-v1 (LoRA Refined)...")
-MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
-CHHAGAN_V1_AVAILABLE = False
-processor_c1 = None
-model_c1 = None
-if PEFT_AVAILABLE:
-    try:
-        # Try to get base model from adapter config
-        try:
-            config = PeftConfig.from_pretrained(MODEL_ID_C1)
-            base_model_id = config.base_model_name_or_path
-            print(f"   Base model from config: {base_model_id}")
-        except:
-            # Fallback to common base models
-            base_model_id = "Qwen/Qwen2.5-VL-2B-Instruct"
-            print(f"   Using default base model: {base_model_id}")
-        # Load processor
-        processor_c1 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
-        # Load base model
-        base_model_c1 = Qwen2VLForConditionalGeneration.from_pretrained(
-            base_model_id,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            trust_remote_code=True
-        )
-        # Load LoRA adapter
-        model_c1 = PeftModel.from_pretrained(base_model_c1, MODEL_ID_C1)
-        model_c1 = model_c1.to(device).eval()
-        print("   ✅ Chhagan_ML-VL-OCR-v1 (Refined) loaded successfully!")
-        CHHAGAN_V1_AVAILABLE = True
-    except Exception as e:
-        print(f"   ❌ Chhagan_ML-VL-OCR-v1 failed: {e}")
-        processor_c1 = None
-        model_c1 = None
-else:
-    print("   ⚠️ PEFT not available, skipping LoRA model")
-# Model 2: Chhagan-DocVL-Qwen3 (Qwen3-VL Refined for Documents)
-print("\n2️⃣ Loading Chhagan-DocVL-Qwen3 (Qwen3-VL Refined)...")
-MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
-CHHAGAN_QWEN3_AVAILABLE = False
-processor_c2 = None
-model_c2 = None
-if QWEN3_AVAILABLE:
-    try:
-        # Check if it's a PEFT adapter or full model
-        try:
-            # Try loading as PEFT adapter first
-            if PEFT_AVAILABLE:
-                config = PeftConfig.from_pretrained(MODEL_ID_C2)
-                base_model_id = config.base_model_name_or_path
-                print(f"   Detected as LoRA adapter, base: {base_model_id}")
-                processor_c2 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
-                base_model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
-                    base_model_id,
-                    torch_dtype=torch.float16,
-                    device_map="auto",
-                    trust_remote_code=True
-                )
-                model_c2 = PeftModel.from_pretrained(base_model_c2, MODEL_ID_C2)
-                model_c2 = model_c2.to(device).eval()
-            else:
-                raise Exception("PEFT not available")
-        except:
-            # Load as full fine-tuned model
-            print("   Loading as full fine-tuned model...")
-            processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
-            model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
-                MODEL_ID_C2,
-                attn_implementation="flash_attention_2",
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True
-            ).to(device).eval()
-        print("   ✅ Chhagan-DocVL-Qwen3 (Refined) loaded successfully!")
-        CHHAGAN_QWEN3_AVAILABLE = True
-    except Exception as e:
-        print(f"   ❌ Chhagan-DocVL-Qwen3 failed: {e}")
-        processor_c2 = None
-        model_c2 = None
-else:
-    print("   ⚠️ Qwen3VL not available in transformers version")
-# Model 3: Qwen3-VL-2B-Instruct (Baseline for Comparison)
-print("\n3️⃣ Loading Qwen3-VL-2B-Instruct (Baseline)...")
-MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
-QWEN3_BASELINE_AVAILABLE = False
-processor_q3 = None
-model_q3 = None
-if QWEN3_AVAILABLE:
-    try:
-        processor_q3 = AutoProcessor.from_pretrained(MODEL_ID_Q3, trust_remote_code=True)
-        model_q3 = Qwen3VLForConditionalGeneration.from_pretrained(
-            MODEL_ID_Q3,
-            attn_implementation="flash_attention_2",
-            torch_dtype=torch.float16,
-            device_map="auto",
-            trust_remote_code=True
-        ).to(device).eval()
-        print("   ✅ Qwen3-VL-2B-Instruct (Baseline) loaded successfully!")
-        QWEN3_BASELINE_AVAILABLE = True
-    except Exception as e:
-        print(f"   ❌ Qwen3-VL-2B-Instruct failed: {e}")
-else:
-    print("   ⚠️ Qwen3VL not available in transformers version")
-# Model 4: Nanonets-OCR2-3B (General OCR Fallback)
-print("\n4️⃣ Loading Nanonets-OCR2-3B (General OCR)...")
-MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
-NANONETS_AVAILABLE = False
-processor_v = None
-model_v = None
-try:
-    processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-    model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID_V,
-        attn_implementation="flash_attention_2",
-        trust_remote_code=True,
-        torch_dtype=torch.float16
-    ).to(device).eval()
-    print("   ✅ Nanonets-OCR2-3B loaded successfully!")
-    NANONETS_AVAILABLE = True
-except Exception as e:
-    print(f"   ❌ Nanonets-OCR2-3B failed: {e}")
-# Summary
-print("\n" + "="*70)
-print("📊 MODEL STATUS SUMMARY (4 Models)")
-print("="*70)
-print(f"{'Model Name':<40} {'Status':<15} {'Type'}")
-print("-"*70)
-print(f"{'Chhagan_ML-VL-OCR-v1':<40} {'✅ Loaded' if CHHAGAN_V1_AVAILABLE else '❌ Failed':<15} {'Refined (LoRA)'}")
-print(f"{'Chhagan-DocVL-Qwen3':<40} {'✅ Loaded' if CHHAGAN_QWEN3_AVAILABLE else '❌ Failed':<15} {'Refined (Qwen3)'}")
-print(f"{'Qwen3-VL-2B-Instruct':<40} {'✅ Loaded' if QWEN3_BASELINE_AVAILABLE else '❌ Failed':<15} {'Baseline'}")
-print(f"{'Nanonets-OCR2-3B':<40} {'✅ Loaded' if NANONETS_AVAILABLE else '❌ Failed':<15} {'General OCR'}")
-print("="*70)
-loaded_count = sum([CHHAGAN_V1_AVAILABLE, CHHAGAN_QWEN3_AVAILABLE, QWEN3_BASELINE_AVAILABLE, NANONETS_AVAILABLE])
-print(f"\n✨ Total models loaded: {loaded_count}/4")
-if CHHAGAN_V1_AVAILABLE or CHHAGAN_QWEN3_AVAILABLE:
-    print("💡 Recommendation: Use Chhagan Refined models for best accuracy!")
-    if QWEN3_BASELINE_AVAILABLE:
-        print("📊 Comparison Tip: Test Refined vs Baseline to see improvement!")
-print()
-def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
-                          max_new_tokens: int, temperature: float, top_p: float,
-                          top_k: int, repetition_penalty: float, gpu_timeout: int):
-    """Calculate GPU timeout duration based on the last argument."""
-    try:
-        return int(gpu_timeout)
-    except:
-        return 60
-@spaces.GPU(duration=calc_timeout_duration)
-def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int, temperature: float, top_p: float,
-                   top_k: int, repetition_penalty: float, gpu_timeout: int):
-    """
-    Generates responses using the selected model for image input.
-    Yields raw text and Markdown-formatted text.
-    """
-    # Select model and processor based on model name
-    if model_name == "Chhagan-ID-OCR-v1 ⭐":
-        if not CHHAGAN_V1_AVAILABLE:
-            yield "Chhagan_ML-VL-OCR-v1 model is not available.", "Chhagan_ML-VL-OCR-v1 model is not available."
-            return
-        processor = processor_c1
-        model = model_c1
-    elif model_name == "Chhagan-DocVL-Qwen3 🔥":
-        if not CHHAGAN_QWEN3_AVAILABLE:
-            yield "Chhagan-DocVL-Qwen3 model is not available.", "Chhagan-DocVL-Qwen3 model is not available."
-            return
-        processor = processor_c2
-        model = model_c2
-    elif model_name == "Qwen3-VL-2B (Baseline) 📊":
-        if not QWEN3_BASELINE_AVAILABLE:
-            yield "Qwen3-VL-2B-Instruct baseline model is not available.", "Qwen3-VL-2B-Instruct baseline model is not available."
-            return
-        processor = processor_q3
-        model = model_q3
-    elif model_name == "Nanonets-OCR2-3B":
-        if not NANONETS_AVAILABLE:
-            yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
-            return
-        processor = processor_v
-        model = model_v
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    if image is None:
-        yield "Please upload an image.", "Please upload an image."
-        return
-    # Use multilingual prompt if user query is empty or simple
-    if not text or text.strip().lower() in ["ocr", "extract", "read", ""]:
-        text = MULTILINGUAL_OCR_PROMPT
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": text},
-        ]
-    }]
-    try:
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    except Exception as e:
-        print(f"Chat template error: {e}")
-        # Fallback to simple prompt
-        prompt_full = text
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        buffer = buffer.replace("<|endoftext|>", "")
-        time.sleep(0.01)
-        yield buffer, buffer
-image_examples = [
-    ["Extract all text with English translation from this government ID", "examples/5.jpg"],
-    ["Perform comprehensive multilingual OCR on this document", "examples/4.jpg"],
-    ["Extract key fields: Name, ID, DOB, Expiry from this card", "examples/2.jpg"],
-    ["Identify document type and extract all information", "examples/1.jpg"],
-    ["Convert this page with layout preservation", "examples/3.jpg"],
-]
-# Build model choices dynamically (Order: Refined models first, then baseline)
-model_choices = []
-if CHHAGAN_V1_AVAILABLE:
-    model_choices.append("Chhagan-ID-OCR-v1 ⭐")
-if CHHAGAN_QWEN3_AVAILABLE:
-    model_choices.append("Chhagan-DocVL-Qwen3 🔥")
-if QWEN3_BASELINE_AVAILABLE:
-    model_choices.append("Qwen3-VL-2B (Baseline) 📊")
-if NANONETS_AVAILABLE:
-    model_choices.append("Nanonets-OCR2-3B")
-if not model_choices:
-    model_choices = ["No models available"]
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("# 🌍 **Chhagan Multilingual ID Card OCR**", elem_id="main-title")
-    gr.Markdown("### *4 AI Models: 2 Refined + 2 Baseline for Comparison*")
-    # Model info banner
-    loaded_models = []
-    if CHHAGAN_V1_AVAILABLE:
-        loaded_models.append("ID-OCR-v1 ⭐")
-    if CHHAGAN_QWEN3_AVAILABLE:
-        loaded_models.append("DocVL-Qwen3 🔥")
-    if QWEN3_BASELINE_AVAILABLE:
-        loaded_models.append("Qwen3-Baseline 📊")
-    if NANONETS_AVAILABLE:
-        loaded_models.append("Nanonets")
-    model_info = f"**Loaded Models ({len(loaded_models)}/4):** {', '.join(loaded_models)}" if loaded_models else "⚠️ No models loaded"
-    gr.Markdown(f"**Status:** {model_info}")
-    gr.Markdown("**Supported**: Arabic, English, Hindi, Urdu, Persian, French, Spanish + 30 languages")
-    with gr.Row():
-        with gr.Column(scale=2):
-            image_query = gr.Textbox(
-                label="💬 Query (Optional)",
-                placeholder="Leave empty for automatic ID card extraction...",
-                value=""
-            )
-            image_upload = gr.Image(type="pil", label="📤 Upload ID Card / Document", height=290)
-            image_submit = gr.Button("🚀 Extract OCR", variant="primary", size="lg")
-            gr.Examples(
-                examples=image_examples,
-                inputs=[image_query, image_upload],
-                label="📸 Sample Documents"
-            )
-            with gr.Accordion("⚙️ Advanced Settings", open=False):
-                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.7)
-                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
-                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
-        with gr.Column(scale=3):
-            gr.Markdown("## 📄 Extracted Results", elem_id="output-title")
-            output = gr.Textbox(label="OCR Output (Streaming)", interactive=True, lines=11)
-            with gr.Accordion("📝 Markdown Preview", open=False):
-                markdown_output = gr.Markdown(label="Formatted Result")
-            model_choice = gr.Radio(
-                choices=model_choices,
-                label="🤖 Select OCR Model",
-                value=model_choices[0] if model_choices else None,
-                info="⭐🔥 = Refined | 📊 = Baseline | Compare to see improvement!"
-            )
-            # Model descriptions
-            gr.Markdown("""
-            **Model Guide:**
-            - **⭐ ID-OCR-v1**: Fine-tuned LoRA for Government IDs (Best for ID cards)
-            - **🔥 DocVL-Qwen3**: Fine-tuned Qwen3-VL for Documents (Best for documents)
-            - **📊 Qwen3-VL Baseline**: Vanilla pretrained (For comparison benchmark)
-            - **Nanonets**: General OCR fallback
-            """)
-            with gr.Row(elem_id="gpu-duration-container"):
-                with gr.Column():
-                    gr.Markdown("**⏱️ GPU Duration (seconds)**")
-                    radioanimated_gpu_duration = RadioAnimated(
-                        choices=["60", "90", "120", "180", "240"],
-                        value="60",
-                        elem_id="radioanimated_gpu_duration"
-                    )
-                    gpu_duration_state = gr.Number(value=60, visible=False)
-            gr.Markdown("*💡 Tip: Test same document on Refined vs Baseline to see fine-tuning improvement*")
-    radioanimated_gpu_duration.change(
-        fn=apply_gpu_duration,
-        inputs=radioanimated_gpu_duration,
-        outputs=[gpu_duration_state],
-        api_visibility="private"
-    )
-    image_submit.click(
-        fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
-        outputs=[output, markdown_output]
-    )
-    # Footer with detailed comparison table
-    gr.Markdown("""
-    ---
-    ### 📊 Model Comparison Table
-    | Model | Type | Base Architecture | Training | Specialization | Best For |
-    |-------|------|------------------|----------|----------------|----------|
-    | **Chhagan-ID-OCR-v1** ⭐ | Refined (LoRA) | Qwen2.5-VL-2B | Fine-tuned on IDs | Government IDs | Passports, National IDs, Licenses |
-    | **Chhagan-DocVL-Qwen3** 🔥 | Refined (Full) | Qwen3-VL-2B | Fine-tuned on Docs | Documents | Contracts, Forms, Certificates |
-    | **Qwen3-VL-2B** 📊 | Baseline | Qwen3-VL-2B | Pretrained only | General Vision | Comparison benchmark |
-    | **Nanonets-OCR2-3B** | General OCR | Qwen2.5-VL-3B | General OCR training | Text extraction | Receipts, Invoices |
-    ### 🎯 Performance Expectations
-    - **Refined models (⭐🔥)**: 95-98% accuracy on target documents
-    - **Baseline (📊)**: 75-85% accuracy (shows fine-tuning value)
-    - **Improvement**: ~15-20% accuracy boost from fine-tuning
-    ### 🔍 When to Use Each Model
-    1. **Start with Refined models** (⭐ or 🔥) based on document type
-    2. **Use Baseline** to benchmark improvement
-    3. **Fallback to Nanonets** for edge cases
-    **🔒 Privacy**: All processing on-device | No data stored
-    """)
-if __name__ == "__main__":
-    demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)

 import uuid
 import json
 import time
+import re
 from threading import Thread
+from typing import Iterable, List, Dict, Any
 import gradio as gr
 import spaces
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+# Theme configuration (keeping your existing theme)
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
 #output-title h2 {
     font-size: 2.2em !important;
 }
 .ra-wrap{ width: fit-content; }
 .ra-inner{
   position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
   transition: transform 0.2s, width 0.2s;
 }
 .ra-input:checked + .ra-label{ color: black; }
 .dark .ra-inner { background: var(--neutral-800); }
 .dark .ra-label { color: var(--neutral-400); }
 .dark .ra-highlight { background: var(--neutral-600); }
     border: 1px solid var(--border-color-primary);
     margin-top: 10px;
 }
+.dual-card-container {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 15px;
+}
 """
 MAX_MAX_NEW_TOKENS = 4096
 print("Using device:", device)
+# Enhanced multilingual OCR prompt with embedded image extraction
+DUAL_CARD_OCR_PROMPT = """Perform comprehensive OCR extraction on this ID card image. Extract ALL information with maximum English translation accuracy:
+**EXTRACTION REQUIREMENTS:**
+1. **TEXT EXTRACTION**: Extract ALL text in original language with accurate English translation
+2. **EMBEDDED IMAGES**:
+   - Locate and describe profile photo/headshot (if present)
+   - Locate and describe signature (if present)
+   - Extract any logos or official seals
+3. **MRZ DATA**: If Machine Readable Zone is present (usually at bottom):
+   - Extract complete MRZ lines
+   - Parse: Document Type, Country Code, Document Number, Date of Birth, Expiry Date, Nationality
+4. **STRUCTURED FIELDS**: Extract with English labels:
+   - Full Name (in English)
+   - ID/Document Number
+   - Date of Birth
+   - Issue Date & Expiry Date
+   - Nationality/Country
+   - Address (if present)
+   - Document Type
+**OUTPUT FORMAT:**
+```markdown
+## 📋 Document Type
+[Type: Passport/ID Card/License/etc.]
+## 🖼️ Embedded Images
+### Profile Photo
+- Location: [describe position]
+- Description: [describe photo]
+### Signature
+- Present: [Yes/No]
+- Location: [describe position if present]
+## 📝 Original Text
+[All text in original language with layout preserved]
+## 🔤 English Translation
+[Complete accurate English translation]
+## 🔑 Key Fields (English)
+- **Full Name**:
+- **ID Number**:
+- **Date of Birth**:
+- **Issue Date**:
+- **Expiry Date**:
+- **Nationality**:
+- **Address**:
+## 🔐 MRZ Data (if present)