Spaces:

vithacocf
/

ocr

Sleeping

App Files Files Community

vithacocf commited on 17 days ago

Commit

86ba016

verified ·

1 Parent(s): 2364e8e

Update app.py

Browse files

Files changed (1) hide show

app.py +522 -165

app.py CHANGED Viewed

@@ -1,232 +1,589 @@
-# =========================
-# CAMEL-DOC-OCR (HF Spaces SAFE)
-# Single-file – NO CUDA init at global scope
-# =========================
 import os
 import gc
 import torch
-import fitz
 import gradio as gr
 import spaces
-from PIL import Image
 from transformers import AutoProcessor, BitsAndBytesConfig
 from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
-# =========================
-# CONFIG
-# =========================
-MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
-DPI = 150
-MAX_IMAGE_SIZE = 2048
-# =========================
-# TORCH FLAGS (SAFE FOR SPACES)
-# =========================
-torch.set_grad_enabled(False)
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-# =========================
-# LOAD MODEL (NO CUDA INIT HERE)
-# =========================
 bnb = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16,
-)
-processor = AutoProcessor.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True
 )
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     quantization_config=bnb,
-    device_map="auto",            # HF Spaces will inject GPU here
-    torch_dtype=torch.float16,
     trust_remote_code=True
 ).eval()
 processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
-# =========================
-# PDF → IMAGE (FAST & SAFE)
-# =========================
-def pdf_to_images(pdf_bytes):
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    images = []
-    scale = DPI / 72.0
-    mat = fitz.Matrix(scale, scale)
-    for page in doc:
-        pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        if max(img.size) > MAX_IMAGE_SIZE:
-            img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)
-        images.append(img)
-    return images
-# =========================
-# OCR INFERENCE (CUDA ONLY HERE)
-# =========================
 @spaces.GPU
-def run_inference(image, prompt, max_new_tokens):
-    if image.mode != "RGB":
-        image = image.convert("RGB")
     messages = [{
         "role": "user",
         "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": prompt}
         ]
     }]
     text_prompt = processor.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
     )
     inputs = processor(
-                            text=[text_prompt],
-                            images=[image],
-                            return_tensors="pt",
-                            truncation=False,        # 🔴 BẮT BUỘC
-                            padding="longest"        # 🔴 BẮT BUỘC
-                        ).to(model.device)
-    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
-        outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=False,
-            use_cache=True,
             eos_token_id=processor.tokenizer.eos_token_id
         )
-    outputs = outputs[:, inputs["input_ids"].shape[1]:]
-    return processor.tokenizer.decode(
-        outputs[0],
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=True
-    ).strip()
-# =========================
-# FILE HANDLER
-# =========================
-def handle_file(file, prompt, max_new_tokens, progress=gr.Progress()):
-    file_path = file.name
-    ext = file_path.lower().split(".")[-1]
-    prompt = prompt.strip()
-    if ext == "pdf":
-        with open(file_path, "rb") as f:
-            images = pdf_to_images(f.read())
-        results = []
-        for i, img in enumerate(images):
-            text = run_inference(img, prompt, max_new_tokens)
-            results.append(text)
-            progress((i + 1) / len(images), desc=f"Page {i+1}/{len(images)}")
-        return "\n\n--- PAGE BREAK ---\n\n".join(results)
-    else:
-        img = Image.open(file_path)
-        return run_inference(img, prompt, max_new_tokens)
-# =========================
-# DEFAULT PROMPT (CAMEL OCR)
-# =========================
-DEFAULT_PROMPT = """
-You are an OCR + Information Extraction engine.
-Extract data strictly from the document.
-Return JSON ONLY. NO explanation.
-OUTPUT FORMAT:
-{
-  "price": "",
-  "vat": "",
-  "invoiceNo": "",
-  "invoiceDate": "",
-  "billingToTaxCode": "",
-  "accountingObjectTaxCode": "",
-  "description": ""
 }
-""".strip()
-# =========================
-# GRADIO UI
-# =========================
-with gr.Blocks(title="Camel-Doc-OCR") as demo:
-    gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL – 4bit, HF Spaces Safe)")
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
-                label="Upload Image / PDF",
-                file_types=[".jpg", ".jpeg", ".png", ".pdf"]
             )
             prompt_input = gr.Textbox(
-                label="Prompt",
-                value=DEFAULT_PROMPT,
-                lines=10
             )
-            max_tokens = gr.Radio(
-                [256, 512, 1024, 2048],
                 value=512,
-                label="Max new tokens"
             )
-            run_btn = gr.Button("🚀 Run OCR", variant="primary")
         with gr.Column(scale=1):
-            output = gr.Textbox(
-                label="Result",
-                lines=20
             )
     run_btn.click(
         fn=handle_file,
-        inputs=[file_input, prompt_input, max_tokens],
-        outputs=output
     )
-# =========================
-# CLEANUP
-# =========================
-def cleanup():
-    torch.cuda.empty_cache()
-    gc.collect()
-# =========================
-# LAUNCH
-# =========================
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )

 import os
+import json
+import re
+import hashlib
 import gc
+from io import BytesIO
+from collections import OrderedDict
+from PIL import Image, UnidentifiedImageError
 import torch
+from transformers import AutoProcessor, BitsAndBytesConfig
+from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+from pdf2image import convert_from_bytes
 import gradio as gr
+import fitz
 import spaces
+# --- CONFIGURATION ---
+MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
+CACHE_MAX_SIZE = 128
+DPI = 100
+THREAD_COUNT = 4
+IMAGE_MAX_DIM = 1024
+JPEG_QUALITY = 75
+GPU_MEMORY_FRACTION = 0.8  # use 80% of GPU memory
+PAD_TOKEN_ID = None  # set later to avoid warnings
+# --- CONFIGURATION ---
+MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
+CACHE_MAX_SIZE = 128
+DPI = 200  # Giữ vừa đủ, không quá cao
+IMAGE_MAX_DIM = None  # Không resize nếu không cần
+JPEG_QUALITY = 80
+GPU_MEMORY_FRACTION = 0.8
+# --- 1. Device ---
+device = torch.device("cpu") #torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.backends.cudnn.benchmark = True
+if device.type == 'cuda':
+    torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
+# --- 2. Load model ---
 from transformers import AutoProcessor, BitsAndBytesConfig
 from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 bnb = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
 )
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     quantization_config=bnb,
+    device_map="auto",
     trust_remote_code=True
 ).eval()
 processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
+# --- 8. File handler ---
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+# --- 8. File handler ---
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()):
+    try:
+        file_path = file.name if hasattr(file, "name") else file
+        filename = os.path.basename(file_path)
+        ext = filename.lower().split('.')[-1]
+        full_prompt = (prompt + "\n" + extra_prompt).strip() or ""
+        print(f"[INFO] handle_file → {filename} (.{ext})")
+        if ext == "pdf":
+            try:
+                with open(file_path, "rb") as f:
+                    pdf_bytes = f.read()
+                print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
+                doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+                pages = []
+                zoom = DPI
+                mat = fitz.Matrix(zoom, zoom)
+                for i, page in enumerate(doc):
+                    pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
+                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                    if max(img.size) > 3072:
+                        img.thumbnail((3072, 3072), Image.Resampling.LANCZOS)
+                    pages.append(img)
+                print(f"[INFO] Converted PDF → {len(pages)} pages")
+            except Exception as e:
+                traceback.print_exc()
+                return filename, f"[ERROR] PDF conversion failed: {e}"
+            outputs = []
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages]
+                for idx, future in enumerate(futures):
+                    try:
+                        out = future.result()
+                    except Exception as e:
+                        traceback.print_exc()
+                        out = f"[ERROR] Inference page {idx+1} failed: {e}"
+                    outputs.append(out)
+                    progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}")
+            result = "\n\n--- Page Break ---\n\n".join(outputs)
+            print("[INFO] handle_file done")
+            return filename, result
+        else:
+            try:
+                img = Image.open(file_path)
+                print(f"[INFO] Opened image: {img.mode}, {img.size}")
+            except Exception as e:
+                traceback.print_exc()
+                return filename, f"[ERROR] Image open failed: {e}"
+            return filename, run_inference(img, full_prompt, max_new_tokens)
+    except Exception as e:
+        traceback.print_exc()
+        return "error", f"[ERROR] handle_file unexpected: {e}"
+# --- 3. Inference Function ---
 @spaces.GPU
+def run_inference(img, prompt="", max_new_tokens=512):
+    model.to("cuda")
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    prompt_text = prompt.strip()
     messages = [{
         "role": "user",
         "content": [
+            {"type": "image", "image": img},
+            {"type": "text", "text": prompt_text}
         ]
     }]
     text_prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
     inputs = processor(
+        text=[text_prompt], images=[img], return_tensors="pt", padding=True
+    ).to("cuda")  # Sửa ở đây
+    with torch.inference_mode(), torch.cuda.amp.autocast():
+        gen = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=False,
             eos_token_id=processor.tokenizer.eos_token_id
         )
+    trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
+    result = processor.tokenizer.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )[0].strip()
+    return result
+# --- 9. Prompt templates & JSON export ---
+prompt_templates = {
+    "Electrolux": """Extract all structured information from the delivery order document image.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   ��� NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_electrolux_form>
+  <document_number>Số lệnh giao nhận hàng</document_number>
+  <order_number>Số đơn hàng</order_number>
+  <customer_code>Mã số khách hàng</customer_code>
+  <customer_order_code>Mã đơn khách hàng</customer_order_code>
+  <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
+  <delivery_date>Ngày giao hàng</delivery_date>
+  <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
+  <invoice_number>Số hóa đơn</invoice_number>
+  <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
+  <shipper_address>Địa chỉ gửi hàng</shipper_address>
+  <shipper_phone>Số điện thoại</shipper_phone>
+  <shipper_fax>Số fax</shipper_fax>
+  <shipper_tax_code>Mã số thuế</shipper_tax_code>
+  <consignee_customer_code>Mã khách hàng</consignee_customer_code>
+  <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
+  <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
+  <city_province>Tỉnh/Thành phố</city_province>
+  <postal_code>Mã bưu chính</postal_code>
+  <preparer_name>Họ tên người lập phiếu</preparer_name>
+  <preparer_date>Ngày lập phiếu</preparer_date>
+  <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
+</s_electrolux_form>
+""",
+    "Jotun": """Extract all structured information from the delivery order document.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_jotun_form>
+  <document_number>Số lệnh giao hàng</document_number>
+  <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
+  <customer_code>Mã khách hàng</customer_code>
+  <customer_name>Tên khách hàng</customer_name>
+  <customer_address>Địa chỉ khách hàng</customer_address>
+  <customer_phone>Điện thoại khách hàng</customer_phone>
+  <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
+  <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
+  <order_code>Số đơn đặt hàng</order_code>
+  <order_date>Ngày đặt hàng</order_date>
+  <order_number>Số đơn hàng</order_number>
+  <delivery_date>Ngày giao hàng</delivery_date>
+  <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
+</s_jotun_form>
+""",
+    "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_mawb_form>
+  <air_waybill_number>Số MAWB</air_waybill_number>
+  <shipper_name>Tên người gửi hàng</shipper_name>
+  <shipper_address>Địa chỉ người gửi hàng</shipper_address>
+  <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
+  <consignee_name>Tên người nhận hàng</consignee_name>
+  <consignee_address>Địa chỉ người nhận hàng</consignee_address>
+  <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
+  <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
+  <shipper_signature>Chữ ký người gửi</shipper_signature>
+</s_mawb_form>
+""",
+    "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_weight_ticket>
+  <awb_number>Số AWB</awb_number>
+  <shipper_name>Tên người gửi hàng</shipper_name>
+  <shipper_address>Địa chỉ người gửi hàng</shipper_address>
+  <shipper_contact>Số điện thoại người gửi</shipper_contact>
+  <consignee_name>Tên người nhận hàng</consignee_name>
+  <consignee_address>Địa chỉ người nhận hàng</consignee_address>
+  <cargo_description>Tên hàng hóa</cargo_description>
+  <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
+  <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
+  <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
+</s_weight_ticket>
+""",
+    "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_pc3u_form>
+  <awb_number>Số AWB</awb_number>
+  <cargo_service_code>Mã dịch vụ</cargo_service_code>
+  <shipper_name>Tên người gửi</shipper_name>
+  <shipper_address>Địa chỉ người gửi</shipper_address>
+  <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
+  <payer_name>Người thanh toán</payer_name>
+  <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
+  <consignee_name>Tên người nhận</consignee_name>
+  <consignee_address>Địa chỉ người nhận</consignee_address>
+  <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
+  <shipper_signature>Chữ ký người gửi</shipper_signature>
+  <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
+</s_pc3u_form>
+""",
+    "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_avs_dad>
+  <air_waybill_number>Số AWB</air_waybill_number>
+  <form_code>Mã biểu mẫu</form_code>
+  <shipper_name>Tên người gửi</shipper_name>
+  <shipper_address>Địa chỉ người gửi</shipper_address>
+  <shipper_phone>Điện thoại người gửi</shipper_phone>
+  <shipper_email>Email người gửi</shipper_email>
+  <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
+  <consignee_name>Tên người nhận</consignee_name>
+  <consignee_address>Địa chỉ người nhận</consignee_address>
+  <consignee_phone>Điện thoại người nhận</consignee_phone>
+  <consignee_email>Email người nhận</consignee_email>
+  <departure_airport>Nơi đi</departure_airport>
+  <destination_airport>Nơi đến</destination_airport>
+  <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
+  <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
+  <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
+  <shipper_signature>Chữ ký người gửi</shipper_signature>
+  <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
+</s_avs_dad>
+"""
 }
+def insert_template(name):
+    return prompt_templates.get(name, "")
+def sanitize_filename(name):
+    return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
+def clean_text(text):
+    text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
+    text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
+    return text.strip()
+def export_json(image_name, result_text):
+    try:
+        clean_name = sanitize_filename(image_name)
+        content = {"image": image_name, "text_sequence": clean_text(result_text)}
+        path = f"/tmp/{clean_name}.json"
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(content, f, ensure_ascii=False, indent=2)
+        return path, json.dumps(content, ensure_ascii=False, indent=2)
+    except Exception as e:
+        return "", f"[Export JSON Failed]: {e}"
+# --- 10. Gradio UI ---
+# --- 10. Gradio UI ---
+css = """
+.gradio-textbox textarea {
+    font-size: 13px !important;
+    line-height: 1.3 !important;
+    padding: 6px 8px !important;
+}
+.gradio-textbox label {
+    font-size: 13px !important;
+    font-weight: 600 !important;
+    margin-bottom: 4px !important;
+}
+.gradio-button {
+    font-size: 12px !important;
+    padding: 4px 8px !important;
+    height: 28px !important;
+    min-height: 28px !important;
+    margin: 2px !important;
+}
+.gradio-button[data-variant="primary"] {
+    height: 36px !important;
+    font-size: 13px !important;
+    padding: 8px 16px !important;
+}
+.gradio-file {
+    font-size: 13px !important;
+}
+.gradio-file .file-upload {
+    padding: 8px !important;
+    min-height: 80px !important;
+}
+.gradio-markdown h3 {
+    font-size: 14px !important;
+    margin: 8px 0 4px 0 !important;
+}
+.gradio-markdown h2 {
+    font-size: 18px !important;
+    margin: 8px 0 !important;
+}
+.gradio-code {
+    font-size: 12px !important;
+}
+"""
+with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo:
+    gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
+    # --- Main Layout: 2 Columns ---
     with gr.Row():
+        # === LEFT COLUMN: Input ===
         with gr.Column(scale=1):
+            gr.Markdown("### 📥 INPUT")
+            # File Input
             file_input = gr.File(
+                label="📤 Tải ảnh hoặc PDF",
+                file_types=[".jpg", ".jpeg", ".png", ".pdf"],
+                height=100
             )
+            # Prompt Input
             prompt_input = gr.Textbox(
+                label="Prompt thuần",
+                lines=2,
+                placeholder="Nhập prompt tùy chỉnh...",
+                max_lines=3
             )
+            # JSON Config
+            config_input = gr.Textbox(
+                label="JSON Prompt",
+                lines=6,
+                placeholder="Cấu hình JSON sẽ xuất hiện ở đây...",
+                max_lines=8
+            )
+            # Max New Tokens Radio
+            max_new_tokens_input = gr.Radio(
+                choices=[128, 256, 512, 1024, 1536, 2048],
                 value=512,
+                label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)",
+                info="Chọn độ dài tối đa cho đầu ra của mô hình"
             )
+            # Prompt Templates
+            gr.Markdown("### 📑 Mẫu:")
+            with gr.Row():
+                for key in list(prompt_templates.keys()):  # All buttons in one row
+                    gr.Button(f"{key}", size="sm", scale=1).click(
+                        fn=lambda *, k=key: insert_template(k),
+                        inputs=[],
+                        outputs=config_input
+                    )
+            # Run Button
+            run_btn = gr.Button("🚀 Chạy OCR", variant="primary")
+        # === RIGHT COLUMN: Output ===
         with gr.Column(scale=1):
+            gr.Markdown("### 📤 OUTPUT")
+            # Result Output
+            result_output = gr.Textbox(
+                label="Kết quả trích xuất",
+                lines=10,
+                placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...",
+                max_lines=12
+            )
+            # Export Section
+            with gr.Row():
+                export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm")
+            # JSON Output
+            json_text = gr.Code(
+                label="JSON Output",
+                language="json",
+                lines=6,
+                visible=False
+            )
+            # Download File
+            json_file = gr.File(
+                label="File JSON để tải",
+                visible=False,
+                file_types=[".json"]
             )
+    # --- Hidden Fields ---
+    hidden_name = gr.Textbox(visible=False)
+    # --- Event Handlers ---
+    # Run Inference
     run_btn.click(
         fn=handle_file,
+        inputs=[file_input, prompt_input, config_input, max_new_tokens_input],
+        outputs=[hidden_name, result_output]
     )
+    # Export JSON
+    export_btn.click(
+        fn=export_json,
+        inputs=[hidden_name, result_output],
+        outputs=[json_file, json_text]
+    )
+    export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
+    export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text])
 if __name__ == "__main__":
     demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )