Spaces:

vithacocf
/

ocr

Paused

App Files Files Community

vithacocf commited on Dec 30, 2025

Commit

0460893

verified ·

1 Parent(s): ebff030

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -516

app.py CHANGED Viewed

@@ -1,589 +1,233 @@
 import os
 import json
 import re
-import hashlib
-import gc
-from io import BytesIO
-from collections import OrderedDict
-from PIL import Image, UnidentifiedImageError
 import torch
-from transformers import AutoProcessor, BitsAndBytesConfig
-from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
-from pdf2image import convert_from_bytes
-import gradio as gr
 import fitz
 import spaces
-# --- CONFIGURATION ---
-MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
-CACHE_MAX_SIZE = 128
-DPI = 100
-THREAD_COUNT = 4
-IMAGE_MAX_DIM = 1024
-JPEG_QUALITY = 75
-GPU_MEMORY_FRACTION = 0.8  # use 80% of GPU memory
-PAD_TOKEN_ID = None  # set later to avoid warnings
-# --- CONFIGURATION ---
 MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
-CACHE_MAX_SIZE = 128
-DPI = 150  # Giữ vừa đủ, không quá cao
-IMAGE_MAX_DIM = None  # Không resize nếu không cần
-JPEG_QUALITY = 70
 GPU_MEMORY_FRACTION = 0.8
-# --- 1. Device ---
-device = torch.device("cpu") #torch.device("cuda" if torch.cuda.is_available() else "cpu")
-torch.backends.cudnn.benchmark = True
-if device.type == 'cuda':
     torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
-# --- 2. Load model ---
-from transformers import AutoProcessor, BitsAndBytesConfig
-from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 bnb = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16
 )
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     quantization_config=bnb,
     device_map="auto",
     trust_remote_code=True
 ).eval()
 processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
-# --- 8. File handler ---
-import traceback
-from concurrent.futures import ThreadPoolExecutor
-# --- 8. File handler ---
-import traceback
-from concurrent.futures import ThreadPoolExecutor
-def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()):
-    try:
-        file_path = file.name if hasattr(file, "name") else file
-        filename = os.path.basename(file_path)
-        ext = filename.lower().split('.')[-1]
-        full_prompt = (prompt + "\n" + extra_prompt).strip() or ""
-        print(f"[INFO] handle_file → {filename} (.{ext})")
-        if ext == "pdf":
-            try:
-                with open(file_path, "rb") as f:
-                    pdf_bytes = f.read()
-                print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
-                doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-                pages = []
-                zoom = DPI
-                mat = fitz.Matrix(zoom, zoom)
-                for i, page in enumerate(doc):
-                    pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
-                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-                    if max(img.size) > 3072:
-                        img.thumbnail((3072, 3072), Image.Resampling.LANCZOS)
-                    pages.append(img)
-                print(f"[INFO] Converted PDF → {len(pages)} pages")
-            except Exception as e:
-                traceback.print_exc()
-                return filename, f"[ERROR] PDF conversion failed: {e}"
-            outputs = []
-            with ThreadPoolExecutor(max_workers=4) as executor:
-                futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages]
-                for idx, future in enumerate(futures):
-                    try:
-                        out = future.result()
-                    except Exception as e:
-                        traceback.print_exc()
-                        out = f"[ERROR] Inference page {idx+1} failed: {e}"
-                    outputs.append(out)
-                    progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}")
-            result = "\n\n--- Page Break ---\n\n".join(outputs)
-            print("[INFO] handle_file done")
-            return filename, result
-        else:
-            try:
-                img = Image.open(file_path)
-                print(f"[INFO] Opened image: {img.mode}, {img.size}")
-            except Exception as e:
-                traceback.print_exc()
-                return filename, f"[ERROR] Image open failed: {e}"
-            return filename, run_inference(img, full_prompt, max_new_tokens)
-    except Exception as e:
-        traceback.print_exc()
-        return "error", f"[ERROR] handle_file unexpected: {e}"
-# --- 3. Inference Function ---
-@spaces.GPU
-def run_inference(img, prompt="", max_new_tokens=512):
-    model.to("cuda")
-    if img.mode != "RGB":
-        img = img.convert("RGB")
-    prompt_text = prompt.strip()
     messages = [{
         "role": "user",
         "content": [
-            {"type": "image", "image": img},
-            {"type": "text", "text": prompt_text}
         ]
     }]
     text_prompt = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
     )
     inputs = processor(
-        text=[text_prompt], images=[img], return_tensors="pt", padding=True
-    ).to("cuda")  # Sửa ở đây
-    with torch.inference_mode(), torch.cuda.amp.autocast():
-        gen = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=False,
             eos_token_id=processor.tokenizer.eos_token_id
         )
-    trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
-    result = processor.tokenizer.batch_decode(
-        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
-    )[0].strip()
-    return result
-# --- 9. Prompt templates & JSON export ---
-prompt_templates = {
-    "Electrolux": """Extract all structured information from the delivery order document image.
-You must return the result as a valid XML block that strictly follows the structure below.
-STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
-1. Return **ONLY** the XML block – nothing before or after it.
-2. DO NOT add, remove, rename, or reorder any XML tags.
-3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
-4. For every tag, fill in the exact value read from the image.
-   • NEVER copy or repeat the label/placeholder text.
-   • NEVER guess or invent values.
-5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
-6. DO NOT include Vietnamese text or translations inside tag values.
-7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
-8. Dates must be in YYYY-MM-DD format.
-9. Boolean tags must be exactly true or false (lower-case, no quotes).
-   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
-10. **Inside each value**
-    • Replace every internal line-break with “, ” (comma + space).
-    • Trim leading/trailing whitespace.
-    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
-11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
-12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
-13. Ignore any information not represented by the tags below.
-<s_electrolux_form>
-  <document_number>Số lệnh giao nhận hàng</document_number>
-  <order_number>Số đơn hàng</order_number>
-  <customer_code>Mã số khách hàng</customer_code>
-  <customer_order_code>Mã đơn khách hàng</customer_order_code>
-  <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
-  <delivery_date>Ngày giao hàng</delivery_date>
-  <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
-  <invoice_number>Số hóa đơn</invoice_number>
-  <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
-  <shipper_address>Địa chỉ gửi hàng</shipper_address>
-  <shipper_phone>Số điện thoại</shipper_phone>
-  <shipper_fax>Số fax</shipper_fax>
-  <shipper_tax_code>Mã số thuế</shipper_tax_code>
-  <consignee_customer_code>Mã khách hàng</consignee_customer_code>
-  <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
-  <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
-  <city_province>Tỉnh/Thành phố</city_province>
-  <postal_code>Mã bưu chính</postal_code>
-  <preparer_name>Họ tên người lập phiếu</preparer_name>
-  <preparer_date>Ngày lập phiếu</preparer_date>
-  <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
-</s_electrolux_form>
-""",
-    "Jotun": """Extract all structured information from the delivery order document.
-You must return the result as a valid XML block that strictly follows the structure below.
-STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
-1. Return **ONLY** the XML block – nothing before or after it.
-2. DO NOT add, remove, rename, or reorder any XML tags.
-3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
-4. For every tag, fill in the exact value read from the image.
-   • NEVER copy or repeat the label/placeholder text.
-   • NEVER guess or invent values.
-5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
-6. DO NOT include Vietnamese text or translations inside tag values.
-7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
-8. Dates must be in YYYY-MM-DD format.
-9. Boolean tags must be exactly true or false (lower-case, no quotes).
-   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
-10. **Inside each value**
-    • Replace every internal line-break with “, ” (comma + space).
-    • Trim leading/trailing whitespace.
-    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
-11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
-12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
-13. Ignore any information not represented by the tags below.
-<s_jotun_form>
-  <document_number>Số lệnh giao hàng</document_number>
-  <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
-  <customer_code>Mã khách hàng</customer_code>
-  <customer_name>Tên khách hàng</customer_name>
-  <customer_address>Địa chỉ khách hàng</customer_address>
-  <customer_phone>Điện thoại khách hàng</customer_phone>
-  <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
-  <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
-  <order_code>Số đơn đặt hàng</order_code>
-  <order_date>Ngày đặt hàng</order_date>
-  <order_number>Số đơn hàng</order_number>
-  <delivery_date>Ngày giao hàng</delivery_date>
-  <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
-</s_jotun_form>
-""",
-    "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
-You must return the result as a valid XML block that strictly follows the structure below.
-STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
-1. Return **ONLY** the XML block – nothing before or after it.
-2. DO NOT add, remove, rename, or reorder any XML tags.
-3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
-4. For every tag, fill in the exact value read from the image.
-   • NEVER copy or repeat the label/placeholder text.
-   • NEVER guess or invent values.
-5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
-6. DO NOT include Vietnamese text or translations inside tag values.
-7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
-8. Dates must be in YYYY-MM-DD format.
-9. Boolean tags must be exactly true or false (lower-case, no quotes).
-   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
-10. **Inside each value**
-    • Replace every internal line-break with “, ” (comma + space).
-    • Trim leading/trailing whitespace.
-    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
-11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
-12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
-13. Ignore any information not represented by the tags below.
-<s_mawb_form>
-  <air_waybill_number>Số MAWB</air_waybill_number>
-  <shipper_name>Tên người gửi hàng</shipper_name>
-  <shipper_address>Địa chỉ người gửi hàng</shipper_address>
-  <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
-  <consignee_name>Tên người nhận hàng</consignee_name>
-  <consignee_address>Địa chỉ người nhận hàng</consignee_address>
-  <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
-  <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
-  <shipper_signature>Chữ ký người gửi</shipper_signature>
-</s_mawb_form>
-""",
-    "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
-You must return the result as a valid XML block that strictly follows the structure below.
-STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
-1. Return **ONLY** the XML block – nothing before or after it.
-2. DO NOT add, remove, rename, or reorder any XML tags.
-3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
-4. For every tag, fill in the exact value read from the image.
-   • NEVER copy or repeat the label/placeholder text.
-   • NEVER guess or invent values.
-5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
-6. DO NOT include Vietnamese text or translations inside tag values.
-7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
-8. Dates must be in YYYY-MM-DD format.
-9. Boolean tags must be exactly true or false (lower-case, no quotes).
-   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
-10. **Inside each value**
-    • Replace every internal line-break with “, ” (comma + space).
-    • Trim leading/trailing whitespace.
-    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
-11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
-12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
-13. Ignore any information not represented by the tags below.
-<s_weight_ticket>
-  <awb_number>Số AWB</awb_number>
-  <shipper_name>Tên người gửi hàng</shipper_name>
-  <shipper_address>Địa chỉ người gửi hàng</shipper_address>
-  <shipper_contact>Số điện thoại người gửi</shipper_contact>
-  <consignee_name>Tên người nhận hàng</consignee_name>
-  <consignee_address>Địa chỉ người nhận hàng</consignee_address>
-  <cargo_description>Tên hàng hóa</cargo_description>
-  <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
-  <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
-  <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
-</s_weight_ticket>
-""",
-    "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
-You must return the result as a valid XML block that strictly follows the structure below.
-STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
-1. Return **ONLY** the XML block – nothing before or after it.
-2. DO NOT add, remove, rename, or reorder any XML tags.
-3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
-4. For every tag, fill in the exact value read from the image.
-   • NEVER copy or repeat the label/placeholder text.
-   • NEVER guess or invent values.
-5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
-6. DO NOT include Vietnamese text or translations inside tag values.
-7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
-8. Dates must be in YYYY-MM-DD format.
-9. Boolean tags must be exactly true or false (lower-case, no quotes).
-   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
-10. **Inside each value**
-    • Replace every internal line-break with “, ” (comma + space).
-    • Trim leading/trailing whitespace.
-    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
-11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
-12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
-13. Ignore any information not represented by the tags below.
-<s_pc3u_form>
-  <awb_number>Số AWB</awb_number>
-  <cargo_service_code>Mã dịch vụ</cargo_service_code>
-  <shipper_name>Tên người gửi</shipper_name>
-  <shipper_address>Địa chỉ người gửi</shipper_address>
-  <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
-  <payer_name>Người thanh toán</payer_name>
-  <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
-  <consignee_name>Tên người nhận</consignee_name>
-  <consignee_address>Địa chỉ người nhận</consignee_address>
-  <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
-  <shipper_signature>Chữ ký người gửi</shipper_signature>
-  <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
-</s_pc3u_form>
-""",
-    "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
-You must return the result as a valid XML block that strictly follows the structure below.
-STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
-1. Return **ONLY** the XML block – nothing before or after it.
-2. DO NOT add, remove, rename, or reorder any XML tags.
-3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
-4. For every tag, fill in the exact value read from the image.
-   • NEVER copy or repeat the label/placeholder text.
-   • NEVER guess or invent values.
-5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
-6. DO NOT include Vietnamese text or translations inside tag values.
-7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
-8. Dates must be in YYYY-MM-DD format.
-9. Boolean tags must be exactly true or false (lower-case, no quotes).
-   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
-10. **Inside each value**
-    • Replace every internal line-break with “, ” (comma + space).
-    • Trim leading/trailing whitespace.
-    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
-11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
-12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
-13. Ignore any information not represented by the tags below.
-<s_avs_dad>
-  <air_waybill_number>Số AWB</air_waybill_number>
-  <form_code>Mã biểu mẫu</form_code>
-  <shipper_name>Tên người gửi</shipper_name>
-  <shipper_address>Địa chỉ người gửi</shipper_address>
-  <shipper_phone>Điện thoại người gửi</shipper_phone>
-  <shipper_email>Email người gửi</shipper_email>
-  <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
-  <consignee_name>Tên người nhận</consignee_name>
-  <consignee_address>Địa chỉ người nhận</consignee_address>
-  <consignee_phone>Điện thoại người nhận</consignee_phone>
-  <consignee_email>Email người nhận</consignee_email>
-  <departure_airport>Nơi đi</departure_airport>
-  <destination_airport>Nơi đến</destination_airport>
-  <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
-  <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
-  <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
-  <shipper_signature>Chữ ký người gửi</shipper_signature>
-  <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
-</s_avs_dad>
-"""
 }
-def insert_template(name):
-    return prompt_templates.get(name, "")
-def sanitize_filename(name):
-    return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
-def clean_text(text):
-    text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
-    text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
-    return text.strip()
-def export_json(image_name, result_text):
-    try:
-        clean_name = sanitize_filename(image_name)
-        content = {"image": image_name, "text_sequence": clean_text(result_text)}
-        path = f"/tmp/{clean_name}.json"
-        with open(path, "w", encoding="utf-8") as f:
-            json.dump(content, f, ensure_ascii=False, indent=2)
-        return path, json.dumps(content, ensure_ascii=False, indent=2)
-    except Exception as e:
-        return "", f"[Export JSON Failed]: {e}"
-# --- 10. Gradio UI ---
-# --- 10. Gradio UI ---
-css = """
-.gradio-textbox textarea {
-    font-size: 13px !important;
-    line-height: 1.3 !important;
-    padding: 6px 8px !important;
-}
-.gradio-textbox label {
-    font-size: 13px !important;
-    font-weight: 600 !important;
-    margin-bottom: 4px !important;
-}
-.gradio-button {
-    font-size: 12px !important;
-    padding: 4px 8px !important;
-    height: 28px !important;
-    min-height: 28px !important;
-    margin: 2px !important;
-}
-.gradio-button[data-variant="primary"] {
-    height: 36px !important;
-    font-size: 13px !important;
-    padding: 8px 16px !important;
-}
-.gradio-file {
-    font-size: 13px !important;
-}
-.gradio-file .file-upload {
-    padding: 8px !important;
-    min-height: 80px !important;
-}
-.gradio-markdown h3 {
-    font-size: 14px !important;
-    margin: 8px 0 4px 0 !important;
-}
-.gradio-markdown h2 {
-    font-size: 18px !important;
-    margin: 8px 0 !important;
-}
-.gradio-code {
-    font-size: 12px !important;
-}
-"""
-with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo:
-    gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
-    # --- Main Layout: 2 Columns ---
     with gr.Row():
-        # === LEFT COLUMN: Input ===
         with gr.Column(scale=1):
-            gr.Markdown("### 📥 INPUT")
-            # File Input
             file_input = gr.File(
-                label="📤 Tải ảnh hoặc PDF",
-                file_types=[".jpg", ".jpeg", ".png", ".pdf"],
-                height=100
             )
-            # Prompt Input
             prompt_input = gr.Textbox(
-                label="Prompt thuần",
-                lines=2,
-                placeholder="Nhập prompt tùy chỉnh...",
-                max_lines=3
             )
-            # JSON Config
-            config_input = gr.Textbox(
-                label="JSON Prompt",
-                lines=6,
-                placeholder="Cấu hình JSON sẽ xuất hiện ở đây...",
-                max_lines=8
-            )
-            # Max New Tokens Radio
-            max_new_tokens_input = gr.Radio(
-                choices=[128, 256, 512, 1024, 1536, 2048],
                 value=512,
-                label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)",
-                info="Chọn độ dài tối đa cho đầu ra của mô hình"
             )
-            # Prompt Templates
-            gr.Markdown("### 📑 Mẫu:")
-            with gr.Row():
-                for key in list(prompt_templates.keys()):  # All buttons in one row
-                    gr.Button(f"{key}", size="sm", scale=1).click(
-                        fn=lambda *, k=key: insert_template(k),
-                        inputs=[],
-                        outputs=config_input
-                    )
-            # Run Button
-            run_btn = gr.Button("🚀 Chạy OCR", variant="primary")
-        # === RIGHT COLUMN: Output ===
         with gr.Column(scale=1):
-            gr.Markdown("### 📤 OUTPUT")
-            # Result Output
-            result_output = gr.Textbox(
-                label="Kết quả trích xuất",
-                lines=10,
-                placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...",
-                max_lines=12
             )
-            # Export Section
-            with gr.Row():
-                export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm")
-            # JSON Output
-            json_text = gr.Code(
-                label="JSON Output",
-                language="json",
-                lines=6,
-                visible=False
-            )
-            # Download File
-            json_file = gr.File(
-                label="File JSON để tải",
-                visible=False,
-                file_types=[".json"]
-            )
-    # --- Hidden Fields ---
-    hidden_name = gr.Textbox(visible=False)
-    # --- Event Handlers ---
-    # Run Inference
     run_btn.click(
         fn=handle_file,
-        inputs=[file_input, prompt_input, config_input, max_new_tokens_input],
-        outputs=[hidden_name, result_output]
     )
-    # Export JSON
-    export_btn.click(
-        fn=export_json,
-        inputs=[hidden_name, result_output],
-        outputs=[json_file, json_text]
-    )
-    export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
-    export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text])
 if __name__ == "__main__":
     demo.launch(
-        share=True,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

+# =========================
+# CAMEL-DOC-OCR (FAST)
+# Single-file version
+# =========================
 import os
+import gc
 import json
 import re
 import torch
 import fitz
+import gradio as gr
 import spaces
+from PIL import Image
+from transformers import AutoProcessor, BitsAndBytesConfig
+from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+# =========================
+# CONFIG
+# =========================
 MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
+DPI = 150
+MAX_IMAGE_SIZE = 2048
 GPU_MEMORY_FRACTION = 0.8
+# =========================
+# TORCH OPTIMIZATION
+# =========================
+torch.set_grad_enabled(False)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+if torch.cuda.is_available():
     torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
+# =========================
+# LOAD MODEL (ONCE)
+# =========================
 bnb = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+processor = AutoProcessor.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True
 )
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     quantization_config=bnb,
     device_map="auto",
+    torch_dtype=torch.float16,
     trust_remote_code=True
 ).eval()
 processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
+# =========================
+# PDF → IMAGE
+# =========================
+def pdf_to_images(pdf_bytes):
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    images = []
+    scale = DPI / 72.0
+    mat = fitz.Matrix(scale, scale)
+    for page in doc:
+        pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        if max(img.size) > MAX_IMAGE_SIZE:
+            img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)
+        images.append(img)
+    return images
+# =========================
+# OCR INFERENCE (FAST)
+# =========================
+@spaces.GPU
+def run_inference(image, prompt, max_new_tokens):
+    if image.mode != "RGB":
+        image = image.convert("RGB")
     messages = [{
         "role": "user",
         "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": prompt}
         ]
     }]
     text_prompt = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
     )
     inputs = processor(
+        text=[text_prompt],
+        images=[image],
+        return_tensors="pt"
+    ).to(model.device)
+    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=False,
+            use_cache=True,
             eos_token_id=processor.tokenizer.eos_token_id
         )
+    outputs = outputs[:, inputs["input_ids"].shape[1]:]
+    return processor.tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True
+    ).strip()
+# =========================
+# FILE HANDLER
+# =========================
+def handle_file(file, prompt, max_new_tokens, progress=gr.Progress()):
+    file_path = file.name
+    ext = file_path.lower().split(".")[-1]
+    prompt = prompt.strip()
+    if ext == "pdf":
+        with open(file_path, "rb") as f:
+            images = pdf_to_images(f.read())
+        results = []
+        for i, img in enumerate(images):
+            text = run_inference(img, prompt, max_new_tokens)
+            results.append(text)
+            progress((i + 1) / len(images), desc=f"Page {i+1}/{len(images)}")
+        return "\n\n--- PAGE BREAK ---\n\n".join(results)
+    else:
+        img = Image.open(file_path)
+        return run_inference(img, prompt, max_new_tokens)
+# =========================
+# DEFAULT PROMPT (CAMEL OCR)
+# =========================
+DEFAULT_PROMPT = """
+You are an OCR + Information Extraction engine.
+Extract data strictly from the document.
+Return JSON ONLY. NO explanation.
+OUTPUT FORMAT:
+{
+  "price": "",
+  "vat": "",
+  "invoiceNo": "",
+  "invoiceDate": "",
+  "billingToTaxCode": "",
+  "accountingObjectTaxCode": "",
+  "description": ""
 }
+""".strip()
+# =========================
+# GRADIO UI
+# =========================
+with gr.Blocks(title="Camel-Doc-OCR") as demo:
+    gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL – 4bit, Fast)")
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
+                label="Upload Image / PDF",
+                file_types=[".jpg", ".jpeg", ".png", ".pdf"]
             )
             prompt_input = gr.Textbox(
+                label="Prompt",
+                value=DEFAULT_PROMPT,
+                lines=10
             )
+            max_tokens = gr.Radio(
+                [256, 512, 1024, 2048],
                 value=512,
+                label="Max new tokens"
             )
+            run_btn = gr.Button("🚀 Run OCR", variant="primary")
         with gr.Column(scale=1):
+            output = gr.Textbox(
+                label="Result",
+                lines=20
             )
     run_btn.click(
         fn=handle_file,
+        inputs=[file_input, prompt_input, max_tokens],
+        outputs=output
     )
+# =========================
+# CLEANUP & LAUNCH
+# =========================
+def cleanup():
+    torch.cuda.empty_cache()
+    gc.collect()
 if __name__ == "__main__":
     demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )