vithacocf commited on
Commit
86ba016
·
verified ·
1 Parent(s): 2364e8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +522 -165
app.py CHANGED
@@ -1,232 +1,589 @@
1
- # =========================
2
- # CAMEL-DOC-OCR (HF Spaces SAFE)
3
- # Single-file – NO CUDA init at global scope
4
- # =========================
5
-
6
  import os
 
 
 
7
  import gc
 
 
 
8
  import torch
9
- import fitz
 
 
10
  import gradio as gr
 
11
  import spaces
12
- from PIL import Image
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  from transformers import AutoProcessor, BitsAndBytesConfig
15
  from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
16
 
17
-
18
- # =========================
19
- # CONFIG
20
- # =========================
21
- MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
22
- DPI = 150
23
- MAX_IMAGE_SIZE = 2048
24
-
25
-
26
- # =========================
27
- # TORCH FLAGS (SAFE FOR SPACES)
28
- # =========================
29
- torch.set_grad_enabled(False)
30
- torch.backends.cuda.matmul.allow_tf32 = True
31
- torch.backends.cudnn.allow_tf32 = True
32
-
33
-
34
- # =========================
35
- # LOAD MODEL (NO CUDA INIT HERE)
36
- # =========================
37
  bnb = BitsAndBytesConfig(
38
  load_in_4bit=True,
39
  bnb_4bit_use_double_quant=True,
40
  bnb_4bit_quant_type="nf4",
41
- bnb_4bit_compute_dtype=torch.float16,
42
- )
43
-
44
- processor = AutoProcessor.from_pretrained(
45
- MODEL_ID,
46
- trust_remote_code=True
47
  )
48
 
 
49
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
50
  MODEL_ID,
51
  quantization_config=bnb,
52
- device_map="auto", # HF Spaces will inject GPU here
53
- torch_dtype=torch.float16,
54
  trust_remote_code=True
55
  ).eval()
56
-
57
  processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
58
 
59
-
60
- # =========================
61
- # PDF IMAGE (FAST & SAFE)
62
- # =========================
63
- def pdf_to_images(pdf_bytes):
64
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
65
- images = []
66
-
67
- scale = DPI / 72.0
68
- mat = fitz.Matrix(scale, scale)
69
-
70
- for page in doc:
71
- pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
72
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
73
-
74
- if max(img.size) > MAX_IMAGE_SIZE:
75
- img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)
76
-
77
- images.append(img)
78
-
79
- return images
80
-
81
-
82
- # =========================
83
- # OCR INFERENCE (CUDA ONLY HERE)
84
- # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  @spaces.GPU
86
- def run_inference(image, prompt, max_new_tokens):
87
- if image.mode != "RGB":
88
- image = image.convert("RGB")
 
 
 
89
 
90
  messages = [{
91
  "role": "user",
92
  "content": [
93
- {"type": "image", "image": image},
94
- {"type": "text", "text": prompt}
95
  ]
96
  }]
97
 
98
  text_prompt = processor.apply_chat_template(
99
- messages,
100
- tokenize=False,
101
- add_generation_prompt=True
102
  )
103
 
104
  inputs = processor(
105
- text=[text_prompt],
106
- images=[image],
107
- return_tensors="pt",
108
- truncation=False, # 🔴 BẮT BUỘC
109
- padding="longest" # 🔴 BẮT BUỘC
110
- ).to(model.device)
111
-
112
- with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
113
- outputs = model.generate(
114
  **inputs,
115
  max_new_tokens=max_new_tokens,
116
  do_sample=False,
117
- use_cache=True,
118
  eos_token_id=processor.tokenizer.eos_token_id
119
  )
120
 
121
- outputs = outputs[:, inputs["input_ids"].shape[1]:]
122
-
123
- return processor.tokenizer.decode(
124
- outputs[0],
125
- skip_special_tokens=True,
126
- clean_up_tokenization_spaces=True
127
- ).strip()
128
-
129
-
130
- # =========================
131
- # FILE HANDLER
132
- # =========================
133
- def handle_file(file, prompt, max_new_tokens, progress=gr.Progress()):
134
- file_path = file.name
135
- ext = file_path.lower().split(".")[-1]
136
- prompt = prompt.strip()
137
-
138
- if ext == "pdf":
139
- with open(file_path, "rb") as f:
140
- images = pdf_to_images(f.read())
141
-
142
- results = []
143
- for i, img in enumerate(images):
144
- text = run_inference(img, prompt, max_new_tokens)
145
- results.append(text)
146
- progress((i + 1) / len(images), desc=f"Page {i+1}/{len(images)}")
147
-
148
- return "\n\n--- PAGE BREAK ---\n\n".join(results)
149
-
150
- else:
151
- img = Image.open(file_path)
152
- return run_inference(img, prompt, max_new_tokens)
153
-
154
-
155
- # =========================
156
- # DEFAULT PROMPT (CAMEL OCR)
157
- # =========================
158
- DEFAULT_PROMPT = """
159
- You are an OCR + Information Extraction engine.
160
- Extract data strictly from the document.
161
- Return JSON ONLY. NO explanation.
162
-
163
- OUTPUT FORMAT:
164
- {
165
- "price": "",
166
- "vat": "",
167
- "invoiceNo": "",
168
- "invoiceDate": "",
169
- "billingToTaxCode": "",
170
- "accountingObjectTaxCode": "",
171
- "description": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  }
173
- """.strip()
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- # =========================
177
- # GRADIO UI
178
- # =========================
179
- with gr.Blocks(title="Camel-Doc-OCR") as demo:
180
- gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL – 4bit, HF Spaces Safe)")
181
 
 
182
  with gr.Row():
 
183
  with gr.Column(scale=1):
 
 
 
184
  file_input = gr.File(
185
- label="Upload Image / PDF",
186
- file_types=[".jpg", ".jpeg", ".png", ".pdf"]
 
187
  )
188
 
 
189
  prompt_input = gr.Textbox(
190
- label="Prompt",
191
- value=DEFAULT_PROMPT,
192
- lines=10
 
193
  )
194
 
195
- max_tokens = gr.Radio(
196
- [256, 512, 1024, 2048],
 
 
 
 
 
 
 
 
 
197
  value=512,
198
- label="Max new tokens"
 
199
  )
200
 
201
- run_btn = gr.Button("🚀 Run OCR", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
202
 
 
203
  with gr.Column(scale=1):
204
- output = gr.Textbox(
205
- label="Result",
206
- lines=20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  )
208
 
 
 
 
 
 
 
209
  run_btn.click(
210
  fn=handle_file,
211
- inputs=[file_input, prompt_input, max_tokens],
212
- outputs=output
213
  )
214
 
 
 
 
 
 
 
215
 
216
- # =========================
217
- # CLEANUP
218
- # =========================
219
- def cleanup():
220
- torch.cuda.empty_cache()
221
- gc.collect()
222
-
223
 
224
- # =========================
225
- # LAUNCH
226
- # =========================
227
  if __name__ == "__main__":
228
  demo.launch(
229
- server_name="0.0.0.0",
230
- server_port=7860,
231
- share=True
232
- )
 
 
 
 
 
 
1
  import os
2
+ import json
3
+ import re
4
+ import hashlib
5
  import gc
6
+ from io import BytesIO
7
+ from collections import OrderedDict
8
+ from PIL import Image, UnidentifiedImageError
9
  import torch
10
+ from transformers import AutoProcessor, BitsAndBytesConfig
11
+ from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
12
+ from pdf2image import convert_from_bytes
13
  import gradio as gr
14
+ import fitz
15
  import spaces
 
16
 
17
+ # --- CONFIGURATION ---
18
+ MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
19
+ CACHE_MAX_SIZE = 128
20
+ DPI = 100
21
+ THREAD_COUNT = 4
22
+ IMAGE_MAX_DIM = 1024
23
+ JPEG_QUALITY = 75
24
+ GPU_MEMORY_FRACTION = 0.8 # use 80% of GPU memory
25
+ PAD_TOKEN_ID = None # set later to avoid warnings
26
+
27
+ # --- CONFIGURATION ---
28
+ MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
29
+ CACHE_MAX_SIZE = 128
30
+ DPI = 200 # Giữ vừa đủ, không quá cao
31
+ IMAGE_MAX_DIM = None # Không resize nếu không cần
32
+ JPEG_QUALITY = 80
33
+ GPU_MEMORY_FRACTION = 0.8
34
+
35
+ # --- 1. Device ---
36
+ device = torch.device("cpu") #torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ torch.backends.cudnn.benchmark = True
38
+ if device.type == 'cuda':
39
+ torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
40
+
41
+ # --- 2. Load model ---
42
  from transformers import AutoProcessor, BitsAndBytesConfig
43
  from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  bnb = BitsAndBytesConfig(
46
  load_in_4bit=True,
47
  bnb_4bit_use_double_quant=True,
48
  bnb_4bit_quant_type="nf4",
49
+ bnb_4bit_compute_dtype=torch.float16
 
 
 
 
 
50
  )
51
 
52
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
53
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
  MODEL_ID,
55
  quantization_config=bnb,
56
+ device_map="auto",
 
57
  trust_remote_code=True
58
  ).eval()
 
59
  processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
60
 
61
+ # --- 8. File handler ---
62
+ import traceback
63
+ from concurrent.futures import ThreadPoolExecutor
64
+
65
+ # --- 8. File handler ---
66
+ import traceback
67
+ from concurrent.futures import ThreadPoolExecutor
68
+
69
+ def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()):
70
+ try:
71
+ file_path = file.name if hasattr(file, "name") else file
72
+ filename = os.path.basename(file_path)
73
+ ext = filename.lower().split('.')[-1]
74
+ full_prompt = (prompt + "\n" + extra_prompt).strip() or ""
75
+
76
+ print(f"[INFO] handle_file → {filename} (.{ext})")
77
+
78
+ if ext == "pdf":
79
+ try:
80
+ with open(file_path, "rb") as f:
81
+ pdf_bytes = f.read()
82
+ print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
83
+
84
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
85
+ pages = []
86
+ zoom = DPI
87
+ mat = fitz.Matrix(zoom, zoom)
88
+ for i, page in enumerate(doc):
89
+ pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
90
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
91
+ if max(img.size) > 3072:
92
+ img.thumbnail((3072, 3072), Image.Resampling.LANCZOS)
93
+ pages.append(img)
94
+ print(f"[INFO] Converted PDF → {len(pages)} pages")
95
+
96
+ except Exception as e:
97
+ traceback.print_exc()
98
+ return filename, f"[ERROR] PDF conversion failed: {e}"
99
+
100
+ outputs = []
101
+ with ThreadPoolExecutor(max_workers=4) as executor:
102
+ futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages]
103
+ for idx, future in enumerate(futures):
104
+ try:
105
+ out = future.result()
106
+ except Exception as e:
107
+ traceback.print_exc()
108
+ out = f"[ERROR] Inference page {idx+1} failed: {e}"
109
+ outputs.append(out)
110
+ progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}")
111
+
112
+ result = "\n\n--- Page Break ---\n\n".join(outputs)
113
+ print("[INFO] handle_file done")
114
+ return filename, result
115
+
116
+ else:
117
+ try:
118
+ img = Image.open(file_path)
119
+ print(f"[INFO] Opened image: {img.mode}, {img.size}")
120
+ except Exception as e:
121
+ traceback.print_exc()
122
+ return filename, f"[ERROR] Image open failed: {e}"
123
+
124
+ return filename, run_inference(img, full_prompt, max_new_tokens)
125
+
126
+ except Exception as e:
127
+ traceback.print_exc()
128
+ return "error", f"[ERROR] handle_file unexpected: {e}"
129
+
130
+ # --- 3. Inference Function ---
131
  @spaces.GPU
132
+ def run_inference(img, prompt="", max_new_tokens=512):
133
+ model.to("cuda")
134
+
135
+ if img.mode != "RGB":
136
+ img = img.convert("RGB")
137
+ prompt_text = prompt.strip()
138
 
139
  messages = [{
140
  "role": "user",
141
  "content": [
142
+ {"type": "image", "image": img},
143
+ {"type": "text", "text": prompt_text}
144
  ]
145
  }]
146
 
147
  text_prompt = processor.apply_chat_template(
148
+ messages, tokenize=False, add_generation_prompt=True
 
 
149
  )
150
 
151
  inputs = processor(
152
+ text=[text_prompt], images=[img], return_tensors="pt", padding=True
153
+ ).to("cuda") # Sửa ở đây
154
+
155
+ with torch.inference_mode(), torch.cuda.amp.autocast():
156
+ gen = model.generate(
 
 
 
 
157
  **inputs,
158
  max_new_tokens=max_new_tokens,
159
  do_sample=False,
 
160
  eos_token_id=processor.tokenizer.eos_token_id
161
  )
162
 
163
+ trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
164
+ result = processor.tokenizer.batch_decode(
165
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
166
+ )[0].strip()
167
+
168
+ return result
169
+ # --- 9. Prompt templates & JSON export ---
170
+ prompt_templates = {
171
+ "Electrolux": """Extract all structured information from the delivery order document image.
172
+ You must return the result as a valid XML block that strictly follows the structure below.
173
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
174
+ 1. Return **ONLY** the XML block – nothing before or after it.
175
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
176
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
177
+ 4. For every tag, fill in the exact value read from the image.
178
+ ��� NEVER copy or repeat the label/placeholder text.
179
+ • NEVER guess or invent values.
180
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
181
+ 6. DO NOT include Vietnamese text or translations inside tag values.
182
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
183
+ 8. Dates must be in YYYY-MM-DD format.
184
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
185
+ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
186
+ 10. **Inside each value**
187
+ • Replace every internal line-break with “, ” (comma + space).
188
+ Trim leading/trailing whitespace.
189
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
190
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
191
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
192
+ 13. Ignore any information not represented by the tags below.
193
+ <s_electrolux_form>
194
+ <document_number>Số lệnh giao nhận hàng</document_number>
195
+ <order_number>Số đơn hàng</order_number>
196
+ <customer_code>Mã số khách hàng</customer_code>
197
+ <customer_order_code>Mã đơn khách hàng</customer_order_code>
198
+ <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
199
+ <delivery_date>Ngày giao hàng</delivery_date>
200
+ <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
201
+ <invoice_number>Số hóa đơn</invoice_number>
202
+ <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
203
+ <shipper_address>Địa chỉ gửi hàng</shipper_address>
204
+ <shipper_phone>Số điện thoại</shipper_phone>
205
+ <shipper_fax>Số fax</shipper_fax>
206
+ <shipper_tax_code>Mã số thuế</shipper_tax_code>
207
+ <consignee_customer_code>Mã khách hàng</consignee_customer_code>
208
+ <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
209
+ <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
210
+ <city_province>Tỉnh/Thành phố</city_province>
211
+ <postal_code>Mã bưu chính</postal_code>
212
+ <preparer_name>Họ tên người lập phiếu</preparer_name>
213
+ <preparer_date>Ngày lập phiếu</preparer_date>
214
+ <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
215
+ </s_electrolux_form>
216
+ """,
217
+
218
+ "Jotun": """Extract all structured information from the delivery order document.
219
+ You must return the result as a valid XML block that strictly follows the structure below.
220
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
221
+ 1. Return **ONLY** the XML block – nothing before or after it.
222
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
223
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
224
+ 4. For every tag, fill in the exact value read from the image.
225
+ • NEVER copy or repeat the label/placeholder text.
226
+ • NEVER guess or invent values.
227
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
228
+ 6. DO NOT include Vietnamese text or translations inside tag values.
229
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
230
+ 8. Dates must be in YYYY-MM-DD format.
231
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
232
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
233
+ 10. **Inside each value**
234
+ • Replace every internal line-break with “, ” (comma + space).
235
+ • Trim leading/trailing whitespace.
236
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
237
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
238
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
239
+ 13. Ignore any information not represented by the tags below.
240
+ <s_jotun_form>
241
+ <document_number>Số lệnh giao hàng</document_number>
242
+ <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
243
+ <customer_code>Mã khách hàng</customer_code>
244
+ <customer_name>Tên khách hàng</customer_name>
245
+ <customer_address>Địa chỉ khách hàng</customer_address>
246
+ <customer_phone>Điện thoại khách hàng</customer_phone>
247
+ <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
248
+ <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
249
+ <order_code>Số đơn đặt hàng</order_code>
250
+ <order_date>Ngày đặt hàng</order_date>
251
+ <order_number>Số đơn hàng</order_number>
252
+ <delivery_date>Ngày giao hàng</delivery_date>
253
+ <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
254
+ </s_jotun_form>
255
+ """,
256
+
257
+ "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
258
+ You must return the result as a valid XML block that strictly follows the structure below.
259
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
260
+ 1. Return **ONLY** the XML block – nothing before or after it.
261
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
262
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
263
+ 4. For every tag, fill in the exact value read from the image.
264
+ • NEVER copy or repeat the label/placeholder text.
265
+ • NEVER guess or invent values.
266
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
267
+ 6. DO NOT include Vietnamese text or translations inside tag values.
268
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
269
+ 8. Dates must be in YYYY-MM-DD format.
270
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
271
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
272
+ 10. **Inside each value**
273
+ • Replace every internal line-break with “, ” (comma + space).
274
+ • Trim leading/trailing whitespace.
275
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
276
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
277
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
278
+ 13. Ignore any information not represented by the tags below.
279
+ <s_mawb_form>
280
+ <air_waybill_number>Số MAWB</air_waybill_number>
281
+ <shipper_name>Tên người gửi hàng</shipper_name>
282
+ <shipper_address>Địa chỉ người gửi hàng</shipper_address>
283
+ <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
284
+ <consignee_name>Tên người nhận hàng</consignee_name>
285
+ <consignee_address>Địa chỉ người nhận hàng</consignee_address>
286
+ <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
287
+ <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
288
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
289
+ </s_mawb_form>
290
+ """,
291
+
292
+ "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
293
+ You must return the result as a valid XML block that strictly follows the structure below.
294
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
295
+ 1. Return **ONLY** the XML block – nothing before or after it.
296
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
297
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
298
+ 4. For every tag, fill in the exact value read from the image.
299
+ • NEVER copy or repeat the label/placeholder text.
300
+ • NEVER guess or invent values.
301
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
302
+ 6. DO NOT include Vietnamese text or translations inside tag values.
303
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
304
+ 8. Dates must be in YYYY-MM-DD format.
305
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
306
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
307
+ 10. **Inside each value**
308
+ • Replace every internal line-break with “, ” (comma + space).
309
+ • Trim leading/trailing whitespace.
310
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
311
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
312
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
313
+ 13. Ignore any information not represented by the tags below.
314
+ <s_weight_ticket>
315
+ <awb_number>Số AWB</awb_number>
316
+ <shipper_name>Tên người gửi hàng</shipper_name>
317
+ <shipper_address>Địa chỉ người gửi hàng</shipper_address>
318
+ <shipper_contact>Số điện thoại người gửi</shipper_contact>
319
+ <consignee_name>Tên người nhận hàng</consignee_name>
320
+ <consignee_address>Địa chỉ người nhận hàng</consignee_address>
321
+ <cargo_description>Tên hàng hóa</cargo_description>
322
+ <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
323
+ <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
324
+ <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
325
+ </s_weight_ticket>
326
+ """,
327
+
328
+ "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
329
+ You must return the result as a valid XML block that strictly follows the structure below.
330
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
331
+ 1. Return **ONLY** the XML block – nothing before or after it.
332
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
333
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
334
+ 4. For every tag, fill in the exact value read from the image.
335
+ • NEVER copy or repeat the label/placeholder text.
336
+ • NEVER guess or invent values.
337
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
338
+ 6. DO NOT include Vietnamese text or translations inside tag values.
339
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
340
+ 8. Dates must be in YYYY-MM-DD format.
341
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
342
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
343
+ 10. **Inside each value**
344
+ • Replace every internal line-break with “, ” (comma + space).
345
+ • Trim leading/trailing whitespace.
346
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
347
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
348
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
349
+ 13. Ignore any information not represented by the tags below.
350
+ <s_pc3u_form>
351
+ <awb_number>Số AWB</awb_number>
352
+ <cargo_service_code>Mã dịch vụ</cargo_service_code>
353
+ <shipper_name>Tên người gửi</shipper_name>
354
+ <shipper_address>Địa chỉ người gửi</shipper_address>
355
+ <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
356
+ <payer_name>Người thanh toán</payer_name>
357
+ <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
358
+ <consignee_name>Tên người nhận</consignee_name>
359
+ <consignee_address>Địa chỉ người nhận</consignee_address>
360
+ <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
361
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
362
+ <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
363
+ </s_pc3u_form>
364
+ """,
365
+
366
+ "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
367
+ You must return the result as a valid XML block that strictly follows the structure below.
368
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
369
+ 1. Return **ONLY** the XML block – nothing before or after it.
370
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
371
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
372
+ 4. For every tag, fill in the exact value read from the image.
373
+ • NEVER copy or repeat the label/placeholder text.
374
+ • NEVER guess or invent values.
375
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
376
+ 6. DO NOT include Vietnamese text or translations inside tag values.
377
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
378
+ 8. Dates must be in YYYY-MM-DD format.
379
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
380
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
381
+ 10. **Inside each value**
382
+ • Replace every internal line-break with “, ” (comma + space).
383
+ • Trim leading/trailing whitespace.
384
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
385
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
386
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
387
+ 13. Ignore any information not represented by the tags below.
388
+ <s_avs_dad>
389
+ <air_waybill_number>Số AWB</air_waybill_number>
390
+ <form_code>Mã biểu mẫu</form_code>
391
+ <shipper_name>Tên người gửi</shipper_name>
392
+ <shipper_address>Địa chỉ người gửi</shipper_address>
393
+ <shipper_phone>Điện thoại người gửi</shipper_phone>
394
+ <shipper_email>Email người gửi</shipper_email>
395
+ <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
396
+ <consignee_name>Tên người nhận</consignee_name>
397
+ <consignee_address>Địa chỉ người nhận</consignee_address>
398
+ <consignee_phone>Điện thoại người nhận</consignee_phone>
399
+ <consignee_email>Email người nhận</consignee_email>
400
+ <departure_airport>Nơi đi</departure_airport>
401
+ <destination_airport>Nơi đến</destination_airport>
402
+ <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
403
+ <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
404
+ <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
405
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
406
+ <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
407
+ </s_avs_dad>
408
+ """
409
  }
 
410
 
411
+ def insert_template(name):
412
+ return prompt_templates.get(name, "")
413
+
414
+ def sanitize_filename(name):
415
+ return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
416
+
417
+ def clean_text(text):
418
+ text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
419
+ text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
420
+ return text.strip()
421
+
422
+ def export_json(image_name, result_text):
423
+ try:
424
+ clean_name = sanitize_filename(image_name)
425
+ content = {"image": image_name, "text_sequence": clean_text(result_text)}
426
+ path = f"/tmp/{clean_name}.json"
427
+ with open(path, "w", encoding="utf-8") as f:
428
+ json.dump(content, f, ensure_ascii=False, indent=2)
429
+ return path, json.dumps(content, ensure_ascii=False, indent=2)
430
+ except Exception as e:
431
+ return "", f"[Export JSON Failed]: {e}"
432
+
433
+ # --- 10. Gradio UI ---
434
+ # --- 10. Gradio UI ---
435
+ css = """
436
+ .gradio-textbox textarea {
437
+ font-size: 13px !important;
438
+ line-height: 1.3 !important;
439
+ padding: 6px 8px !important;
440
+ }
441
+ .gradio-textbox label {
442
+ font-size: 13px !important;
443
+ font-weight: 600 !important;
444
+ margin-bottom: 4px !important;
445
+ }
446
+ .gradio-button {
447
+ font-size: 12px !important;
448
+ padding: 4px 8px !important;
449
+ height: 28px !important;
450
+ min-height: 28px !important;
451
+ margin: 2px !important;
452
+ }
453
+ .gradio-button[data-variant="primary"] {
454
+ height: 36px !important;
455
+ font-size: 13px !important;
456
+ padding: 8px 16px !important;
457
+ }
458
+ .gradio-file {
459
+ font-size: 13px !important;
460
+ }
461
+ .gradio-file .file-upload {
462
+ padding: 8px !important;
463
+ min-height: 80px !important;
464
+ }
465
+ .gradio-markdown h3 {
466
+ font-size: 14px !important;
467
+ margin: 8px 0 4px 0 !important;
468
+ }
469
+ .gradio-markdown h2 {
470
+ font-size: 18px !important;
471
+ margin: 8px 0 !important;
472
+ }
473
+ .gradio-code {
474
+ font-size: 12px !important;
475
+ }
476
+ """
477
 
478
+ with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo:
479
+ gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
 
 
 
480
 
481
+ # --- Main Layout: 2 Columns ---
482
  with gr.Row():
483
+ # === LEFT COLUMN: Input ===
484
  with gr.Column(scale=1):
485
+ gr.Markdown("### 📥 INPUT")
486
+
487
+ # File Input
488
  file_input = gr.File(
489
+ label="📤 Tải ảnh hoặc PDF",
490
+ file_types=[".jpg", ".jpeg", ".png", ".pdf"],
491
+ height=100
492
  )
493
 
494
+ # Prompt Input
495
  prompt_input = gr.Textbox(
496
+ label="Prompt thuần",
497
+ lines=2,
498
+ placeholder="Nhập prompt tùy chỉnh...",
499
+ max_lines=3
500
  )
501
 
502
+ # JSON Config
503
+ config_input = gr.Textbox(
504
+ label="JSON Prompt",
505
+ lines=6,
506
+ placeholder="Cấu hình JSON sẽ xuất hiện ở đây...",
507
+ max_lines=8
508
+ )
509
+
510
+ # Max New Tokens Radio
511
+ max_new_tokens_input = gr.Radio(
512
+ choices=[128, 256, 512, 1024, 1536, 2048],
513
  value=512,
514
+ label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)",
515
+ info="Chọn độ dài tối đa cho đầu ra của mô hình"
516
  )
517
 
518
+ # Prompt Templates
519
+ gr.Markdown("### 📑 Mẫu:")
520
+ with gr.Row():
521
+ for key in list(prompt_templates.keys()): # All buttons in one row
522
+ gr.Button(f"{key}", size="sm", scale=1).click(
523
+ fn=lambda *, k=key: insert_template(k),
524
+ inputs=[],
525
+ outputs=config_input
526
+ )
527
+
528
+ # Run Button
529
+ run_btn = gr.Button("🚀 Chạy OCR", variant="primary")
530
 
531
+ # === RIGHT COLUMN: Output ===
532
  with gr.Column(scale=1):
533
+ gr.Markdown("### 📤 OUTPUT")
534
+
535
+ # Result Output
536
+ result_output = gr.Textbox(
537
+ label="Kết quả trích xuất",
538
+ lines=10,
539
+ placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...",
540
+ max_lines=12
541
+ )
542
+
543
+ # Export Section
544
+ with gr.Row():
545
+ export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm")
546
+
547
+ # JSON Output
548
+ json_text = gr.Code(
549
+ label="JSON Output",
550
+ language="json",
551
+ lines=6,
552
+ visible=False
553
+ )
554
+
555
+ # Download File
556
+ json_file = gr.File(
557
+ label="File JSON để tải",
558
+ visible=False,
559
+ file_types=[".json"]
560
  )
561
 
562
+ # --- Hidden Fields ---
563
+ hidden_name = gr.Textbox(visible=False)
564
+
565
+ # --- Event Handlers ---
566
+
567
+ # Run Inference
568
  run_btn.click(
569
  fn=handle_file,
570
+ inputs=[file_input, prompt_input, config_input, max_new_tokens_input],
571
+ outputs=[hidden_name, result_output]
572
  )
573
 
574
+ # Export JSON
575
+ export_btn.click(
576
+ fn=export_json,
577
+ inputs=[hidden_name, result_output],
578
+ outputs=[json_file, json_text]
579
+ )
580
 
581
+ export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
582
+ export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text])
 
 
 
 
 
583
 
 
 
 
584
  if __name__ == "__main__":
585
  demo.launch(
586
+ share=True,
587
+ server_name="0.0.0.0",
588
+ server_port=7860
589
+ )