vithacocf commited on
Commit
0460893
·
verified ·
1 Parent(s): ebff030

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -516
app.py CHANGED
@@ -1,589 +1,233 @@
 
 
 
 
 
1
  import os
 
2
  import json
3
  import re
4
- import hashlib
5
- import gc
6
- from io import BytesIO
7
- from collections import OrderedDict
8
- from PIL import Image, UnidentifiedImageError
9
  import torch
10
- from transformers import AutoProcessor, BitsAndBytesConfig
11
- from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
12
- from pdf2image import convert_from_bytes
13
- import gradio as gr
14
  import fitz
 
15
  import spaces
 
16
 
17
- # --- CONFIGURATION ---
18
- MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
19
- CACHE_MAX_SIZE = 128
20
- DPI = 100
21
- THREAD_COUNT = 4
22
- IMAGE_MAX_DIM = 1024
23
- JPEG_QUALITY = 75
24
- GPU_MEMORY_FRACTION = 0.8 # use 80% of GPU memory
25
- PAD_TOKEN_ID = None # set later to avoid warnings
26
-
27
- # --- CONFIGURATION ---
28
  MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
29
- CACHE_MAX_SIZE = 128
30
- DPI = 150 # Giữ vừa đủ, không quá cao
31
- IMAGE_MAX_DIM = None # Không resize nếu không cần
32
- JPEG_QUALITY = 70
33
  GPU_MEMORY_FRACTION = 0.8
34
 
35
- # --- 1. Device ---
36
- device = torch.device("cpu") #torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
- torch.backends.cudnn.benchmark = True
38
- if device.type == 'cuda':
 
 
 
 
 
39
  torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
40
 
41
- # --- 2. Load model ---
42
- from transformers import AutoProcessor, BitsAndBytesConfig
43
- from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
44
 
 
 
 
45
  bnb = BitsAndBytesConfig(
46
  load_in_4bit=True,
47
  bnb_4bit_use_double_quant=True,
48
  bnb_4bit_quant_type="nf4",
49
- bnb_4bit_compute_dtype=torch.float16
 
 
 
 
 
50
  )
51
 
52
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
53
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
  MODEL_ID,
55
  quantization_config=bnb,
56
  device_map="auto",
 
57
  trust_remote_code=True
58
  ).eval()
 
59
  processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
60
 
61
- # --- 8. File handler ---
62
- import traceback
63
- from concurrent.futures import ThreadPoolExecutor
64
-
65
- # --- 8. File handler ---
66
- import traceback
67
- from concurrent.futures import ThreadPoolExecutor
68
-
69
- def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()):
70
- try:
71
- file_path = file.name if hasattr(file, "name") else file
72
- filename = os.path.basename(file_path)
73
- ext = filename.lower().split('.')[-1]
74
- full_prompt = (prompt + "\n" + extra_prompt).strip() or ""
75
-
76
- print(f"[INFO] handle_file → {filename} (.{ext})")
77
-
78
- if ext == "pdf":
79
- try:
80
- with open(file_path, "rb") as f:
81
- pdf_bytes = f.read()
82
- print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
83
-
84
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
85
- pages = []
86
- zoom = DPI
87
- mat = fitz.Matrix(zoom, zoom)
88
- for i, page in enumerate(doc):
89
- pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
90
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
91
- if max(img.size) > 3072:
92
- img.thumbnail((3072, 3072), Image.Resampling.LANCZOS)
93
- pages.append(img)
94
- print(f"[INFO] Converted PDF → {len(pages)} pages")
95
-
96
- except Exception as e:
97
- traceback.print_exc()
98
- return filename, f"[ERROR] PDF conversion failed: {e}"
99
-
100
- outputs = []
101
- with ThreadPoolExecutor(max_workers=4) as executor:
102
- futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages]
103
- for idx, future in enumerate(futures):
104
- try:
105
- out = future.result()
106
- except Exception as e:
107
- traceback.print_exc()
108
- out = f"[ERROR] Inference page {idx+1} failed: {e}"
109
- outputs.append(out)
110
- progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}")
111
-
112
- result = "\n\n--- Page Break ---\n\n".join(outputs)
113
- print("[INFO] handle_file done")
114
- return filename, result
115
-
116
- else:
117
- try:
118
- img = Image.open(file_path)
119
- print(f"[INFO] Opened image: {img.mode}, {img.size}")
120
- except Exception as e:
121
- traceback.print_exc()
122
- return filename, f"[ERROR] Image open failed: {e}"
123
-
124
- return filename, run_inference(img, full_prompt, max_new_tokens)
125
-
126
- except Exception as e:
127
- traceback.print_exc()
128
- return "error", f"[ERROR] handle_file unexpected: {e}"
129
-
130
- # --- 3. Inference Function ---
131
- @spaces.GPU
132
- def run_inference(img, prompt="", max_new_tokens=512):
133
- model.to("cuda")
134
 
135
- if img.mode != "RGB":
136
- img = img.convert("RGB")
137
- prompt_text = prompt.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  messages = [{
140
  "role": "user",
141
  "content": [
142
- {"type": "image", "image": img},
143
- {"type": "text", "text": prompt_text}
144
  ]
145
  }]
146
 
147
  text_prompt = processor.apply_chat_template(
148
- messages, tokenize=False, add_generation_prompt=True
 
 
149
  )
150
 
151
  inputs = processor(
152
- text=[text_prompt], images=[img], return_tensors="pt", padding=True
153
- ).to("cuda") # Sửa ở đây
 
 
154
 
155
- with torch.inference_mode(), torch.cuda.amp.autocast():
156
- gen = model.generate(
157
  **inputs,
158
  max_new_tokens=max_new_tokens,
159
  do_sample=False,
 
160
  eos_token_id=processor.tokenizer.eos_token_id
161
  )
162
 
163
- trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
164
- result = processor.tokenizer.batch_decode(
165
- trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
166
- )[0].strip()
167
-
168
- return result
169
- # --- 9. Prompt templates & JSON export ---
170
- prompt_templates = {
171
- "Electrolux": """Extract all structured information from the delivery order document image.
172
- You must return the result as a valid XML block that strictly follows the structure below.
173
- STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
174
- 1. Return **ONLY** the XML block – nothing before or after it.
175
- 2. DO NOT add, remove, rename, or reorder any XML tags.
176
- 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
177
- 4. For every tag, fill in the exact value read from the image.
178
- NEVER copy or repeat the label/placeholder text.
179
- • NEVER guess or invent values.
180
- 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
181
- 6. DO NOT include Vietnamese text or translations inside tag values.
182
- 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
183
- 8. Dates must be in YYYY-MM-DD format.
184
- 9. Boolean tags must be exactly true or false (lower-case, no quotes).
185
- Yes Passed ⇒ true | ✘ X No Fail ⇒ false
186
- 10. **Inside each value**
187
- • Replace every internal line-break with “, ” (comma + space).
188
- Trim leading/trailing whitespace.
189
- • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
190
- 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
191
- 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
192
- 13. Ignore any information not represented by the tags below.
193
- <s_electrolux_form>
194
- <document_number>Số lệnh giao nhận hàng</document_number>
195
- <order_number>Số đơn hàng</order_number>
196
- <customer_code>Mã số khách hàng</customer_code>
197
- <customer_order_code>Mã đơn khách hàng</customer_order_code>
198
- <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
199
- <delivery_date>Ngày giao hàng</delivery_date>
200
- <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
201
- <invoice_number>Số hóa đơn</invoice_number>
202
- <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
203
- <shipper_address>Địa chỉ gửi hàng</shipper_address>
204
- <shipper_phone>Số điện thoại</shipper_phone>
205
- <shipper_fax>Số fax</shipper_fax>
206
- <shipper_tax_code>Mã số thuế</shipper_tax_code>
207
- <consignee_customer_code>Mã khách hàng</consignee_customer_code>
208
- <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
209
- <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
210
- <city_province>Tỉnh/Thành phố</city_province>
211
- <postal_code>Mã bưu chính</postal_code>
212
- <preparer_name>Họ tên người lập phiếu</preparer_name>
213
- <preparer_date>Ngày lập phiếu</preparer_date>
214
- <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
215
- </s_electrolux_form>
216
- """,
217
-
218
- "Jotun": """Extract all structured information from the delivery order document.
219
- You must return the result as a valid XML block that strictly follows the structure below.
220
- STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
221
- 1. Return **ONLY** the XML block – nothing before or after it.
222
- 2. DO NOT add, remove, rename, or reorder any XML tags.
223
- 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
224
- 4. For every tag, fill in the exact value read from the image.
225
- • NEVER copy or repeat the label/placeholder text.
226
- • NEVER guess or invent values.
227
- 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
228
- 6. DO NOT include Vietnamese text or translations inside tag values.
229
- 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
230
- 8. Dates must be in YYYY-MM-DD format.
231
- 9. Boolean tags must be exactly true or false (lower-case, no quotes).
232
- ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
233
- 10. **Inside each value**
234
- • Replace every internal line-break with “, ” (comma + space).
235
- • Trim leading/trailing whitespace.
236
- • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
237
- 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
238
- 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
239
- 13. Ignore any information not represented by the tags below.
240
- <s_jotun_form>
241
- <document_number>Số lệnh giao hàng</document_number>
242
- <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
243
- <customer_code>Mã khách hàng</customer_code>
244
- <customer_name>Tên khách hàng</customer_name>
245
- <customer_address>Địa chỉ khách hàng</customer_address>
246
- <customer_phone>Điện thoại khách hàng</customer_phone>
247
- <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
248
- <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
249
- <order_code>Số đơn đặt hàng</order_code>
250
- <order_date>Ngày đặt hàng</order_date>
251
- <order_number>Số đơn hàng</order_number>
252
- <delivery_date>Ngày giao hàng</delivery_date>
253
- <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
254
- </s_jotun_form>
255
- """,
256
-
257
- "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
258
- You must return the result as a valid XML block that strictly follows the structure below.
259
- STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
260
- 1. Return **ONLY** the XML block – nothing before or after it.
261
- 2. DO NOT add, remove, rename, or reorder any XML tags.
262
- 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
263
- 4. For every tag, fill in the exact value read from the image.
264
- • NEVER copy or repeat the label/placeholder text.
265
- • NEVER guess or invent values.
266
- 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
267
- 6. DO NOT include Vietnamese text or translations inside tag values.
268
- 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
269
- 8. Dates must be in YYYY-MM-DD format.
270
- 9. Boolean tags must be exactly true or false (lower-case, no quotes).
271
- ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
272
- 10. **Inside each value**
273
- • Replace every internal line-break with “, ” (comma + space).
274
- • Trim leading/trailing whitespace.
275
- • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
276
- 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
277
- 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
278
- 13. Ignore any information not represented by the tags below.
279
- <s_mawb_form>
280
- <air_waybill_number>Số MAWB</air_waybill_number>
281
- <shipper_name>Tên người gửi hàng</shipper_name>
282
- <shipper_address>Địa chỉ người gửi hàng</shipper_address>
283
- <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
284
- <consignee_name>Tên người nhận hàng</consignee_name>
285
- <consignee_address>Địa chỉ người nhận hàng</consignee_address>
286
- <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
287
- <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
288
- <shipper_signature>Chữ ký người gửi</shipper_signature>
289
- </s_mawb_form>
290
- """,
291
-
292
- "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
293
- You must return the result as a valid XML block that strictly follows the structure below.
294
- STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
295
- 1. Return **ONLY** the XML block – nothing before or after it.
296
- 2. DO NOT add, remove, rename, or reorder any XML tags.
297
- 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
298
- 4. For every tag, fill in the exact value read from the image.
299
- • NEVER copy or repeat the label/placeholder text.
300
- • NEVER guess or invent values.
301
- 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
302
- 6. DO NOT include Vietnamese text or translations inside tag values.
303
- 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
304
- 8. Dates must be in YYYY-MM-DD format.
305
- 9. Boolean tags must be exactly true or false (lower-case, no quotes).
306
- ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
307
- 10. **Inside each value**
308
- • Replace every internal line-break with “, ” (comma + space).
309
- • Trim leading/trailing whitespace.
310
- • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
311
- 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
312
- 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
313
- 13. Ignore any information not represented by the tags below.
314
- <s_weight_ticket>
315
- <awb_number>Số AWB</awb_number>
316
- <shipper_name>Tên người gửi hàng</shipper_name>
317
- <shipper_address>Địa chỉ người gửi hàng</shipper_address>
318
- <shipper_contact>Số điện thoại người gửi</shipper_contact>
319
- <consignee_name>Tên người nhận hàng</consignee_name>
320
- <consignee_address>Địa chỉ người nhận hàng</consignee_address>
321
- <cargo_description>Tên hàng hóa</cargo_description>
322
- <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
323
- <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
324
- <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
325
- </s_weight_ticket>
326
- """,
327
-
328
- "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
329
- You must return the result as a valid XML block that strictly follows the structure below.
330
- STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
331
- 1. Return **ONLY** the XML block – nothing before or after it.
332
- 2. DO NOT add, remove, rename, or reorder any XML tags.
333
- 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
334
- 4. For every tag, fill in the exact value read from the image.
335
- • NEVER copy or repeat the label/placeholder text.
336
- • NEVER guess or invent values.
337
- 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
338
- 6. DO NOT include Vietnamese text or translations inside tag values.
339
- 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
340
- 8. Dates must be in YYYY-MM-DD format.
341
- 9. Boolean tags must be exactly true or false (lower-case, no quotes).
342
- ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
343
- 10. **Inside each value**
344
- • Replace every internal line-break with “, ” (comma + space).
345
- • Trim leading/trailing whitespace.
346
- • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
347
- 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
348
- 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
349
- 13. Ignore any information not represented by the tags below.
350
- <s_pc3u_form>
351
- <awb_number>Số AWB</awb_number>
352
- <cargo_service_code>Mã dịch vụ</cargo_service_code>
353
- <shipper_name>Tên người gửi</shipper_name>
354
- <shipper_address>Địa chỉ người gửi</shipper_address>
355
- <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
356
- <payer_name>Người thanh toán</payer_name>
357
- <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
358
- <consignee_name>Tên người nhận</consignee_name>
359
- <consignee_address>Địa chỉ người nhận</consignee_address>
360
- <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
361
- <shipper_signature>Chữ ký người gửi</shipper_signature>
362
- <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
363
- </s_pc3u_form>
364
- """,
365
-
366
- "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
367
- You must return the result as a valid XML block that strictly follows the structure below.
368
- STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
369
- 1. Return **ONLY** the XML block – nothing before or after it.
370
- 2. DO NOT add, remove, rename, or reorder any XML tags.
371
- 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
372
- 4. For every tag, fill in the exact value read from the image.
373
- • NEVER copy or repeat the label/placeholder text.
374
- • NEVER guess or invent values.
375
- 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
376
- 6. DO NOT include Vietnamese text or translations inside tag values.
377
- 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
378
- 8. Dates must be in YYYY-MM-DD format.
379
- 9. Boolean tags must be exactly true or false (lower-case, no quotes).
380
- ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
381
- 10. **Inside each value**
382
- • Replace every internal line-break with “, ” (comma + space).
383
- • Trim leading/trailing whitespace.
384
- • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
385
- 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
386
- 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
387
- 13. Ignore any information not represented by the tags below.
388
- <s_avs_dad>
389
- <air_waybill_number>Số AWB</air_waybill_number>
390
- <form_code>Mã biểu mẫu</form_code>
391
- <shipper_name>Tên người gửi</shipper_name>
392
- <shipper_address>Địa chỉ người gửi</shipper_address>
393
- <shipper_phone>Điện thoại người gửi</shipper_phone>
394
- <shipper_email>Email người gửi</shipper_email>
395
- <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
396
- <consignee_name>Tên người nhận</consignee_name>
397
- <consignee_address>Địa chỉ người nhận</consignee_address>
398
- <consignee_phone>Điện thoại người nhận</consignee_phone>
399
- <consignee_email>Email người nhận</consignee_email>
400
- <departure_airport>Nơi đi</departure_airport>
401
- <destination_airport>Nơi đến</destination_airport>
402
- <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
403
- <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
404
- <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
405
- <shipper_signature>Chữ ký người gửi</shipper_signature>
406
- <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
407
- </s_avs_dad>
408
- """
409
  }
 
410
 
411
- def insert_template(name):
412
- return prompt_templates.get(name, "")
413
-
414
- def sanitize_filename(name):
415
- return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
416
-
417
- def clean_text(text):
418
- text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
419
- text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
420
- return text.strip()
421
-
422
- def export_json(image_name, result_text):
423
- try:
424
- clean_name = sanitize_filename(image_name)
425
- content = {"image": image_name, "text_sequence": clean_text(result_text)}
426
- path = f"/tmp/{clean_name}.json"
427
- with open(path, "w", encoding="utf-8") as f:
428
- json.dump(content, f, ensure_ascii=False, indent=2)
429
- return path, json.dumps(content, ensure_ascii=False, indent=2)
430
- except Exception as e:
431
- return "", f"[Export JSON Failed]: {e}"
432
-
433
- # --- 10. Gradio UI ---
434
- # --- 10. Gradio UI ---
435
- css = """
436
- .gradio-textbox textarea {
437
- font-size: 13px !important;
438
- line-height: 1.3 !important;
439
- padding: 6px 8px !important;
440
- }
441
- .gradio-textbox label {
442
- font-size: 13px !important;
443
- font-weight: 600 !important;
444
- margin-bottom: 4px !important;
445
- }
446
- .gradio-button {
447
- font-size: 12px !important;
448
- padding: 4px 8px !important;
449
- height: 28px !important;
450
- min-height: 28px !important;
451
- margin: 2px !important;
452
- }
453
- .gradio-button[data-variant="primary"] {
454
- height: 36px !important;
455
- font-size: 13px !important;
456
- padding: 8px 16px !important;
457
- }
458
- .gradio-file {
459
- font-size: 13px !important;
460
- }
461
- .gradio-file .file-upload {
462
- padding: 8px !important;
463
- min-height: 80px !important;
464
- }
465
- .gradio-markdown h3 {
466
- font-size: 14px !important;
467
- margin: 8px 0 4px 0 !important;
468
- }
469
- .gradio-markdown h2 {
470
- font-size: 18px !important;
471
- margin: 8px 0 !important;
472
- }
473
- .gradio-code {
474
- font-size: 12px !important;
475
- }
476
- """
477
 
478
- with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo:
479
- gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
 
 
 
480
 
481
- # --- Main Layout: 2 Columns ---
482
  with gr.Row():
483
- # === LEFT COLUMN: Input ===
484
  with gr.Column(scale=1):
485
- gr.Markdown("### 📥 INPUT")
486
-
487
- # File Input
488
  file_input = gr.File(
489
- label="📤 Tải ảnh hoặc PDF",
490
- file_types=[".jpg", ".jpeg", ".png", ".pdf"],
491
- height=100
492
  )
493
 
494
- # Prompt Input
495
  prompt_input = gr.Textbox(
496
- label="Prompt thuần",
497
- lines=2,
498
- placeholder="Nhập prompt tùy chỉnh...",
499
- max_lines=3
500
  )
501
 
502
- # JSON Config
503
- config_input = gr.Textbox(
504
- label="JSON Prompt",
505
- lines=6,
506
- placeholder="Cấu hình JSON sẽ xuất hiện ở đây...",
507
- max_lines=8
508
- )
509
-
510
- # Max New Tokens Radio
511
- max_new_tokens_input = gr.Radio(
512
- choices=[128, 256, 512, 1024, 1536, 2048],
513
  value=512,
514
- label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)",
515
- info="Chọn độ dài tối đa cho đầu ra của mô hình"
516
  )
517
 
518
- # Prompt Templates
519
- gr.Markdown("### 📑 Mẫu:")
520
- with gr.Row():
521
- for key in list(prompt_templates.keys()): # All buttons in one row
522
- gr.Button(f"{key}", size="sm", scale=1).click(
523
- fn=lambda *, k=key: insert_template(k),
524
- inputs=[],
525
- outputs=config_input
526
- )
527
 
528
- # Run Button
529
- run_btn = gr.Button("🚀 Chạy OCR", variant="primary")
530
-
531
- # === RIGHT COLUMN: Output ===
532
  with gr.Column(scale=1):
533
- gr.Markdown("### 📤 OUTPUT")
534
-
535
- # Result Output
536
- result_output = gr.Textbox(
537
- label="Kết quả trích xuất",
538
- lines=10,
539
- placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...",
540
- max_lines=12
541
  )
542
 
543
- # Export Section
544
- with gr.Row():
545
- export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm")
546
-
547
- # JSON Output
548
- json_text = gr.Code(
549
- label="JSON Output",
550
- language="json",
551
- lines=6,
552
- visible=False
553
- )
554
-
555
- # Download File
556
- json_file = gr.File(
557
- label="File JSON để tải",
558
- visible=False,
559
- file_types=[".json"]
560
- )
561
-
562
- # --- Hidden Fields ---
563
- hidden_name = gr.Textbox(visible=False)
564
-
565
- # --- Event Handlers ---
566
-
567
- # Run Inference
568
  run_btn.click(
569
  fn=handle_file,
570
- inputs=[file_input, prompt_input, config_input, max_new_tokens_input],
571
- outputs=[hidden_name, result_output]
572
  )
573
 
574
- # Export JSON
575
- export_btn.click(
576
- fn=export_json,
577
- inputs=[hidden_name, result_output],
578
- outputs=[json_file, json_text]
579
- )
580
 
581
- export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
582
- export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text])
 
 
 
 
 
583
 
584
  if __name__ == "__main__":
585
  demo.launch(
586
- share=True,
587
- server_name="0.0.0.0",
588
- server_port=7860
589
- )
 
1
+ # =========================
2
+ # CAMEL-DOC-OCR (FAST)
3
+ # Single-file version
4
+ # =========================
5
+
6
  import os
7
+ import gc
8
  import json
9
  import re
 
 
 
 
 
10
  import torch
 
 
 
 
11
  import fitz
12
+ import gradio as gr
13
  import spaces
14
+ from PIL import Image
15
 
16
+ from transformers import AutoProcessor, BitsAndBytesConfig
17
+ from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
18
+
19
+
20
+ # =========================
21
+ # CONFIG
22
+ # =========================
 
 
 
 
23
  MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
24
+ DPI = 150
25
+ MAX_IMAGE_SIZE = 2048
 
 
26
  GPU_MEMORY_FRACTION = 0.8
27
 
28
+
29
+ # =========================
30
+ # TORCH OPTIMIZATION
31
+ # =========================
32
+ torch.set_grad_enabled(False)
33
+ torch.backends.cuda.matmul.allow_tf32 = True
34
+ torch.backends.cudnn.allow_tf32 = True
35
+
36
+ if torch.cuda.is_available():
37
  torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
38
 
 
 
 
39
 
40
+ # =========================
41
+ # LOAD MODEL (ONCE)
42
+ # =========================
43
  bnb = BitsAndBytesConfig(
44
  load_in_4bit=True,
45
  bnb_4bit_use_double_quant=True,
46
  bnb_4bit_quant_type="nf4",
47
+ bnb_4bit_compute_dtype=torch.float16,
48
+ )
49
+
50
+ processor = AutoProcessor.from_pretrained(
51
+ MODEL_ID,
52
+ trust_remote_code=True
53
  )
54
 
 
55
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
  MODEL_ID,
57
  quantization_config=bnb,
58
  device_map="auto",
59
+ torch_dtype=torch.float16,
60
  trust_remote_code=True
61
  ).eval()
62
+
63
  processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ # =========================
67
+ # PDF → IMAGE
68
+ # =========================
69
+ def pdf_to_images(pdf_bytes):
70
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
71
+ images = []
72
+
73
+ scale = DPI / 72.0
74
+ mat = fitz.Matrix(scale, scale)
75
+
76
+ for page in doc:
77
+ pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
78
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
79
+
80
+ if max(img.size) > MAX_IMAGE_SIZE:
81
+ img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)
82
+
83
+ images.append(img)
84
+
85
+ return images
86
+
87
+
88
+ # =========================
89
+ # OCR INFERENCE (FAST)
90
+ # =========================
91
+ @spaces.GPU
92
+ def run_inference(image, prompt, max_new_tokens):
93
+ if image.mode != "RGB":
94
+ image = image.convert("RGB")
95
 
96
  messages = [{
97
  "role": "user",
98
  "content": [
99
+ {"type": "image", "image": image},
100
+ {"type": "text", "text": prompt}
101
  ]
102
  }]
103
 
104
  text_prompt = processor.apply_chat_template(
105
+ messages,
106
+ tokenize=False,
107
+ add_generation_prompt=True
108
  )
109
 
110
  inputs = processor(
111
+ text=[text_prompt],
112
+ images=[image],
113
+ return_tensors="pt"
114
+ ).to(model.device)
115
 
116
+ with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
117
+ outputs = model.generate(
118
  **inputs,
119
  max_new_tokens=max_new_tokens,
120
  do_sample=False,
121
+ use_cache=True,
122
  eos_token_id=processor.tokenizer.eos_token_id
123
  )
124
 
125
+ outputs = outputs[:, inputs["input_ids"].shape[1]:]
126
+
127
+ return processor.tokenizer.decode(
128
+ outputs[0],
129
+ skip_special_tokens=True,
130
+ clean_up_tokenization_spaces=True
131
+ ).strip()
132
+
133
+
134
+ # =========================
135
+ # FILE HANDLER
136
+ # =========================
137
+ def handle_file(file, prompt, max_new_tokens, progress=gr.Progress()):
138
+ file_path = file.name
139
+ ext = file_path.lower().split(".")[-1]
140
+ prompt = prompt.strip()
141
+
142
+ if ext == "pdf":
143
+ with open(file_path, "rb") as f:
144
+ images = pdf_to_images(f.read())
145
+
146
+ results = []
147
+ for i, img in enumerate(images):
148
+ text = run_inference(img, prompt, max_new_tokens)
149
+ results.append(text)
150
+ progress((i + 1) / len(images), desc=f"Page {i+1}/{len(images)}")
151
+
152
+ return "\n\n--- PAGE BREAK ---\n\n".join(results)
153
+
154
+ else:
155
+ img = Image.open(file_path)
156
+ return run_inference(img, prompt, max_new_tokens)
157
+
158
+
159
+ # =========================
160
+ # DEFAULT PROMPT (CAMEL OCR)
161
+ # =========================
162
+ DEFAULT_PROMPT = """
163
+ You are an OCR + Information Extraction engine.
164
+ Extract data strictly from the document.
165
+ Return JSON ONLY. NO explanation.
166
+
167
+ OUTPUT FORMAT:
168
+ {
169
+ "price": "",
170
+ "vat": "",
171
+ "invoiceNo": "",
172
+ "invoiceDate": "",
173
+ "billingToTaxCode": "",
174
+ "accountingObjectTaxCode": "",
175
+ "description": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  }
177
+ """.strip()
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ # =========================
181
+ # GRADIO UI
182
+ # =========================
183
+ with gr.Blocks(title="Camel-Doc-OCR") as demo:
184
+ gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL – 4bit, Fast)")
185
 
 
186
  with gr.Row():
 
187
  with gr.Column(scale=1):
 
 
 
188
  file_input = gr.File(
189
+ label="Upload Image / PDF",
190
+ file_types=[".jpg", ".jpeg", ".png", ".pdf"]
 
191
  )
192
 
 
193
  prompt_input = gr.Textbox(
194
+ label="Prompt",
195
+ value=DEFAULT_PROMPT,
196
+ lines=10
 
197
  )
198
 
199
+ max_tokens = gr.Radio(
200
+ [256, 512, 1024, 2048],
 
 
 
 
 
 
 
 
 
201
  value=512,
202
+ label="Max new tokens"
 
203
  )
204
 
205
+ run_btn = gr.Button("🚀 Run OCR", variant="primary")
 
 
 
 
 
 
 
 
206
 
 
 
 
 
207
  with gr.Column(scale=1):
208
+ output = gr.Textbox(
209
+ label="Result",
210
+ lines=20
 
 
 
 
 
211
  )
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  run_btn.click(
214
  fn=handle_file,
215
+ inputs=[file_input, prompt_input, max_tokens],
216
+ outputs=output
217
  )
218
 
 
 
 
 
 
 
219
 
220
+ # =========================
221
+ # CLEANUP & LAUNCH
222
+ # =========================
223
+ def cleanup():
224
+ torch.cuda.empty_cache()
225
+ gc.collect()
226
+
227
 
228
  if __name__ == "__main__":
229
  demo.launch(
230
+ server_name="0.0.0.0",
231
+ server_port=7860,
232
+ share=True
233
+ )