vithacocf commited on
Commit
0085935
·
verified ·
1 Parent(s): 421881d

Update app.py

Browse files

Update case pdf

Files changed (1) hide show
  1. app.py +168 -490
app.py CHANGED
@@ -7,10 +7,10 @@ import pandas as pd
7
  import gradio as gr
8
  import google.generativeai as genai
9
  import requests
 
10
 
11
  # ================== CONFIG ==================
12
- # KHÔNG hardcode key. YÊU CẦU đặt biến môi trường GOOGLE_API_KEY.
13
- DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs" # để trống. Nếu cần, bạn có thể set tạm thời ở ENV.
14
 
15
  INTERNAL_MODEL_MAP = {
16
  "Gemini 2.5 Flash": "gemini-2.5-flash",
@@ -19,9 +19,10 @@ INTERNAL_MODEL_MAP = {
19
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
20
 
21
  try:
22
- RESAMPLE = Image.Resampling.LANCZOS # Pillow >= 10
23
  except AttributeError:
24
- RESAMPLE = Image.LANCZOS # Pillow < 10
 
25
  PROMPT_FREIGHT_JSON = """
26
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
27
  {
@@ -72,7 +73,6 @@ Please analyze the freight rate table in the file I provide and convert it into
72
  }
73
  ]
74
  }
75
-
76
  ### Date rules
77
  - valid_from format:
78
  - `DD/MM/YYYY` (if full date)
@@ -82,7 +82,6 @@ Please analyze the freight rate table in the file I provide and convert it into
82
  - valid_to:
83
  - exact `DD/MM/YYYY` if present
84
  - else `UFN`
85
-
86
  STRICT RULES:
87
  - ONLY return a single JSON object as specified above.
88
  - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
@@ -98,6 +97,7 @@ STRICT RULES:
98
  - Replace commas in remarks with semicolons.
99
  - Only return JSON.
100
  """
 
101
  # ================== HELPERS ==================
102
  import fitz # PyMuPDF
103
 
@@ -126,23 +126,6 @@ def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> by
126
  return upload.read()
127
  raise TypeError(f"Unsupported file object: {type(upload)}")
128
 
129
- def _make_previews(file_bytes: bytes, max_side: int = 2000) -> List[Image.Image]:
130
- """Trả list PIL.Image đã RGB + resize theo max_side."""
131
- if len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
132
- pages = pdf_to_images(file_bytes)
133
- else:
134
- pages = [Image.open(io.BytesIO(file_bytes))]
135
- out = []
136
- for im in pages:
137
- im = ensure_rgb(im)
138
- if max_side:
139
- w, h = im.size
140
- scale = min(max_side / float(w), max_side / float(h), 1.0)
141
- if scale < 1.0:
142
- im = im.resize((max(1, int(w*scale)), max(1, int(h*scale))), RESAMPLE)
143
- out.append(im)
144
- return out
145
-
146
  def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
147
  if isinstance(file, (str, os.PathLike)):
148
  filename = os.path.basename(str(file))
@@ -162,265 +145,39 @@ def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
162
  mime = "image/png"
163
  return filename, mime
164
 
165
- def _extract_json_from_message(msg: str):
166
- """Bóc JSON trong ```json ...``` nếu có. Trả về (obj, cleaned_string)."""
167
- s = (msg or "").strip()
168
- s = re.sub(r"^\s*```(?:json)?\s*", "", s, flags=re.IGNORECASE)
169
- s = re.sub(r"\s*```\s*$", "", s)
170
- try:
171
- return json.loads(s), s
172
- except Exception:
173
- return None, s
174
-
175
- def _pretty_message(msg: str) -> str:
176
- obj, s = _extract_json_from_message(msg)
177
- return json.dumps(obj, ensure_ascii=False, indent=2) if obj is not None else s
178
-
179
- def _safe_text_from_gemini(resp):
180
- try:
181
- return resp.text
182
- except Exception:
183
- pass
184
- texts = []
185
- for c in getattr(resp, "candidates", []) or []:
186
- content = getattr(c, "content", None)
187
- parts = getattr(content, "parts", None) if content else None
188
- if not parts:
189
- continue
190
- for p in parts:
191
- t = getattr(p, "text", None)
192
- if t:
193
- texts.append(t)
194
- return "\n".join(texts).strip()
195
-
196
- def _wait_file_active(file_obj, timeout_s: int = 60) -> object:
197
- """Chờ file upload sang Gemini ở trạng thái ACTIVE, có timeout + backoff."""
198
- start = time.time()
199
- delay = 0.5
200
- while hasattr(file_obj, "state") and getattr(file_obj.state, "name", "") == "PROCESSING":
201
- if time.time() - start > timeout_s:
202
- raise TimeoutError("Upload processing timeout.")
203
- time.sleep(delay)
204
- delay = min(delay * 1.5, 2.0)
205
- file_obj = genai.get_file(file_obj.name)
206
- if not hasattr(file_obj, "state") or file_obj.state.name != "ACTIVE":
207
- st = getattr(file_obj, "state", None)
208
- raise RuntimeError(f"Upload failed or not active. State={getattr(st, 'name', 'UNKNOWN')}")
209
- return file_obj
210
-
211
- # ---------- JSON → Excel (schema-agnostic) ----------
212
- def _flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
213
- """Flatten dict lồng nhau thành 1 level: {'a':{'b':1}} -> {'a.b':1}"""
214
- items = []
215
- for k, v in (d or {}).items():
216
- new_key = f"{parent_key}{sep}{k}" if parent_key else str(k)
217
- if isinstance(v, dict):
218
- items.extend(_flatten_dict(v, new_key, sep=sep).items())
219
- else:
220
- items.append((new_key, v))
221
- return dict(items)
222
-
223
- def _sanitize_sheet_name(name: str, used: set[str]) -> str:
224
- # Excel sheet name ≤ 31 chars, không chứa []:*?/\
225
- invalid = set(r'[]:*?/\'' + '"')
226
- clean = "".join(ch for ch in name if ch not in invalid)
227
- clean = clean.strip()
228
- if not clean:
229
- clean = "sheet"
230
- clean = clean[:31]
231
- # đảm bảo unique
232
- base, idx = clean, 1
233
- while clean in used:
234
- suffix = f"_{idx}"
235
- clean = (base[: (31 - len(suffix))] + suffix)
236
- idx += 1
237
- used.add(clean)
238
- return clean
239
-
240
- def _to_excel_generic(data: Any, path: str) -> str:
241
- """
242
- Quy tắc:
243
- - Nếu là list[dict] -> 1 sheet "data" (json_normalize)
244
- - Nếu là dict:
245
- + Tạo 1 sheet "summary" từ các field dạng scalar/dict (flatten)
246
- + Với mỗi field là list:
247
- · list[dict] -> 1 sheet theo tên key (normalize)
248
- · list[scalar]-> 1 sheet 1 cột 'value'
249
- · list[mixed] -> chuyển thành cột 'value' dạng chuỗi
250
- """
251
- with pd.ExcelWriter(path) as writer:
252
- used_names = set()
253
-
254
- def add_df(df: pd.DataFrame, sheet: str):
255
- sheetname = _sanitize_sheet_name(sheet, used_names)
256
- df.to_excel(writer, index=False, sheet_name=sheetname)
257
-
258
- if isinstance(data, list):
259
- # list tổng quát
260
- try:
261
- df = pd.json_normalize(data, sep=".")
262
- except Exception:
263
- df = pd.DataFrame({"value": [json.dumps(x, ensure_ascii=False) for x in data]})
264
- add_df(df, "data")
265
- return path
266
-
267
- if isinstance(data, dict):
268
- scalars = {}
269
- list_sheets: list[tuple[str, pd.DataFrame]] = []
270
-
271
- for k, v in data.items():
272
- if isinstance(v, list):
273
- if len(v) == 0:
274
- list_sheets.append((k, pd.DataFrame()))
275
- elif isinstance(v[0], dict):
276
- try:
277
- df = pd.json_normalize(v, sep=".")
278
- except Exception:
279
- df = pd.DataFrame({"value": [json.dumps(x, ensure_ascii=False) for x in v]})
280
- list_sheets.append((k, df))
281
- elif not isinstance(v[0], (list, dict)):
282
- df = pd.DataFrame({"value": v})
283
- list_sheets.append((k, df))
284
- else:
285
- df = pd.DataFrame({"value": [json.dumps(x, ensure_ascii=False) for x in v]})
286
- list_sheets.append((k, df))
287
- elif isinstance(v, dict):
288
- scalars.update(_flatten_dict({k: v}))
289
- else:
290
- scalars[k] = v
291
-
292
- # summary sheet
293
- if len(scalars) > 0:
294
- add_df(pd.DataFrame([scalars]), "summary")
295
-
296
- # each list -> one sheet
297
- for k, df in list_sheets:
298
- add_df(df, k if k else "list")
299
-
300
- # nếu dict chỉ có list, không có summary => vẫn OK (chỉ có các sheet list)
301
- return path
302
-
303
- # kiểu khác: ghi thành 1 cột value
304
- add_df(pd.DataFrame({"value": [json.dumps(data, ensure_ascii=False)]}), "data")
305
- return path
306
-
307
- # ================== HANDLERS ==================
308
- def preview_process(file):
309
- """Trả list đường dẫn ảnh PNG tạm cho Gallery (ổn định hơn list PIL)."""
310
- if file is None:
311
- return []
312
  try:
313
- file_bytes = _read_file_bytes(file)
314
- images = _make_previews(file_bytes, max_side=2000)
315
- paths = []
316
- for i, im in enumerate(images):
317
- fd, path = tempfile.mkstemp(suffix=f"_preview_{i}.png")
318
- os.close(fd)
319
- im.save(path, format="PNG")
320
- paths.append(path)
321
- return paths
 
 
 
 
 
 
 
322
  except Exception as e:
323
- print(f"Preview error: {e}")
324
- return []
325
-
326
- def _merge_freight_objects(objs: list[dict]) -> dict | None:
327
- if not objs: return None
328
- base = {}
329
- for k in ["shipping_line","shipping_line_code","shipping_line_reason","fee_type","valid_from","valid_to"]:
330
- for o in objs:
331
- if isinstance(o, dict) and o.get(k):
332
- base[k] = o[k]
333
- break
334
- base.setdefault(k, None)
335
-
336
- seen = set()
337
- merged_charges, merged_local = [], []
338
- def norm(v): return v.replace(",", ";") if isinstance(v, str) else v
339
-
340
- for o in objs:
341
- for c in (o.get("charges") or []):
342
- wb = json.dumps(c.get("weight_breaks", {}), sort_keys=True, ensure_ascii=False)
343
- key = (c.get("origin"), c.get("destination"), c.get("charge_name"), c.get("charge_code"), c.get("currency"), wb)
344
- if key in seen: continue
345
- c["remark"] = norm(c.get("remark"))
346
- merged_charges.append(c)
347
- seen.add(key)
348
- for lc in (o.get("local_charges") or []):
349
- lc["remark"] = norm(lc.get("remark"))
350
- merged_local.append(lc)
351
-
352
- base["charges"] = merged_charges
353
- base["local_charges"] = merged_local
354
- return base
355
- def _coerce_only_json(text: str) -> str:
356
- obj, s = _extract_json_from_message(text)
357
- if obj is not None:
358
- return json.dumps(obj, ensure_ascii=False)
359
- m = re.search(r"\{.*\}\s*$", text, flags=re.DOTALL)
360
- return m.group(0) if m else text.strip()
361
- # -------- Internal (Gemini) - Base (1 lượt, không thinking) --------
362
- def run_process_internal_base(file_bytes, filename, mime, question, model_choice,
363
- temperature, top_p):
364
- api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
365
- if not api_key:
366
- return "ERROR: Missing GOOGLE_API_KEY.", None
367
- genai.configure(api_key=api_key)
368
 
369
- model_name = INTERNAL_MODEL_MAP.get(model_choice, INTERNAL_MODEL_MAP["Gemini 2.5 Flash"])
370
- gen_config = {"temperature": float(temperature), "top_p": float(top_p)}
371
- model = genai.GenerativeModel(model_name=model_name, generation_config=gen_config)
372
-
373
- uploaded = None
374
- tmp_path = None
375
- try:
376
- if file_bytes:
377
- suffix = os.path.splitext(filename)[1] or ".bin"
378
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
379
- tmp.write(file_bytes)
380
- tmp_path = tmp.name
381
- uploaded = genai.upload_file(path=tmp_path, mime_type=mime)
382
- uploaded = _wait_file_active(uploaded, timeout_s=60)
383
-
384
- user_prompt = (question or "").strip()
385
- if not user_prompt:
386
- user_prompt = (
387
- "Perform high-quality OCR on the provided file. If PDF: read all pages in order. "
388
- "Return clean plain text. If structure is obvious (tables, key:value), preserve it. "
389
- "If you can, output JSON that captures the structure."
390
- )
391
-
392
- # Gọi model
393
- if uploaded:
394
- resp = model.generate_content([user_prompt, uploaded])
395
- else:
396
- resp = model.generate_content(user_prompt)
397
-
398
- # Lấy đúng message LLM (pretty nếu là JSON)
399
- answer_raw = _safe_text_from_gemini(resp)
400
- message = _pretty_message(answer_raw)
401
-
402
- # Parse JSON (nếu có) để export. Không validate schema.
403
- parsed_obj, _ = _extract_json_from_message(answer_raw)
404
-
405
- return message, parsed_obj
406
- finally:
407
- if tmp_path and os.path.exists(tmp_path):
408
- try: os.remove(tmp_path)
409
- except Exception: pass
410
- try:
411
- if uploaded and hasattr(uploaded, "name"):
412
- genai.delete_file(uploaded.name)
413
- except Exception:
414
- pass
415
- # ================== MAIN OCR FUNCTION ==================
416
  def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
417
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
418
  if not api_key:
419
  return "ERROR: Missing GOOGLE_API_KEY.", None
420
  genai.configure(api_key=api_key)
421
-
422
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
423
- model = genai.GenerativeModel(model_name=model_name, generation_config={"temperature": float(temperature), "top_p": float(top_p)})
 
424
 
425
  if file_bytes[:4] == b"%PDF":
426
  pages = pdf_to_images(file_bytes)
@@ -429,9 +186,14 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
429
 
430
  user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
431
  all_json_results, all_text_results = [], []
432
-
433
  previous_header_json = None
434
 
 
 
 
 
 
 
435
  for i in range(0, len(pages), batch_size):
436
  batch = pages[i:i+batch_size]
437
  uploaded = []
@@ -439,107 +201,132 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
439
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
440
  im.save(tmp.name)
441
  up = genai.upload_file(path=tmp.name, mime_type="image/png")
442
- up = _wait_file_active(up)
443
  uploaded.append(up)
444
-
445
- # build dynamic prompt
446
- if previous_header_json:
447
- context_prompt = (
448
- f"{user_prompt}\n"
449
- "The previous page had this table structure:\n"
450
- f"{json.dumps(previous_header_json, ensure_ascii=False, indent=2)}\n"
451
- "If this page has no header, assume it continues with the same structure."
452
- )
453
- else:
454
- context_prompt = user_prompt
455
-
456
- resp = model.generate_content([f"{context_prompt}\n(This is batch {i//batch_size+1})"] + uploaded)
457
- text = _safe_text_from_gemini(resp)
458
- json_text = _coerce_only_json(text)
459
-
460
- try:
461
- parsed = json.loads(json_text)
462
- all_json_results.append(parsed)
463
-
464
- # ✅ update header context (for next page)
465
- if i == 0:
466
- # chỉ cần giữ phần "charges[0].weight_breaks" làm cấu trúc header
467
- first_charge = (parsed.get("charges") or [{}])[0]
468
- if "weight_breaks" in first_charge:
469
- previous_header_json = first_charge["weight_breaks"]
470
- except Exception:
471
- all_text_results.append(text)
472
- finally:
473
- for up in uploaded:
474
- try: genai.delete_file(up.name)
475
- except: pass
476
-
477
- if all_json_results:
478
- merged_json = _merge_freight_objects(all_json_results)
479
- message = json.dumps(merged_json, ensure_ascii=False, indent=2)
480
- return message, merged_json
481
-
482
- combined_text = "\n\n".join(all_text_results)
483
- message = _pretty_message(combined_text)
484
- parsed_obj, _ = _extract_json_from_message(combined_text)
485
- return message, parsed_obj
486
-
487
- # -------- External API --------
488
- def run_process_external(file_bytes, filename, mime, question, api_url,
489
- temperature, top_p):
490
- if not api_url or not str(api_url).strip():
491
- return "ERROR: Missing external API endpoint (hãy dán URL).", None
492
- try:
493
- user_prompt = (question or "").strip()
494
- if not user_prompt:
495
- user_prompt = (
496
- "Perform high-quality OCR on the provided file. If PDF: read all pages in order. "
497
- "Return clean plain text. If structure is obvious (tables, key:value), preserve it. "
498
- "If you can, output JSON that captures the structure."
499
- )
500
-
501
- data = {"prompt": user_prompt, "temperature": str(temperature), "top_p": str(top_p)}
502
 
503
- if file_bytes:
504
- files = {"file": (filename, file_bytes, mime)}
505
- r = requests.post(api_url, files=files, data=data, timeout=60)
506
- else:
507
- r = requests.post(api_url, json=data, timeout=60)
508
-
509
- if r.status_code >= 400:
510
- return f"ERROR: External API HTTP {r.status_code}: {r.text[:300]}", None
511
-
512
- answer = None
513
- try:
514
- j = r.json()
515
- answer = j.get("message") or j.get("text") or j.get("data")
516
- if isinstance(answer, (dict, list)):
517
- answer = json.dumps(answer, ensure_ascii=False)
518
- except Exception:
519
- answer = r.text
520
-
521
- answer = (answer or "").strip()
522
- message = _pretty_message(answer)
523
- parsed_obj, _ = _extract_json_from_message(answer)
524
-
525
- return message, parsed_obj
526
- except Exception as e:
527
- return f"ERROR: {type(e).__name__}: {str(e) or repr(e)}", None
528
-
529
- # -------- Router --------
530
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
531
  """
532
- Router (không Agent, không thinking):
533
- - Nếu chọn External model -> run_process_external
534
- - Ngược lại -> Gemini nội bộ (Base 1 lượt)
535
  """
536
  try:
537
- has_file = file is not None
538
- file_bytes = filename = mime = None
539
- if has_file:
540
- file_bytes = _read_file_bytes(file)
541
- filename, mime = _guess_name_and_mime(file, file_bytes)
542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  if model_choice == EXTERNAL_MODEL_NAME:
544
  return run_process_external(
545
  file_bytes=file_bytes, filename=filename, mime=mime,
@@ -552,136 +339,27 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
552
  question=question, model_choice=model_choice,
553
  temperature=temperature, top_p=top_p
554
  )
555
- except Exception as e:
556
- return f"ERROR: {type(e).__name__}: {str(e) or repr(e)}", None
557
-
558
- def on_export_excel(parsed_obj):
559
- try:
560
- if not parsed_obj:
561
- # không có JSON để export → giữ nguyên, không hiện nút tải
562
- return gr.update(value=None, visible=False)
563
-
564
- # tạo file an toàn, giữ lại sau khi request kết thúc
565
- fd, tmp_path = tempfile.mkstemp(suffix=".xlsx")
566
- os.close(fd)
567
- _to_excel_generic(parsed_obj, tmp_path)
568
 
569
- # trả về path và bật visible để hiện link download
570
- return gr.update(value=tmp_path, visible=True)
571
  except Exception as e:
572
- print(f"Export error: {e}")
573
- return gr.update(value=None, visible=False)
574
-
575
- def clear_all():
576
- # file, preview, output_text, question, model, parsed_state, download,
577
- # temperature, top_p, external_api_url
578
- return (
579
- None, [], "", "",
580
- "Gemini 2.5 Flash", None, None,
581
- 0.2, 0.95, ""
582
- )
583
 
584
  # ================== UI ==================
585
- def _toggle_external_visibility(selected: str):
586
- return gr.update(visible=(selected == EXTERNAL_MODEL_NAME))
587
-
588
  def main():
589
- custom_css = """
590
- .gradio-container { max-width: 1400px !important; margin: 0 auto; }
591
- #main-row { display: flex; gap: 20px; align-items: flex-start; }
592
- #left-column { flex: 1; min-width: 400px; max-width: 600px; }
593
- #right-column { flex: 1; min-width: 400px; }
594
- #file-upload { border: 2px dashed #d1d5db; border-radius: 12px; padding: 20px; text-align: center; transition: border-color 0.3s ease; }
595
- #file-upload:hover { border-color: #3b82f6; }
596
- #preview-gallery { max-height: 600px; overflow-y: auto; border: 1px solid #e5e7eb; border-radius: 12px; background: #f9fafb; padding: 10px; }
597
- #preview-gallery .grid { grid-template-columns: 1fr !important; gap: 10px !important; }
598
- #preview-gallery img { width: 100% !important; height: auto !important; object-fit: contain !important; background: white; }
599
- #controls-section { background: #f8fafc; padding: 20px; border-radius: 12px; margin-bottom: 20px; }
600
- #results-section { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; }
601
- #llm-output { max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 13px; }
602
- .primary-button { background: linear-gradient(90deg, #3b82f6, #1d4ed8) !important; color: white !important; border: none !important; border-radius: 8px !important; padding: 10px 20px !important; font-weight: 500 !important; }
603
- .primary-button:hover { transform: translateY(-1px) !important; box-shadow: 0 4px 12px rgba(59, 130, 246, 0.3) !important; }
604
- .secondary-button { background: #f3f4f6 !important; color: #374151 !important; border: 1px solid #d1d5db !important; border-radius: 8px !important; padding: 8px 16px !important; }
605
- @media (max-width: 1024px) { #main-row { flex-direction: column; } #left-column, #right-column { min-width: 100%; max-width: 100%; } }
606
- """
607
-
608
- with gr.Blocks(title="OCR Multi-Agent System", css=custom_css, theme=gr.themes.Soft()) as demo:
609
- gr.HTML("""
610
- <div style="text-align: center; padding: 20px 0; margin-bottom: 30px;">
611
- <h1 style="color:#1f2937; font-size: 2.5rem; font-weight: bold; margin-bottom: 8px;">📄 OCR Extraction (LLM-first)</h1>
612
- <p style="color:#6b7280; font-size: 1.1rem; margin: 0;">Upload PDF/images → LLM produces raw text/JSON → Export Excel (schema-agnostic)</p>
613
- </div>
614
- """)
615
-
616
- last_parsed_state = gr.State(value=None)
617
-
618
- with gr.Row(elem_id="main-row"):
619
- # Left
620
- with gr.Column(elem_id="left-column"):
621
- gr.Markdown("### 📁 Upload Document")
622
- file = gr.File(
623
- label="Choose PDF or Image file",
624
- file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
625
- type="filepath",
626
- elem_id="file-upload"
627
- )
628
- gr.Markdown("### 👁️ Document Preview")
629
- preview = gr.Gallery(columns=1, height=None, show_label=False, elem_id="preview-gallery", allow_preview=True)
630
-
631
- # Right
632
- with gr.Column(elem_id="right-column"):
633
- with gr.Group(elem_id="controls-section"):
634
- gr.Markdown("### ⚙️ Processing Options")
635
- with gr.Row():
636
- model_choice = gr.Dropdown(
637
- choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
638
- value="Gemini 2.5 Flash",
639
- label="Model"
640
- )
641
-
642
- with gr.Row():
643
- temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05, label="temperature")
644
- top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="top_p")
645
-
646
- external_api_url = gr.Textbox(
647
- label="External API endpoint (URL)",
648
- placeholder="https://your-host/path/to/ocr",
649
- visible=False
650
- )
651
-
652
- question = gr.Textbox(
653
- label="Custom Prompt (optional)",
654
- placeholder="Leave blank for default OCR; or ask model to output JSON by your own schema...",
655
- lines=3
656
- )
657
- with gr.Row():
658
- run_btn = gr.Button("🚀 Process Document", elem_classes=["primary-button"])
659
- clear_btn = gr.Button("🗑️ Clear All", elem_classes=["secondary-button"])
660
-
661
- with gr.Group(elem_id="results-section"):
662
- gr.Markdown("### 📊 LLM Message (raw/pretty)")
663
- output_text = gr.Code(label="LLM Message", language="json", elem_id="llm-output")
664
- with gr.Row():
665
- export_btn = gr.Button("⬇️ Export to Excel", elem_classes=["secondary-button"])
666
- download_file = gr.File(label="Download Excel", interactive=False, visible=False)
667
-
668
- # Events
669
- file.change(preview_process, inputs=[file], outputs=[preview])
670
- model_choice.change(_toggle_external_visibility, inputs=[model_choice], outputs=[external_api_url])
671
 
672
  run_btn.click(
673
  run_process,
674
  inputs=[file, question, model_choice, temperature, top_p, external_api_url],
675
- outputs=[output_text, last_parsed_state]
676
- )
677
-
678
- export_btn.click(on_export_excel, inputs=[last_parsed_state], outputs=[download_file])
679
-
680
- clear_btn.click(
681
- clear_all,
682
- inputs=[],
683
- outputs=[file, preview, output_text, question, model_choice, last_parsed_state,
684
- download_file, temperature, top_p, external_api_url]
685
  )
686
 
687
  return demo
@@ -689,4 +367,4 @@ def main():
689
  demo = main()
690
 
691
  if __name__ == "__main__":
692
- demo.launch()
 
7
  import gradio as gr
8
  import google.generativeai as genai
9
  import requests
10
+ import pdfplumber
11
 
12
  # ================== CONFIG ==================
13
+ DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 
14
 
15
  INTERNAL_MODEL_MAP = {
16
  "Gemini 2.5 Flash": "gemini-2.5-flash",
 
19
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
20
 
21
  try:
22
+ RESAMPLE = Image.Resampling.LANCZOS
23
  except AttributeError:
24
+ RESAMPLE = Image.LANCZOS
25
+
26
  PROMPT_FREIGHT_JSON = """
27
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
28
  {
 
73
  }
74
  ]
75
  }
 
76
  ### Date rules
77
  - valid_from format:
78
  - `DD/MM/YYYY` (if full date)
 
82
  - valid_to:
83
  - exact `DD/MM/YYYY` if present
84
  - else `UFN`
 
85
  STRICT RULES:
86
  - ONLY return a single JSON object as specified above.
87
  - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
 
97
  - Replace commas in remarks with semicolons.
98
  - Only return JSON.
99
  """
100
+
101
  # ================== HELPERS ==================
102
  import fitz # PyMuPDF
103
 
 
126
  return upload.read()
127
  raise TypeError(f"Unsupported file object: {type(upload)}")
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
130
  if isinstance(file, (str, os.PathLike)):
131
  filename = os.path.basename(str(file))
 
145
  mime = "image/png"
146
  return filename, mime
147
 
148
+ # ================== PDF CHECK STEP ==================
149
+ def check_pdf_structure(file_bytes: bytes) -> str:
150
+ """Kiểm tra nhanh file PDF có phải bảng nhiều cột, nhiều trang không."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  try:
152
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
153
+ if len(pdf.pages) <= 2:
154
+ return "không"
155
+ table_pages = 0
156
+ for page in pdf.pages[:3]:
157
+ tables = page.find_tables()
158
+ if tables and len(tables) > 0:
159
+ table_pages += 1
160
+ if table_pages >= 1:
161
+ return "có"
162
+ text = "\n".join([(p.extract_text() or "") for p in pdf.pages[:2]])
163
+ num_tokens = sum(ch.isdigit() for ch in text)
164
+ line_count = len(text.splitlines())
165
+ if num_tokens > 100 and line_count > 20:
166
+ return "có"
167
+ return "không"
168
  except Exception as e:
169
+ print("PDF check error:", e)
170
+ return "không"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ # ================== OCR CORE (Gemini) ==================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
174
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
175
  if not api_key:
176
  return "ERROR: Missing GOOGLE_API_KEY.", None
177
  genai.configure(api_key=api_key)
 
178
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
179
+ model = genai.GenerativeModel(model_name=model_name,
180
+ generation_config={"temperature": float(temperature), "top_p": float(top_p)})
181
 
182
  if file_bytes[:4] == b"%PDF":
183
  pages = pdf_to_images(file_bytes)
 
186
 
187
  user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
188
  all_json_results, all_text_results = [], []
 
189
  previous_header_json = None
190
 
191
+ def _safe_text(resp):
192
+ try:
193
+ return resp.text
194
+ except:
195
+ return ""
196
+
197
  for i in range(0, len(pages), batch_size):
198
  batch = pages[i:i+batch_size]
199
  uploaded = []
 
201
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
202
  im.save(tmp.name)
203
  up = genai.upload_file(path=tmp.name, mime_type="image/png")
204
+ up = genai.get_file(up.name)
205
  uploaded.append(up)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ context_prompt = user_prompt
208
+ resp = model.generate_content([context_prompt] + uploaded)
209
+ text = _safe_text(resp)
210
+ all_text_results.append(text)
211
+ for up in uploaded:
212
+ try:
213
+ genai.delete_file(up.name)
214
+ except:
215
+ pass
216
+
217
+ return "\n\n".join(all_text_results), None
218
+
219
+ # ================== EXTERNAL API (nếu có) ==================
220
+ def run_process_external(file_bytes, filename, mime, question, api_url, temperature, top_p):
221
+ if not api_url:
222
+ return "ERROR: Missing external API endpoint.", None
223
+ data = {"prompt": question or "", "temperature": str(temperature), "top_p": str(top_p)}
224
+ files = {"file": (filename, file_bytes, mime)}
225
+ r = requests.post(api_url, files=files, data=data, timeout=60)
226
+ if r.status_code >= 400:
227
+ return f"ERROR: External API HTTP {r.status_code}: {r.text[:200]}", None
228
+ return r.text, None
229
+
230
+ # ================== MAIN ROUTER (đã thêm STEP CHECK) ==================
 
 
 
231
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
232
  """
233
+ Router ( bước kiểm tra PDF/table trước khi xử lý):
234
+ - Nếu PDF nhiều trang/nhiều bảng -> extract trước (pdfplumber)
235
+ - Ngược lại -> OCR trực tiếp Gemini
236
  """
237
  try:
238
+ if file is None:
239
+ return "ERROR: No file uploaded.", None
 
 
 
240
 
241
+ file_bytes = _read_file_bytes(file)
242
+ filename, mime = _guess_name_and_mime(file, file_bytes)
243
+
244
+ # STEP 1️⃣: Check PDF structure
245
+ if mime == "application/pdf" or file_bytes[:4] == b"%PDF":
246
+ check_result = check_pdf_structure(file_bytes)
247
+ print(f"[PDF Check] {filename}: {check_result}")
248
+
249
+ if check_result == "có":
250
+ try:
251
+ print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
252
+ all_dfs = []
253
+ saved_header = None
254
+
255
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
256
+ for page_idx, page in enumerate(pdf.pages, start=1):
257
+ print(f"📄 Đang xử lý trang {page_idx}...")
258
+
259
+ table = page.extract_table({
260
+ "vertical_strategy": "lines",
261
+ "horizontal_strategy": "text",
262
+ "snap_tolerance": 3,
263
+ "intersection_tolerance": 5,
264
+ })
265
+
266
+ if not table or len(table) < 2:
267
+ print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
268
+ continue
269
+
270
+ header = table[0]
271
+ rows = table[1:]
272
+
273
+ # Lưu header đầu tiên
274
+ if saved_header is None:
275
+ saved_header = header
276
+ print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
277
+
278
+ # Nếu trang sau không có header rõ → dùng header cũ
279
+ if len(header) < len(saved_header) or "REGION" not in header[0]:
280
+ print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
281
+ header = saved_header
282
+ rows = table
283
+ else:
284
+ saved_header = header # cập nhật header hợp lệ
285
+
286
+ if len(rows) == 0:
287
+ print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
288
+ continue
289
+
290
+ try:
291
+ df = pd.DataFrame(rows, columns=header)
292
+ all_dfs.append(df)
293
+ print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
294
+ except Exception as e:
295
+ print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
296
+
297
+ if all_dfs:
298
+ final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
299
+ print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
300
+
301
+ # Xuất ra file tạm (Excel + JSON)
302
+ base_name = os.path.splitext(filename)[0]
303
+ tmp_dir = tempfile.gettempdir()
304
+ # json_path = os.path.join(tmp_dir, f"{base_name}.json")
305
+ # excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
306
+
307
+ # final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
308
+ # final_df.to_excel(excel_path, index=False)
309
+
310
+ # print(f"✅ Xuất JSON: {json_path}")
311
+ # print(f"✅ Xuất Excel: {excel_path}")
312
+
313
+ # Convert bảng thành CSV text để Gemini đọc tiếp
314
+ table_text = final_df.to_csv(index=False)
315
+ print(f"✅ Đang Gen text từ file CSV")
316
+ question = (
317
+ f"{PROMPT_FREIGHT_JSON}\n"
318
+ "Below is the table text extracted from the PDF (CSV format):\n"
319
+ f"{table_text}\n\n"
320
+ "Please convert this into valid JSON as per the schema."
321
+ )
322
+ else:
323
+ print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
324
+
325
+ except Exception as e:
326
+ print("❌ pdfplumber extract failed:", e)
327
+
328
+
329
+ # STEP 2️⃣: Route model
330
  if model_choice == EXTERNAL_MODEL_NAME:
331
  return run_process_external(
332
  file_bytes=file_bytes, filename=filename, mime=mime,
 
339
  question=question, model_choice=model_choice,
340
  temperature=temperature, top_p=top_p
341
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
 
 
343
  except Exception as e:
344
+ return f"ERROR: {type(e).__name__}: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
345
 
346
  # ================== UI ==================
 
 
 
347
  def main():
348
+ with gr.Blocks(title="OCR Multi-Agent System") as demo:
349
+ file = gr.File(label="Upload PDF/Image")
350
+ question = gr.Textbox(label="Prompt", lines=2)
351
+ model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
352
+ value="Gemini 2.5 Flash", label="Model")
353
+ temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
354
+ top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
355
+ external_api_url = gr.Textbox(label="External API URL", visible=False)
356
+ output_text = gr.Code(label="Output", language="json")
357
+ run_btn = gr.Button("🚀 Process")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  run_btn.click(
360
  run_process,
361
  inputs=[file, question, model_choice, temperature, top_p, external_api_url],
362
+ outputs=[output_text, gr.State()]
 
 
 
 
 
 
 
 
 
363
  )
364
 
365
  return demo
 
367
  demo = main()
368
 
369
  if __name__ == "__main__":
370
+ demo.launch()