vithacocf commited on
Commit
ece3c79
·
verified ·
1 Parent(s): 0fb6325

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -301
app.py CHANGED
@@ -1,156 +1,221 @@
1
  from __future__ import annotations
2
- import os, io, re, json, time, mimetypes, tempfile, string
3
- from typing import List, Union, Tuple, Any, Iterable
4
 
 
 
5
  from PIL import Image
6
  import pandas as pd
7
  import gradio as gr
8
  import google.generativeai as genai
9
- import requests
10
  import pdfplumber
 
 
 
 
 
11
 
 
12
  # ================== CONFIG ==================
13
- DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 
 
 
 
 
 
 
 
14
 
15
  INTERNAL_MODEL_MAP = {
16
  "Gemini 2.5 Flash": "gemini-2.5-flash",
17
- "Gemini 2.5 Pro": "gemini-2.5-pro",
18
  }
19
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
20
 
21
- try:
22
- RESAMPLE = Image.Resampling.LANCZOS
23
- except AttributeError:
24
- RESAMPLE = Image.LANCZOS
25
-
26
  PROMPT_FREIGHT_JSON = """
27
- You are an expert in air freight rate extraction and normalization.
28
-
29
- The document contains rate information for multiple airlines.
30
- Please analyze all content (tables, headers, notes) and return **a list of JSON objects**, each representing a separate airline.
31
-
32
- Each airline should follow this schema:
33
 
34
  {
35
  "shipping_line": "...",
36
  "shipping_line_code": "...",
37
  "shipping_line_reason": "Why this carrier is chosen?",
38
  "fee_type": "Air Freight",
39
- "valid_from": "...",
40
- "valid_to": "...",
41
- "charges": [ ... ], # List of charge objects (see below)
42
- "local_charges": [ ... ] # Optional local charges if available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
- Each `charges` object must follow this schema:
46
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  {
48
- "frequency": "...",
49
- "package_type": "...", # e.g. Carton, Pallet, Skid
50
- "aircraft_type": "...",
51
- "direction": "Export / Import / null",
52
- "origin": "...",
53
- "destination": "...",
54
- "charge_name": "...",
55
- "charge_code": "GCR / PER / DGR / etc.",
56
- "charge_code_reason": "...",
57
- "cargo_type": "...",
58
- "currency": "...",
59
- "transit": "...",
60
- "transit_time": "...",
61
  "weight_breaks": {
62
- "M": ...,
63
- "N": ...,
64
- "+45kg": ...,
65
- "+100kg": ...,
66
- "+300kg": ...,
67
- "+500kg": ...,
68
- "+1000kg": ...,
69
- "other": { key: value },
70
- "weight_breaks_reason": "Why chosen weight_breaks?"
71
  },
72
- "remark": "..."
73
  }
74
 
75
- Each `local_charges` object:
76
-
77
  {
78
- "charge_name": "...",
79
- "charge_code": "...",
80
- "unit": "...",
81
- "amount": ...,
82
- "remark": "..."
 
 
 
 
 
 
 
 
83
  }
84
-
85
- ---
86
-
87
- ### ✈️ Airline Separation Logic:
88
- - If multiple airlines are detected in the document, separate each section and return a distinct JSON object per airline.
89
- - Infer `shipping_line` and `shipping_line_code` from the header (e.g. "AIR CHINA CARGO (CA)" → name = "AIR CHINA CARGO", code = "CA").
90
- - Each JSON object must include only data relevant to that airline.
91
-
92
- ---
93
-
94
- ### 💡 Date rules:
95
- - valid_from:
96
- - `DD/MM/YYYY` if exact
97
- - `01/MM/YYYY` if only month/year
98
- - `01/01/YYYY` if only year
99
- - `UFN` if missing
100
- - valid_to:
101
- - exact `DD/MM/YYYY` if present
102
- - else `UFN`
103
-
104
- ---
105
-
106
- ### 📦 Package and Surcharge Logic:
107
- Apply these when the remark or note indicates such rules:
108
-
109
- 1. **Default case**: If no package mentioned → `"Carton"` is the default.
110
- 2. **“Carton = Pallet”**: Duplicate rates with `package_type="Pallet"`.
111
- 3. **“SKID shipment: add 10 cents (GEN & PER)”**: Add new charges with `+0.10 USD/kg` for GEN/PER, with `package_type="Pallet"` or `"Skid"`.
112
- 4. **EU vs Non-EU surcharges**: If different pallet surcharges by region → split charges accordingly.
113
- 5. **“All-in” or “inclusive of MY and SC”**: Record `FSC` and `WSC` as `local_charges` with `"NIL"` amount.
114
- 6. **Flight number is not a charge code**. Always use standard cargo code (GCR, PER, etc.).
115
-
116
- ---
117
-
118
- ### ⚙️ Other Business Rules:
119
- - RQ / Request → "RQST"
120
- - Combine same-rate destinations using `/`
121
- - Always use **IATA code** for origin/destination
122
- - Direction = Export if origin is in Vietnam (SGN, HAN, DAD), else Import
123
- - Frequency:
124
- - D[1-7] = day of week
125
- - "Daily" = D1234567
126
- - Remarks: Replace `,` with `;`
127
- - Add meaningful `"shipping_line_reason"` and `"charge_code_reason"`
128
-
129
- ---
130
-
131
- ### 🚨 STRICT OUTPUT:
132
- - Return **a JSON array**, where each item is a full airline object
133
- - Do NOT return markdown or explanation
134
- - All fields must be valid
135
- - All numbers = numeric types
136
- - Use `null` if value missing
137
-
138
  """
139
 
140
  # ================== HELPERS ==================
141
- import fitz # PyMuPDF
 
 
 
 
 
142
 
143
  def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
144
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
145
- pages = []
146
- for p in doc:
147
- pix = p.get_pixmap(dpi=200)
148
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
149
- pages.append(img)
150
- return pages
151
-
152
- def ensure_rgb(im: Image.Image) -> Image.Image:
153
- return im.convert("RGB") if im.mode != "RGB" else im
154
 
155
  def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
156
  if upload is None:
@@ -166,73 +231,141 @@ def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> by
166
  raise TypeError(f"Unsupported file object: {type(upload)}")
167
 
168
  def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
 
169
  if isinstance(file, (str, os.PathLike)):
170
  filename = os.path.basename(str(file))
171
- elif isinstance(file, dict) and "name" in file:
172
- filename = os.path.basename(file["name"])
173
- elif isinstance(file, dict) and "path" in file:
174
- filename = os.path.basename(file["path"])
175
- else:
176
- filename = "upload.bin"
177
  mime, _ = mimetypes.guess_type(filename)
178
- if not mime:
179
- if len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
180
- mime = "application/pdf"
181
- if not filename.lower().endswith(".pdf"):
182
- filename += ".pdf"
183
- else:
184
- mime = "image/png"
185
- return filename, mime
186
-
187
- # ================== PDF CHECK STEP ==================
 
 
 
 
188
  def check_pdf_structure(file_bytes: bytes) -> str:
189
- """Kiểm tra nhanh file PDF có phải bảng nhiều cột, nhiều trang không."""
 
 
 
 
 
 
190
  try:
 
 
 
191
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
192
- if len(pdf.pages) <= 2:
193
- return "không"
194
- table_pages = 0
195
- for page in pdf.pages[:3]:
196
- tables = page.find_tables()
197
- if tables and len(tables) > 0:
198
- table_pages += 1
199
- if table_pages >= 1:
200
- return "có"
201
- text = "\n".join([(p.extract_text() or "") for p in pdf.pages[:2]])
202
- num_tokens = sum(ch.isdigit() for ch in text)
203
- line_count = len(text.splitlines())
204
- if num_tokens > 100 and line_count > 20:
205
- return "có"
206
- return "không"
207
- except Exception as e:
208
- print("PDF check error:", e)
209
- return "không"
 
210
 
211
- # ================== OCR CORE (Gemini) ==================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
213
- api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
214
- if not api_key:
215
- return "ERROR: Missing GOOGLE_API_KEY.", None
216
  genai.configure(api_key=api_key)
217
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
 
218
  model = genai.GenerativeModel(model_name=model_name,
219
  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
220
-
221
- if file_bytes[:4] == b"%PDF":
222
- pages = pdf_to_images(file_bytes)
223
- else:
224
- pages = [Image.open(io.BytesIO(file_bytes))]
225
-
226
- user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
227
- all_json_results, all_text_results = [], []
228
- previous_header_json = None
229
-
230
- def _safe_text(resp):
231
- try:
232
- return resp.text
233
- except:
234
- return ""
235
-
236
  for i in range(0, len(pages), batch_size):
237
  batch = pages[i:i+batch_size]
238
  uploaded = []
@@ -240,145 +373,46 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
240
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
241
  im.save(tmp.name)
242
  up = genai.upload_file(path=tmp.name, mime_type="image/png")
243
- up = genai.get_file(up.name)
244
- uploaded.append(up)
245
-
246
- context_prompt = user_prompt
247
- resp = model.generate_content([context_prompt] + uploaded)
248
- text = _safe_text(resp)
249
- all_text_results.append(text)
250
  for up in uploaded:
251
- try:
252
- genai.delete_file(up.name)
253
- except:
254
- pass
255
-
256
  return "\n\n".join(all_text_results), None
257
 
258
- # ================== EXTERNAL API (nếu có) ==================
259
- def run_process_external(file_bytes, filename, mime, question, api_url, temperature, top_p):
260
- if not api_url:
261
- return "ERROR: Missing external API endpoint.", None
262
- data = {"prompt": question or "", "temperature": str(temperature), "top_p": str(top_p)}
263
- files = {"file": (filename, file_bytes, mime)}
264
- r = requests.post(api_url, files=files, data=data, timeout=60)
265
- if r.status_code >= 400:
266
- return f"ERROR: External API HTTP {r.status_code}: {r.text[:200]}", None
267
- return r.text, None
268
-
269
- # ================== MAIN ROUTER (đã thêm STEP CHECK) ==================
270
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
271
- """
272
- Router (có bước kiểm tra PDF/table trước khi xử lý):
273
- - Nếu PDF nhiều trang/nhiều bảng -> extract trước (pdfplumber)
274
- - Ngược lại -> OCR trực tiếp Gemini
275
- """
276
  try:
277
  if file is None:
278
  return "ERROR: No file uploaded.", None
279
-
280
  file_bytes = _read_file_bytes(file)
281
  filename, mime = _guess_name_and_mime(file, file_bytes)
282
-
283
- # STEP 1️⃣: Check PDF structure
284
- if mime == "application/pdf" or file_bytes[:4] == b"%PDF":
285
- check_result = check_pdf_structure(file_bytes)
286
- print(f"[PDF Check] {filename}: {check_result}")
287
-
288
- if check_result == "có" and 1==2: # bỏ qua if này test thử prompt nhiều hãng
289
- try:
290
- print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
291
- all_dfs = []
292
- saved_header = None
293
-
294
- with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
295
- for page_idx, page in enumerate(pdf.pages, start=1):
296
- print(f"📄 Đang xử lý trang {page_idx}...")
297
-
298
- table = page.extract_table({
299
- "vertical_strategy": "lines",
300
- "horizontal_strategy": "text",
301
- "snap_tolerance": 3,
302
- "intersection_tolerance": 5,
303
- })
304
-
305
- if not table or len(table) < 2:
306
- print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
307
- continue
308
-
309
- header = table[0]
310
- rows = table[1:]
311
-
312
- # Lưu header đầu tiên
313
- if saved_header is None:
314
- saved_header = header
315
- print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
316
-
317
- # Nếu trang sau không có header rõ → dùng header cũ
318
- if len(header) < len(saved_header) or "REGION" not in header[0]:
319
- print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
320
- header = saved_header
321
- rows = table
322
- else:
323
- saved_header = header # cập nhật header hợp lệ
324
-
325
- if len(rows) == 0:
326
- print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
327
- continue
328
-
329
- try:
330
- df = pd.DataFrame(rows, columns=header)
331
- all_dfs.append(df)
332
- print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
333
- except Exception as e:
334
- print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
335
-
336
- if all_dfs:
337
- final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
338
- print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
339
-
340
- # Xuất ra file tạm (Excel + JSON)
341
- base_name = os.path.splitext(filename)[0]
342
- tmp_dir = tempfile.gettempdir()
343
- # json_path = os.path.join(tmp_dir, f"{base_name}.json")
344
- # excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
345
-
346
- # final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
347
- # final_df.to_excel(excel_path, index=False)
348
-
349
- # print(f"✅ Xuất JSON: {json_path}")
350
- # print(f"✅ Xuất Excel: {excel_path}")
351
-
352
- # Convert bảng thành CSV text để Gemini đọc tiếp
353
- table_text = final_df.to_csv(index=False)
354
- print(f"✅ Đang Gen text từ file CSV")
355
- question = (
356
- f"{PROMPT_FREIGHT_JSON}\n"
357
- "Below is the table text extracted from the PDF (CSV format):\n"
358
- f"{table_text}\n\n"
359
- "Please convert this into valid JSON as per the schema."
360
- )
361
- else:
362
- print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
363
-
364
- except Exception as e:
365
- print("❌ pdfplumber extract failed:", e)
366
-
367
-
368
- # STEP 2️⃣: Route model
369
- if model_choice == EXTERNAL_MODEL_NAME:
370
- return run_process_external(
371
- file_bytes=file_bytes, filename=filename, mime=mime,
372
- question=question, api_url=external_api_url,
373
- temperature=temperature, top_p=top_p
374
  )
375
-
376
- return run_process_internal_base_v2(
377
- file_bytes=file_bytes, filename=filename, mime=mime,
378
- question=question, model_choice=model_choice,
379
- temperature=temperature, top_p=top_p
380
- )
381
-
 
 
382
  except Exception as e:
383
  return f"ERROR: {type(e).__name__}: {str(e)}", None
384
 
@@ -400,7 +434,6 @@ def main():
400
  inputs=[file, question, model_choice, temperature, top_p, external_api_url],
401
  outputs=[output_text, gr.State()]
402
  )
403
-
404
  return demo
405
 
406
  demo = main()
 
1
  from __future__ import annotations
 
 
2
 
3
+ import os, io, re, json, time, mimetypes, tempfile
4
+ from typing import List, Union, Tuple
5
  from PIL import Image
6
  import pandas as pd
7
  import gradio as gr
8
  import google.generativeai as genai
9
+ #import requests
10
  import pdfplumber
11
+ from pdf2image import convert_from_path
12
+ #import pytesseract
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ import fitz # PyMuPDF
15
+ import multiprocessing
16
 
17
+ num_cpus = multiprocessing.cpu_count()
18
  # ================== CONFIG ==================
19
+ DEFAULT_API_KEY = [
20
+ "AIzaSyD0qjaoOJwrLeOz9Ko8Bi9vRgTy3AefTC8",
21
+ # "AIzaSyAq7Wsi6fR0oWrJQbFkgGNdvxJTn8hWEzQ",
22
+ # "AIzaSyDRWRwwnYJktCULH8d26mzD1Lv4l0CdQws",
23
+ # "AIzaSyDW-x3kTWC7s2NJBOFDU7uC0vhKnREbANw",
24
+ # "AIzaSyAq7Wsi6fR0oWrJQbFkgGNdvxJTn8hWEzQ",
25
+ # "AIzaSyD0qjaoOJwrLeOz9Ko8Bi9vRgTy3AefTC8"
26
+ ]
27
+ key_index = 0
28
 
29
  INTERNAL_MODEL_MAP = {
30
  "Gemini 2.5 Flash": "gemini-2.5-flash",
31
+ "Gemini 2.5 Pro": "gemini-2.5-pro",
32
  }
33
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
34
 
35
+ PROMPT_FREIGHT_HEADER_JSON = """Vui lòng trích xuất tất cả thông tin metadata, tiêu đề (header), và ghi chú bên ngoài bảng giá trong tài liệu.
36
+ Trả lời bằng tiếng Việt, ngắn gọn, rõ ràng và trình bày theo dạng danh sách.
37
+ Đặc biệt, cần xác định và chuẩn hóa ngày hiệu lực (valid from / to) theo văn bản trong tài liệu, tuân thủ chính xác các quy tắc định dạng ngày như sau: DD/MM/YYYY, 01/MM/YYYY, 01/01/YYYY hoặc UFN nếu không có thông tin rõ ràng."""
 
 
38
  PROMPT_FREIGHT_JSON = """
39
+ Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 
 
 
 
 
40
 
41
  {
42
  "shipping_line": "...",
43
  "shipping_line_code": "...",
44
  "shipping_line_reason": "Why this carrier is chosen?",
45
  "fee_type": "Air Freight",
46
+ "valid_from": ...,
47
+ "valid_to": ...,
48
+ "charges": [
49
+ {
50
+ "frequency": "...",
51
+ "package_type": "...",
52
+ "base_package_type": "...",
53
+ "aircraft_type": "...",
54
+ "direction": "Export or Import or null",
55
+ "origin": "...",
56
+ "destination": "...",
57
+ "charge_name": "...",
58
+ "charge_code": "...",
59
+ "charge_code_reason": "...",
60
+ "cargo_type": "...",
61
+ "currency": "...",
62
+ "transit": "...",
63
+ "transit_time": "...",
64
+ "additional_cost": ...,
65
+ "weight_breaks": {
66
+ "M": ...,
67
+ "N": ...,
68
+ "+45kg": ...,
69
+ "+100kg": ...,
70
+ "+300kg": ...,
71
+ "+500kg": ...,
72
+ "+1000kg": ...,
73
+ "other": { key: value },
74
+ "weight_breaks_reason": "Why chosen weight_breaks?"
75
+ },
76
+ "remark": "..."
77
+ }
78
+ ],
79
+ "local_charges": [
80
+ {
81
+ "charge_name": "...",
82
+ "charge_code": "...",
83
+ "unit": "...",
84
+ "amount": ...,
85
+ "remark": "..."
86
+ }
87
+ ]
88
  }
89
 
90
+ ============================================================
91
+ ### DATE RULES
92
+ ============================================================
93
+
94
+ - **valid_from** format:
95
+ - DD/MM/YYYY (if full date)
96
+ - 01/MM/YYYY (if month + year only)
97
+ - 01/01/YYYY (if year only)
98
+ - UFN if missing
99
+
100
+ - **valid_to**:
101
+ - exact DD/MM/YYYY if present
102
+ - else: UFN
103
+
104
+ ============================================================
105
+ ### STRICT DATA RULES
106
+ ============================================================
107
+
108
+ - ONLY return a single JSON object.
109
+ - All rates must match the weight break columns (M, N, +45kg, etc.).
110
+ - Use `null` if value is missing.
111
+ - "RQ" or similar → set as `"RQST"`.
112
+ - Group destinations with same rate using "/".
113
+ - Use IATA codes for `origin` and `destination`.
114
+ - Ignore flight numbers like "ZH118" for charge_code.
115
+ - Frequency format:
116
+ - D[1-7] (e.g. D1, D2345, D1234567)
117
+ - Local charges: must include if found.
118
+ - Validity fields (`valid_from`, `valid_to`): use rules above.
119
+ - Direction: Export if from Vietnam (SGN, HAN, DAD...), otherwise Import.
120
+ - Provide plain English for `shipping_line_reason` and `charge_code_reason`.
121
+ - Replace commas in remarks with semicolons.
122
+ - RETURN ONLY JSON — no explanations.
123
+
124
+ ============================================================
125
+ ### PACKAGE TYPE & SURCHARGE LOGIC
126
+ ============================================================
127
+
128
+ - Always treat **Carton** as the base rate.
129
+ - Generate derived **Pallet** (or SKID) surcharges if found in remarks/notes.
130
+
131
+ ▶️ Rules:
132
+
133
+ 1️⃣ **SKID shipment surcharge**
134
+ If remark says:
135
+ "SKID shipment: add 10 cents (apply for GEN & PER)"
136
+ → Add surcharge line (+0.10 USD/kg) for Pallet GEN/PER.
137
+
138
+ - Increase all weight breaks by that value.
139
+ - Keep origin, destination, etc. unchanged.
140
+ - Mention derivation in `remark`.
141
+
142
+ 2️⃣ **Regional surcharge**
143
+ E.g.:
144
+ "For SKID shipment: EU +USD0.30/kg and rest +USD0.20/kg (exclude RGN, MAA)"
145
+ → Generate 2 surcharge lines accordingly.
146
+
147
+ 3️⃣ **Carton = Pallet**
148
+ If remark says:
149
+ "Carton = Pallet"
150
+ → Copy Carton rates into Pallet.
151
+ Set `additional_cost` = 0.
152
+
153
+ 4️⃣ **As per remark**
154
+ If remark says:
155
+ "For specific route with package type: as per remark"
156
+ → Parse to determine logic.
157
+
158
+ ============================================================
159
+ ### DERIVED CHARGE GENERATION
160
+ ============================================================
161
+
162
+ - Derived charges must be appended to `"charges"` array.
163
+ - Must include:
164
+ - `"package_type": "Pallet"`
165
+ - `"base_package_type": "Carton"`
166
+ - `"additional_cost"` = numeric surcharge
167
+ - `"remark"` stating derivation
168
+ - Other fields (origin, destination...) must match base record.
169
+ - DO NOT remove the Carton base record.
170
+
171
+ ============================================================
172
+ ### EXAMPLES
173
+ ============================================================
174
+
175
+ Base:
176
  {
177
+ "package_type": "Carton",
178
+ "cargo_type": "GEN",
179
+ "origin": "SGN",
180
+ "destination": "NRT",
181
+ "currency": "USD",
 
 
 
 
 
 
 
 
182
  "weight_breaks": {
183
+ "+45kg": 6.05,
184
+ "+100kg": 5.30,
185
+ "+300kg": 4.80
 
 
 
 
 
 
186
  },
187
+ "remark": "Carton base rate"
188
  }
189
 
190
+ Derived (from SKID remark):
 
191
  {
192
+ "package_type": "Pallet",
193
+ "base_package_type": "Carton",
194
+ "cargo_type": "GEN, PER",
195
+ "currency": "USD",
196
+ "origin": "SGN",
197
+ "destination": "NRT",
198
+ "additional_cost": 0.10,
199
+ "weight_breaks": {
200
+ "+45kg": 6.15,
201
+ "+100kg": 5.40,
202
+ "+300kg": 4.90
203
+ },
204
+ "remark": "Derived from Carton; SKID shipment: add 10 cents (apply for GEN & PER)"
205
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  """
207
 
208
  # ================== HELPERS ==================
209
+ def get_next_key():
210
+ global key_index
211
+ key = DEFAULT_API_KEY[key_index % len(DEFAULT_API_KEY)]
212
+ key_index += 1
213
+
214
+ return key
215
 
216
  def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
217
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
218
+ return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
 
 
 
 
 
 
 
 
219
 
220
  def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
221
  if upload is None:
 
231
  raise TypeError(f"Unsupported file object: {type(upload)}")
232
 
233
  def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
234
+ filename = "upload.bin"
235
  if isinstance(file, (str, os.PathLike)):
236
  filename = os.path.basename(str(file))
237
+ elif isinstance(file, dict):
238
+ filename = os.path.basename(file.get("name") or file.get("path", filename))
 
 
 
 
239
  mime, _ = mimetypes.guess_type(filename)
240
+ if not mime and file_bytes[:4] == b"%PDF":
241
+ mime = "application/pdf"
242
+ if not filename.lower().endswith(".pdf"):
243
+ filename += ".pdf"
244
+ return filename, mime or "application/octet-stream"
245
+
246
+ def safe_parse_json(text: str):
247
+ cleaned = re.sub(r"```json|```", "", text).strip()
248
+ try:
249
+ return json.loads(cleaned)
250
+ except json.JSONDecodeError as e:
251
+ print(f"❌ Failed to parse JSON: {e}")
252
+ print("📄 Raw text:\n", cleaned[:300])
253
+ return None
254
  def check_pdf_structure(file_bytes: bytes) -> str:
255
+ """
256
+ Phân tích PDF xem thuộc loại:
257
+ - 0: "1_trang_1_hang"
258
+ - 1: "nhieu_trang_1_hang"
259
+ - 2: "nhieu_hang"
260
+ - "khong_xac_dinh": nếu có lỗi
261
+ """
262
  try:
263
+ airline_pattern = re.compile(r"(.*?CARGO.*?RATE\s+EX\s+[A-Z]{3})", re.IGNORECASE)
264
+ airline_headers = set()
265
+
266
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
267
+ for page in pdf.pages:
268
+ text = page.extract_text()
269
+ if not text:
270
+ continue
271
+
272
+ for line in text.splitlines():
273
+ match = airline_pattern.search(line.strip())
274
+ if match:
275
+ airline_name = match.group(1).strip().upper()
276
+ airline_headers.add(airline_name)
277
+
278
+ total_pages = len(pdf.pages)
279
+
280
+ if len(airline_headers) > 1:
281
+ return 2
282
+ elif total_pages > 1:
283
+ return 1
284
+ else:
285
+ return 0
286
 
287
+ except Exception as e:
288
+ print(f"❌ Lỗi phân tích PDF: {e}")
289
+ return "khong_xac_dinh"
290
+
291
+ # ================== PDF CHECK & SPLIT ==================
292
+ def split_excel_by_airline_header(excel_path, sheet_name=0):
293
+ df = pd.read_excel(excel_path, header=None, sheet_name=sheet_name)
294
+ airline_chunks = {}
295
+ pattern = re.compile(r".*CARGO.*RATE EX HAN", re.IGNORECASE)
296
+ start_indices, airline_names = [], []
297
+ for i, row in df.iterrows():
298
+ line = " ".join([str(cell) for cell in row if pd.notnull(cell)])
299
+ if pattern.match(line):
300
+ start_indices.append(i)
301
+ airline_names.append(line.strip())
302
+ start_indices.append(len(df))
303
+ for i in range(len(airline_names)):
304
+ chunk_df = df.iloc[start_indices[i]:start_indices[i+1]].reset_index(drop=True)
305
+ airline_chunks[airline_names[i]] = chunk_df
306
+ return airline_chunks
307
+
308
+ def export_pdf_to_excel(pdf_path: str, excel_output_path: str):
309
+ all_data = []
310
+ with pdfplumber.open(pdf_path) as pdf:
311
+ for page_num, page in enumerate(pdf.pages, start=1):
312
+ tables = page.extract_tables()
313
+ for table in tables:
314
+ df = pd.DataFrame(table)
315
+ df["__page__"] = page_num
316
+ all_data.append(df)
317
+ if all_data:
318
+ final_df = pd.concat(all_data, ignore_index=True)
319
+ final_df.to_excel(excel_output_path, index=False)
320
+
321
+ # ================== PARALLEL ==================
322
+ def send_to_gemini_for_json(df_chunk: pd.DataFrame, prompt: str, header: str) -> dict:
323
+ print(f'Begin process {df_chunk}')
324
+ table_text = df_chunk.to_csv(index=False)
325
+ full_prompt = f"{prompt}\n\n Below is header and note {header}\nBelow is the table text (CSV):\n{table_text}\nReturn the JSON."
326
+ result_text, _ = run_process_internal_base_v2(
327
+ file_bytes=None,
328
+ filename=None,
329
+ mime=None,
330
+ question=full_prompt,
331
+ model_choice="Gemini 2.5 Flash",
332
+ temperature=0.4,
333
+ top_p=1.0
334
+ )
335
+ #print(f'End process {df_chunk}')
336
+ return safe_parse_json(result_text)
337
+
338
+ def process_all_chunks_with_threadpool(chunks: dict[str, pd.DataFrame], prompt: str, header: str, max_workers: int = 5) -> list[dict]:
339
+ all_results = []
340
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
341
+ futures = {
342
+ executor.submit(send_to_gemini_for_json, chunk, prompt, header): airline
343
+ for airline, chunk in chunks.items() #if re.match(r"^\\d+", airline.strip())
344
+ }
345
+ for future in as_completed(futures):
346
+ airline = futures[future]
347
+ try:
348
+ result = future.result()
349
+ if result:
350
+ all_results.extend(result if isinstance(result, list) else [result])
351
+ except Exception as e:
352
+ print(f"❌ Error with {airline}: {e}")
353
+ return all_results
354
+
355
+ # ================== GEMINI BASE ==================
356
  def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
357
+ api_key = get_next_key()
 
 
358
  genai.configure(api_key=api_key)
359
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
360
+ print(f'Use key: {api_key}')
361
  model = genai.GenerativeModel(model_name=model_name,
362
  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
363
+ if file_bytes is None:
364
+ response = model.generate_content(question)
365
+ #print(response.text)
366
+ return response.text, None
367
+ pages = pdf_to_images(file_bytes)
368
+ all_text_results = []
 
 
 
 
 
 
 
 
 
 
369
  for i in range(0, len(pages), batch_size):
370
  batch = pages[i:i+batch_size]
371
  uploaded = []
 
373
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
374
  im.save(tmp.name)
375
  up = genai.upload_file(path=tmp.name, mime_type="image/png")
376
+ uploaded.append(genai.get_file(up.name))
377
+ resp = model.generate_content([question] + uploaded)
378
+ all_text_results.append(resp.text if hasattr(resp, "text") else "")
 
 
 
 
379
  for up in uploaded:
380
+ try: genai.delete_file(up.name)
381
+ except: pass
 
 
 
382
  return "\n\n".join(all_text_results), None
383
 
384
+ # ================== MAIN ROUTER ==================
 
 
 
 
 
 
 
 
 
 
 
385
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
 
 
 
 
 
386
  try:
387
  if file is None:
388
  return "ERROR: No file uploaded.", None
 
389
  file_bytes = _read_file_bytes(file)
390
  filename, mime = _guess_name_and_mime(file, file_bytes)
391
+ check_result = check_pdf_structure(file_bytes)
392
+ if check_result > 1:
393
+ base_name = os.path.splitext(filename)[0]
394
+ tmp_dir = tempfile.gettempdir()
395
+ excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
396
+ export_pdf_to_excel(filename, excel_path)
397
+ chunks = split_excel_by_airline_header(excel_path)
398
+ header, _ = run_process_internal_base_v2(
399
+ file_bytes=file_bytes,
400
+ filename=filename,
401
+ mime=mime,
402
+ question=PROMPT_FREIGHT_HEADER_JSON,
403
+ model_choice=model_choice,
404
+ temperature=temperature,
405
+ top_p=top_p
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  )
407
+ print(header)
408
+ chunk_files = []
409
+ for airline, df_chunk in chunks.items():
410
+ safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_")
411
+ print (f'airline : {airline}')
412
+ result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5)
413
+ return json.dumps(result, ensure_ascii=False, indent=2), None
414
+ else:
415
+ return "Only supports multi-airline PDF for now", None
416
  except Exception as e:
417
  return f"ERROR: {type(e).__name__}: {str(e)}", None
418
 
 
434
  inputs=[file, question, model_choice, temperature, top_p, external_api_url],
435
  outputs=[output_text, gr.State()]
436
  )
 
437
  return demo
438
 
439
  demo = main()