vithacocf commited on
Commit
a76dbb6
·
verified ·
1 Parent(s): a2b9c10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -22
app.py CHANGED
@@ -25,7 +25,77 @@ except AttributeError:
25
 
26
  PROMPT_FREIGHT_JSON = """
27
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
28
- { ... } # (rút gọn lại vì bạn đã có)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # ================== HELPERS ==================
@@ -177,28 +247,83 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
177
  print(f"[PDF Check] {filename}: {check_result}")
178
 
179
  if check_result == "có":
 
180
  print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
181
- try:
182
- tables_all = []
183
- with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
184
- for page in pdf.pages:
185
- for tb in page.extract_tables():
186
- if not tb or len(tb) < 2:
187
- continue
188
- header = tb[0]
189
- df = pd.DataFrame(tb[1:], columns=header)
190
- tables_all.append(df)
191
- if tables_all:
192
- df_all = pd.concat(tables_all, ignore_index=True)
193
- table_text = df_all.to_csv(index=False)
194
- question = (
195
- f"{PROMPT_FREIGHT_JSON}\n"
196
- "Below is the table text extracted from the PDF (CSV format):\n"
197
- f"{table_text}\n\n"
198
- "Please convert this into valid JSON as per the schema."
199
- )
200
- except Exception as e:
201
- print("pdfplumber extract failed:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  # STEP 2️⃣: Route model
204
  if model_choice == EXTERNAL_MODEL_NAME:
 
25
 
26
  PROMPT_FREIGHT_JSON = """
27
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
28
+ {
29
+ "shipping_line": "...",
30
+ "shipping_line_code": "...",
31
+ "shipping_line_reason": "Why this carrier is chosen?",
32
+ "fee_type": "Air Freight",
33
+ "valid_from": ...,
34
+ "valid_to": ...,
35
+ "charges": [
36
+ {
37
+ "frequency": "...",
38
+ "package_type": "...",
39
+ "aircraft_type": "...",
40
+ "direction": "Export or Import or null",
41
+ "origin": "...",
42
+ "destination": "...",
43
+ "charge_name": "...",
44
+ "charge_code": "...",
45
+ "charge_code_reason": "...",
46
+ "cargo_type": "...",
47
+ "currency": "...",
48
+ "transit": "...",
49
+ "transit_time": "...",
50
+ "weight_breaks": {
51
+ "M": ...,
52
+ "N": ...,
53
+ "+45kg": ...,
54
+ "+100kg": ...,
55
+ "+300kg": ...,
56
+ "+500kg": ...,
57
+ "+1000kg": ...,
58
+ "other": {
59
+ key: value
60
+ },
61
+ "weight_breaks_reason":"Why chosen weight_breaks?"
62
+ },
63
+ "remark": "..."
64
+ }
65
+ ],
66
+ "local_charges": [
67
+ {
68
+ "charge_name": "...",
69
+ "charge_code": "...",
70
+ "unit": "...",
71
+ "amount": ...,
72
+ "remark": "..."
73
+ }
74
+ ]
75
+ }
76
+ ### Date rules
77
+ - valid_from format:
78
+ - `DD/MM/YYYY` (if full date)
79
+ - `01/MM/YYYY` (if month+year only)
80
+ - `01/01/YYYY` (if year only)
81
+ - `UFN` if missing
82
+ - valid_to:
83
+ - exact `DD/MM/YYYY` if present
84
+ - else `UFN`
85
+ STRICT RULES:
86
+ - ONLY return a single JSON object as specified above.
87
+ - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
88
+ - If the table shows "RQ" or similar, set value as "RQST".
89
+ - Group same-price destinations into one record separated by "/".
90
+ - Always use IATA code for origin and destination.
91
+ - Flight number (e.g. ZH118) is not charge code.
92
+ - Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
93
+ - If local charges exist, list them.
94
+ - If validity missing, set null.
95
+ - Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
96
+ - Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
97
+ - Replace commas in remarks with semicolons.
98
+ - Only return JSON.
99
  """
100
 
101
  # ================== HELPERS ==================
 
247
  print(f"[PDF Check] {filename}: {check_result}")
248
 
249
  if check_result == "có":
250
+ try:
251
  print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
252
+ all_dfs = []
253
+ saved_header = None
254
+
255
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
256
+ for page_idx, page in enumerate(pdf.pages, start=1):
257
+ print(f"📄 Đang xử trang {page_idx}...")
258
+
259
+ table = page.extract_table({
260
+ "vertical_strategy": "lines",
261
+ "horizontal_strategy": "text",
262
+ "snap_tolerance": 3,
263
+ "intersection_tolerance": 5,
264
+ })
265
+
266
+ if not table or len(table) < 2:
267
+ print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
268
+ continue
269
+
270
+ header = table[0]
271
+ rows = table[1:]
272
+
273
+ # Lưu header đầu tiên
274
+ if saved_header is None:
275
+ saved_header = header
276
+ print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
277
+
278
+ # Nếu trang sau không có header rõ → dùng header cũ
279
+ if len(header) < len(saved_header) or "REGION" not in header[0]:
280
+ print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
281
+ header = saved_header
282
+ rows = table
283
+ else:
284
+ saved_header = header # cập nhật header hợp lệ
285
+
286
+ if len(rows) == 0:
287
+ print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
288
+ continue
289
+
290
+ try:
291
+ df = pd.DataFrame(rows, columns=header)
292
+ all_dfs.append(df)
293
+ print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
294
+ except Exception as e:
295
+ print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
296
+
297
+ if all_dfs:
298
+ final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
299
+ print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
300
+
301
+ # Xuất ra file tạm (Excel + JSON)
302
+ base_name = os.path.splitext(filename)[0]
303
+ tmp_dir = tempfile.gettempdir()
304
+ json_path = os.path.join(tmp_dir, f"{base_name}.json")
305
+ excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
306
+
307
+ final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
308
+ final_df.to_excel(excel_path, index=False)
309
+
310
+ print(f"✅ Xuất JSON: {json_path}")
311
+ print(f"✅ Xuất Excel: {excel_path}")
312
+
313
+ # Convert bảng thành CSV text để Gemini đọc tiếp
314
+ table_text = final_df.to_csv(index=False)
315
+ question = (
316
+ f"{PROMPT_FREIGHT_JSON}\n"
317
+ "Below is the table text extracted from the PDF (CSV format):\n"
318
+ f"{table_text}\n\n"
319
+ "Please convert this into valid JSON as per the schema."
320
+ )
321
+ else:
322
+ print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
323
+
324
+ except Exception as e:
325
+ print("❌ pdfplumber extract failed:", e)
326
+
327
 
328
  # STEP 2️⃣: Route model
329
  if model_choice == EXTERNAL_MODEL_NAME: