Spaces:

vithacocf
/

air_flow

Sleeping

App Files Files Community

vithacocf commited on Nov 3

Commit

a76dbb6

verified ·

1 Parent(s): a2b9c10

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -22

app.py CHANGED Viewed

@@ -25,7 +25,77 @@ except AttributeError:
 PROMPT_FREIGHT_JSON = """
 Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
-{ ... }  # (rút gọn lại vì bạn đã có)
 """
 # ================== HELPERS ==================
@@ -177,28 +247,83 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
             print(f"[PDF Check] {filename}: {check_result}")
             if check_result == "có":
                 print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
-                try:
-                    tables_all = []
-                    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-                        for page in pdf.pages:
-                            for tb in page.extract_tables():
-                                if not tb or len(tb) < 2:
-                                    continue
-                                header = tb[0]
-                                df = pd.DataFrame(tb[1:], columns=header)
-                                tables_all.append(df)
-                    if tables_all:
-                        df_all = pd.concat(tables_all, ignore_index=True)
-                        table_text = df_all.to_csv(index=False)
-                        question = (
-                            f"{PROMPT_FREIGHT_JSON}\n"
-                            "Below is the table text extracted from the PDF (CSV format):\n"
-                            f"{table_text}\n\n"
-                            "Please convert this into valid JSON as per the schema."
-                        )
-                except Exception as e:
-                    print("pdfplumber extract failed:", e)
         # STEP 2️⃣: Route model
         if model_choice == EXTERNAL_MODEL_NAME:

 PROMPT_FREIGHT_JSON = """
 Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
+{
+  "shipping_line": "...",
+  "shipping_line_code": "...",
+  "shipping_line_reason": "Why this carrier is chosen?",
+  "fee_type": "Air Freight",
+  "valid_from": ...,
+  "valid_to": ...,
+  "charges": [
+    {
+      "frequency": "...",
+      "package_type": "...",
+      "aircraft_type": "...",
+      "direction": "Export or Import or null",
+      "origin": "...",
+      "destination": "...",
+      "charge_name": "...",
+      "charge_code": "...",
+      "charge_code_reason": "...",
+      "cargo_type": "...",
+      "currency": "...",
+      "transit": "...",
+      "transit_time": "...",
+      "weight_breaks": {
+        "M": ...,
+        "N": ...,
+        "+45kg": ...,
+        "+100kg": ...,
+        "+300kg": ...,
+        "+500kg": ...,
+        "+1000kg": ...,
+        "other": {
+          key: value
+        },
+        "weight_breaks_reason":"Why chosen weight_breaks?"
+      },
+      "remark": "..."
+    }
+  ],
+  "local_charges": [
+    {
+      "charge_name": "...",
+      "charge_code": "...",
+      "unit": "...",
+      "amount": ...,
+      "remark": "..."
+    }
+  ]
+}
+### Date rules
+- valid_from format:
+  - `DD/MM/YYYY` (if full date)
+  - `01/MM/YYYY` (if month+year only)
+  - `01/01/YYYY` (if year only)
+  - `UFN` if missing
+- valid_to:
+  - exact `DD/MM/YYYY` if present
+  - else `UFN`
+STRICT RULES:
+- ONLY return a single JSON object as specified above.
+- All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
+- If the table shows "RQ" or similar, set value as "RQST".
+- Group same-price destinations into one record separated by "/".
+- Always use IATA code for origin and destination.
+- Flight number (e.g. ZH118) is not charge code.
+- Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
+- If local charges exist, list them.
+- If validity missing, set null.
+- Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
+- Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
+- Replace commas in remarks with semicolons.
+- Only return JSON.
 """
 # ================== HELPERS ==================
             print(f"[PDF Check] {filename}: {check_result}")
             if check_result == "có":
+            try:
                 print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
+                all_dfs = []
+                saved_header = None
+                with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+                    for page_idx, page in enumerate(pdf.pages, start=1):
+                        print(f"📄 Đang xử lý trang {page_idx}...")
+                        table = page.extract_table({
+                            "vertical_strategy": "lines",
+                            "horizontal_strategy": "text",
+                            "snap_tolerance": 3,
+                            "intersection_tolerance": 5,
+                        })
+                        if not table or len(table) < 2:
+                            print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
+                            continue
+                        header = table[0]
+                        rows = table[1:]
+                        # Lưu header đầu tiên
+                        if saved_header is None:
+                            saved_header = header
+                            print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
+                        # Nếu trang sau không có header rõ → dùng header cũ
+                        if len(header) < len(saved_header) or "REGION" not in header[0]:
+                            print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
+                            header = saved_header
+                            rows = table
+                        else:
+                            saved_header = header  # cập nhật header hợp lệ
+                        if len(rows) == 0:
+                            print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
+                            continue
+                        try:
+                            df = pd.DataFrame(rows, columns=header)
+                            all_dfs.append(df)
+                            print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
+                        except Exception as e:
+                            print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
+                if all_dfs:
+                    final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
+                    print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
+                    # Xuất ra file tạm (Excel + JSON)
+                    base_name = os.path.splitext(filename)[0]
+                    tmp_dir = tempfile.gettempdir()
+                    json_path = os.path.join(tmp_dir, f"{base_name}.json")
+                    excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
+                    final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
+                    final_df.to_excel(excel_path, index=False)
+                    print(f"✅ Xuất JSON:  {json_path}")
+                    print(f"✅ Xuất Excel: {excel_path}")
+                    # Convert bảng thành CSV text để Gemini đọc tiếp
+                    table_text = final_df.to_csv(index=False)
+                    question = (
+                        f"{PROMPT_FREIGHT_JSON}\n"
+                        "Below is the table text extracted from the PDF (CSV format):\n"
+                        f"{table_text}\n\n"
+                        "Please convert this into valid JSON as per the schema."
+                    )
+                else:
+                    print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
+            except Exception as e:
+                print("❌ pdfplumber extract failed:", e)
         # STEP 2️⃣: Route model
         if model_choice == EXTERNAL_MODEL_NAME: