Spaces:

vithacocf
/

api_gemini

Paused

App Files Files Community

vithacocf commited on about 1 month ago

Commit

ece3c79

verified ·

1 Parent(s): 0fb6325

Update app.py

Browse files

Files changed (1) hide show

app.py +334 -301

app.py CHANGED Viewed

@@ -1,156 +1,221 @@
 from __future__ import annotations
-import os, io, re, json, time, mimetypes, tempfile, string
-from typing import List, Union, Tuple, Any, Iterable
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
-import requests
 import pdfplumber
 # ================== CONFIG ==================
-DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 INTERNAL_MODEL_MAP = {
     "Gemini 2.5 Flash": "gemini-2.5-flash",
-    "Gemini 2.5 Pro":   "gemini-2.5-pro",
 }
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
-try:
-    RESAMPLE = Image.Resampling.LANCZOS
-except AttributeError:
-    RESAMPLE = Image.LANCZOS
 PROMPT_FREIGHT_JSON = """
-You are an expert in air freight rate extraction and normalization.
-The document contains rate information for multiple airlines.
-Please analyze all content (tables, headers, notes) and return **a list of JSON objects**, each representing a separate airline.
-Each airline should follow this schema:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
   "shipping_line_reason": "Why this carrier is chosen?",
   "fee_type": "Air Freight",
-  "valid_from": "...",
-  "valid_to": "...",
-  "charges": [ ... ],             # List of charge objects (see below)
-  "local_charges": [ ... ]        # Optional local charges if available
 }
-Each `charges` object must follow this schema:
 {
-  "frequency": "...",
-  "package_type": "...",                  # e.g. Carton, Pallet, Skid
-  "aircraft_type": "...",
-  "direction": "Export / Import / null",
-  "origin": "...",
-  "destination": "...",
-  "charge_name": "...",
-  "charge_code": "GCR / PER / DGR / etc.",
-  "charge_code_reason": "...",
-  "cargo_type": "...",
-  "currency": "...",
-  "transit": "...",
-  "transit_time": "...",
   "weight_breaks": {
-    "M": ...,
-    "N": ...,
-    "+45kg": ...,
-    "+100kg": ...,
-    "+300kg": ...,
-    "+500kg": ...,
-    "+1000kg": ...,
-    "other": { key: value },
-    "weight_breaks_reason": "Why chosen weight_breaks?"
   },
-  "remark": "..."
 }
-Each `local_charges` object:
 {
-  "charge_name": "...",
-  "charge_code": "...",
-  "unit": "...",
-  "amount": ...,
-  "remark": "..."
 }
----
-### ✈️ Airline Separation Logic:
-- If multiple airlines are detected in the document, separate each section and return a distinct JSON object per airline.
-- Infer `shipping_line` and `shipping_line_code` from the header (e.g. "AIR CHINA CARGO (CA)" → name = "AIR CHINA CARGO", code = "CA").
-- Each JSON object must include only data relevant to that airline.
----
-### 💡 Date rules:
-- valid_from:
-  - `DD/MM/YYYY` if exact
-  - `01/MM/YYYY` if only month/year
-  - `01/01/YYYY` if only year
-  - `UFN` if missing
-- valid_to:
-  - exact `DD/MM/YYYY` if present
-  - else `UFN`
----
-### 📦 Package and Surcharge Logic:
-Apply these when the remark or note indicates such rules:
-1. **Default case**: If no package mentioned → `"Carton"` is the default.
-2. **“Carton = Pallet”**: Duplicate rates with `package_type="Pallet"`.
-3. **“SKID shipment: add 10 cents (GEN & PER)”**: Add new charges with `+0.10 USD/kg` for GEN/PER, with `package_type="Pallet"` or `"Skid"`.
-4. **EU vs Non-EU surcharges**: If different pallet surcharges by region → split charges accordingly.
-5. **“All-in” or “inclusive of MY and SC”**: Record `FSC` and `WSC` as `local_charges` with `"NIL"` amount.
-6. **Flight number is not a charge code**. Always use standard cargo code (GCR, PER, etc.).
----
-### ⚙️ Other Business Rules:
-- RQ / Request → "RQST"
-- Combine same-rate destinations using `/`
-- Always use **IATA code** for origin/destination
-- Direction = Export if origin is in Vietnam (SGN, HAN, DAD), else Import
-- Frequency:
-  - D[1-7] = day of week
-  - "Daily" = D1234567
-- Remarks: Replace `,` with `;`
-- Add meaningful `"shipping_line_reason"` and `"charge_code_reason"`
----
-### 🚨 STRICT OUTPUT:
-- Return **a JSON array**, where each item is a full airline object
-- Do NOT return markdown or explanation
-- All fields must be valid
-- All numbers = numeric types
-- Use `null` if value missing
 """
 # ================== HELPERS ==================
-import fitz  # PyMuPDF
 def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    pages = []
-    for p in doc:
-        pix = p.get_pixmap(dpi=200)
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        pages.append(img)
-    return pages
-def ensure_rgb(im: Image.Image) -> Image.Image:
-    return im.convert("RGB") if im.mode != "RGB" else im
 def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
@@ -166,73 +231,141 @@ def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> by
     raise TypeError(f"Unsupported file object: {type(upload)}")
 def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
     if isinstance(file, (str, os.PathLike)):
         filename = os.path.basename(str(file))
-    elif isinstance(file, dict) and "name" in file:
-        filename = os.path.basename(file["name"])
-    elif isinstance(file, dict) and "path" in file:
-        filename = os.path.basename(file["path"])
-    else:
-        filename = "upload.bin"
     mime, _ = mimetypes.guess_type(filename)
-    if not mime:
-        if len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
-            mime = "application/pdf"
-            if not filename.lower().endswith(".pdf"):
-                filename += ".pdf"
-        else:
-            mime = "image/png"
-    return filename, mime
-# ================== PDF CHECK STEP ==================
 def check_pdf_structure(file_bytes: bytes) -> str:
-    """Kiểm tra nhanh file PDF có phải bảng nhiều cột, nhiều trang không."""
     try:
         with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-            if len(pdf.pages) <= 2:
-                return "không"
-            table_pages = 0
-            for page in pdf.pages[:3]:
-                tables = page.find_tables()
-                if tables and len(tables) > 0:
-                    table_pages += 1
-            if table_pages >= 1:
-                return "có"
-            text = "\n".join([(p.extract_text() or "") for p in pdf.pages[:2]])
-            num_tokens = sum(ch.isdigit() for ch in text)
-            line_count = len(text.splitlines())
-            if num_tokens > 100 and line_count > 20:
-                return "có"
-        return "không"
-    except Exception as e:
-        print("PDF check error:", e)
-        return "không"
-# ================== OCR CORE (Gemini) ==================
 def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
-    api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
-    if not api_key:
-        return "ERROR: Missing GOOGLE_API_KEY.", None
     genai.configure(api_key=api_key)
     model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
     model = genai.GenerativeModel(model_name=model_name,
                                   generation_config={"temperature": float(temperature), "top_p": float(top_p)})
-    if file_bytes[:4] == b"%PDF":
-        pages = pdf_to_images(file_bytes)
-    else:
-        pages = [Image.open(io.BytesIO(file_bytes))]
-    user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
-    all_json_results, all_text_results = [], []
-    previous_header_json = None
-    def _safe_text(resp):
-        try:
-            return resp.text
-        except:
-            return ""
     for i in range(0, len(pages), batch_size):
         batch = pages[i:i+batch_size]
         uploaded = []
@@ -240,145 +373,46 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
             with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                 im.save(tmp.name)
                 up = genai.upload_file(path=tmp.name, mime_type="image/png")
-                up = genai.get_file(up.name)
-                uploaded.append(up)
-        context_prompt = user_prompt
-        resp = model.generate_content([context_prompt] + uploaded)
-        text = _safe_text(resp)
-        all_text_results.append(text)
         for up in uploaded:
-            try:
-                genai.delete_file(up.name)
-            except:
-                pass
     return "\n\n".join(all_text_results), None
-# ================== EXTERNAL API (nếu có) ==================
-def run_process_external(file_bytes, filename, mime, question, api_url, temperature, top_p):
-    if not api_url:
-        return "ERROR: Missing external API endpoint.", None
-    data = {"prompt": question or "", "temperature": str(temperature), "top_p": str(top_p)}
-    files = {"file": (filename, file_bytes, mime)}
-    r = requests.post(api_url, files=files, data=data, timeout=60)
-    if r.status_code >= 400:
-        return f"ERROR: External API HTTP {r.status_code}: {r.text[:200]}", None
-    return r.text, None
-# ================== MAIN ROUTER (đã thêm STEP CHECK) ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
-    """
-    Router (có bước kiểm tra PDF/table trước khi xử lý):
-      - Nếu PDF nhiều trang/nhiều bảng -> extract trước (pdfplumber)
-      - Ngược lại -> OCR trực tiếp Gemini
-    """
     try:
         if file is None:
             return "ERROR: No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
-        # STEP 1️⃣: Check PDF structure
-        if mime == "application/pdf" or file_bytes[:4] == b"%PDF":
-            check_result = check_pdf_structure(file_bytes)
-            print(f"[PDF Check] {filename}: {check_result}")
-            if check_result == "có" and 1==2: # bỏ qua if này test thử prompt nhiều hãng
-                try:
-                    print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
-                    all_dfs = []
-                    saved_header = None
-                    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-                        for page_idx, page in enumerate(pdf.pages, start=1):
-                            print(f"📄 Đang xử lý trang {page_idx}...")
-                            table = page.extract_table({
-                                "vertical_strategy": "lines",
-                                "horizontal_strategy": "text",
-                                "snap_tolerance": 3,
-                                "intersection_tolerance": 5,
-                            })
-                            if not table or len(table) < 2:
-                                print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
-                                continue
-                            header = table[0]
-                            rows = table[1:]
-                            # Lưu header đầu tiên
-                            if saved_header is None:
-                                saved_header = header
-                                print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
-                            # Nếu trang sau không có header rõ → dùng header cũ
-                            if len(header) < len(saved_header) or "REGION" not in header[0]:
-                                print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
-                                header = saved_header
-                                rows = table
-                            else:
-                                saved_header = header  # cập nhật header hợp lệ
-                            if len(rows) == 0:
-                                print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
-                                continue
-                            try:
-                                df = pd.DataFrame(rows, columns=header)
-                                all_dfs.append(df)
-                                print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
-                            except Exception as e:
-                                print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
-                    if all_dfs:
-                        final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
-                        print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
-                        # Xuất ra file tạm (Excel + JSON)
-                        base_name = os.path.splitext(filename)[0]
-                        tmp_dir = tempfile.gettempdir()
-                        # json_path = os.path.join(tmp_dir, f"{base_name}.json")
-                        # excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
-                        # final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
-                        # final_df.to_excel(excel_path, index=False)
-                        # print(f"✅ Xuất JSON:  {json_path}")
-                        # print(f"✅ Xuất Excel: {excel_path}")
-                        # Convert bảng thành CSV text để Gemini đọc tiếp
-                        table_text = final_df.to_csv(index=False)
-                        print(f"✅ Đang Gen text từ file CSV")
-                        question = (
-                            f"{PROMPT_FREIGHT_JSON}\n"
-                            "Below is the table text extracted from the PDF (CSV format):\n"
-                            f"{table_text}\n\n"
-                            "Please convert this into valid JSON as per the schema."
-                        )
-                    else:
-                        print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
-                except Exception as e:
-                    print("❌ pdfplumber extract failed:", e)
-        # STEP 2️⃣: Route model
-        if model_choice == EXTERNAL_MODEL_NAME:
-            return run_process_external(
-                file_bytes=file_bytes, filename=filename, mime=mime,
-                question=question, api_url=external_api_url,
-                temperature=temperature, top_p=top_p
             )
-        return run_process_internal_base_v2(
-            file_bytes=file_bytes, filename=filename, mime=mime,
-            question=question, model_choice=model_choice,
-            temperature=temperature, top_p=top_p
-        )
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {str(e)}", None
@@ -400,7 +434,6 @@ def main():
             inputs=[file, question, model_choice, temperature, top_p, external_api_url],
             outputs=[output_text, gr.State()]
         )
     return demo
 demo = main()

 from __future__ import annotations
+import os, io, re, json, time, mimetypes, tempfile
+from typing import List, Union, Tuple
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
+#import requests
 import pdfplumber
+from pdf2image import convert_from_path
+#import pytesseract
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import fitz  # PyMuPDF
+import multiprocessing
+num_cpus = multiprocessing.cpu_count()
 # ================== CONFIG ==================
+DEFAULT_API_KEY = [
+    "AIzaSyD0qjaoOJwrLeOz9Ko8Bi9vRgTy3AefTC8",
+    # "AIzaSyAq7Wsi6fR0oWrJQbFkgGNdvxJTn8hWEzQ",
+    # "AIzaSyDRWRwwnYJktCULH8d26mzD1Lv4l0CdQws",
+    # "AIzaSyDW-x3kTWC7s2NJBOFDU7uC0vhKnREbANw",
+    # "AIzaSyAq7Wsi6fR0oWrJQbFkgGNdvxJTn8hWEzQ",
+    # "AIzaSyD0qjaoOJwrLeOz9Ko8Bi9vRgTy3AefTC8"
+]
+key_index = 0
 INTERNAL_MODEL_MAP = {
     "Gemini 2.5 Flash": "gemini-2.5-flash",
+    "Gemini 2.5 Pro": "gemini-2.5-pro",
 }
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
+PROMPT_FREIGHT_HEADER_JSON = """Vui lòng trích xuất tất cả thông tin metadata, tiêu đề (header), và ghi chú bên ngoài bảng giá trong tài liệu.
+Trả lời bằng tiếng Việt, ngắn gọn, rõ ràng và trình bày theo dạng danh sách.
+Đặc biệt, cần xác định và chuẩn hóa ngày hiệu lực (valid from / to) theo văn bản trong tài liệu, tuân thủ chính xác các quy tắc định dạng ngày như sau: DD/MM/YYYY, 01/MM/YYYY, 01/01/YYYY hoặc UFN nếu không có thông tin rõ ràng."""
 PROMPT_FREIGHT_JSON = """
+Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
   "shipping_line_reason": "Why this carrier is chosen?",
   "fee_type": "Air Freight",
+  "valid_from": ...,
+  "valid_to": ...,
+  "charges": [
+    {
+      "frequency": "...",
+      "package_type": "...",
+      "base_package_type": "...",
+      "aircraft_type": "...",
+      "direction": "Export or Import or null",
+      "origin": "...",
+      "destination": "...",
+      "charge_name": "...",
+      "charge_code": "...",
+      "charge_code_reason": "...",
+      "cargo_type": "...",
+      "currency": "...",
+      "transit": "...",
+      "transit_time": "...",
+      "additional_cost": ...,
+      "weight_breaks": {
+        "M": ...,
+        "N": ...,
+        "+45kg": ...,
+        "+100kg": ...,
+        "+300kg": ...,
+        "+500kg": ...,
+        "+1000kg": ...,
+        "other": { key: value },
+        "weight_breaks_reason": "Why chosen weight_breaks?"
+      },
+      "remark": "..."
+    }
+  ],
+  "local_charges": [
+    {
+      "charge_name": "...",
+      "charge_code": "...",
+      "unit": "...",
+      "amount": ...,
+      "remark": "..."
+    }
+  ]
 }
+============================================================
+### DATE RULES
+============================================================
+- **valid_from** format:
+  - DD/MM/YYYY (if full date)
+  - 01/MM/YYYY (if month + year only)
+  - 01/01/YYYY (if year only)
+  - UFN if missing
+- **valid_to**:
+  - exact DD/MM/YYYY if present
+  - else: UFN
+============================================================
+### STRICT DATA RULES
+============================================================
+- ONLY return a single JSON object.
+- All rates must match the weight break columns (M, N, +45kg, etc.).
+- Use `null` if value is missing.
+- "RQ" or similar → set as `"RQST"`.
+- Group destinations with same rate using "/".
+- Use IATA codes for `origin` and `destination`.
+- Ignore flight numbers like "ZH118" for charge_code.
+- Frequency format:
+  - D[1-7] (e.g. D1, D2345, D1234567)
+- Local charges: must include if found.
+- Validity fields (`valid_from`, `valid_to`): use rules above.
+- Direction: Export if from Vietnam (SGN, HAN, DAD...), otherwise Import.
+- Provide plain English for `shipping_line_reason` and `charge_code_reason`.
+- Replace commas in remarks with semicolons.
+- RETURN ONLY JSON — no explanations.
+============================================================
+### PACKAGE TYPE & SURCHARGE LOGIC
+============================================================
+- Always treat **Carton** as the base rate.
+- Generate derived **Pallet** (or SKID) surcharges if found in remarks/notes.
+▶️ Rules:
+1️⃣ **SKID shipment surcharge**
+If remark says:
+"SKID shipment: add 10 cents (apply for GEN & PER)"
+→ Add surcharge line (+0.10 USD/kg) for Pallet GEN/PER.
+- Increase all weight breaks by that value.
+- Keep origin, destination, etc. unchanged.
+- Mention derivation in `remark`.
+2️⃣ **Regional surcharge**
+E.g.:
+"For SKID shipment: EU +USD0.30/kg and rest +USD0.20/kg (exclude RGN, MAA)"
+→ Generate 2 surcharge lines accordingly.
+3️⃣ **Carton = Pallet**
+If remark says:
+"Carton = Pallet"
+→ Copy Carton rates into Pallet.
+Set `additional_cost` = 0.
+4️⃣ **As per remark**
+If remark says:
+"For specific route with package type: as per remark"
+→ Parse to determine logic.
+============================================================
+### DERIVED CHARGE GENERATION
+============================================================
+- Derived charges must be appended to `"charges"` array.
+- Must include:
+  - `"package_type": "Pallet"`
+  - `"base_package_type": "Carton"`
+  - `"additional_cost"` = numeric surcharge
+  - `"remark"` stating derivation
+- Other fields (origin, destination...) must match base record.
+- DO NOT remove the Carton base record.
+============================================================
+### EXAMPLES
+============================================================
+Base:
 {
+  "package_type": "Carton",
+  "cargo_type": "GEN",
+  "origin": "SGN",
+  "destination": "NRT",
+  "currency": "USD",
   "weight_breaks": {
+    "+45kg": 6.05,
+    "+100kg": 5.30,
+    "+300kg": 4.80
   },
+  "remark": "Carton base rate"
 }
+Derived (from SKID remark):
 {
+  "package_type": "Pallet",
+  "base_package_type": "Carton",
+  "cargo_type": "GEN, PER",
+  "currency": "USD",
+  "origin": "SGN",
+  "destination": "NRT",
+  "additional_cost": 0.10,
+  "weight_breaks": {
+    "+45kg": 6.15,
+    "+100kg": 5.40,
+    "+300kg": 4.90
+  },
+  "remark": "Derived from Carton; SKID shipment: add 10 cents (apply for GEN & PER)"
 }
 """
 # ================== HELPERS ==================
+def get_next_key():
+    global key_index
+    key = DEFAULT_API_KEY[key_index % len(DEFAULT_API_KEY)]
+    key_index += 1
+    return key
 def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
 def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
     raise TypeError(f"Unsupported file object: {type(upload)}")
 def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
+    filename = "upload.bin"
     if isinstance(file, (str, os.PathLike)):
         filename = os.path.basename(str(file))
+    elif isinstance(file, dict):
+        filename = os.path.basename(file.get("name") or file.get("path", filename))
     mime, _ = mimetypes.guess_type(filename)
+    if not mime and file_bytes[:4] == b"%PDF":
+        mime = "application/pdf"
+        if not filename.lower().endswith(".pdf"):
+            filename += ".pdf"
+    return filename, mime or "application/octet-stream"
+def safe_parse_json(text: str):
+    cleaned = re.sub(r"```json|```", "", text).strip()
+    try:
+        return json.loads(cleaned)
+    except json.JSONDecodeError as e:
+        print(f"❌ Failed to parse JSON: {e}")
+        print("📄 Raw text:\n", cleaned[:300])
+        return None
 def check_pdf_structure(file_bytes: bytes) -> str:
+    """
+    Phân tích PDF xem thuộc loại:
+    - 0: "1_trang_1_hang"
+    - 1: "nhieu_trang_1_hang"
+    - 2: "nhieu_hang"
+    - "khong_xac_dinh": nếu có lỗi
+    """
     try:
+        airline_pattern = re.compile(r"(.*?CARGO.*?RATE\s+EX\s+[A-Z]{3})", re.IGNORECASE)
+        airline_headers = set()
         with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text()
+                if not text:
+                    continue
+                for line in text.splitlines():
+                    match = airline_pattern.search(line.strip())
+                    if match:
+                        airline_name = match.group(1).strip().upper()
+                        airline_headers.add(airline_name)
+            total_pages = len(pdf.pages)
+        if len(airline_headers) > 1:
+            return 2
+        elif total_pages > 1:
+            return 1
+        else:
+            return 0
+    except Exception as e:
+        print(f"❌ Lỗi phân tích PDF: {e}")
+        return "khong_xac_dinh"
+# ================== PDF CHECK & SPLIT ==================
+def split_excel_by_airline_header(excel_path, sheet_name=0):
+    df = pd.read_excel(excel_path, header=None, sheet_name=sheet_name)
+    airline_chunks = {}
+    pattern = re.compile(r".*CARGO.*RATE EX HAN", re.IGNORECASE)
+    start_indices, airline_names = [], []
+    for i, row in df.iterrows():
+        line = " ".join([str(cell) for cell in row if pd.notnull(cell)])
+        if pattern.match(line):
+            start_indices.append(i)
+            airline_names.append(line.strip())
+    start_indices.append(len(df))
+    for i in range(len(airline_names)):
+        chunk_df = df.iloc[start_indices[i]:start_indices[i+1]].reset_index(drop=True)
+        airline_chunks[airline_names[i]] = chunk_df
+    return airline_chunks
+def export_pdf_to_excel(pdf_path: str, excel_output_path: str):
+    all_data = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            tables = page.extract_tables()
+            for table in tables:
+                df = pd.DataFrame(table)
+                df["__page__"] = page_num
+                all_data.append(df)
+    if all_data:
+        final_df = pd.concat(all_data, ignore_index=True)
+        final_df.to_excel(excel_output_path, index=False)
+# ================== PARALLEL ==================
+def send_to_gemini_for_json(df_chunk: pd.DataFrame, prompt: str, header: str) -> dict:
+    print(f'Begin process {df_chunk}')
+    table_text = df_chunk.to_csv(index=False)
+    full_prompt = f"{prompt}\n\n Below is header and note {header}\nBelow is the table text (CSV):\n{table_text}\nReturn the JSON."
+    result_text, _ = run_process_internal_base_v2(
+        file_bytes=None,
+        filename=None,
+        mime=None,
+        question=full_prompt,
+        model_choice="Gemini 2.5 Flash",
+        temperature=0.4,
+        top_p=1.0
+    )
+    #print(f'End process {df_chunk}')
+    return safe_parse_json(result_text)
+def process_all_chunks_with_threadpool(chunks: dict[str, pd.DataFrame], prompt: str, header: str, max_workers: int = 5) -> list[dict]:
+    all_results = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(send_to_gemini_for_json, chunk, prompt, header): airline
+            for airline, chunk in chunks.items() #if re.match(r"^\\d+", airline.strip())
+        }
+        for future in as_completed(futures):
+            airline = futures[future]
+            try:
+                result = future.result()
+                if result:
+                    all_results.extend(result if isinstance(result, list) else [result])
+            except Exception as e:
+                print(f"❌ Error with {airline}: {e}")
+    return all_results
+# ================== GEMINI BASE ==================
 def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
+    api_key = get_next_key()
     genai.configure(api_key=api_key)
     model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
+    print(f'Use key: {api_key}')
     model = genai.GenerativeModel(model_name=model_name,
                                   generation_config={"temperature": float(temperature), "top_p": float(top_p)})
+    if file_bytes is None:
+        response = model.generate_content(question)
+        #print(response.text)
+        return response.text, None
+    pages = pdf_to_images(file_bytes)
+    all_text_results = []
     for i in range(0, len(pages), batch_size):
         batch = pages[i:i+batch_size]
         uploaded = []
             with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                 im.save(tmp.name)
                 up = genai.upload_file(path=tmp.name, mime_type="image/png")
+                uploaded.append(genai.get_file(up.name))
+        resp = model.generate_content([question] + uploaded)
+        all_text_results.append(resp.text if hasattr(resp, "text") else "")
         for up in uploaded:
+            try: genai.delete_file(up.name)
+            except: pass
     return "\n\n".join(all_text_results), None
+# ================== MAIN ROUTER ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "ERROR: No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
+        check_result = check_pdf_structure(file_bytes)
+        if check_result > 1:
+            base_name = os.path.splitext(filename)[0]
+            tmp_dir = tempfile.gettempdir()
+            excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
+            export_pdf_to_excel(filename, excel_path)
+            chunks = split_excel_by_airline_header(excel_path)
+            header, _ = run_process_internal_base_v2(
+                file_bytes=file_bytes,
+                filename=filename,
+                mime=mime,
+                question=PROMPT_FREIGHT_HEADER_JSON,
+                model_choice=model_choice,
+                temperature=temperature,
+                top_p=top_p
             )
+            print(header)
+            chunk_files = []
+            for airline, df_chunk in chunks.items():
+                safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_")
+                print (f'airline : {airline}')
+            result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5)
+            return json.dumps(result, ensure_ascii=False, indent=2), None
+        else:
+            return "Only supports multi-airline PDF for now", None
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {str(e)}", None
             inputs=[file, question, model_choice, temperature, top_p, external_api_url],
             outputs=[output_text, gr.State()]
         )
     return demo
 demo = main()