Spaces:

vithacocf
/

air_flow

Sleeping

App Files Files Community

vithacocf commited on Nov 4

Commit

2e92701

verified ·

1 Parent(s): 952d402

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -126

app.py CHANGED Viewed

@@ -1,18 +1,15 @@
 from __future__ import annotations
-import os, io, re, json, tempfile, mimetypes
-from typing import Union, Tuple
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
 import fitz  # PyMuPDF
 import pdfplumber
-try:
-    import camelot
-except Exception:
-    camelot = None
 # ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
@@ -21,27 +18,28 @@ INTERNAL_MODEL_MAP = {
     "Gemini 2.5 Pro": "gemini-2.5-pro",
 }
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
-Please analyze the freight rate table and convert it into JSON with this schema:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
   "fee_type": "Air Freight",
-  "valid_from": "...",
-  "valid_to": "...",
   "charges": [
     {
-      "origin": "...",
-      "destination": "...",
       "frequency": "...",
       "package_type": "...",
       "aircraft_type": "...",
-      "direction": "...",
       "charge_name": "...",
-      "charge_code": "GCR, DGR, PER, etc.",
-      "currency": "...",
       "cargo_type": "...",
       "transit": "...",
       "transit_time": "...",
       "weight_breaks": {
@@ -52,27 +50,50 @@ Please analyze the freight rate table and convert it into JSON with this schema:
         "+300kg": ...,
         "+500kg": ...,
         "+1000kg": ...,
-        "other": { key: value }
       },
-      "remark": "...",
-      "pallet_rule": "...",
-      "additional_cost": "..."
     }
   ]
 }
-### RULES
-- If remark says "SKID shipment: add 10 cents" → add surcharge line (+0.10 USD/kg) for Pallet (GEN & PER)
-- Adjust all weight breaks (+0.1) keeping other keys the same.
-- If remark says "Carton = Pallet" → same rates; no extra surcharge.
-- If remark says "EU +USD0.30/kg and rest +USD0.20/kg" → add 2 surcharge lines.
-- Always record Carton rates as base; generate Pallet rates if mentioned.
 - Group same-price destinations into one record separated by "/".
-- Frequency format: D[1-7]; "Daily" = D1234567.
-- Direction = Export if origin is Vietnam, else Import.
 - Replace commas in remarks with semicolons.
-- Only return valid JSON.
 """
 # ================== HELPERS ==================
 def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
@@ -94,56 +115,61 @@ def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
         mime = "application/pdf"
     return filename, mime or "application/octet-stream"
-# ================== PDF TABLE EXTRACT ==================
 def extract_pdf_tables(file_path: str) -> pd.DataFrame:
-    """Dùng Camelot trước, fallback pdfplumber nếu lỗi."""
     all_dfs = []
-    try:
-        total_pages = len(fitz.open(file_path))
-        print(f"📄 Tổng số trang: {total_pages}")
-    except:
-        total_pages = 1
-    if camelot is not None:
-        for page_no in range(1, total_pages + 1):
-            print(f"���� Đang xử lý trang {page_no}...")
-            dfs_this_page = []
             try:
-                tables = camelot.read_pdf(file_path, flavor="lattice", pages=str(page_no), line_scale=40)
                 if tables and tables.n > 0:
                     for t in tables:
                         if t.shape[0] > 0:
                             dfs_this_page.append(t.df)
-                    print(f"✅ Lattice OK ({tables.n} bảng).")
             except Exception as e:
-                print(f"⚠️ Lattice lỗi: {e}")
-            if not dfs_this_page:
-                try:
-                    tables = camelot.read_pdf(file_path, flavor="stream", pages=str(page_no), edge_tol=200)
-                    if tables and tables.n > 0:
-                        for t in tables:
-                            if t.shape[0] > 0:
-                                dfs_this_page.append(t.df)
-                        print(f"✅ Stream OK ({tables.n} bảng).")
-                except Exception as e:
-                    print(f"❌ Stream lỗi: {e}")
-            if dfs_this_page:
-                all_dfs.extend(dfs_this_page)
-    if not all_dfs:
-        print("⚠️ Camelot không tìm thấy bảng → fallback pdfplumber.")
-        with pdfplumber.open(file_path) as pdf:
-            for page in pdf.pages:
-                tables = page.extract_tables()
-                for tb in tables:
-                    if tb and len(tb) > 2:
-                        df = pd.DataFrame(tb[1:], columns=tb[0])
-                        all_dfs.append(df)
     if not all_dfs:
-        print("🚫 Không phát hiện bảng trong PDF.")
         return pd.DataFrame()
     df_final = pd.concat(all_dfs, ignore_index=True)
@@ -154,123 +180,160 @@ def extract_pdf_tables(file_path: str) -> pd.DataFrame:
     print(f"✅ Tổng hợp: {len(df_final)} dòng, {len(df_final.columns)} cột.")
     return df_final
-# ================== NOTE EXTRACTION ==================
 def extract_pdf_note(file_path: str) -> str:
     try:
         with pdfplumber.open(file_path) as pdf:
-            text = ""
-            for p in pdf.pages[-2:]:  # lấy 2 trang cuối
-                t = (p.extract_text() or "")
-                text += "\n" + t
-            lines = text.strip().splitlines()
-            note_text = "\n".join(lines[-15:])
-            print(f"📝 Note Extracted: {len(note_text)} chars")
             return note_text
     except Exception as e:
         print(f"⚠️ extract_pdf_note lỗi: {e}")
         return ""
-# ================== GEMINI CALL ==================
 def call_gemini_with_prompt(content_text: str, note_text: str, question: str, model_choice: str, temperature: float, top_p: float):
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel(
         model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
-        generation_config={"temperature": float(temperature), "top_p": float(top_p)}
     )
     base_prompt = question.strip() if question and question.strip() else PROMPT_FREIGHT_JSON
     prompt = f"""
-{base_prompt}
-Below is the extracted CSV data:
-{content_text}
-Below are the notes (remark, package type, surcharges, etc.):
-{note_text}
-Please analyze everything and generate a valid JSON in the specified format.
-"""
-    print("🧠 Sending prompt to Gemini...")
-    resp = model.generate_content(prompt)
-    return getattr(resp, "text", str(resp))
-# ================== MAIN PROCESS ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "❌ No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
         print(f"[UPLOAD] {filename} ({mime})")
         if mime == "application/pdf":
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                 tmp.write(file_bytes)
                 tmp_path = tmp.name
             df = extract_pdf_tables(tmp_path)
             note_text = extract_pdf_note(tmp_path)
             if not df.empty:
-                # 🔹 Nếu phát hiện nhiều carrier
-                carrier_rows = df[df.iloc[:, 0].astype(str).str.contains("CARRIER", case=False, na=False)].index.tolist()
-                results = []
-                if carrier_rows:
-                    for i, start in enumerate(carrier_rows):
-                        end = carrier_rows[i + 1] if i + 1 < len(carrier_rows) else len(df)
-                        sub_df = df.iloc[start:end]
-                        csv_text = sub_df.to_csv(index=False)
-                        print(f"🚀 Processing carrier block {i+1}/{len(carrier_rows)}...")
-                        message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
-                        results.append(message)
-                    return "\n\n".join(results), None
-                else:
-                    csv_text = df.to_csv(index=False)
-                    print("✅ Gửi Gemini để sinh JSON...")
-                    message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
-                    return message, None
             else:
                 print("⚠️ Không có bảng hợp lệ, fallback OCR Gemini.")
                 return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
-        # fallback nếu không phải PDF
         return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {e}", None
-# ================== FALLBACK OCR ==================
-def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
-def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
-    genai.configure(api_key=os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY))
-    model = genai.GenerativeModel(INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
-        generation_config={"temperature": float(temperature), "top_p": float(top_p)})
-    pages = pdf_to_images(file_bytes) if file_bytes[:4] == b"%PDF" else [Image.open(io.BytesIO(file_bytes))]
-    all_text_results = []
     for i in range(0, len(pages), batch_size):
         batch = pages[i:i+batch_size]
-        uploads = [genai.upload_file(path=tempfile.NamedTemporaryFile(delete=False, suffix=".png").name) for _ in batch]
-        resp = model.generate_content([question or PROMPT_FREIGHT_JSON] + uploads)
-        all_text_results.append(getattr(resp, "text", str(resp)))
-    return "\n\n".join(all_text_results), None
 # ================== UI ==================
 def main():
-    with gr.Blocks(title="📦 Freight JSON Extractor") as demo:
         file = gr.File(label="Upload PDF/Image")
-        question = gr.Textbox(label="Prompt (optional)", lines=2)
         model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
                                    value="Gemini 2.5 Flash", label="Model")
         temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
         top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
-        output = gr.Code(label="Gemini Output", language="json")
-        btn = gr.Button("🚀 Run Extraction")
-        btn.click(run_process, [file, question, model_choice, temperature, top_p, gr.State()], outputs=[output, gr.State()])
     return demo
 demo = main()
 if __name__ == "__main__":
     demo.launch()

 from __future__ import annotations
+import os, io, re, json, time, mimetypes, tempfile
+from typing import List, Union, Tuple, Any
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
+import requests
 import fitz  # PyMuPDF
+import camelot
 import pdfplumber
 # ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
     "Gemini 2.5 Pro": "gemini-2.5-pro",
 }
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
+Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
+  "shipping_line_reason": "Why this carrier is chosen?",
   "fee_type": "Air Freight",
+  "valid_from": ...,
+  "valid_to": ...,
   "charges": [
     {
       "frequency": "...",
       "package_type": "...",
       "aircraft_type": "...",
+      "direction": "Export or Import or null",
+      "origin": "...",
+      "destination": "...",
       "charge_name": "...",
+      "charge_code": "charge_code": "GCR, DGR, PER, etc. (Use IATA Code DO NOT use flight number)",
+      "charge_code_reason": "...",
       "cargo_type": "...",
+      "currency": "...",
       "transit": "...",
       "transit_time": "...",
       "weight_breaks": {
         "+300kg": ...,
         "+500kg": ...,
         "+1000kg": ...,
+        "other": {
+          key: value
+        },
+        "weight_breaks_reason":"Why chosen weight_breaks?"
       },
+      "remark": "..."
+    }
+  ],
+  "local_charges": [
+    {
+      "charge_name": "...",
+      "charge_code": "...",
+      "unit": "...",
+      "amount": ...,
+      "remark": "..."
     }
   ]
 }
+### Date rules
+- valid_from format:
+  - `DD/MM/YYYY` (if full date)
+  - `01/MM/YYYY` (if month+year only)
+  - `01/01/YYYY` (if year only)
+  - `UFN` if missing
+- valid_to:
+  - exact `DD/MM/YYYY` if present
+  - else `UFN`
+STRICT RULES:
+- ONLY return a single JSON object as specified above.
+- All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
+- If the table shows "RQ" or similar, set value as "RQST".
 - Group same-price destinations into one record separated by "/".
+- Always use IATA code for origin and destination.
+- Flight number (e.g. ZH118) is not charge code.
+- Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
+- If local charges exist, list them.
+- If validity missing, set null.
+- Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
+- Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
 - Replace commas in remarks with semicolons.
+- Only return JSON.
 """
 # ================== HELPERS ==================
 def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
         mime = "application/pdf"
     return filename, mime or "application/octet-stream"
 def extract_pdf_tables(file_path: str) -> pd.DataFrame:
+    """
+    Extract bảng PDF bằng Camelot (từng trang):
+      - Thử lattice
+      - Nếu thất bại → fallback stream
+      - Gộp tất cả
+    """
+    import camelot
     all_dfs = []
+    # Đếm tổng số trang
+    import fitz
+    total_pages = len(fitz.open(file_path))
+    print(f"📄 Tổng số trang: {total_pages}")
+    for page_no in range(1, total_pages + 1):
+        print(f"🔍 Đang xử lý trang {page_no}...")
+        dfs_this_page = []
+        # --- Thử lattice ---
+        try:
+            tables = camelot.read_pdf(
+                file_path, flavor="lattice",
+                pages=str(page_no), strip_text="\n", line_scale=40
+            )
+            if tables and tables.n > 0:
+                for t in tables:
+                    if t.shape[0] > 0:
+                        dfs_this_page.append(t.df)
+                print(f"✅ Trang {page_no}: Lattice thành công ({tables.n} bảng).")
+        except Exception as e:
+            print(f"⚠️ Trang {page_no} lattice lỗi: {e}")
+        # --- Fallback stream ---
+        if not dfs_this_page:
             try:
+                tables = camelot.read_pdf(
+                    file_path, flavor="stream",
+                    pages=str(page_no), edge_tol=200, row_tol=10
+                )
                 if tables and tables.n > 0:
                     for t in tables:
                         if t.shape[0] > 0:
                             dfs_this_page.append(t.df)
+                    print(f"✅ Trang {page_no}: Stream thành công ({tables.n} bảng).")
             except Exception as e:
+                print(f"❌ Trang {page_no} stream lỗi: {e}")
+        if dfs_this_page:
+            all_dfs.extend(dfs_this_page)
+        else:
+            print(f"🚫 Trang {page_no}: Không phát hiện bảng.")
     if not all_dfs:
+        print("❌ Không tìm thấy bảng trong toàn bộ PDF.")
         return pd.DataFrame()
     df_final = pd.concat(all_dfs, ignore_index=True)
     print(f"✅ Tổng hợp: {len(df_final)} dòng, {len(df_final.columns)} cột.")
     return df_final
 def extract_pdf_note(file_path: str) -> str:
+    """
+    Dùng pdfplumber để lấy phần text cuối tài liệu (note, remark...).
+    Chỉ lấy từ 10 dòng cuối của trang cuối.
+    """
     try:
         with pdfplumber.open(file_path) as pdf:
+            last_page = pdf.pages[-1]
+            text = (last_page.extract_text() or "").strip()
+            lines = text.splitlines()
+            note_text = "\n".join(lines[-12:])  # lấy ~12 dòng cuối
+            print(f"📝 Extracted note text thành công.{note_text}")
             return note_text
     except Exception as e:
         print(f"⚠️ extract_pdf_note lỗi: {e}")
         return ""
 def call_gemini_with_prompt(content_text: str, note_text: str, question: str, model_choice: str, temperature: float, top_p: float):
+    """Gửi bảng + note vào Gemini (ưu tiên prompt tùy chỉnh nếu có)"""
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel(
         model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
+        generation_config={
+            "temperature": float(temperature),
+            "top_p": float(top_p)
+        }
     )
+    # Nếu user không nhập câu hỏi riêng, dùng prompt chuẩn FREIGHT_JSON
     base_prompt = question.strip() if question and question.strip() else PROMPT_FREIGHT_JSON
     prompt = f"""
+                {base_prompt}
+                Below is the extracted CSV data:
+                {content_text}
+                Below are the notes extracted from the PDF (e.g. Valid From, Origin, Remark, Package Type rules):
+                {note_text}
+                Please analyze all data and generate the JSON output following the schema above.
+                """
+    print("🧠 Sending prompt to Gemini...")
+    response = model.generate_content(prompt)
+    result_text = getattr(response, "text", str(response))
+    return result_text
+# ================== MAIN ROUTER ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "❌ No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
         print(f"[UPLOAD] {filename} ({mime})")
         if mime == "application/pdf":
+            # Lưu file tạm để camelot đọc
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                 tmp.write(file_bytes)
                 tmp_path = tmp.name
+            # 1️⃣ Extract bảng bằng Camelot
             df = extract_pdf_tables(tmp_path)
             note_text = extract_pdf_note(tmp_path)
             if not df.empty:
+                csv_text = df.to_csv(index=False)
+                print("✅ Gửi Gemini để sinh JSON...")
+                message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
+                return message, None
             else:
                 print("⚠️ Không có bảng hợp lệ, fallback OCR Gemini.")
                 return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
+        # Các loại file khác → OCR trực tiếp
         return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {e}", None
+def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
+    api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
+    if not api_key:
+        return "ERROR: Missing GOOGLE_API_KEY.", None
+    genai.configure(api_key=api_key)
+    model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
+    model = genai.GenerativeModel(model_name=model_name,
+                                  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
+    if file_bytes[:4] == b"%PDF":
+        pages = pdf_to_images(file_bytes)
+    else:
+        pages = [Image.open(io.BytesIO(file_bytes))]
+    user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
+    all_json_results, all_text_results = [], []
+    previous_header_json = None
+    def _safe_text(resp):
+        try:
+            return resp.text
+        except:
+            return ""
     for i in range(0, len(pages), batch_size):
         batch = pages[i:i+batch_size]
+        uploaded = []
+        for im in batch:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+                im.save(tmp.name)
+                up = genai.upload_file(path=tmp.name, mime_type="image/png")
+                up = genai.get_file(up.name)
+                uploaded.append(up)
+        context_prompt = user_prompt
+        resp = model.generate_content([context_prompt] + uploaded)
+        text = _safe_text(resp)
+        all_text_results.append(text)
+        for up in uploaded:
+            try:
+                genai.delete_file(up.name)
+            except:
+                pass
+    return "\n\n".join(all_text_results), None
 # ================== UI ==================
 def main():
+    with gr.Blocks(title="OCR Multi-Agent System") as demo:
         file = gr.File(label="Upload PDF/Image")
+        question = gr.Textbox(label="Prompt", lines=2)
         model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
                                    value="Gemini 2.5 Flash", label="Model")
         temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
         top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
+        external_api_url = gr.Textbox(label="External API URL", visible=False)
+        output_text = gr.Code(label="Output", language="json")
+        run_btn = gr.Button("🚀 Process")
+        run_btn.click(
+            run_process,
+            inputs=[file, question, model_choice, temperature, top_p, external_api_url],
+            outputs=[output_text, gr.State()]
+        )
     return demo
 demo = main()
 if __name__ == "__main__":
     demo.launch()