Spaces:

vithacocf
/

air_flow

Sleeping

App Files Files Community

vithacocf commited on Nov 4

Commit

bf0f7cb

verified ·

1 Parent(s): b7af253

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -99

app.py CHANGED Viewed

@@ -1,23 +1,10 @@
-from __future__ import annotations
-import os, io, re, json, time, mimetypes, tempfile
-from typing import List, Union, Tuple, Any
-from PIL import Image
-import pandas as pd
-import gradio as gr
-import google.generativeai as genai
-import requests
-import pdfplumber
-import fitz  # PyMuPDF
-# ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 INTERNAL_MODEL_MAP = {
     "Gemini 2.5 Flash": "gemini-2.5-flash",
-    "Gemini 2.5 Pro":   "gemini-2.5-pro",
 }
-EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
 Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 {
@@ -93,99 +80,132 @@ STRICT RULES:
 - Only return JSON.
 """
-# ================== HELPERS ==================
 def _read_file_bytes(upload):
     if isinstance(upload, str):
-        with open(upload, "rb") as f:
-            return f.read()
     elif hasattr(upload, "read"):
         return upload.read()
     raise TypeError("Unsupported file input")
-def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
     filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
     mime, _ = mimetypes.guess_type(filename)
-    if not mime and file_bytes[:4] == b"%PDF":
-        mime = "application/pdf"
     return filename, mime or "application/octet-stream"
-def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
-# ================== PDF CHECK ==================
 def check_pdf_structure(file_bytes: bytes) -> bool:
-    """Trả về True nếu PDF có nhiều trang và dạng bảng."""
     try:
         with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-            if len(pdf.pages) <= 2:
-                return False
             for page in pdf.pages[:3]:
-                tables = page.find_tables()
-                if tables:
-                    return True
         return False
     except Exception as e:
-        print("PDF check error:", e)
-        return False
-# ================== GEMINI CALL ==================
-def call_gemini_with_prompt(content_text: str, question: str, model_choice: str, temperature: float, top_p: float):
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel(
         model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
         generation_config={"temperature": temperature, "top_p": top_p}
     )
-    prompt = f"{PROMPT_FREIGHT_JSON}\n{question or ''}\n\nBelow is the extracted CSV data:\n{content_text}"
-    response = model.generate_content(prompt)
-    return getattr(response, "text", str(response))
-# ================== MAIN LOGIC ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "❌ No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
         print(f"[UPLOAD] {filename} ({mime})")
-        # 1️⃣ Nếu là PDF và có nhiều trang dạng bảng
         if mime == "application/pdf" and check_pdf_structure(file_bytes):
-            print("➡️ PDF nhiều trang & dạng bảng → trích xuất CSV trước khi gọi Gemini.")
-            all_dfs, saved_header = [], None
-            with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-                for idx, page in enumerate(pdf.pages, start=1):
-                    table = page.extract_table({
-                        "vertical_strategy": "lines",
-                        "horizontal_strategy": "text",
-                        "snap_tolerance": 3,
-                        "intersection_tolerance": 5,
-                    })
-                    if not table or len(table) < 2:
-                        continue
-                    header, rows = table[0], table[1:]
-                    if saved_header is None:
-                        saved_header = header
-                    elif len(header) < len(saved_header):
-                        header = saved_header
-                    try:
-                        df = pd.DataFrame(rows, columns=header)
-                        all_dfs.append(df)
-                    except Exception as e:
-                        print(f"⚠️ Trang {idx} lỗi DataFrame: {e}")
-            if all_dfs:
-                final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all")
-                csv_text = final_df.to_csv(index=False)
-                print(f"✅ Trích xuất {len(final_df)} dòng, gửi Gemini xử lý JSON.")
-                message = call_gemini_with_prompt(csv_text, question, model_choice, temperature, top_p)
                 return message, None
             else:
-                print("⚠️ Không có bảng hợp lệ, fallback qua OCR bình thường.")
-        # 2️⃣ Các loại file còn lại → xử lý như cũ
         api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
         genai.configure(api_key=api_key)
         model = genai.GenerativeModel(
@@ -193,34 +213,9 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
             generation_config={"temperature": temperature, "top_p": top_p}
         )
         uploaded = genai.upload_file(path=file.name)
-        resp = model.generate_content([question or PROMPT_FREIGHT_JSON, uploaded])
         genai.delete_file(uploaded.name)
         return getattr(resp, "text", str(resp)), None
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {e}", None
-# ================== UI ==================
-def main():
-    with gr.Blocks(title="OCR + Table Extraction for Gemini") as demo:
-        file = gr.File(label="📂 Upload PDF / Image / CSV")
-        question = gr.Textbox(label="Prompt", lines=2)
-        model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
-                                   value="Gemini 2.5 Flash", label="Model")
-        temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
-        top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
-        external_api_url = gr.Textbox(label="External API URL", visible=False)
-        output_text = gr.Code(label="Gemini Output", language="json")
-        run_btn = gr.Button("🚀 Run Extraction")
-        run_btn.click(
-            run_process,
-            inputs=[file, question, model_choice, temperature, top_p, external_api_url],
-            outputs=[output_text, gr.State()]
-        )
-    return demo
-demo = main()
-if __name__ == "__main__":
-    demo.launch()

+import os, io, tempfile, mimetypes, camelot, pdfplumber, pandas as pd, google.generativeai as genai
+import re
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 INTERNAL_MODEL_MAP = {
     "Gemini 2.5 Flash": "gemini-2.5-flash",
+    "Gemini 2.5 Pro": "gemini-2.5-pro",
 }
 PROMPT_FREIGHT_JSON = """
 Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 {
 - Only return JSON.
 """
+# ========== Helpers ==========
 def _read_file_bytes(upload):
     if isinstance(upload, str):
+        with open(upload, "rb") as f: return f.read()
     elif hasattr(upload, "read"):
         return upload.read()
     raise TypeError("Unsupported file input")
+def _guess_name_and_mime(file, file_bytes):
     filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
     mime, _ = mimetypes.guess_type(filename)
+    if not mime and file_bytes[:4] == b"%PDF": mime = "application/pdf"
     return filename, mime or "application/octet-stream"
 def check_pdf_structure(file_bytes: bytes) -> bool:
     try:
         with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            if len(pdf.pages) <= 2: return False
             for page in pdf.pages[:3]:
+                if page.find_tables(): return True
         return False
     except Exception as e:
+        print("PDF check error:", e); return False
+# ========== 1️⃣ Extract bảng bằng Camelot ==========
+def extract_pdf_tables(file_path: str) -> pd.DataFrame:
+    all_dfs = []
+    try:
+        print("🔍 Try lattice mode...")
+        tables = camelot.read_pdf(file_path, flavor="lattice", pages="all")
+        if tables.n > 0:
+            for t in tables: all_dfs.append(t.df)
+            print(f"✅ Lattice: {tables.n} tables.")
+    except Exception as e:
+        print(f"⚠️ Lattice failed: {e}")
+    if not all_dfs:
+        try:
+            print("🔁 Try stream mode...")
+            tables = camelot.read_pdf(file_path, flavor="stream", pages="all")
+            if tables.n > 0:
+                for t in tables: all_dfs.append(t.df)
+                print(f"✅ Stream: {tables.n} tables.")
+        except Exception as e:
+            print(f"❌ Stream failed: {e}")
+    if not all_dfs:
+        print("🚫 No table detected.")
+        return pd.DataFrame()
+    df_final = pd.concat(all_dfs, ignore_index=True)
+    if all(str(c).isdigit() for c in df_final.columns):
+        print("🧠 Detected numeric headers (0,1,2..), using first row as real header.")
+        df_final.columns = df_final.iloc[0]
+        df_final = df_final[1:]
+    df_final = df_final.dropna(how="all").reset_index(drop=True)
+    print(f"✅ Total: {len(df_final)} rows × {len(df_final.columns)} columns.")
+    return df_final
+# ========== 2️⃣ Extract phần Note / Header ==========
+def extract_pdf_note(file_bytes: bytes) -> str:
+    """
+    Lấy phần text ở đầu PDF (ví dụ: Start Date, Expiry Date, Origin, các note nhỏ)
+    Bỏ qua vùng bảng phía dưới.
+    """
+    try:
+        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            first_page = pdf.pages[0]
+            text = first_page.extract_text() or ""
+            # cắt phần note: chỉ lấy 15 dòng đầu để tránh trích luôn bảng
+            lines = text.splitlines()[:15]
+            note_lines = []
+            for line in lines:
+                if re.search(r"(Start Date|Origin|Expiry|Product|MY|SC|All rates|Currency)", line, re.I):
+                    note_lines.append(line.strip())
+            note_text = " ".join(note_lines)
+            return note_text.strip()
+    except Exception as e:
+        print(f"⚠️ Note extraction failed: {e}")
+        return ""
+# ========== 3️⃣ Gọi Gemini ==========
+def call_gemini_with_prompt(csv_text: str, note_text: str, model_choice: str, temperature: float, top_p: float):
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel(
         model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
         generation_config={"temperature": temperature, "top_p": top_p}
     )
+    prompt = f"""{PROMPT_FREIGHT_JSON}
+Below is the extracted freight rate table (CSV) and additional notes:
+Notes:
+{note_text or '[No notes detected]'}
+CSV:
+{csv_text}
+→ Convert to valid JSON as per schema above.
+"""
+    resp = model.generate_content(prompt)
+    return getattr(resp, "text", str(resp))
+# ========== 4️⃣ Main process ==========
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "❌ No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
         print(f"[UPLOAD] {filename} ({mime})")
         if mime == "application/pdf" and check_pdf_structure(file_bytes):
+            print("➡️ PDF has multi-page table → extract before Gemini.")
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                tmp.write(file_bytes)
+                tmp_path = tmp.name
+            df = extract_pdf_tables(tmp_path)
+            if not df.empty:
+                note_text = extract_pdf_note(file_bytes)
+                csv_text = df.to_csv(index=False)
+                print("✅ Send table + note to Gemini...")
+                message = call_gemini_with_prompt(csv_text, note_text, model_choice, temperature, top_p)
                 return message, None
             else:
+                print("⚠️ No valid table found → fallback to OCR Gemini.")
+        # fallback OCR
         api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
         genai.configure(api_key=api_key)
         model = genai.GenerativeModel(
             generation_config={"temperature": temperature, "top_p": top_p}
         )
         uploaded = genai.upload_file(path=file.name)
+        resp = model.generate_content([PROMPT_FREIGHT_JSON, uploaded])
         genai.delete_file(uploaded.name)
         return getattr(resp, "text", str(resp)), None
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {e}", None