Spaces:

vithacocf
/

air_flow

Sleeping

App Files Files Community

vithacocf commited on Nov 4, 2025

Commit

770523c

verified ·

1 Parent(s): 0dafa97

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -177

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
 from __future__ import annotations
-import os, io, re, json, time, mimetypes, tempfile
-from typing import List, Union, Tuple, Any
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
 import requests
-import pdfplumber
-import fitz  # PyMuPDF
 # ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
@@ -18,21 +16,21 @@ INTERNAL_MODEL_MAP = {
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
-Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
   "shipping_line_reason": "Why this carrier is chosen?",
   "fee_type": "Air Freight",
-  "valid_from": ...,
-  "valid_to": ...,
   "charges": [
     {
       "frequency": "...",
       "package_type": "...",
       "aircraft_type": "...",
       "direction": "Export or Import or null",
-      "origin": "...",
       "destination": "...",
       "charge_name": "...",
       "charge_code": "...",
@@ -49,10 +47,8 @@ Please analyze the freight rate table in the file I provide and convert it into
         "+300kg": ...,
         "+500kg": ...,
         "+1000kg": ...,
-        "other": {
-          key: value
-        },
-        "weight_breaks_reason":"Why chosen weight_breaks?"
       },
       "remark": "..."
     }
@@ -67,39 +63,17 @@ Please analyze the freight rate table in the file I provide and convert it into
     }
   ]
 }
-### Date rules
-- valid_from format:
-  - `DD/MM/YYYY` (if full date)
-  - `01/MM/YYYY` (if month+year only)
-  - `01/01/YYYY` (if year only)
-  - `UFN` if missing
-- valid_to:
-  - exact `DD/MM/YYYY` if present
-  - else `UFN`
-STRICT RULES:
-- ONLY return a single JSON object as specified above.
-- All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
-- If the table shows "RQ" or similar, set value as "RQST".
-- Group same-price destinations into one record separated by "/".
-- Always use IATA code for origin and destination.
-- Flight number (e.g. ZH118) is not charge code.
-- Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
-- If local charges exist, list them.
-- If validity missing, set null.
-- Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
-- Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
-- Replace commas in remarks with semicolons.
-- Only return JSON.
-"""
-try:
-    RESAMPLE = Image.Resampling.LANCZOS
-except AttributeError:
-    RESAMPLE = Image.LANCZOS
 # ================== HELPERS ==================
-def _read_file_bytes(upload):
     if upload is None:
         raise ValueError("No file uploaded.")
     if isinstance(upload, (str, os.PathLike)):
@@ -123,150 +97,87 @@ def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
         filename = "upload.bin"
     mime, _ = mimetypes.guess_type(filename)
     if not mime:
-        if len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
             mime = "application/pdf"
-            if not filename.lower().endswith(".pdf"):
-                filename += ".pdf"
         else:
             mime = "image/png"
     return filename, mime
-# ================== PDF AUTO-EXTRACT ==================
-def extract_pdf_table_safely(file_bytes: bytes, filename: str):
-    """Tự động đọc PDF bảng nhiều trang, fix lệch header + Origin."""
-    print(f"[PDF Extract] {filename}: bắt đầu phân tích bằng pdfplumber...")
-    try:
-        pdf = pdfplumber.open(io.BytesIO(file_bytes))
-    except Exception as e:
-        print(f"❌ Không mở được PDF: {e}")
-        return None, None
-    table_data = []
-    header = None
-    origin = None
-    for i, page in enumerate(pdf.pages, start=1):
-        print(f"📄 Trang {i}...")
-        # tìm Origin
-        if i == 1:
-            text_page = page.extract_text() or ""
-            m = re.search(r"Origin\s*:\s*([A-Z]{3})", text_page)
-            if m:
-                origin = m.group(1).strip()
-                print(f"✅ Origin phát hiện: {origin}")
-            else:
-                origin = "UNK"
-        tables = page.extract_tables({
-            "vertical_strategy": "lines",
-            "horizontal_strategy": "text",
-            "snap_tolerance": 3,
-            "intersection_tolerance": 5,
-        })
-        if not tables:
-            print(f"⚠️ Trang {i}: không có bảng hợp lệ.")
-            continue
-        for table in tables:
-            if not table or len(table) < 2:
-                continue
-            if header is None:
-                header = table[0]
-                print(f"✅ Header đầu tiên: {header}")
-                df = pd.DataFrame(table[1:], columns=header)
-            else:
-                try:
-                    df = pd.DataFrame(table, columns=header)
-                except Exception as e:
-                    print(f"⚠️ Trang {i}: lỗi DataFrame {e} → cân chỉnh cột lại.")
-                    n_col = min(len(header), len(table[0]))
-                    df = pd.DataFrame([r[:n_col] for r in table], columns=header[:n_col])
-            df["ORIGIN"] = origin
-            df = df[df[header[0]] != header[0]]
-            table_data.append(df)
-    pdf.close()
-    if not table_data:
-        print("❌ Không có bảng hợp lệ trong PDF.")
-        return None, None
-    final_df = pd.concat(table_data, ignore_index=True)
-    print(f"✅ Tổng cộng {len(final_df)} dòng, {len(final_df.columns)} cột.")
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
-    final_df.to_excel(tmp.name, index=False)
-    print(f"💾 Excel tạm: {tmp.name}")
-    return final_df, tmp.name
-# ================== OCR CORE ==================
-def pdf_to_images(pdf_bytes: bytes):
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    pages = []
-    for p in doc:
-        pix = p.get_pixmap(dpi=200)
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        pages.append(img)
-    return pages
-def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
-    model = genai.GenerativeModel(model_name=model_name, generation_config={"temperature": float(temperature), "top_p": float(top_p)})
-    if file_bytes[:4] == b"%PDF":
-        pages = pdf_to_images(file_bytes)
-    else:
-        pages = [Image.open(io.BytesIO(file_bytes))]
-    user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
-    all_text_results = []
-    for i in range(0, len(pages), batch_size):
-        batch = pages[i:i+batch_size]
-        uploaded = []
-        for im in batch:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
-                im.save(tmp.name)
-                up = genai.upload_file(path=tmp.name, mime_type="image/png")
-                up = genai.get_file(up.name)
-                uploaded.append(up)
-        resp = model.generate_content([user_prompt] + uploaded)
-        all_text_results.append(resp.text)
-        for up in uploaded:
-            try:
-                genai.delete_file(up.name)
-            except:
-                pass
-    return "\n\n".join(all_text_results), None
-# ================== ROUTER ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
-        # STEP 1️⃣: Auto-detect PDF table
-        if mime == "application/pdf":
-            print(f"[CHECK] {filename}: PDF detected → thử extract bảng trước...")
-            df, tmp_path = extract_pdf_table_safely(file_bytes, filename)
-            if df is not None and len(df) > 0:
-                print("✅ PDF có bảng rõ → skip OCR.")
-                preview_text = f"Extracted {len(df)} rows from {filename}. Origin={df['ORIGIN'].iloc[0]}"
-                return preview_text, None
-            else:
-                print("⚠️ PDF không rõ cấu trúc → fallback sang OCR.")
-        # STEP 2️⃣: Nếu không phải bảng → OCR Gemini
-        return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {str(e)}", None
@@ -274,18 +185,18 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
 # ================== UI ==================
 def main():
-    with gr.Blocks(title="OCR Hybrid Extractor") as demo:
-        gr.Markdown("## 📦 Hybrid OCR: pdfplumber → Gemini Fallback")
-        file = gr.File(label="Upload PDF/Image")
-        question = gr.Textbox(label="Prompt", lines=2)
         model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
                                    value="Gemini 2.5 Flash", label="Model")
         temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
         top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
         external_api_url = gr.Textbox(label="External API URL", visible=False)
-        output_text = gr.Code(label="Output", language="json")
-        run_btn = gr.Button("🚀 Process")
         run_btn.click(run_process,
                       inputs=[file, question, model_choice, temperature, top_p, external_api_url],

 from __future__ import annotations
+import os, io, re, json, mimetypes, tempfile
+from typing import List, Union, Tuple
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
 import requests
 # ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
+Please analyze the freight rate table in the CSV file I provide and convert it into JSON with this structure:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
   "shipping_line_reason": "Why this carrier is chosen?",
   "fee_type": "Air Freight",
+  "valid_from": "...",
+  "valid_to": "...",
   "charges": [
     {
       "frequency": "...",
       "package_type": "...",
       "aircraft_type": "...",
       "direction": "Export or Import or null",
+      "origin": "...",       # detect automatically from header, filename, or text (e.g. SGN/HAN/DAD)
       "destination": "...",
       "charge_name": "...",
       "charge_code": "...",
         "+300kg": ...,
         "+500kg": ...,
         "+1000kg": ...,
+        "other": { key: value },
+        "weight_breaks_reason": "Why chosen weight_breaks?"
       },
       "remark": "..."
     }
     }
   ]
 }
+Rules:
+- If filename or top text includes "Origin: SGN", "SGN", "HAN", or "DAD" → use as origin.
+- If missing, infer origin from file name (e.g., "TK - SGN Rate Sheet.csv" → SGN).
+- All rates must match the weight break columns (M, N, 45, 100, 300, 500, 1000, etc.).
+- No assumptions; set null if missing.
+- Only return valid JSON object as above.
+"""
 # ================== HELPERS ==================
+def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
         raise ValueError("No file uploaded.")
     if isinstance(upload, (str, os.PathLike)):
         filename = "upload.bin"
     mime, _ = mimetypes.guess_type(filename)
     if not mime:
+        if filename.lower().endswith(".csv"):
+            mime = "text/csv"
+        elif len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
             mime = "application/pdf"
         else:
             mime = "image/png"
     return filename, mime
+# ================== GEMINI PROCESS ==================
+def run_gemini_text(file_bytes, filename, mime, model_choice, question, temperature, top_p):
+    """Gemini đọc CSV/text → sinh JSON"""
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
+    if not api_key:
+        return "ERROR: Missing GOOGLE_API_KEY.", None
     genai.configure(api_key=api_key)
     model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
+    model = genai.GenerativeModel(model_name=model_name,
+                                  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
+    # đọc CSV nếu có
+    csv_text = None
+    if mime == "text/csv" or filename.lower().endswith(".csv"):
+        try:
+            df = pd.read_csv(io.BytesIO(file_bytes))
+            csv_text = df.to_csv(index=False)
+        except Exception:
+            csv_text = file_bytes.decode("utf-8", errors="ignore")
+    # prompt chính
+    user_prompt = question.strip() if question else PROMPT_FREIGHT_JSON
+    full_prompt = (
+        f"{user_prompt}\n\n"
+        f"Filename: {filename}\n\n"
+        f"Below is the table text extracted from your CSV file:\n{csv_text or file_bytes.decode('utf-8', errors='ignore')}\n\n"
+        "Please analyze and return valid JSON only."
+    )
+    resp = model.generate_content(full_prompt)
+    return resp.text.strip(), None
+# ================== EXTERNAL API (nếu có) ==================
+def run_process_external(file_bytes, filename, mime, question, api_url, temperature, top_p):
+    if not api_url:
+        return "ERROR: Missing external API endpoint.", None
+    data = {"prompt": question or "", "temperature": str(temperature), "top_p": str(top_p)}
+    files = {"file": (filename, file_bytes, mime)}
+    r = requests.post(api_url, files=files, data=data, timeout=60)
+    if r.status_code >= 400:
+        return f"ERROR: External API HTTP {r.status_code}: {r.text[:200]}", None
+    return r.text, None
+# ================== MAIN ROUTER ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
+        if file is None:
+            return "ERROR: No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
+        print(f"[INFO] Processing {filename} ({mime})...")
+        # Nếu là CSV → đọc text & gửi Gemini
+        if mime == "text/csv" or filename.lower().endswith(".csv"):
+            print("🟢 Detected CSV file → Sending to Gemini for JSON conversion.")
+            return run_gemini_text(file_bytes, filename, mime, model_choice, question, temperature, top_p)
+        # Nếu chọn external
+        if model_choice == EXTERNAL_MODEL_NAME:
+            return run_process_external(
+                file_bytes=file_bytes, filename=filename, mime=mime,
+                question=question, api_url=external_api_url,
+                temperature=temperature, top_p=top_p
+            )
+        # fallback: PDF / image
+        return "⚠️ Only CSV supported in this version. Please upload .csv file.", None
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {str(e)}", None
 # ================== UI ==================
 def main():
+    with gr.Blocks(title="CSV → JSON Converter (Gemini)") as demo:
+        gr.Markdown("## 📦 Upload CSV → Gemini generates structured JSON")
+        file = gr.File(label="Upload CSV file")
+        question = gr.Textbox(label="Custom Prompt (optional)", lines=2)
         model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
                                    value="Gemini 2.5 Flash", label="Model")
         temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
         top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
         external_api_url = gr.Textbox(label="External API URL", visible=False)
+        output_text = gr.Code(label="Gemini Output", language="json")
+        run_btn = gr.Button("🚀 Convert to JSON")
         run_btn.click(run_process,
                       inputs=[file, question, model_choice, temperature, top_p, external_api_url],