Update app.py
Browse files
app.py
CHANGED
|
@@ -1,23 +1,10 @@
|
|
| 1 |
-
|
| 2 |
-
import
|
| 3 |
-
from typing import List, Union, Tuple, Any
|
| 4 |
-
from PIL import Image
|
| 5 |
-
import pandas as pd
|
| 6 |
-
import gradio as gr
|
| 7 |
-
import google.generativeai as genai
|
| 8 |
-
import requests
|
| 9 |
-
import pdfplumber
|
| 10 |
-
import fitz # PyMuPDF
|
| 11 |
-
|
| 12 |
-
# ================== CONFIG ==================
|
| 13 |
DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
|
| 14 |
-
|
| 15 |
INTERNAL_MODEL_MAP = {
|
| 16 |
"Gemini 2.5 Flash": "gemini-2.5-flash",
|
| 17 |
-
"Gemini 2.5 Pro":
|
| 18 |
}
|
| 19 |
-
EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
|
| 20 |
-
|
| 21 |
PROMPT_FREIGHT_JSON = """
|
| 22 |
Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
|
| 23 |
{
|
|
@@ -93,99 +80,132 @@ STRICT RULES:
|
|
| 93 |
- Only return JSON.
|
| 94 |
"""
|
| 95 |
|
| 96 |
-
#
|
| 97 |
def _read_file_bytes(upload):
|
| 98 |
if isinstance(upload, str):
|
| 99 |
-
with open(upload, "rb") as f:
|
| 100 |
-
return f.read()
|
| 101 |
elif hasattr(upload, "read"):
|
| 102 |
return upload.read()
|
| 103 |
raise TypeError("Unsupported file input")
|
| 104 |
|
| 105 |
-
def _guess_name_and_mime(file, file_bytes
|
| 106 |
filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
|
| 107 |
mime, _ = mimetypes.guess_type(filename)
|
| 108 |
-
if not mime and file_bytes[:4] == b"%PDF":
|
| 109 |
-
mime = "application/pdf"
|
| 110 |
return filename, mime or "application/octet-stream"
|
| 111 |
|
| 112 |
-
def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
|
| 113 |
-
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 114 |
-
return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
|
| 115 |
-
|
| 116 |
-
# ================== PDF CHECK ==================
|
| 117 |
def check_pdf_structure(file_bytes: bytes) -> bool:
|
| 118 |
-
"""Trả về True nếu PDF có nhiều trang và dạng bảng."""
|
| 119 |
try:
|
| 120 |
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 121 |
-
if len(pdf.pages) <= 2:
|
| 122 |
-
return False
|
| 123 |
for page in pdf.pages[:3]:
|
| 124 |
-
|
| 125 |
-
if tables:
|
| 126 |
-
return True
|
| 127 |
return False
|
| 128 |
except Exception as e:
|
| 129 |
-
print("PDF check error:", e)
|
| 130 |
-
return False
|
| 131 |
|
| 132 |
-
#
|
| 133 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
|
| 135 |
genai.configure(api_key=api_key)
|
| 136 |
model = genai.GenerativeModel(
|
| 137 |
model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
|
| 138 |
generation_config={"temperature": temperature, "top_p": top_p}
|
| 139 |
)
|
| 140 |
-
prompt = f"{PROMPT_FREIGHT_JSON}
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
#
|
| 145 |
def run_process(file, question, model_choice, temperature, top_p, external_api_url):
|
| 146 |
try:
|
| 147 |
if file is None:
|
| 148 |
return "❌ No file uploaded.", None
|
| 149 |
-
|
| 150 |
file_bytes = _read_file_bytes(file)
|
| 151 |
filename, mime = _guess_name_and_mime(file, file_bytes)
|
| 152 |
print(f"[UPLOAD] {filename} ({mime})")
|
| 153 |
|
| 154 |
-
# 1️⃣ Nếu là PDF và có nhiều trang dạng bảng
|
| 155 |
if mime == "application/pdf" and check_pdf_structure(file_bytes):
|
| 156 |
-
print("➡️ PDF
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
continue
|
| 168 |
-
header, rows = table[0], table[1:]
|
| 169 |
-
if saved_header is None:
|
| 170 |
-
saved_header = header
|
| 171 |
-
elif len(header) < len(saved_header):
|
| 172 |
-
header = saved_header
|
| 173 |
-
try:
|
| 174 |
-
df = pd.DataFrame(rows, columns=header)
|
| 175 |
-
all_dfs.append(df)
|
| 176 |
-
except Exception as e:
|
| 177 |
-
print(f"⚠️ Trang {idx} lỗi DataFrame: {e}")
|
| 178 |
-
|
| 179 |
-
if all_dfs:
|
| 180 |
-
final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all")
|
| 181 |
-
csv_text = final_df.to_csv(index=False)
|
| 182 |
-
print(f"✅ Trích xuất {len(final_df)} dòng, gửi Gemini xử lý JSON.")
|
| 183 |
-
message = call_gemini_with_prompt(csv_text, question, model_choice, temperature, top_p)
|
| 184 |
return message, None
|
| 185 |
else:
|
| 186 |
-
print("⚠️
|
| 187 |
|
| 188 |
-
#
|
| 189 |
api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
|
| 190 |
genai.configure(api_key=api_key)
|
| 191 |
model = genai.GenerativeModel(
|
|
@@ -193,34 +213,9 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
|
|
| 193 |
generation_config={"temperature": temperature, "top_p": top_p}
|
| 194 |
)
|
| 195 |
uploaded = genai.upload_file(path=file.name)
|
| 196 |
-
resp = model.generate_content([
|
| 197 |
genai.delete_file(uploaded.name)
|
| 198 |
return getattr(resp, "text", str(resp)), None
|
| 199 |
|
| 200 |
except Exception as e:
|
| 201 |
return f"ERROR: {type(e).__name__}: {e}", None
|
| 202 |
-
|
| 203 |
-
# ================== UI ==================
|
| 204 |
-
def main():
|
| 205 |
-
with gr.Blocks(title="OCR + Table Extraction for Gemini") as demo:
|
| 206 |
-
file = gr.File(label="📂 Upload PDF / Image / CSV")
|
| 207 |
-
question = gr.Textbox(label="Prompt", lines=2)
|
| 208 |
-
model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
|
| 209 |
-
value="Gemini 2.5 Flash", label="Model")
|
| 210 |
-
temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
|
| 211 |
-
top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
|
| 212 |
-
external_api_url = gr.Textbox(label="External API URL", visible=False)
|
| 213 |
-
output_text = gr.Code(label="Gemini Output", language="json")
|
| 214 |
-
run_btn = gr.Button("🚀 Run Extraction")
|
| 215 |
-
|
| 216 |
-
run_btn.click(
|
| 217 |
-
run_process,
|
| 218 |
-
inputs=[file, question, model_choice, temperature, top_p, external_api_url],
|
| 219 |
-
outputs=[output_text, gr.State()]
|
| 220 |
-
)
|
| 221 |
-
|
| 222 |
-
return demo
|
| 223 |
-
|
| 224 |
-
demo = main()
|
| 225 |
-
if __name__ == "__main__":
|
| 226 |
-
demo.launch()
|
|
|
|
| 1 |
+
import os, io, tempfile, mimetypes, camelot, pdfplumber, pandas as pd, google.generativeai as genai
|
| 2 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
|
|
|
|
| 4 |
INTERNAL_MODEL_MAP = {
|
| 5 |
"Gemini 2.5 Flash": "gemini-2.5-flash",
|
| 6 |
+
"Gemini 2.5 Pro": "gemini-2.5-pro",
|
| 7 |
}
|
|
|
|
|
|
|
| 8 |
PROMPT_FREIGHT_JSON = """
|
| 9 |
Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
|
| 10 |
{
|
|
|
|
| 80 |
- Only return JSON.
|
| 81 |
"""
|
| 82 |
|
| 83 |
+
# ========== Helpers ==========
|
| 84 |
def _read_file_bytes(upload):
|
| 85 |
if isinstance(upload, str):
|
| 86 |
+
with open(upload, "rb") as f: return f.read()
|
|
|
|
| 87 |
elif hasattr(upload, "read"):
|
| 88 |
return upload.read()
|
| 89 |
raise TypeError("Unsupported file input")
|
| 90 |
|
| 91 |
+
def _guess_name_and_mime(file, file_bytes):
|
| 92 |
filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
|
| 93 |
mime, _ = mimetypes.guess_type(filename)
|
| 94 |
+
if not mime and file_bytes[:4] == b"%PDF": mime = "application/pdf"
|
|
|
|
| 95 |
return filename, mime or "application/octet-stream"
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def check_pdf_structure(file_bytes: bytes) -> bool:
|
|
|
|
| 98 |
try:
|
| 99 |
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 100 |
+
if len(pdf.pages) <= 2: return False
|
|
|
|
| 101 |
for page in pdf.pages[:3]:
|
| 102 |
+
if page.find_tables(): return True
|
|
|
|
|
|
|
| 103 |
return False
|
| 104 |
except Exception as e:
|
| 105 |
+
print("PDF check error:", e); return False
|
|
|
|
| 106 |
|
| 107 |
+
# ========== 1️⃣ Extract bảng bằng Camelot ==========
|
| 108 |
+
def extract_pdf_tables(file_path: str) -> pd.DataFrame:
|
| 109 |
+
all_dfs = []
|
| 110 |
+
try:
|
| 111 |
+
print("🔍 Try lattice mode...")
|
| 112 |
+
tables = camelot.read_pdf(file_path, flavor="lattice", pages="all")
|
| 113 |
+
if tables.n > 0:
|
| 114 |
+
for t in tables: all_dfs.append(t.df)
|
| 115 |
+
print(f"✅ Lattice: {tables.n} tables.")
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"⚠️ Lattice failed: {e}")
|
| 118 |
+
|
| 119 |
+
if not all_dfs:
|
| 120 |
+
try:
|
| 121 |
+
print("🔁 Try stream mode...")
|
| 122 |
+
tables = camelot.read_pdf(file_path, flavor="stream", pages="all")
|
| 123 |
+
if tables.n > 0:
|
| 124 |
+
for t in tables: all_dfs.append(t.df)
|
| 125 |
+
print(f"✅ Stream: {tables.n} tables.")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"❌ Stream failed: {e}")
|
| 128 |
+
|
| 129 |
+
if not all_dfs:
|
| 130 |
+
print("🚫 No table detected.")
|
| 131 |
+
return pd.DataFrame()
|
| 132 |
+
|
| 133 |
+
df_final = pd.concat(all_dfs, ignore_index=True)
|
| 134 |
+
if all(str(c).isdigit() for c in df_final.columns):
|
| 135 |
+
print("🧠 Detected numeric headers (0,1,2..), using first row as real header.")
|
| 136 |
+
df_final.columns = df_final.iloc[0]
|
| 137 |
+
df_final = df_final[1:]
|
| 138 |
+
df_final = df_final.dropna(how="all").reset_index(drop=True)
|
| 139 |
+
print(f"✅ Total: {len(df_final)} rows × {len(df_final.columns)} columns.")
|
| 140 |
+
return df_final
|
| 141 |
+
|
| 142 |
+
# ========== 2️⃣ Extract phần Note / Header ==========
|
| 143 |
+
def extract_pdf_note(file_bytes: bytes) -> str:
|
| 144 |
+
"""
|
| 145 |
+
Lấy phần text ở đầu PDF (ví dụ: Start Date, Expiry Date, Origin, các note nhỏ)
|
| 146 |
+
Bỏ qua vùng bảng phía dưới.
|
| 147 |
+
"""
|
| 148 |
+
try:
|
| 149 |
+
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 150 |
+
first_page = pdf.pages[0]
|
| 151 |
+
text = first_page.extract_text() or ""
|
| 152 |
+
# cắt phần note: chỉ lấy 15 dòng đầu để tránh trích luôn bảng
|
| 153 |
+
lines = text.splitlines()[:15]
|
| 154 |
+
note_lines = []
|
| 155 |
+
for line in lines:
|
| 156 |
+
if re.search(r"(Start Date|Origin|Expiry|Product|MY|SC|All rates|Currency)", line, re.I):
|
| 157 |
+
note_lines.append(line.strip())
|
| 158 |
+
note_text = " ".join(note_lines)
|
| 159 |
+
return note_text.strip()
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"⚠️ Note extraction failed: {e}")
|
| 162 |
+
return ""
|
| 163 |
+
|
| 164 |
+
# ========== 3️⃣ Gọi Gemini ==========
|
| 165 |
+
def call_gemini_with_prompt(csv_text: str, note_text: str, model_choice: str, temperature: float, top_p: float):
|
| 166 |
api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
|
| 167 |
genai.configure(api_key=api_key)
|
| 168 |
model = genai.GenerativeModel(
|
| 169 |
model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
|
| 170 |
generation_config={"temperature": temperature, "top_p": top_p}
|
| 171 |
)
|
| 172 |
+
prompt = f"""{PROMPT_FREIGHT_JSON}
|
| 173 |
+
Below is the extracted freight rate table (CSV) and additional notes:
|
| 174 |
+
Notes:
|
| 175 |
+
{note_text or '[No notes detected]'}
|
| 176 |
+
CSV:
|
| 177 |
+
{csv_text}
|
| 178 |
+
→ Convert to valid JSON as per schema above.
|
| 179 |
+
"""
|
| 180 |
+
resp = model.generate_content(prompt)
|
| 181 |
+
return getattr(resp, "text", str(resp))
|
| 182 |
|
| 183 |
+
# ========== 4️⃣ Main process ==========
|
| 184 |
def run_process(file, question, model_choice, temperature, top_p, external_api_url):
|
| 185 |
try:
|
| 186 |
if file is None:
|
| 187 |
return "❌ No file uploaded.", None
|
|
|
|
| 188 |
file_bytes = _read_file_bytes(file)
|
| 189 |
filename, mime = _guess_name_and_mime(file, file_bytes)
|
| 190 |
print(f"[UPLOAD] {filename} ({mime})")
|
| 191 |
|
|
|
|
| 192 |
if mime == "application/pdf" and check_pdf_structure(file_bytes):
|
| 193 |
+
print("➡️ PDF has multi-page table → extract before Gemini.")
|
| 194 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 195 |
+
tmp.write(file_bytes)
|
| 196 |
+
tmp_path = tmp.name
|
| 197 |
+
|
| 198 |
+
df = extract_pdf_tables(tmp_path)
|
| 199 |
+
if not df.empty:
|
| 200 |
+
note_text = extract_pdf_note(file_bytes)
|
| 201 |
+
csv_text = df.to_csv(index=False)
|
| 202 |
+
print("✅ Send table + note to Gemini...")
|
| 203 |
+
message = call_gemini_with_prompt(csv_text, note_text, model_choice, temperature, top_p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
return message, None
|
| 205 |
else:
|
| 206 |
+
print("⚠️ No valid table found → fallback to OCR Gemini.")
|
| 207 |
|
| 208 |
+
# fallback OCR
|
| 209 |
api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
|
| 210 |
genai.configure(api_key=api_key)
|
| 211 |
model = genai.GenerativeModel(
|
|
|
|
| 213 |
generation_config={"temperature": temperature, "top_p": top_p}
|
| 214 |
)
|
| 215 |
uploaded = genai.upload_file(path=file.name)
|
| 216 |
+
resp = model.generate_content([PROMPT_FREIGHT_JSON, uploaded])
|
| 217 |
genai.delete_file(uploaded.name)
|
| 218 |
return getattr(resp, "text", str(resp)), None
|
| 219 |
|
| 220 |
except Exception as e:
|
| 221 |
return f"ERROR: {type(e).__name__}: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|