vithacocf commited on
Commit
bf0f7cb
·
verified ·
1 Parent(s): b7af253

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -99
app.py CHANGED
@@ -1,23 +1,10 @@
1
- from __future__ import annotations
2
- import os, io, re, json, time, mimetypes, tempfile
3
- from typing import List, Union, Tuple, Any
4
- from PIL import Image
5
- import pandas as pd
6
- import gradio as gr
7
- import google.generativeai as genai
8
- import requests
9
- import pdfplumber
10
- import fitz # PyMuPDF
11
-
12
- # ================== CONFIG ==================
13
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
14
-
15
  INTERNAL_MODEL_MAP = {
16
  "Gemini 2.5 Flash": "gemini-2.5-flash",
17
- "Gemini 2.5 Pro": "gemini-2.5-pro",
18
  }
19
- EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
20
-
21
  PROMPT_FREIGHT_JSON = """
22
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
23
  {
@@ -93,99 +80,132 @@ STRICT RULES:
93
  - Only return JSON.
94
  """
95
 
96
- # ================== HELPERS ==================
97
  def _read_file_bytes(upload):
98
  if isinstance(upload, str):
99
- with open(upload, "rb") as f:
100
- return f.read()
101
  elif hasattr(upload, "read"):
102
  return upload.read()
103
  raise TypeError("Unsupported file input")
104
 
105
- def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
106
  filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
107
  mime, _ = mimetypes.guess_type(filename)
108
- if not mime and file_bytes[:4] == b"%PDF":
109
- mime = "application/pdf"
110
  return filename, mime or "application/octet-stream"
111
 
112
- def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
113
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
114
- return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
115
-
116
- # ================== PDF CHECK ==================
117
  def check_pdf_structure(file_bytes: bytes) -> bool:
118
- """Trả về True nếu PDF có nhiều trang và dạng bảng."""
119
  try:
120
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
121
- if len(pdf.pages) <= 2:
122
- return False
123
  for page in pdf.pages[:3]:
124
- tables = page.find_tables()
125
- if tables:
126
- return True
127
  return False
128
  except Exception as e:
129
- print("PDF check error:", e)
130
- return False
131
 
132
- # ================== GEMINI CALL ==================
133
- def call_gemini_with_prompt(content_text: str, question: str, model_choice: str, temperature: float, top_p: float):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
135
  genai.configure(api_key=api_key)
136
  model = genai.GenerativeModel(
137
  model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
138
  generation_config={"temperature": temperature, "top_p": top_p}
139
  )
140
- prompt = f"{PROMPT_FREIGHT_JSON}\n{question or ''}\n\nBelow is the extracted CSV data:\n{content_text}"
141
- response = model.generate_content(prompt)
142
- return getattr(response, "text", str(response))
 
 
 
 
 
 
 
143
 
144
- # ================== MAIN LOGIC ==================
145
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
146
  try:
147
  if file is None:
148
  return "❌ No file uploaded.", None
149
-
150
  file_bytes = _read_file_bytes(file)
151
  filename, mime = _guess_name_and_mime(file, file_bytes)
152
  print(f"[UPLOAD] {filename} ({mime})")
153
 
154
- # 1️⃣ Nếu là PDF và có nhiều trang dạng bảng
155
  if mime == "application/pdf" and check_pdf_structure(file_bytes):
156
- print("➡️ PDF nhiều trang & dạng bảng trích xuất CSV trước khi gọi Gemini.")
157
- all_dfs, saved_header = [], None
158
- with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
159
- for idx, page in enumerate(pdf.pages, start=1):
160
- table = page.extract_table({
161
- "vertical_strategy": "lines",
162
- "horizontal_strategy": "text",
163
- "snap_tolerance": 3,
164
- "intersection_tolerance": 5,
165
- })
166
- if not table or len(table) < 2:
167
- continue
168
- header, rows = table[0], table[1:]
169
- if saved_header is None:
170
- saved_header = header
171
- elif len(header) < len(saved_header):
172
- header = saved_header
173
- try:
174
- df = pd.DataFrame(rows, columns=header)
175
- all_dfs.append(df)
176
- except Exception as e:
177
- print(f"⚠️ Trang {idx} lỗi DataFrame: {e}")
178
-
179
- if all_dfs:
180
- final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all")
181
- csv_text = final_df.to_csv(index=False)
182
- print(f"✅ Trích xuất {len(final_df)} dòng, gửi Gemini xử lý JSON.")
183
- message = call_gemini_with_prompt(csv_text, question, model_choice, temperature, top_p)
184
  return message, None
185
  else:
186
- print("⚠️ Không bảng hợp lệ, fallback qua OCR bình thường.")
187
 
188
- # 2️⃣ Các loại file còn lại → xử lý như cũ
189
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
190
  genai.configure(api_key=api_key)
191
  model = genai.GenerativeModel(
@@ -193,34 +213,9 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
193
  generation_config={"temperature": temperature, "top_p": top_p}
194
  )
195
  uploaded = genai.upload_file(path=file.name)
196
- resp = model.generate_content([question or PROMPT_FREIGHT_JSON, uploaded])
197
  genai.delete_file(uploaded.name)
198
  return getattr(resp, "text", str(resp)), None
199
 
200
  except Exception as e:
201
  return f"ERROR: {type(e).__name__}: {e}", None
202
-
203
- # ================== UI ==================
204
- def main():
205
- with gr.Blocks(title="OCR + Table Extraction for Gemini") as demo:
206
- file = gr.File(label="📂 Upload PDF / Image / CSV")
207
- question = gr.Textbox(label="Prompt", lines=2)
208
- model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
209
- value="Gemini 2.5 Flash", label="Model")
210
- temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
211
- top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
212
- external_api_url = gr.Textbox(label="External API URL", visible=False)
213
- output_text = gr.Code(label="Gemini Output", language="json")
214
- run_btn = gr.Button("🚀 Run Extraction")
215
-
216
- run_btn.click(
217
- run_process,
218
- inputs=[file, question, model_choice, temperature, top_p, external_api_url],
219
- outputs=[output_text, gr.State()]
220
- )
221
-
222
- return demo
223
-
224
- demo = main()
225
- if __name__ == "__main__":
226
- demo.launch()
 
1
+ import os, io, tempfile, mimetypes, camelot, pdfplumber, pandas as pd, google.generativeai as genai
2
+ import re
 
 
 
 
 
 
 
 
 
 
3
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 
4
  INTERNAL_MODEL_MAP = {
5
  "Gemini 2.5 Flash": "gemini-2.5-flash",
6
+ "Gemini 2.5 Pro": "gemini-2.5-pro",
7
  }
 
 
8
  PROMPT_FREIGHT_JSON = """
9
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
10
  {
 
80
  - Only return JSON.
81
  """
82
 
83
+ # ========== Helpers ==========
84
  def _read_file_bytes(upload):
85
  if isinstance(upload, str):
86
+ with open(upload, "rb") as f: return f.read()
 
87
  elif hasattr(upload, "read"):
88
  return upload.read()
89
  raise TypeError("Unsupported file input")
90
 
91
+ def _guess_name_and_mime(file, file_bytes):
92
  filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
93
  mime, _ = mimetypes.guess_type(filename)
94
+ if not mime and file_bytes[:4] == b"%PDF": mime = "application/pdf"
 
95
  return filename, mime or "application/octet-stream"
96
 
 
 
 
 
 
97
  def check_pdf_structure(file_bytes: bytes) -> bool:
 
98
  try:
99
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
100
+ if len(pdf.pages) <= 2: return False
 
101
  for page in pdf.pages[:3]:
102
+ if page.find_tables(): return True
 
 
103
  return False
104
  except Exception as e:
105
+ print("PDF check error:", e); return False
 
106
 
107
+ # ========== 1️⃣ Extract bảng bằng Camelot ==========
108
+ def extract_pdf_tables(file_path: str) -> pd.DataFrame:
109
+ all_dfs = []
110
+ try:
111
+ print("🔍 Try lattice mode...")
112
+ tables = camelot.read_pdf(file_path, flavor="lattice", pages="all")
113
+ if tables.n > 0:
114
+ for t in tables: all_dfs.append(t.df)
115
+ print(f"✅ Lattice: {tables.n} tables.")
116
+ except Exception as e:
117
+ print(f"⚠️ Lattice failed: {e}")
118
+
119
+ if not all_dfs:
120
+ try:
121
+ print("🔁 Try stream mode...")
122
+ tables = camelot.read_pdf(file_path, flavor="stream", pages="all")
123
+ if tables.n > 0:
124
+ for t in tables: all_dfs.append(t.df)
125
+ print(f"✅ Stream: {tables.n} tables.")
126
+ except Exception as e:
127
+ print(f"❌ Stream failed: {e}")
128
+
129
+ if not all_dfs:
130
+ print("🚫 No table detected.")
131
+ return pd.DataFrame()
132
+
133
+ df_final = pd.concat(all_dfs, ignore_index=True)
134
+ if all(str(c).isdigit() for c in df_final.columns):
135
+ print("🧠 Detected numeric headers (0,1,2..), using first row as real header.")
136
+ df_final.columns = df_final.iloc[0]
137
+ df_final = df_final[1:]
138
+ df_final = df_final.dropna(how="all").reset_index(drop=True)
139
+ print(f"✅ Total: {len(df_final)} rows × {len(df_final.columns)} columns.")
140
+ return df_final
141
+
142
+ # ========== 2️⃣ Extract phần Note / Header ==========
143
+ def extract_pdf_note(file_bytes: bytes) -> str:
144
+ """
145
+ Lấy phần text ở đầu PDF (ví dụ: Start Date, Expiry Date, Origin, các note nhỏ)
146
+ Bỏ qua vùng bảng phía dưới.
147
+ """
148
+ try:
149
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
150
+ first_page = pdf.pages[0]
151
+ text = first_page.extract_text() or ""
152
+ # cắt phần note: chỉ lấy 15 dòng đầu để tránh trích luôn bảng
153
+ lines = text.splitlines()[:15]
154
+ note_lines = []
155
+ for line in lines:
156
+ if re.search(r"(Start Date|Origin|Expiry|Product|MY|SC|All rates|Currency)", line, re.I):
157
+ note_lines.append(line.strip())
158
+ note_text = " ".join(note_lines)
159
+ return note_text.strip()
160
+ except Exception as e:
161
+ print(f"⚠️ Note extraction failed: {e}")
162
+ return ""
163
+
164
+ # ========== 3️⃣ Gọi Gemini ==========
165
+ def call_gemini_with_prompt(csv_text: str, note_text: str, model_choice: str, temperature: float, top_p: float):
166
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
167
  genai.configure(api_key=api_key)
168
  model = genai.GenerativeModel(
169
  model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
170
  generation_config={"temperature": temperature, "top_p": top_p}
171
  )
172
+ prompt = f"""{PROMPT_FREIGHT_JSON}
173
+ Below is the extracted freight rate table (CSV) and additional notes:
174
+ Notes:
175
+ {note_text or '[No notes detected]'}
176
+ CSV:
177
+ {csv_text}
178
+ → Convert to valid JSON as per schema above.
179
+ """
180
+ resp = model.generate_content(prompt)
181
+ return getattr(resp, "text", str(resp))
182
 
183
+ # ========== 4️⃣ Main process ==========
184
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
185
  try:
186
  if file is None:
187
  return "❌ No file uploaded.", None
 
188
  file_bytes = _read_file_bytes(file)
189
  filename, mime = _guess_name_and_mime(file, file_bytes)
190
  print(f"[UPLOAD] {filename} ({mime})")
191
 
 
192
  if mime == "application/pdf" and check_pdf_structure(file_bytes):
193
+ print("➡️ PDF has multi-page tableextract before Gemini.")
194
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
195
+ tmp.write(file_bytes)
196
+ tmp_path = tmp.name
197
+
198
+ df = extract_pdf_tables(tmp_path)
199
+ if not df.empty:
200
+ note_text = extract_pdf_note(file_bytes)
201
+ csv_text = df.to_csv(index=False)
202
+ print("✅ Send table + note to Gemini...")
203
+ message = call_gemini_with_prompt(csv_text, note_text, model_choice, temperature, top_p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  return message, None
205
  else:
206
+ print("⚠️ No valid table found fallback to OCR Gemini.")
207
 
208
+ # fallback OCR
209
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
210
  genai.configure(api_key=api_key)
211
  model = genai.GenerativeModel(
 
213
  generation_config={"temperature": temperature, "top_p": top_p}
214
  )
215
  uploaded = genai.upload_file(path=file.name)
216
+ resp = model.generate_content([PROMPT_FREIGHT_JSON, uploaded])
217
  genai.delete_file(uploaded.name)
218
  return getattr(resp, "text", str(resp)), None
219
 
220
  except Exception as e:
221
  return f"ERROR: {type(e).__name__}: {e}", None