vithacocf commited on
Commit
770523c
·
verified ·
1 Parent(s): 0dafa97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -177
app.py CHANGED
@@ -1,13 +1,11 @@
1
  from __future__ import annotations
2
- import os, io, re, json, time, mimetypes, tempfile
3
- from typing import List, Union, Tuple, Any
4
  from PIL import Image
5
  import pandas as pd
6
  import gradio as gr
7
  import google.generativeai as genai
8
  import requests
9
- import pdfplumber
10
- import fitz # PyMuPDF
11
 
12
  # ================== CONFIG ==================
13
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
@@ -18,21 +16,21 @@ INTERNAL_MODEL_MAP = {
18
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
19
 
20
  PROMPT_FREIGHT_JSON = """
21
- Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
22
  {
23
  "shipping_line": "...",
24
  "shipping_line_code": "...",
25
  "shipping_line_reason": "Why this carrier is chosen?",
26
  "fee_type": "Air Freight",
27
- "valid_from": ...,
28
- "valid_to": ...,
29
  "charges": [
30
  {
31
  "frequency": "...",
32
  "package_type": "...",
33
  "aircraft_type": "...",
34
  "direction": "Export or Import or null",
35
- "origin": "...",
36
  "destination": "...",
37
  "charge_name": "...",
38
  "charge_code": "...",
@@ -49,10 +47,8 @@ Please analyze the freight rate table in the file I provide and convert it into
49
  "+300kg": ...,
50
  "+500kg": ...,
51
  "+1000kg": ...,
52
- "other": {
53
- key: value
54
- },
55
- "weight_breaks_reason":"Why chosen weight_breaks?"
56
  },
57
  "remark": "..."
58
  }
@@ -67,39 +63,17 @@ Please analyze the freight rate table in the file I provide and convert it into
67
  }
68
  ]
69
  }
70
- ### Date rules
71
- - valid_from format:
72
- - `DD/MM/YYYY` (if full date)
73
- - `01/MM/YYYY` (if month+year only)
74
- - `01/01/YYYY` (if year only)
75
- - `UFN` if missing
76
- - valid_to:
77
- - exact `DD/MM/YYYY` if present
78
- - else `UFN`
79
- STRICT RULES:
80
- - ONLY return a single JSON object as specified above.
81
- - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
82
- - If the table shows "RQ" or similar, set value as "RQST".
83
- - Group same-price destinations into one record separated by "/".
84
- - Always use IATA code for origin and destination.
85
- - Flight number (e.g. ZH118) is not charge code.
86
- - Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
87
- - If local charges exist, list them.
88
- - If validity missing, set null.
89
- - Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
90
- - Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
91
- - Replace commas in remarks with semicolons.
92
- - Only return JSON.
93
- """
94
-
95
- try:
96
- RESAMPLE = Image.Resampling.LANCZOS
97
- except AttributeError:
98
- RESAMPLE = Image.LANCZOS
99
 
 
 
 
 
 
 
 
100
 
101
  # ================== HELPERS ==================
102
- def _read_file_bytes(upload):
103
  if upload is None:
104
  raise ValueError("No file uploaded.")
105
  if isinstance(upload, (str, os.PathLike)):
@@ -123,150 +97,87 @@ def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
123
  filename = "upload.bin"
124
  mime, _ = mimetypes.guess_type(filename)
125
  if not mime:
126
- if len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
 
 
127
  mime = "application/pdf"
128
- if not filename.lower().endswith(".pdf"):
129
- filename += ".pdf"
130
  else:
131
  mime = "image/png"
132
  return filename, mime
133
 
134
 
135
- # ================== PDF AUTO-EXTRACT ==================
136
- def extract_pdf_table_safely(file_bytes: bytes, filename: str):
137
- """Tự động đọc PDF bảng nhiều trang, fix lệch header + Origin."""
138
- print(f"[PDF Extract] {filename}: bắt đầu phân tích bằng pdfplumber...")
139
- try:
140
- pdf = pdfplumber.open(io.BytesIO(file_bytes))
141
- except Exception as e:
142
- print(f"❌ Không mở được PDF: {e}")
143
- return None, None
144
-
145
- table_data = []
146
- header = None
147
- origin = None
148
-
149
- for i, page in enumerate(pdf.pages, start=1):
150
- print(f"📄 Trang {i}...")
151
-
152
- # tìm Origin
153
- if i == 1:
154
- text_page = page.extract_text() or ""
155
- m = re.search(r"Origin\s*:\s*([A-Z]{3})", text_page)
156
- if m:
157
- origin = m.group(1).strip()
158
- print(f"✅ Origin phát hiện: {origin}")
159
- else:
160
- origin = "UNK"
161
-
162
- tables = page.extract_tables({
163
- "vertical_strategy": "lines",
164
- "horizontal_strategy": "text",
165
- "snap_tolerance": 3,
166
- "intersection_tolerance": 5,
167
- })
168
-
169
- if not tables:
170
- print(f"⚠️ Trang {i}: không có bảng hợp lệ.")
171
- continue
172
-
173
- for table in tables:
174
- if not table or len(table) < 2:
175
- continue
176
-
177
- if header is None:
178
- header = table[0]
179
- print(f"✅ Header đầu tiên: {header}")
180
- df = pd.DataFrame(table[1:], columns=header)
181
- else:
182
- try:
183
- df = pd.DataFrame(table, columns=header)
184
- except Exception as e:
185
- print(f"⚠️ Trang {i}: lỗi DataFrame {e} → cân chỉnh cột lại.")
186
- n_col = min(len(header), len(table[0]))
187
- df = pd.DataFrame([r[:n_col] for r in table], columns=header[:n_col])
188
-
189
- df["ORIGIN"] = origin
190
- df = df[df[header[0]] != header[0]]
191
- table_data.append(df)
192
-
193
- pdf.close()
194
- if not table_data:
195
- print("❌ Không có bảng hợp lệ trong PDF.")
196
- return None, None
197
-
198
- final_df = pd.concat(table_data, ignore_index=True)
199
- print(f"✅ Tổng cộng {len(final_df)} dòng, {len(final_df.columns)} cột.")
200
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
201
- final_df.to_excel(tmp.name, index=False)
202
- print(f"💾 Excel tạm: {tmp.name}")
203
- return final_df, tmp.name
204
-
205
-
206
- # ================== OCR CORE ==================
207
- def pdf_to_images(pdf_bytes: bytes):
208
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
209
- pages = []
210
- for p in doc:
211
- pix = p.get_pixmap(dpi=200)
212
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
213
- pages.append(img)
214
- return pages
215
-
216
- def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
217
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
 
 
 
218
  genai.configure(api_key=api_key)
219
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
220
- model = genai.GenerativeModel(model_name=model_name, generation_config={"temperature": float(temperature), "top_p": float(top_p)})
221
-
222
- if file_bytes[:4] == b"%PDF":
223
- pages = pdf_to_images(file_bytes)
224
- else:
225
- pages = [Image.open(io.BytesIO(file_bytes))]
226
-
227
- user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
228
- all_text_results = []
229
-
230
- for i in range(0, len(pages), batch_size):
231
- batch = pages[i:i+batch_size]
232
- uploaded = []
233
- for im in batch:
234
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
235
- im.save(tmp.name)
236
- up = genai.upload_file(path=tmp.name, mime_type="image/png")
237
- up = genai.get_file(up.name)
238
- uploaded.append(up)
239
-
240
- resp = model.generate_content([user_prompt] + uploaded)
241
- all_text_results.append(resp.text)
242
- for up in uploaded:
243
- try:
244
- genai.delete_file(up.name)
245
- except:
246
- pass
247
-
248
- return "\n\n".join(all_text_results), None
249
-
250
-
251
- # ================== ROUTER ==================
 
 
 
 
 
 
252
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
253
  try:
 
 
 
254
  file_bytes = _read_file_bytes(file)
255
  filename, mime = _guess_name_and_mime(file, file_bytes)
256
 
257
- # STEP 1️⃣: Auto-detect PDF table
258
- if mime == "application/pdf":
259
- print(f"[CHECK] {filename}: PDF detectedthử extract bảng trước...")
260
- df, tmp_path = extract_pdf_table_safely(file_bytes, filename)
261
- if df is not None and len(df) > 0:
262
- print("✅ PDF bảng skip OCR.")
263
- preview_text = f"Extracted {len(df)} rows from {filename}. Origin={df['ORIGIN'].iloc[0]}"
264
- return preview_text, None
265
- else:
266
- print("⚠️ PDF không rõ cấu trúc → fallback sang OCR.")
 
 
 
 
267
 
268
- # STEP 2️⃣: Nếu không phải bảng → OCR Gemini
269
- return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
270
 
271
  except Exception as e:
272
  return f"ERROR: {type(e).__name__}: {str(e)}", None
@@ -274,18 +185,18 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
274
 
275
  # ================== UI ==================
276
  def main():
277
- with gr.Blocks(title="OCR Hybrid Extractor") as demo:
278
- gr.Markdown("## 📦 Hybrid OCR: pdfplumber → Gemini Fallback")
279
 
280
- file = gr.File(label="Upload PDF/Image")
281
- question = gr.Textbox(label="Prompt", lines=2)
282
  model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
283
  value="Gemini 2.5 Flash", label="Model")
284
  temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
285
  top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
286
  external_api_url = gr.Textbox(label="External API URL", visible=False)
287
- output_text = gr.Code(label="Output", language="json")
288
- run_btn = gr.Button("🚀 Process")
289
 
290
  run_btn.click(run_process,
291
  inputs=[file, question, model_choice, temperature, top_p, external_api_url],
 
1
  from __future__ import annotations
2
+ import os, io, re, json, mimetypes, tempfile
3
+ from typing import List, Union, Tuple
4
  from PIL import Image
5
  import pandas as pd
6
  import gradio as gr
7
  import google.generativeai as genai
8
  import requests
 
 
9
 
10
  # ================== CONFIG ==================
11
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 
16
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
17
 
18
  PROMPT_FREIGHT_JSON = """
19
+ Please analyze the freight rate table in the CSV file I provide and convert it into JSON with this structure:
20
  {
21
  "shipping_line": "...",
22
  "shipping_line_code": "...",
23
  "shipping_line_reason": "Why this carrier is chosen?",
24
  "fee_type": "Air Freight",
25
+ "valid_from": "...",
26
+ "valid_to": "...",
27
  "charges": [
28
  {
29
  "frequency": "...",
30
  "package_type": "...",
31
  "aircraft_type": "...",
32
  "direction": "Export or Import or null",
33
+ "origin": "...", # detect automatically from header, filename, or text (e.g. SGN/HAN/DAD)
34
  "destination": "...",
35
  "charge_name": "...",
36
  "charge_code": "...",
 
47
  "+300kg": ...,
48
  "+500kg": ...,
49
  "+1000kg": ...,
50
+ "other": { key: value },
51
+ "weight_breaks_reason": "Why chosen weight_breaks?"
 
 
52
  },
53
  "remark": "..."
54
  }
 
63
  }
64
  ]
65
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ Rules:
68
+ - If filename or top text includes "Origin: SGN", "SGN", "HAN", or "DAD" → use as origin.
69
+ - If missing, infer origin from file name (e.g., "TK - SGN Rate Sheet.csv" → SGN).
70
+ - All rates must match the weight break columns (M, N, 45, 100, 300, 500, 1000, etc.).
71
+ - No assumptions; set null if missing.
72
+ - Only return valid JSON object as above.
73
+ """
74
 
75
  # ================== HELPERS ==================
76
+ def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
77
  if upload is None:
78
  raise ValueError("No file uploaded.")
79
  if isinstance(upload, (str, os.PathLike)):
 
97
  filename = "upload.bin"
98
  mime, _ = mimetypes.guess_type(filename)
99
  if not mime:
100
+ if filename.lower().endswith(".csv"):
101
+ mime = "text/csv"
102
+ elif len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
103
  mime = "application/pdf"
 
 
104
  else:
105
  mime = "image/png"
106
  return filename, mime
107
 
108
 
109
+ # ================== GEMINI PROCESS ==================
110
+ def run_gemini_text(file_bytes, filename, mime, model_choice, question, temperature, top_p):
111
+ """Gemini đọc CSV/text sinh JSON"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
113
+ if not api_key:
114
+ return "ERROR: Missing GOOGLE_API_KEY.", None
115
+
116
  genai.configure(api_key=api_key)
117
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
118
+ model = genai.GenerativeModel(model_name=model_name,
119
+ generation_config={"temperature": float(temperature), "top_p": float(top_p)})
120
+
121
+ # đọc CSV nếu có
122
+ csv_text = None
123
+ if mime == "text/csv" or filename.lower().endswith(".csv"):
124
+ try:
125
+ df = pd.read_csv(io.BytesIO(file_bytes))
126
+ csv_text = df.to_csv(index=False)
127
+ except Exception:
128
+ csv_text = file_bytes.decode("utf-8", errors="ignore")
129
+
130
+ # prompt chính
131
+ user_prompt = question.strip() if question else PROMPT_FREIGHT_JSON
132
+ full_prompt = (
133
+ f"{user_prompt}\n\n"
134
+ f"Filename: {filename}\n\n"
135
+ f"Below is the table text extracted from your CSV file:\n{csv_text or file_bytes.decode('utf-8', errors='ignore')}\n\n"
136
+ "Please analyze and return valid JSON only."
137
+ )
138
+
139
+ resp = model.generate_content(full_prompt)
140
+ return resp.text.strip(), None
141
+
142
+
143
+ # ================== EXTERNAL API (nếu có) ==================
144
+ def run_process_external(file_bytes, filename, mime, question, api_url, temperature, top_p):
145
+ if not api_url:
146
+ return "ERROR: Missing external API endpoint.", None
147
+ data = {"prompt": question or "", "temperature": str(temperature), "top_p": str(top_p)}
148
+ files = {"file": (filename, file_bytes, mime)}
149
+ r = requests.post(api_url, files=files, data=data, timeout=60)
150
+ if r.status_code >= 400:
151
+ return f"ERROR: External API HTTP {r.status_code}: {r.text[:200]}", None
152
+ return r.text, None
153
+
154
+
155
+ # ================== MAIN ROUTER ==================
156
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
157
  try:
158
+ if file is None:
159
+ return "ERROR: No file uploaded.", None
160
+
161
  file_bytes = _read_file_bytes(file)
162
  filename, mime = _guess_name_and_mime(file, file_bytes)
163
 
164
+ print(f"[INFO] Processing {filename} ({mime})...")
165
+
166
+ # Nếu CSVđọc text & gửi Gemini
167
+ if mime == "text/csv" or filename.lower().endswith(".csv"):
168
+ print("🟢 Detected CSV file Sending to Gemini for JSON conversion.")
169
+ return run_gemini_text(file_bytes, filename, mime, model_choice, question, temperature, top_p)
170
+
171
+ # Nếu chọn external
172
+ if model_choice == EXTERNAL_MODEL_NAME:
173
+ return run_process_external(
174
+ file_bytes=file_bytes, filename=filename, mime=mime,
175
+ question=question, api_url=external_api_url,
176
+ temperature=temperature, top_p=top_p
177
+ )
178
 
179
+ # fallback: PDF / image
180
+ return "⚠️ Only CSV supported in this version. Please upload .csv file.", None
181
 
182
  except Exception as e:
183
  return f"ERROR: {type(e).__name__}: {str(e)}", None
 
185
 
186
  # ================== UI ==================
187
  def main():
188
+ with gr.Blocks(title="CSV JSON Converter (Gemini)") as demo:
189
+ gr.Markdown("## 📦 Upload CSV → Gemini generates structured JSON")
190
 
191
+ file = gr.File(label="Upload CSV file")
192
+ question = gr.Textbox(label="Custom Prompt (optional)", lines=2)
193
  model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
194
  value="Gemini 2.5 Flash", label="Model")
195
  temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
196
  top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
197
  external_api_url = gr.Textbox(label="External API URL", visible=False)
198
+ output_text = gr.Code(label="Gemini Output", language="json")
199
+ run_btn = gr.Button("🚀 Convert to JSON")
200
 
201
  run_btn.click(run_process,
202
  inputs=[file, question, model_choice, temperature, top_p, external_api_url],