vithacocf commited on
Commit
b7af253
·
verified ·
1 Parent(s): 770523c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -117
app.py CHANGED
@@ -1,36 +1,39 @@
1
  from __future__ import annotations
2
- import os, io, re, json, mimetypes, tempfile
3
- from typing import List, Union, Tuple
4
  from PIL import Image
5
  import pandas as pd
6
  import gradio as gr
7
  import google.generativeai as genai
8
  import requests
 
 
9
 
10
  # ================== CONFIG ==================
11
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 
12
  INTERNAL_MODEL_MAP = {
13
  "Gemini 2.5 Flash": "gemini-2.5-flash",
14
- "Gemini 2.5 Pro": "gemini-2.5-pro",
15
  }
16
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
17
 
18
  PROMPT_FREIGHT_JSON = """
19
- Please analyze the freight rate table in the CSV file I provide and convert it into JSON with this structure:
20
  {
21
  "shipping_line": "...",
22
  "shipping_line_code": "...",
23
  "shipping_line_reason": "Why this carrier is chosen?",
24
  "fee_type": "Air Freight",
25
- "valid_from": "...",
26
- "valid_to": "...",
27
  "charges": [
28
  {
29
  "frequency": "...",
30
  "package_type": "...",
31
  "aircraft_type": "...",
32
  "direction": "Export or Import or null",
33
- "origin": "...", # detect automatically from header, filename, or text (e.g. SGN/HAN/DAD)
34
  "destination": "...",
35
  "charge_name": "...",
36
  "charge_code": "...",
@@ -47,8 +50,10 @@ Please analyze the freight rate table in the CSV file I provide and convert it i
47
  "+300kg": ...,
48
  "+500kg": ...,
49
  "+1000kg": ...,
50
- "other": { key: value },
51
- "weight_breaks_reason": "Why chosen weight_breaks?"
 
 
52
  },
53
  "remark": "..."
54
  }
@@ -63,148 +68,159 @@ Please analyze the freight rate table in the CSV file I provide and convert it i
63
  }
64
  ]
65
  }
66
-
67
- Rules:
68
- - If filename or top text includes "Origin: SGN", "SGN", "HAN", or "DAD" → use as origin.
69
- - If missing, infer origin from file name (e.g., "TK - SGN Rate Sheet.csv" → SGN).
70
- - All rates must match the weight break columns (M, N, 45, 100, 300, 500, 1000, etc.).
71
- - No assumptions; set null if missing.
72
- - Only return valid JSON object as above.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  """
74
 
75
  # ================== HELPERS ==================
76
- def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
77
- if upload is None:
78
- raise ValueError("No file uploaded.")
79
- if isinstance(upload, (str, os.PathLike)):
80
  with open(upload, "rb") as f:
81
  return f.read()
82
- if isinstance(upload, dict) and "path" in upload:
83
- with open(upload["path"], "rb") as f:
84
- return f.read()
85
- if hasattr(upload, "read"):
86
  return upload.read()
87
- raise TypeError(f"Unsupported file object: {type(upload)}")
88
 
89
  def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
90
- if isinstance(file, (str, os.PathLike)):
91
- filename = os.path.basename(str(file))
92
- elif isinstance(file, dict) and "name" in file:
93
- filename = os.path.basename(file["name"])
94
- elif isinstance(file, dict) and "path" in file:
95
- filename = os.path.basename(file["path"])
96
- else:
97
- filename = "upload.bin"
98
  mime, _ = mimetypes.guess_type(filename)
99
- if not mime:
100
- if filename.lower().endswith(".csv"):
101
- mime = "text/csv"
102
- elif len(file_bytes) >= 4 and file_bytes[:4] == b"%PDF":
103
- mime = "application/pdf"
104
- else:
105
- mime = "image/png"
106
- return filename, mime
107
-
108
-
109
- # ================== GEMINI PROCESS ==================
110
- def run_gemini_text(file_bytes, filename, mime, model_choice, question, temperature, top_p):
111
- """Gemini đọc CSV/text → sinh JSON"""
112
- api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
113
- if not api_key:
114
- return "ERROR: Missing GOOGLE_API_KEY.", None
115
-
116
- genai.configure(api_key=api_key)
117
- model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
118
- model = genai.GenerativeModel(model_name=model_name,
119
- generation_config={"temperature": float(temperature), "top_p": float(top_p)})
120
-
121
- # đọc CSV nếu có
122
- csv_text = None
123
- if mime == "text/csv" or filename.lower().endswith(".csv"):
124
- try:
125
- df = pd.read_csv(io.BytesIO(file_bytes))
126
- csv_text = df.to_csv(index=False)
127
- except Exception:
128
- csv_text = file_bytes.decode("utf-8", errors="ignore")
129
-
130
- # prompt chính
131
- user_prompt = question.strip() if question else PROMPT_FREIGHT_JSON
132
- full_prompt = (
133
- f"{user_prompt}\n\n"
134
- f"Filename: {filename}\n\n"
135
- f"Below is the table text extracted from your CSV file:\n{csv_text or file_bytes.decode('utf-8', errors='ignore')}\n\n"
136
- "Please analyze and return valid JSON only."
137
- )
138
-
139
- resp = model.generate_content(full_prompt)
140
- return resp.text.strip(), None
141
 
 
 
 
142
 
143
- # ================== EXTERNAL API (nếu có) ==================
144
- def run_process_external(file_bytes, filename, mime, question, api_url, temperature, top_p):
145
- if not api_url:
146
- return "ERROR: Missing external API endpoint.", None
147
- data = {"prompt": question or "", "temperature": str(temperature), "top_p": str(top_p)}
148
- files = {"file": (filename, file_bytes, mime)}
149
- r = requests.post(api_url, files=files, data=data, timeout=60)
150
- if r.status_code >= 400:
151
- return f"ERROR: External API HTTP {r.status_code}: {r.text[:200]}", None
152
- return r.text, None
 
 
 
 
 
153
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- # ================== MAIN ROUTER ==================
156
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
157
  try:
158
  if file is None:
159
- return "ERROR: No file uploaded.", None
160
 
161
  file_bytes = _read_file_bytes(file)
162
  filename, mime = _guess_name_and_mime(file, file_bytes)
163
-
164
- print(f"[INFO] Processing {filename} ({mime})...")
165
-
166
- # Nếu CSV đọc text & gửi Gemini
167
- if mime == "text/csv" or filename.lower().endswith(".csv"):
168
- print("🟢 Detected CSV file → Sending to Gemini for JSON conversion.")
169
- return run_gemini_text(file_bytes, filename, mime, model_choice, question, temperature, top_p)
170
-
171
- # Nếu chọn external
172
- if model_choice == EXTERNAL_MODEL_NAME:
173
- return run_process_external(
174
- file_bytes=file_bytes, filename=filename, mime=mime,
175
- question=question, api_url=external_api_url,
176
- temperature=temperature, top_p=top_p
177
- )
178
-
179
- # fallback: PDF / image
180
- return "⚠️ Only CSV supported in this version. Please upload .csv file.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  except Exception as e:
183
- return f"ERROR: {type(e).__name__}: {str(e)}", None
184
-
185
 
186
  # ================== UI ==================
187
  def main():
188
- with gr.Blocks(title="CSV JSON Converter (Gemini)") as demo:
189
- gr.Markdown("## 📦 Upload CSV Gemini generates structured JSON")
190
-
191
- file = gr.File(label="Upload CSV file")
192
- question = gr.Textbox(label="Custom Prompt (optional)", lines=2)
193
  model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
194
  value="Gemini 2.5 Flash", label="Model")
195
  temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
196
  top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
197
  external_api_url = gr.Textbox(label="External API URL", visible=False)
198
  output_text = gr.Code(label="Gemini Output", language="json")
199
- run_btn = gr.Button("🚀 Convert to JSON")
200
 
201
- run_btn.click(run_process,
202
- inputs=[file, question, model_choice, temperature, top_p, external_api_url],
203
- outputs=[output_text, gr.State()])
 
 
204
 
205
  return demo
206
 
207
-
208
  demo = main()
209
  if __name__ == "__main__":
210
  demo.launch()
 
1
  from __future__ import annotations
2
+ import os, io, re, json, time, mimetypes, tempfile
3
+ from typing import List, Union, Tuple, Any
4
  from PIL import Image
5
  import pandas as pd
6
  import gradio as gr
7
  import google.generativeai as genai
8
  import requests
9
+ import pdfplumber
10
+ import fitz # PyMuPDF
11
 
12
  # ================== CONFIG ==================
13
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
14
+
15
  INTERNAL_MODEL_MAP = {
16
  "Gemini 2.5 Flash": "gemini-2.5-flash",
17
+ "Gemini 2.5 Pro": "gemini-2.5-pro",
18
  }
19
  EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
20
 
21
  PROMPT_FREIGHT_JSON = """
22
+ Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
23
  {
24
  "shipping_line": "...",
25
  "shipping_line_code": "...",
26
  "shipping_line_reason": "Why this carrier is chosen?",
27
  "fee_type": "Air Freight",
28
+ "valid_from": ...,
29
+ "valid_to": ...,
30
  "charges": [
31
  {
32
  "frequency": "...",
33
  "package_type": "...",
34
  "aircraft_type": "...",
35
  "direction": "Export or Import or null",
36
+ "origin": "...",
37
  "destination": "...",
38
  "charge_name": "...",
39
  "charge_code": "...",
 
50
  "+300kg": ...,
51
  "+500kg": ...,
52
  "+1000kg": ...,
53
+ "other": {
54
+ key: value
55
+ },
56
+ "weight_breaks_reason":"Why chosen weight_breaks?"
57
  },
58
  "remark": "..."
59
  }
 
68
  }
69
  ]
70
  }
71
+ ### Date rules
72
+ - valid_from format:
73
+ - `DD/MM/YYYY` (if full date)
74
+ - `01/MM/YYYY` (if month+year only)
75
+ - `01/01/YYYY` (if year only)
76
+ - `UFN` if missing
77
+ - valid_to:
78
+ - exact `DD/MM/YYYY` if present
79
+ - else `UFN`
80
+ STRICT RULES:
81
+ - ONLY return a single JSON object as specified above.
82
+ - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
83
+ - If the table shows "RQ" or similar, set value as "RQST".
84
+ - Group same-price destinations into one record separated by "/".
85
+ - Always use IATA code for origin and destination.
86
+ - Flight number (e.g. ZH118) is not charge code.
87
+ - Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
88
+ - If local charges exist, list them.
89
+ - If validity missing, set null.
90
+ - Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
91
+ - Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
92
+ - Replace commas in remarks with semicolons.
93
+ - Only return JSON.
94
  """
95
 
96
  # ================== HELPERS ==================
97
+ def _read_file_bytes(upload):
98
+ if isinstance(upload, str):
 
 
99
  with open(upload, "rb") as f:
100
  return f.read()
101
+ elif hasattr(upload, "read"):
 
 
 
102
  return upload.read()
103
+ raise TypeError("Unsupported file input")
104
 
105
  def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
106
+ filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
 
 
 
 
 
 
 
107
  mime, _ = mimetypes.guess_type(filename)
108
+ if not mime and file_bytes[:4] == b"%PDF":
109
+ mime = "application/pdf"
110
+ return filename, mime or "application/octet-stream"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
113
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
114
+ return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
115
 
116
+ # ================== PDF CHECK ==================
117
+ def check_pdf_structure(file_bytes: bytes) -> bool:
118
+ """Trả về True nếu PDF có nhiều trang và dạng bảng."""
119
+ try:
120
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
121
+ if len(pdf.pages) <= 2:
122
+ return False
123
+ for page in pdf.pages[:3]:
124
+ tables = page.find_tables()
125
+ if tables:
126
+ return True
127
+ return False
128
+ except Exception as e:
129
+ print("PDF check error:", e)
130
+ return False
131
 
132
+ # ================== GEMINI CALL ==================
133
+ def call_gemini_with_prompt(content_text: str, question: str, model_choice: str, temperature: float, top_p: float):
134
+ api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
135
+ genai.configure(api_key=api_key)
136
+ model = genai.GenerativeModel(
137
+ model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
138
+ generation_config={"temperature": temperature, "top_p": top_p}
139
+ )
140
+ prompt = f"{PROMPT_FREIGHT_JSON}\n{question or ''}\n\nBelow is the extracted CSV data:\n{content_text}"
141
+ response = model.generate_content(prompt)
142
+ return getattr(response, "text", str(response))
143
 
144
+ # ================== MAIN LOGIC ==================
145
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
146
  try:
147
  if file is None:
148
+ return " No file uploaded.", None
149
 
150
  file_bytes = _read_file_bytes(file)
151
  filename, mime = _guess_name_and_mime(file, file_bytes)
152
+ print(f"[UPLOAD] {filename} ({mime})")
153
+
154
+ # 1️⃣ Nếu là PDF và có nhiều trang dạng bảng
155
+ if mime == "application/pdf" and check_pdf_structure(file_bytes):
156
+ print("➡️ PDF nhiều trang & dạng bảng → trích xuất CSV trước khi gọi Gemini.")
157
+ all_dfs, saved_header = [], None
158
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
159
+ for idx, page in enumerate(pdf.pages, start=1):
160
+ table = page.extract_table({
161
+ "vertical_strategy": "lines",
162
+ "horizontal_strategy": "text",
163
+ "snap_tolerance": 3,
164
+ "intersection_tolerance": 5,
165
+ })
166
+ if not table or len(table) < 2:
167
+ continue
168
+ header, rows = table[0], table[1:]
169
+ if saved_header is None:
170
+ saved_header = header
171
+ elif len(header) < len(saved_header):
172
+ header = saved_header
173
+ try:
174
+ df = pd.DataFrame(rows, columns=header)
175
+ all_dfs.append(df)
176
+ except Exception as e:
177
+ print(f"⚠️ Trang {idx} lỗi DataFrame: {e}")
178
+
179
+ if all_dfs:
180
+ final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all")
181
+ csv_text = final_df.to_csv(index=False)
182
+ print(f"✅ Trích xuất {len(final_df)} dòng, gửi Gemini xử lý JSON.")
183
+ message = call_gemini_with_prompt(csv_text, question, model_choice, temperature, top_p)
184
+ return message, None
185
+ else:
186
+ print("⚠️ Không có bảng hợp lệ, fallback qua OCR bình thường.")
187
+
188
+ # 2️⃣ Các loại file còn lại → xử lý như cũ
189
+ api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
190
+ genai.configure(api_key=api_key)
191
+ model = genai.GenerativeModel(
192
+ model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
193
+ generation_config={"temperature": temperature, "top_p": top_p}
194
+ )
195
+ uploaded = genai.upload_file(path=file.name)
196
+ resp = model.generate_content([question or PROMPT_FREIGHT_JSON, uploaded])
197
+ genai.delete_file(uploaded.name)
198
+ return getattr(resp, "text", str(resp)), None
199
 
200
  except Exception as e:
201
+ return f"ERROR: {type(e).__name__}: {e}", None
 
202
 
203
  # ================== UI ==================
204
  def main():
205
+ with gr.Blocks(title="OCR + Table Extraction for Gemini") as demo:
206
+ file = gr.File(label="📂 Upload PDF / Image / CSV")
207
+ question = gr.Textbox(label="Prompt", lines=2)
 
 
208
  model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
209
  value="Gemini 2.5 Flash", label="Model")
210
  temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
211
  top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
212
  external_api_url = gr.Textbox(label="External API URL", visible=False)
213
  output_text = gr.Code(label="Gemini Output", language="json")
214
+ run_btn = gr.Button("🚀 Run Extraction")
215
 
216
+ run_btn.click(
217
+ run_process,
218
+ inputs=[file, question, model_choice, temperature, top_p, external_api_url],
219
+ outputs=[output_text, gr.State()]
220
+ )
221
 
222
  return demo
223
 
 
224
  demo = main()
225
  if __name__ == "__main__":
226
  demo.launch()