vithacocf commited on
Commit
ffe88dd
·
verified ·
1 Parent(s): 2e92701

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -19
app.py CHANGED
@@ -196,9 +196,62 @@ def extract_pdf_note(file_path: str) -> str:
196
  except Exception as e:
197
  print(f"⚠️ extract_pdf_note lỗi: {e}")
198
  return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- def call_gemini_with_prompt(content_text: str, note_text: str, question: str, model_choice: str, temperature: float, top_p: float):
201
- """Gửi bảng + note vào Gemini (ưu tiên prompt tùy chỉnh nếu có)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
203
  genai.configure(api_key=api_key)
204
 
@@ -210,23 +263,39 @@ def call_gemini_with_prompt(content_text: str, note_text: str, question: str, mo
210
  }
211
  )
212
 
213
- # Nếu user không nhập câu hỏi riêng, dùng prompt chuẩn FREIGHT_JSON
214
  base_prompt = question.strip() if question and question.strip() else PROMPT_FREIGHT_JSON
215
 
216
- prompt = f"""
217
- {base_prompt}
218
-
219
- Below is the extracted CSV data:
220
- {content_text}
221
-
222
- Below are the notes extracted from the PDF (e.g. Valid From, Origin, Remark, Package Type rules):
223
- {note_text}
224
-
225
- Please analyze all data and generate the JSON output following the schema above.
226
- """
227
-
228
- print("🧠 Sending prompt to Gemini...")
229
- response = model.generate_content(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  result_text = getattr(response, "text", str(response))
231
 
232
  return result_text
@@ -251,11 +320,11 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
251
  # 1️⃣ Extract bảng bằng Camelot
252
  df = extract_pdf_tables(tmp_path)
253
  note_text = extract_pdf_note(tmp_path)
254
-
255
  if not df.empty:
256
  csv_text = df.to_csv(index=False)
257
  print("✅ Gửi Gemini để sinh JSON...")
258
- message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
259
  return message, None
260
  else:
261
  print("⚠️ Không có bảng hợp lệ, fallback OCR Gemini.")
 
196
  except Exception as e:
197
  print(f"⚠️ extract_pdf_note lỗi: {e}")
198
  return ""
199
+ def extract_airline_header_via_ocr(file_path: str) -> str:
200
+ """
201
+ Dùng Gemini OCR nhận diện hãng bay ở trang đầu PDF.
202
+ ⚡ Tối ưu: chỉ lấy 1 trang đầu, DPI=120, JPEG quality=60 để giảm dung lượng.
203
+ """
204
+ import google.generativeai as genai
205
+ from PIL import Image
206
+ import fitz, io, tempfile, os
207
+
208
+ api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
209
+ genai.configure(api_key=api_key)
210
+ model = genai.GenerativeModel("gemini-2.5-flash")
211
+
212
+ # --- Chuyển trang đầu PDF thành ảnh (giảm DPI và nén) ---
213
+ pdf = fitz.open(file_path)
214
+ pix = pdf[0].get_pixmap(dpi=120) # ⚡ DPI thấp hơn giúp nhẹ hơn nhiều
215
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
216
+
217
+ # Nén ảnh JPEG chất lượng thấp hơn để nhẹ KB
218
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
219
+ img.save(tmp.name, format="JPEG", quality=60, optimize=True) # ⚡ chỉ còn ~150–250KB
220
+ img_path = tmp.name
221
+
222
+ # --- Upload nhẹ hơn nhiều ---
223
+ uploaded = genai.upload_file(path=img_path, mime_type="image/jpeg")
224
+
225
+ # --- Prompt yêu cầu nhận diện header ---
226
+ prompt = """
227
+ Identify from this airline rate sheet:
228
+ - Airline name (e.g. Qatar Airways, Turkish Airlines)
229
+ - Airline code (e.g. QR, TK, EK, VN)
230
+ - Title (e.g. SGN PRICING NOV25)
231
+ - Validity info (e.g. Effective from 01 Nov 2025, Until Further Notice)
232
+ Return JSON with fields: airline_name, airline_code, title, valid_from, valid_to.
233
+ """
234
 
235
+ resp = model.generate_content([prompt, uploaded])
236
+ genai.delete_file(uploaded.name)
237
+
238
+ result = getattr(resp, "text", "").strip()
239
+ print("🛫 OCR header (compressed):", result)
240
+ return result
241
+ def call_gemini_with_prompt(
242
+ header: str,
243
+ content_text: str,
244
+ note_text: str,
245
+ question: str,
246
+ model_choice: str,
247
+ temperature: float,
248
+ top_p: float
249
+ ):
250
+ """
251
+ Gửi header + bảng CSV + note vào Gemini.
252
+ Ưu tiên: nếu user nhập prompt riêng → dùng prompt đó, ngược lại dùng PROMPT_FREIGHT_JSON.
253
+ Header (nếu có) sẽ được chèn thêm vào đầu để giúp model nhận diện hãng bay, thời gian hiệu lực, v.v.
254
+ """
255
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
256
  genai.configure(api_key=api_key)
257
 
 
263
  }
264
  )
265
 
266
+ # --- Xác định prompt chính ---
267
  base_prompt = question.strip() if question and question.strip() else PROMPT_FREIGHT_JSON
268
 
269
+ # --- Ghép nội dung ---
270
+ prompt_parts = [base_prompt]
271
+
272
+ if header and header.strip():
273
+ prompt_parts.append(f"""
274
+ ### Header information (from first page OCR or PDF header):
275
+ {header}
276
+ """)
277
+
278
+ prompt_parts.append(f"""
279
+ ### Extracted table data (CSV format):
280
+ {content_text}
281
+ """)
282
+
283
+ if note_text and note_text.strip():
284
+ prompt_parts.append(f"""
285
+ ### Notes or remarks extracted from the PDF:
286
+ {note_text}
287
+ """)
288
+
289
+ prompt_parts.append("""
290
+ Please analyze all data (header + table + notes) and generate the final JSON output
291
+ following the defined schema above. Ensure that any airline, date, or rule from header/note
292
+ is merged into the JSON result (e.g. shipping_line, valid_from, valid_to, remarks, etc.).
293
+ """)
294
+
295
+ full_prompt = "\n".join(prompt_parts)
296
+
297
+ print("🧠 Sending full prompt (with header if available) to Gemini...")
298
+ response = model.generate_content(full_prompt)
299
  result_text = getattr(response, "text", str(response))
300
 
301
  return result_text
 
320
  # 1️⃣ Extract bảng bằng Camelot
321
  df = extract_pdf_tables(tmp_path)
322
  note_text = extract_pdf_note(tmp_path)
323
+ header = extract_airline_header_via_ocr()
324
  if not df.empty:
325
  csv_text = df.to_csv(index=False)
326
  print("✅ Gửi Gemini để sinh JSON...")
327
+ message = call_gemini_with_prompt(header, csv_text, note_text, question, model_choice, temperature, top_p)
328
  return message, None
329
  else:
330
  print("⚠️ Không có bảng hợp lệ, fallback OCR Gemini.")