vithacocf commited on
Commit
720645e
·
verified ·
1 Parent(s): 603a332

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -37
app.py CHANGED
@@ -361,10 +361,11 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
361
  genai.configure(api_key=api_key)
362
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
363
  print(f'Use key: {api_key}')
 
364
  model = genai.GenerativeModel(model_name=model_name,
365
  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
366
  if file_bytes is None:
367
- response = model.generate_content(question)
368
  #print(response.text)
369
  return response.text, None
370
  pages = pdf_to_images(file_bytes)
@@ -377,7 +378,7 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
377
  im.save(tmp.name)
378
  up = genai.upload_file(path=tmp.name, mime_type="image/png")
379
  uploaded.append(genai.get_file(up.name))
380
- resp = model.generate_content([question] + uploaded)
381
  all_text_results.append(resp.text if hasattr(resp, "text") else "")
382
  for up in uploaded:
383
  try: genai.delete_file(up.name)
@@ -391,41 +392,122 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
391
  return "ERROR: No file uploaded.", None
392
  file_bytes = _read_file_bytes(file)
393
  filename, mime = _guess_name_and_mime(file, file_bytes)
394
- check_result = check_pdf_structure(file_bytes)
395
- if check_result > 1:
396
- base_name = os.path.splitext(filename)[0]
397
- tmp_dir = tempfile.gettempdir()
398
-
399
- # 🔁 Ghi file PDF tạm để xử
400
- tmp_pdf_path = os.path.join(tmp_dir, f"{base_name}.pdf")
401
- with open(tmp_pdf_path, "wb") as f:
402
- f.write(file_bytes)
403
-
404
- # 🔁 Tạo đường dẫn file Excel
405
- excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
406
-
407
- # 🛠 Gọi hàm xử lý
408
- export_pdf_to_excel(tmp_pdf_path, excel_path)
409
-
410
- chunks = split_excel_by_airline_header(excel_path)
411
- header, _ = run_process_internal_base_v2(
412
- file_bytes=file_bytes,
413
- filename=filename,
414
- mime=mime,
415
- question=PROMPT_FREIGHT_HEADER_JSON,
416
- model_choice=model_choice,
417
- temperature=temperature,
418
- top_p=top_p
419
- )
420
- print(header)
421
- chunk_files = []
422
- for airline, df_chunk in chunks.items():
423
- safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_")
424
- print (f'airline : {airline}')
425
- result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5)
426
- return json.dumps(result, ensure_ascii=False, indent=2), None
427
- else:
428
- return "Only supports multi-airline PDF for now", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  except Exception as e:
430
  return f"ERROR: {type(e).__name__}: {str(e)}", None
431
 
 
361
  genai.configure(api_key=api_key)
362
  model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
363
  print(f'Use key: {api_key}')
364
+ user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
365
  model = genai.GenerativeModel(model_name=model_name,
366
  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
367
  if file_bytes is None:
368
+ response = model.generate_content(user_prompt)
369
  #print(response.text)
370
  return response.text, None
371
  pages = pdf_to_images(file_bytes)
 
378
  im.save(tmp.name)
379
  up = genai.upload_file(path=tmp.name, mime_type="image/png")
380
  uploaded.append(genai.get_file(up.name))
381
+ resp = model.generate_content([user_prompt] + uploaded)
382
  all_text_results.append(resp.text if hasattr(resp, "text") else "")
383
  for up in uploaded:
384
  try: genai.delete_file(up.name)
 
392
  return "ERROR: No file uploaded.", None
393
  file_bytes = _read_file_bytes(file)
394
  filename, mime = _guess_name_and_mime(file, file_bytes)
395
+ # STEP 1️⃣: Check PDF structure
396
+ if mime == "application/pdf" or file_bytes[:4] == b"%PDF":
397
+ check_result = check_pdf_structure(file_bytes)
398
+ all_dfs = []
399
+ if check_result > 1:
400
+ print("➡️ PDF nhiều cột/nhiều trang dùng pdfplumber extract trước rồi Gemini.")
401
+
402
+ base_name = os.path.splitext(filename)[0]
403
+ tmp_dir = tempfile.gettempdir()
404
+
405
+ # 🔁 Ghi file PDF tạm để xử lý
406
+ tmp_pdf_path = os.path.join(tmp_dir, f"{base_name}.pdf")
407
+ with open(tmp_pdf_path, "wb") as f:
408
+ f.write(file_bytes)
409
+
410
+ # 🔁 Tạo đường dẫn file Excel
411
+ excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
412
+
413
+ # 🛠 Gọi hàm xử lý
414
+ export_pdf_to_excel(tmp_pdf_path, excel_path)
415
+
416
+ chunks = split_excel_by_airline_header(excel_path)
417
+ header, _ = run_process_internal_base_v2(
418
+ file_bytes=file_bytes,
419
+ filename=filename,
420
+ mime=mime,
421
+ question=PROMPT_FREIGHT_HEADER_JSON,
422
+ model_choice=model_choice,
423
+ temperature=temperature,
424
+ top_p=top_p
425
+ )
426
+ print(header)
427
+ chunk_files = []
428
+ for airline, df_chunk in chunks.items():
429
+ safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_")
430
+ print (f'airline : {airline}')
431
+ result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5)
432
+ return json.dumps(result, ensure_ascii=False, indent=2), None
433
+ else:
434
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
435
+ for page_idx, page in enumerate(pdf.pages, start=1):
436
+ print(f"📄 Đang xử lý trang {page_idx}...")
437
+
438
+ table = page.extract_table({
439
+ "vertical_strategy": "lines",
440
+ "horizontal_strategy": "text",
441
+ "snap_tolerance": 3,
442
+ "intersection_tolerance": 5,
443
+ })
444
+
445
+ if not table or len(table) < 2:
446
+ print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
447
+ continue
448
+
449
+ header = table[0]
450
+ rows = table[1:]
451
+
452
+ # Lưu header đầu tiên
453
+ if saved_header is None:
454
+ saved_header = header
455
+ print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
456
+
457
+ # Nếu trang sau không có header rõ → dùng header cũ
458
+ if len(header) < len(saved_header) or "REGION" not in header[0]:
459
+ print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
460
+ header = saved_header
461
+ rows = table
462
+ else:
463
+ saved_header = header # cập nhật header hợp lệ
464
+
465
+ if len(rows) == 0:
466
+ print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
467
+ continue
468
+
469
+ try:
470
+ df = pd.DataFrame(rows, columns=header)
471
+ all_dfs.append(df)
472
+ print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
473
+ except Exception as e:
474
+ print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
475
+
476
+ if all_dfs:
477
+ final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
478
+ print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
479
+
480
+ # Xuất ra file tạm (Excel + JSON)
481
+ base_name = os.path.splitext(filename)[0]
482
+ tmp_dir = tempfile.gettempdir()
483
+ # json_path = os.path.join(tmp_dir, f"{base_name}.json")
484
+ excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
485
+
486
+ # final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
487
+ final_df.to_excel(excel_path, index=False)
488
+
489
+ # print(f"✅ Xuất JSON: {json_path}")
490
+ # print(f"✅ Xuất Excel: {excel_path}")
491
+
492
+ # Convert bảng thành CSV text để Gemini đọc tiếp
493
+ table_text = final_df.to_csv(index=False)
494
+ print(f"✅ Đang Gen text từ file CSV")
495
+ question = (
496
+ f"{PROMPT_FREIGHT_JSON}\n"
497
+ "Below is the table text extracted from the PDF (CSV format):\n"
498
+ f"{table_text}\n\n"
499
+ "Please convert this into valid JSON as per the schema."
500
+ )
501
+ else:
502
+ print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
503
+
504
+
505
+ result_text, _ = run_process_internal_base_v2(
506
+ file_bytes=file_bytes, filename=filename, mime=mime,
507
+ question=question, model_choice=model_choice,
508
+ temperature=temperature, top_p=top_p
509
+ )
510
+ return result_text, None
511
  except Exception as e:
512
  return f"ERROR: {type(e).__name__}: {str(e)}", None
513