Spaces:

vithacocf
/

api_gemini

Paused

App Files Files Community

vithacocf commited on about 1 month ago

Commit

720645e

verified ·

1 Parent(s): 603a332

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -37

app.py CHANGED Viewed

@@ -361,10 +361,11 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
     genai.configure(api_key=api_key)
     model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
     print(f'Use key: {api_key}')
     model = genai.GenerativeModel(model_name=model_name,
                                   generation_config={"temperature": float(temperature), "top_p": float(top_p)})
     if file_bytes is None:
-        response = model.generate_content(question)
         #print(response.text)
         return response.text, None
     pages = pdf_to_images(file_bytes)
@@ -377,7 +378,7 @@ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_cho
                 im.save(tmp.name)
                 up = genai.upload_file(path=tmp.name, mime_type="image/png")
                 uploaded.append(genai.get_file(up.name))
-        resp = model.generate_content([question] + uploaded)
         all_text_results.append(resp.text if hasattr(resp, "text") else "")
         for up in uploaded:
             try: genai.delete_file(up.name)
@@ -391,41 +392,122 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
             return "ERROR: No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
-        check_result = check_pdf_structure(file_bytes)
-        if check_result > 1:
-            base_name = os.path.splitext(filename)[0]
-            tmp_dir = tempfile.gettempdir()
-            # 🔁 Ghi file PDF tạm để xử lý
-            tmp_pdf_path = os.path.join(tmp_dir, f"{base_name}.pdf")
-            with open(tmp_pdf_path, "wb") as f:
-                f.write(file_bytes)
-            # 🔁 Tạo đường dẫn file Excel
-            excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
-            # 🛠 Gọi hàm xử lý
-            export_pdf_to_excel(tmp_pdf_path, excel_path)
-            chunks = split_excel_by_airline_header(excel_path)
-            header, _ = run_process_internal_base_v2(
-                file_bytes=file_bytes,
-                filename=filename,
-                mime=mime,
-                question=PROMPT_FREIGHT_HEADER_JSON,
-                model_choice=model_choice,
-                temperature=temperature,
-                top_p=top_p
-            )
-            print(header)
-            chunk_files = []
-            for airline, df_chunk in chunks.items():
-                safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_")
-                print (f'airline : {airline}')
-            result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5)
-            return json.dumps(result, ensure_ascii=False, indent=2), None
-        else:
-            return "Only supports multi-airline PDF for now", None
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {str(e)}", None

     genai.configure(api_key=api_key)
     model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
     print(f'Use key: {api_key}')
+    user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
     model = genai.GenerativeModel(model_name=model_name,
                                   generation_config={"temperature": float(temperature), "top_p": float(top_p)})
     if file_bytes is None:
+        response = model.generate_content(user_prompt)
         #print(response.text)
         return response.text, None
     pages = pdf_to_images(file_bytes)
                 im.save(tmp.name)
                 up = genai.upload_file(path=tmp.name, mime_type="image/png")
                 uploaded.append(genai.get_file(up.name))
+        resp = model.generate_content([user_prompt] + uploaded)
         all_text_results.append(resp.text if hasattr(resp, "text") else "")
         for up in uploaded:
             try: genai.delete_file(up.name)
             return "ERROR: No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
+        # STEP 1️⃣: Check PDF structure
+        if mime == "application/pdf" or file_bytes[:4] == b"%PDF":
+            check_result = check_pdf_structure(file_bytes)
+            all_dfs = []
+            if check_result > 1:
+                print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
+                base_name = os.path.splitext(filename)[0]
+                tmp_dir = tempfile.gettempdir()
+                # 🔁 Ghi file PDF tạm để xử lý
+                tmp_pdf_path = os.path.join(tmp_dir, f"{base_name}.pdf")
+                with open(tmp_pdf_path, "wb") as f:
+                    f.write(file_bytes)
+                # 🔁 Tạo đường dẫn file Excel
+                excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
+                # 🛠 Gọi hàm xử lý
+                export_pdf_to_excel(tmp_pdf_path, excel_path)
+                chunks = split_excel_by_airline_header(excel_path)
+                header, _ = run_process_internal_base_v2(
+                    file_bytes=file_bytes,
+                    filename=filename,
+                    mime=mime,
+                    question=PROMPT_FREIGHT_HEADER_JSON,
+                    model_choice=model_choice,
+                    temperature=temperature,
+                    top_p=top_p
+                )
+                print(header)
+                chunk_files = []
+                for airline, df_chunk in chunks.items():
+                    safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_")
+                    print (f'airline : {airline}')
+                result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5)
+                return json.dumps(result, ensure_ascii=False, indent=2), None
+            else:
+                with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+                    for page_idx, page in enumerate(pdf.pages, start=1):
+                        print(f"📄 Đang xử lý trang {page_idx}...")
+                        table = page.extract_table({
+                            "vertical_strategy": "lines",
+                            "horizontal_strategy": "text",
+                            "snap_tolerance": 3,
+                            "intersection_tolerance": 5,
+                        })
+                        if not table or len(table) < 2:
+                            print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.")
+                            continue
+                        header = table[0]
+                        rows = table[1:]
+                        # Lưu header đầu tiên
+                        if saved_header is None:
+                            saved_header = header
+                            print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}")
+                        # Nếu trang sau không có header rõ → dùng header cũ
+                        if len(header) < len(saved_header) or "REGION" not in header[0]:
+                            print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.")
+                            header = saved_header
+                            rows = table
+                        else:
+                            saved_header = header  # cập nhật header hợp lệ
+                        if len(rows) == 0:
+                            print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.")
+                            continue
+                        try:
+                            df = pd.DataFrame(rows, columns=header)
+                            all_dfs.append(df)
+                            print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.")
+                        except Exception as e:
+                            print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}")
+                if all_dfs:
+                    final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True)
+                    print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.")
+                    # Xuất ra file tạm (Excel + JSON)
+                    base_name = os.path.splitext(filename)[0]
+                    tmp_dir = tempfile.gettempdir()
+                    # json_path = os.path.join(tmp_dir, f"{base_name}.json")
+                    excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx")
+                    # final_df.to_json(json_path, orient="records", force_ascii=False, indent=2)
+                    final_df.to_excel(excel_path, index=False)
+                    # print(f"✅ Xuất JSON:  {json_path}")
+                    # print(f"✅ Xuất Excel: {excel_path}")
+                    # Convert bảng thành CSV text để Gemini đọc tiếp
+                    table_text = final_df.to_csv(index=False)
+                    print(f"✅ Đang Gen text từ file CSV")
+                    question = (
+                        f"{PROMPT_FREIGHT_JSON}\n"
+                        "Below is the table text extracted from the PDF (CSV format):\n"
+                        f"{table_text}\n\n"
+                        "Please convert this into valid JSON as per the schema."
+                    )
+                else:
+                    print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.")
+        result_text, _ = run_process_internal_base_v2(
+            file_bytes=file_bytes, filename=filename, mime=mime,
+            question=question, model_choice=model_choice,
+            temperature=temperature, top_p=top_p
+        )
+        return result_text, None
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {str(e)}", None