Spaces:
Running
Running
| import google.generativeai as genai | |
| import pandas as pd | |
| import fitz # PyMuPDF | |
| import json | |
| import re | |
| import os | |
| def initialize_gemini(api_key, model_name): | |
| """Inisialisasi model Gemini.""" | |
| try: | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel(model_name) | |
| return model | |
| except Exception as e: | |
| print(f"❌ Gagal mengkonfigurasi Gemini: {e}") | |
| return None | |
| def extract_text_from_pdf(file_path): | |
| """Ekstrak teks dari file PDF.""" | |
| try: | |
| with fitz.open(file_path) as doc: | |
| return "".join(page.get_text() for page in doc), None | |
| except Exception as e: | |
| return None, f"Gagal membaca file PDF: {e}" | |
| def get_invoice_data_with_gemini(pdf_text, model, prompt): | |
| """Kirim teks dan prompt ke Gemini.""" | |
| if not model: | |
| return None, "Model Gemini tidak diinisialisasi." | |
| full_prompt = prompt.format(text_input=pdf_text) | |
| try: | |
| response = model.generate_content(full_prompt, request_options={'timeout': 120}) | |
| clean_response = re.sub(r'```json\n?|```', '', response.text.strip()) | |
| return json.loads(clean_response), None | |
| except json.JSONDecodeError: | |
| # Menangkap respons mentah untuk debugging jika parsing JSON gagal | |
| raw_response = "Tidak ada respons" | |
| if 'response' in locals() and hasattr(response, 'text'): | |
| raw_response = response.text | |
| return None, f"Gagal mem-parsing JSON dari respons Gemini. Respons mentah: {raw_response[:200]}..." | |
| except Exception as e: | |
| return None, f"Error saat berkomunikasi dengan Gemini: {e}" | |
| def process_single_invoice(pdf_file_path, model, prompt): | |
| """ | |
| Proses satu file invoice dan SELALU kembalikan DataFrame yang valid dan terstruktur. | |
| """ | |
| file_name = os.path.basename(pdf_file_path) | |
| brand_name = re.match(r'^(\S+)', file_name).group(1) if re.match(r'^(\S+)', file_name) else "Unknown" | |
| # DEFINISI STRUKTUR DATA YANG KONSISTEN | |
| desired_columns = [ | |
| "brand", "vendor_name", "invoice_number", "estimate_time_arrival", "currency", | |
| "invoice_po_number", "style_code", "size", "item_qty", "pack_qty", | |
| "total_pack_qty", "unit_price", "amount_price", "file_name", "error" | |
| ] | |
| # Fungsi untuk membuat baris error yang konsisten | |
| def create_error_row(error_message): | |
| data = {col: None for col in desired_columns} | |
| data.update({"file_name": file_name, "error": error_message, "brand": brand_name}) | |
| return pd.DataFrame([data]) | |
| # 1. Ekstrak Teks | |
| invoice_text, err = extract_text_from_pdf(pdf_file_path) | |
| if err: | |
| return create_error_row(err) | |
| # 2. Ekstrak data via Gemini | |
| invoice_data, err = get_invoice_data_with_gemini(invoice_text, model, prompt) | |
| if err: | |
| return create_error_row(err) | |
| if not isinstance(invoice_data, dict): | |
| return create_error_row("Respons dari Gemini bukan format JSON (dict) yang valid.") | |
| # 3. Proses dan Bentuk DataFrame | |
| items_list = invoice_data.get('items', []) | |
| processed_rows = [] | |
| # Jika tidak ada item, buat satu baris dari data global | |
| if not items_list or not isinstance(items_list, list): | |
| base_data = { | |
| 'brand': brand_name, | |
| 'vendor_name': invoice_data.get('vendor_name'), | |
| 'invoice_number': invoice_data.get('invoice_number'), | |
| 'estimate_time_arrival': invoice_data.get('estimate_time_arrival'), | |
| 'currency': invoice_data.get('currency'), | |
| 'file_name': file_name, | |
| 'error': "Tidak ada 'items' yang ditemukan dalam respons" if not items_list else None | |
| } | |
| processed_rows.append(base_data) | |
| else: | |
| # Proses setiap item dalam daftar | |
| for item in items_list: | |
| if isinstance(item, dict): | |
| row_data = { | |
| 'brand': brand_name, | |
| 'vendor_name': invoice_data.get('vendor_name'), | |
| 'invoice_number': invoice_data.get('invoice_number'), | |
| 'estimate_time_arrival': invoice_data.get('estimate_time_arrival'), | |
| 'currency': invoice_data.get('currency'), | |
| 'file_name': file_name, | |
| 'error': None | |
| } | |
| row_data.update(item) | |
| processed_rows.append(row_data) | |
| if not processed_rows: | |
| return create_error_row("Data 'items' kosong atau formatnya tidak valid.") | |
| # Buat DataFrame dari baris yang diproses | |
| df = pd.DataFrame(processed_rows) | |
| # Pastikan semua kolom yang diinginkan ada, jika tidak tambahkan dengan nilai None | |
| for col in desired_columns: | |
| if col not in df.columns: | |
| df[col] = None | |
| # Kembalikan DataFrame dengan urutan kolom yang benar | |
| return df[desired_columns] |