invoice-extractor / invoice_extraction.py
Bayhaqy's picture
Update invoice_extraction.py
50311a7 verified
import google.generativeai as genai
import pandas as pd
import fitz # PyMuPDF
import json
import re
import os
def initialize_gemini(api_key, model_name):
"""Inisialisasi model Gemini."""
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name)
return model
except Exception as e:
print(f"❌ Gagal mengkonfigurasi Gemini: {e}")
return None
def extract_text_from_pdf(file_path):
"""Ekstrak teks dari file PDF."""
try:
with fitz.open(file_path) as doc:
return "".join(page.get_text() for page in doc), None
except Exception as e:
return None, f"Gagal membaca file PDF: {e}"
def get_invoice_data_with_gemini(pdf_text, model, prompt):
"""Kirim teks dan prompt ke Gemini."""
if not model:
return None, "Model Gemini tidak diinisialisasi."
full_prompt = prompt.format(text_input=pdf_text)
try:
response = model.generate_content(full_prompt, request_options={'timeout': 120})
clean_response = re.sub(r'```json\n?|```', '', response.text.strip())
return json.loads(clean_response), None
except json.JSONDecodeError:
# Menangkap respons mentah untuk debugging jika parsing JSON gagal
raw_response = "Tidak ada respons"
if 'response' in locals() and hasattr(response, 'text'):
raw_response = response.text
return None, f"Gagal mem-parsing JSON dari respons Gemini. Respons mentah: {raw_response[:200]}..."
except Exception as e:
return None, f"Error saat berkomunikasi dengan Gemini: {e}"
def process_single_invoice(pdf_file_path, model, prompt):
"""
Proses satu file invoice dan SELALU kembalikan DataFrame yang valid dan terstruktur.
"""
file_name = os.path.basename(pdf_file_path)
brand_name = re.match(r'^(\S+)', file_name).group(1) if re.match(r'^(\S+)', file_name) else "Unknown"
# DEFINISI STRUKTUR DATA YANG KONSISTEN
desired_columns = [
"brand", "vendor_name", "invoice_number", "estimate_time_arrival", "currency",
"invoice_po_number", "style_code", "size", "item_qty", "pack_qty",
"total_pack_qty", "unit_price", "amount_price", "file_name", "error"
]
# Fungsi untuk membuat baris error yang konsisten
def create_error_row(error_message):
data = {col: None for col in desired_columns}
data.update({"file_name": file_name, "error": error_message, "brand": brand_name})
return pd.DataFrame([data])
# 1. Ekstrak Teks
invoice_text, err = extract_text_from_pdf(pdf_file_path)
if err:
return create_error_row(err)
# 2. Ekstrak data via Gemini
invoice_data, err = get_invoice_data_with_gemini(invoice_text, model, prompt)
if err:
return create_error_row(err)
if not isinstance(invoice_data, dict):
return create_error_row("Respons dari Gemini bukan format JSON (dict) yang valid.")
# 3. Proses dan Bentuk DataFrame
items_list = invoice_data.get('items', [])
processed_rows = []
# Jika tidak ada item, buat satu baris dari data global
if not items_list or not isinstance(items_list, list):
base_data = {
'brand': brand_name,
'vendor_name': invoice_data.get('vendor_name'),
'invoice_number': invoice_data.get('invoice_number'),
'estimate_time_arrival': invoice_data.get('estimate_time_arrival'),
'currency': invoice_data.get('currency'),
'file_name': file_name,
'error': "Tidak ada 'items' yang ditemukan dalam respons" if not items_list else None
}
processed_rows.append(base_data)
else:
# Proses setiap item dalam daftar
for item in items_list:
if isinstance(item, dict):
row_data = {
'brand': brand_name,
'vendor_name': invoice_data.get('vendor_name'),
'invoice_number': invoice_data.get('invoice_number'),
'estimate_time_arrival': invoice_data.get('estimate_time_arrival'),
'currency': invoice_data.get('currency'),
'file_name': file_name,
'error': None
}
row_data.update(item)
processed_rows.append(row_data)
if not processed_rows:
return create_error_row("Data 'items' kosong atau formatnya tidak valid.")
# Buat DataFrame dari baris yang diproses
df = pd.DataFrame(processed_rows)
# Pastikan semua kolom yang diinginkan ada, jika tidak tambahkan dengan nilai None
for col in desired_columns:
if col not in df.columns:
df[col] = None
# Kembalikan DataFrame dengan urutan kolom yang benar
return df[desired_columns]