Spaces:

Bayhaqy
/

invoice-extractor

Running

App Files Files Community

Bayhaqy commited on Jul 5, 2025

Commit

50311a7

verified ·

1 Parent(s): ff94440

Update invoice_extraction.py

Browse files

Files changed (1) hide show

invoice_extraction.py +91 -115

invoice_extraction.py CHANGED Viewed

@@ -4,143 +4,119 @@ import fitz  # PyMuPDF
 import json
 import re
 import os
-import textwrap
-from concurrent.futures import ThreadPoolExecutor, as_completed
-# --- INISIALISASI MODEL GEMINI ---
-def initialize_gemini(api_key):
-    """Initializes the Gemini model with the provided API key."""
     try:
         genai.configure(api_key=api_key)
-        model = genai.GenerativeModel('gemini-1.5-flash-latest')
-        print("✅ Model Gemini berhasil diinisialisasi.")
         return model
     except Exception as e:
         print(f"❌ Gagal mengkonfigurasi Gemini: {e}")
         return None
 def extract_text_from_pdf(file_path):
-    """Mengekstrak seluruh teks dari file PDF menggunakan PyMuPDF."""
-    if not os.path.exists(file_path):
-        print(f"❌ File not found at: {file_path}")
-        return None
     try:
-        doc = fitz.open(file_path)
-        combined_text = ""
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-            pymupdf_text = page.get_text()
-            combined_text += f"\n--- Page {page_num + 1} ---\n{pymupdf_text}"
-        return combined_text
     except Exception as e:
-        print(f"❌ Gagal mengekstrak teks dari PDF '{os.path.basename(file_path)}': {e}")
-        return None
-def get_invoice_data_with_gemini(pdf_text, model):
-    """Mengirim teks ke Gemini dan meminta ekstraksi data dalam format JSON."""
     if not model:
-        print("❌ Model Gemini tidak diinisialisasi. Melewatkan ekstraksi data.")
-        return None
-    prompt = """
-      You are a highly accurate data parser. Based on the text of the following invoice, extract the requested information.
-      Invoice Text:
-      ---
-      {text_input}
-      ---
-      Your Task:
-      1. Identify global information such as invoice number, ETA date, vendor name, and currency.
-      2. Identify each line item in the invoice.
-      3. Extract the following data and return it ONLY in a single JSON object format. Do not add ```json``` or any other text outside the JSON.
-      Desired JSON Format:
-      {{
-        "vendor_name": "INVOICE SENDER (USUALLY IN THE TOP LEFT CORNER)",
-        "invoice_number": "INVOICE NUMBER",
-        "estimate_time_arrival": "YYYY-MM-DD",
-        "currency": "APPROXIMATELY WHAT CURRENCY (e.g., USD)",
-        "items": [
-          {{
-            "invoice_po_number": "FULL PO NUMBER",
-            "style_code": "ITEM CODE/STYLE/BUYER ITEM",
-            "size": "ITEM SIZE",
-            "item_qty": QUANTITY_OF_ITEMS_INTEGER,
-            "pack_qty": QUANTITY_PER_PACK_INTEGER,
-            "total_pack_qty": TOTAL_CARTON_INTEGER,
-            "unit_price": FLOAT_UNIT_PRICE,
-            "amount_price": TOTAL_PRICE_PER_ITEM_FLOAT (IF VALUE NOT FOUND, USE item_qty * unit_price)
-          }}
-        ]
-      }}
-      Make sure all numeric values (qty, price) are integers or floats, not strings.
-      If you are unsure about a value, return null or 0 for numbers.
-    """
     try:
-        chunk_size = 15000
-        text_chunks = textwrap.wrap(pdf_text, chunk_size, break_long_words=False, replace_whitespace=False)
-        if not text_chunks:
-            return None
-        response = model.generate_content(prompt.format(text_input=text_chunks[0]))
         clean_response = re.sub(r'```json\n?|```', '', response.text.strip())
-        invoice_data = json.loads(clean_response)
-        if 'items' not in invoice_data:
-             invoice_data['items'] = [] # Ensure items key exists
-        return invoice_data
-    except (json.JSONDecodeError, Exception) as e:
-        print(f"❌ Terjadi kesalahan saat berkomunikasi dengan Gemini: {e}")
-        return None
-def process_single_invoice(pdf_file_path, model):
-    """Memproses satu file PDF invoice dan mengembalikan DataFrame."""
     file_name = os.path.basename(pdf_file_path)
-    print(f"\n--- Memproses file: {file_name} ---")
-    brand_match = re.match(r'^(\S+)', file_name)
-    brand_name = brand_match.group(1) if brand_match else "Unknown"
-    invoice_text = extract_text_from_pdf(pdf_file_path)
-    if not invoice_text:
-        return pd.DataFrame([{"file_name": file_name, "error": "Gagal mengekstrak teks dari PDF"}])
-    invoice_data = get_invoice_data_with_gemini(invoice_text, model)
-    if not invoice_data:
-        return pd.DataFrame([{"file_name": file_name, "error": "Gagal mendapatkan data dari Gemini"}])
-    items_list = invoice_data.get('items', [])
-    if not items_list:
-        # Jika tidak ada item, proses data global saja
-        global_data = {
-            "brand": brand_name,
-            "vendor_name": invoice_data.get('vendor_name'),
-            "invoice_number": invoice_data.get('invoice_number'),
-            "estimate_time_arrival": invoice_data.get('estimate_time_arrival'),
-            "currency": invoice_data.get('currency'),
-            "file_name": file_name
-        }
-        items_list.append(global_data)
-    for item in items_list:
-        item['brand'] = brand_name
-        item['vendor_name'] = invoice_data.get('vendor_name')
-        item['invoice_number'] = invoice_data.get('invoice_number')
-        item['estimate_time_arrival'] = invoice_data.get('estimate_time_arrival')
-        item['currency'] = invoice_data.get('currency')
-        item['file_name'] = file_name
-    df = pd.DataFrame(items_list)
     desired_columns = [
         "brand", "vendor_name", "invoice_number", "estimate_time_arrival", "currency",
         "invoice_po_number", "style_code", "size", "item_qty", "pack_qty",
-        "total_pack_qty", "unit_price", "amount_price", "file_name"
     ]
     for col in desired_columns:
         if col not in df.columns:
             df[col] = None
-    return df.reindex(columns=desired_columns)

 import json
 import re
 import os
+def initialize_gemini(api_key, model_name):
+    """Inisialisasi model Gemini."""
     try:
         genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name)
         return model
     except Exception as e:
         print(f"❌ Gagal mengkonfigurasi Gemini: {e}")
         return None
 def extract_text_from_pdf(file_path):
+    """Ekstrak teks dari file PDF."""
     try:
+        with fitz.open(file_path) as doc:
+            return "".join(page.get_text() for page in doc), None
     except Exception as e:
+        return None, f"Gagal membaca file PDF: {e}"
+def get_invoice_data_with_gemini(pdf_text, model, prompt):
+    """Kirim teks dan prompt ke Gemini."""
     if not model:
+        return None, "Model Gemini tidak diinisialisasi."
+    full_prompt = prompt.format(text_input=pdf_text)
     try:
+        response = model.generate_content(full_prompt, request_options={'timeout': 120})
         clean_response = re.sub(r'```json\n?|```', '', response.text.strip())
+        return json.loads(clean_response), None
+    except json.JSONDecodeError:
+        # Menangkap respons mentah untuk debugging jika parsing JSON gagal
+        raw_response = "Tidak ada respons"
+        if 'response' in locals() and hasattr(response, 'text'):
+            raw_response = response.text
+        return None, f"Gagal mem-parsing JSON dari respons Gemini. Respons mentah: {raw_response[:200]}..."
+    except Exception as e:
+        return None, f"Error saat berkomunikasi dengan Gemini: {e}"
+def process_single_invoice(pdf_file_path, model, prompt):
+    """
+    Proses satu file invoice dan SELALU kembalikan DataFrame yang valid dan terstruktur.
+    """
     file_name = os.path.basename(pdf_file_path)
+    brand_name = re.match(r'^(\S+)', file_name).group(1) if re.match(r'^(\S+)', file_name) else "Unknown"
+    # DEFINISI STRUKTUR DATA YANG KONSISTEN
     desired_columns = [
         "brand", "vendor_name", "invoice_number", "estimate_time_arrival", "currency",
         "invoice_po_number", "style_code", "size", "item_qty", "pack_qty",
+        "total_pack_qty", "unit_price", "amount_price", "file_name", "error"
     ]
+    # Fungsi untuk membuat baris error yang konsisten
+    def create_error_row(error_message):
+        data = {col: None for col in desired_columns}
+        data.update({"file_name": file_name, "error": error_message, "brand": brand_name})
+        return pd.DataFrame([data])
+    # 1. Ekstrak Teks
+    invoice_text, err = extract_text_from_pdf(pdf_file_path)
+    if err:
+        return create_error_row(err)
+    # 2. Ekstrak data via Gemini
+    invoice_data, err = get_invoice_data_with_gemini(invoice_text, model, prompt)
+    if err:
+        return create_error_row(err)
+    if not isinstance(invoice_data, dict):
+        return create_error_row("Respons dari Gemini bukan format JSON (dict) yang valid.")
+    # 3. Proses dan Bentuk DataFrame
+    items_list = invoice_data.get('items', [])
+    processed_rows = []
+    # Jika tidak ada item, buat satu baris dari data global
+    if not items_list or not isinstance(items_list, list):
+        base_data = {
+            'brand': brand_name,
+            'vendor_name': invoice_data.get('vendor_name'),
+            'invoice_number': invoice_data.get('invoice_number'),
+            'estimate_time_arrival': invoice_data.get('estimate_time_arrival'),
+            'currency': invoice_data.get('currency'),
+            'file_name': file_name,
+            'error': "Tidak ada 'items' yang ditemukan dalam respons" if not items_list else None
+        }
+        processed_rows.append(base_data)
+    else:
+        # Proses setiap item dalam daftar
+        for item in items_list:
+            if isinstance(item, dict):
+                row_data = {
+                    'brand': brand_name,
+                    'vendor_name': invoice_data.get('vendor_name'),
+                    'invoice_number': invoice_data.get('invoice_number'),
+                    'estimate_time_arrival': invoice_data.get('estimate_time_arrival'),
+                    'currency': invoice_data.get('currency'),
+                    'file_name': file_name,
+                    'error': None
+                }
+                row_data.update(item)
+                processed_rows.append(row_data)
+    if not processed_rows:
+         return create_error_row("Data 'items' kosong atau formatnya tidak valid.")
+    # Buat DataFrame dari baris yang diproses
+    df = pd.DataFrame(processed_rows)
+    # Pastikan semua kolom yang diinginkan ada, jika tidak tambahkan dengan nilai None
     for col in desired_columns:
         if col not in df.columns:
             df[col] = None
+    # Kembalikan DataFrame dengan urutan kolom yang benar
+    return df[desired_columns]