Spaces:

YAMITEK
/

pdf_extracter_using_gemini

Sleeping

App Files Files Community

YAMITEK commited on Apr 29, 2025

Commit

0b4e7f3

verified ·

1 Parent(s): e4fbfc7

Create app.py

Browse files

Files changed (1) hide show

app.py +124 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import streamlit as st
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+import pandas as pd
+import json
+import re
+import google.generativeai as genai
+from dotenv import load_dotenv
+import os
+# Load environment variables
+load_dotenv()
+genai.configure(api_key=os.getenv("api_key"))  # Secure API key loading
+# Convert PIL Image to format Gemini accepts
+def image_to_gemini_format(image):
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format="PNG")
+    return {
+        "mime_type": "image/png",
+        "data": img_byte_arr.getvalue()
+    }
+# Generate content using Gemini
+def get_response(model, image_part, user_prompt, system_instruction):
+    response = model.generate_content([
+        system_instruction,
+        image_part,
+        user_prompt
+    ])
+    return response.text
+# Convert PDF to images
+def convert_pdf_to_images(pdf_bytes):
+    images = []
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    for page in doc:
+        pix = page.get_pixmap(dpi=300)
+        img = Image.open(io.BytesIO(pix.tobytes("png")))
+        images.append(img)
+    return images
+# Streamlit UI
+st.set_page_config(page_title="Invoice Extractor", layout="centered")
+st.title("📄 Invoice Table Extractor using Gemini AI")
+uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])
+if uploaded_pdf:
+    with st.spinner("Converting PDF to images..."):
+        images = convert_pdf_to_images(uploaded_pdf.read())
+    st.image(images[0], caption="Page 1 of PDF", use_container_width=True)
+    if st.button("Extract Table from Invoice"):
+        with st.spinner("Extracting data with Gemini..."):
+            try:
+                model = genai.GenerativeModel('gemini-1.5-flash')
+                system_instruction = "You are an AI specialized in extracting structured data from invoices."
+                user_prompt = """
+Extract the invoice table from the uploaded invoice document.
+The table should include the following columns:
+- CODE ARTICLE
+- DESIGNATION
+- QTE COMMANDÉE
+- QTE LIVRÉE
+- PRIX UNIT. REF
+- PRIX UNIT. HT
+- PRIX UNIT. TTC
+- TOTAL HT
+- TVA %
+Also, extract and attach the following metadata fields to each row:
+- N° CLIENT
+- NOM CLIENT
+- N° FACTURE
+- DATE FACTURE
+- DATE DE CDE
+- Supplier/Company Name
+After extraction:
+- Create a clean pandas DataFrame containing all the above fields.
+- Drop any rows where CODE ARTICLE is empty or missing.
+- Return the data in JSON dictionary format.
+"""
+                image_part = image_to_gemini_format(images[0])
+                response_text = get_response(model, image_part, user_prompt, system_instruction)
+                # Extract JSON from Gemini response
+                json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL)
+                if json_match:
+                    clean_json = json_match.group()
+                    data = json.loads(clean_json)
+                    df = pd.DataFrame(data)
+                    # Clean data
+                    df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]
+                    if df.empty:
+                        st.warning("No valid rows with CODE ARTICLE found.")
+                    else:
+                        st.success("✅ Gemini responded!")
+                        st.dataframe(df)
+                        # Create Excel file in memory
+                        output = io.BytesIO()
+                        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
+                            df.to_excel(writer, index=False, sheet_name="Invoice Data")
+                        output.seek(0)
+                        # Download button
+                        st.download_button(
+                            label="📥 Download Excel",
+                            data=output,
+                            file_name="invoice_extracted.xlsx",
+                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                        )
+                else:
+                    st.error("❌ Could not find valid JSON in Gemini's response.")
+            except Exception as e:
+                st.error("⚠️ Failed to extract or parse data.")
+                st.exception(e)