Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import io | |
| import pandas as pd | |
| import json | |
| import re | |
| import google.generativeai as genai | |
| from dotenv import load_dotenv | |
| import os | |
| # Load environment variables | |
| load_dotenv() | |
| genai.configure(api_key=os.getenv("api_key")) # Secure API key loading | |
| # Convert PIL Image to format Gemini accepts | |
| def image_to_gemini_format(image): | |
| img_byte_arr = io.BytesIO() | |
| image.save(img_byte_arr, format="PNG") | |
| return { | |
| "mime_type": "image/png", | |
| "data": img_byte_arr.getvalue() | |
| } | |
| # Generate content using Gemini | |
| def get_response(model, image_part, user_prompt, system_instruction): | |
| response = model.generate_content([ | |
| system_instruction, | |
| image_part, | |
| user_prompt | |
| ]) | |
| return response.text | |
| # Convert PDF to images | |
| def convert_pdf_to_images(pdf_bytes): | |
| images = [] | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| for page in doc: | |
| pix = page.get_pixmap(dpi=300) | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
| images.append(img) | |
| return images | |
| # Streamlit UI | |
| st.set_page_config(page_title="Invoice Extractor", layout="centered") | |
| st.title("📄 Invoice Table Extractor using Gemini AI") | |
| uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"]) | |
| if uploaded_pdf: | |
| with st.spinner("Converting PDF to images..."): | |
| images = convert_pdf_to_images(uploaded_pdf.read()) | |
| st.image(images[0], caption="Page 1 of PDF", use_container_width=True) | |
| if st.button("Extract Table from Invoice"): | |
| with st.spinner("Extracting data with Gemini..."): | |
| try: | |
| model = genai.GenerativeModel('gemini-1.5-flash') | |
| system_instruction = "You are an AI specialized in extracting structured data from invoices." | |
| user_prompt = """ | |
| Extract the invoice table from the uploaded invoice document. | |
| The table should include the following columns: | |
| - CODE ARTICLE | |
| - DESIGNATION | |
| - QTE COMMANDÉE | |
| - QTE LIVRÉE | |
| - PRIX UNIT. REF | |
| - PRIX UNIT. HT | |
| - PRIX UNIT. TTC | |
| - TOTAL HT | |
| - TVA % | |
| Also, extract and attach the following metadata fields to each row: | |
| - N° CLIENT | |
| - NOM CLIENT | |
| - N° FACTURE | |
| - DATE FACTURE | |
| - DATE DE CDE | |
| - Supplier/Company Name | |
| After extraction: | |
| - Create a clean pandas DataFrame containing all the above fields. | |
| - Drop any rows where CODE ARTICLE is empty or missing. | |
| - Return the data in JSON dictionary format. | |
| """ | |
| image_part = image_to_gemini_format(images[0]) | |
| response_text = get_response(model, image_part, user_prompt, system_instruction) | |
| # Extract JSON from Gemini response | |
| json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL) | |
| if json_match: | |
| clean_json = json_match.group() | |
| data = json.loads(clean_json) | |
| df = pd.DataFrame(data) | |
| # Clean data | |
| df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")] | |
| if df.empty: | |
| st.warning("No valid rows with CODE ARTICLE found.") | |
| else: | |
| st.success("✅ Gemini responded!") | |
| st.dataframe(df) | |
| # Create Excel file in memory | |
| output = io.BytesIO() | |
| with pd.ExcelWriter(output, engine="xlsxwriter") as writer: | |
| df.to_excel(writer, index=False, sheet_name="Invoice Data") | |
| output.seek(0) | |
| # Download button | |
| st.download_button( | |
| label="📥 Download Excel", | |
| data=output, | |
| file_name="invoice_extracted.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| else: | |
| st.error("❌ Could not find valid JSON in Gemini's response.") | |
| except Exception as e: | |
| st.error("⚠️ Failed to extract or parse data.") | |
| st.exception(e) | |