| import streamlit as st |
| import fitz |
| from PIL import Image |
| import io |
| import pandas as pd |
| import json |
| import google.generativeai as genai |
| from dotenv import load_dotenv |
| import os |
| import requests |
|
|
| |
| load_dotenv() |
|
|
| |
| genai.configure(api_key=os.getenv("apikey")) |
|
|
| def get_response(model, user_input, image, prompt): |
| """Generate response from the model using input and image data.""" |
| try: |
| |
| response = model.generate_content([user_input, image, prompt]) |
| return response.text |
| except requests.exceptions.RequestException as e: |
| st.error(f"⚠️ Error while calling the API: {e}") |
| return None |
|
|
| def convert_pdf_to_images(pdf_bytes): |
| """Convert PDF to images using fitz (PyMuPDF).""" |
| images = [] |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| for page in doc: |
| pix = page.get_pixmap(dpi=300) |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) |
| images.append(img) |
| return images |
|
|
| |
| st.set_page_config(page_title="Invoice Extractor", layout="centered") |
| st.title("📄 Invoice Table Extractor using Gemini AI") |
|
|
| uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"]) |
|
|
| if uploaded_pdf: |
| with st.spinner("Converting PDF to images..."): |
| images = convert_pdf_to_images(uploaded_pdf.read()) |
|
|
| st.image(images[0], caption="Page 1 of PDF", use_column_width=True) |
|
|
| if st.button("Extract Table from Invoice"): |
| with st.spinner("Extracting data with Gemini..."): |
| model = genai.GenerativeModel('gemini-1.5-flash') |
|
|
| prompt = """Extract the invoice table from the uploaded invoice document. |
| The table should include the following columns: |
| - CODE ARTICLE |
| - DESIGNATION |
| - QTE COMMANDÉE |
| - QTE LIVRÉE |
| - PRIX UNIT. REF |
| - PRIX UNIT. HT |
| - PRIX UNIT. TTC |
| - TOTAL HT |
| - TVA % |
| Also, extract and attach the following metadata fields to each row: |
| - N° CLIENT |
| - NOM CLIENT |
| - N° FACTURE |
| - DATE FACTURE |
| - DATE DE CDE |
| - Supplier/Company Name |
| After extraction: |
| - Create a clean pandas DataFrame containing all the above fields. |
| - Drop any rows where CODE ARTICLE is empty or missing. |
| - Return the data in JSON dictionary format. |
| """ |
|
|
| try: |
| response_text = get_response(model, prompt, images[0], prompt) |
| if response_text: |
| st.success("✅ Gemini responded!") |
|
|
| |
| start_index = response_text.find('[') |
| end_index = response_text.rfind(']') + 1 |
| clean_json = response_text[start_index:end_index] |
| data = json.loads(clean_json) |
| df = pd.DataFrame(data) |
|
|
| |
| df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")] |
|
|
| st.dataframe(df) |
|
|
| |
| output = io.BytesIO() |
| with pd.ExcelWriter(output, engine="xlsxwriter") as writer: |
| df.to_excel(writer, index=False, sheet_name="Invoice Data") |
| writer.save() |
| st.download_button( |
| label="📥 Download Excel", |
| data=output.getvalue(), |
| file_name="invoice_extracted.xlsx", |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
| ) |
|
|
| except Exception as e: |
| st.error("⚠️ Failed to extract or parse data.") |
| st.exception(e) |