YAMITEK commited on
Commit
0b4e7f3
·
verified ·
1 Parent(s): e4fbfc7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz # PyMuPDF
3
+ from PIL import Image
4
+ import io
5
+ import pandas as pd
6
+ import json
7
+ import re
8
+ import google.generativeai as genai
9
+ from dotenv import load_dotenv
10
+ import os
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+ genai.configure(api_key=os.getenv("api_key")) # Secure API key loading
15
+
16
+ # Convert PIL Image to format Gemini accepts
17
+ def image_to_gemini_format(image):
18
+ img_byte_arr = io.BytesIO()
19
+ image.save(img_byte_arr, format="PNG")
20
+ return {
21
+ "mime_type": "image/png",
22
+ "data": img_byte_arr.getvalue()
23
+ }
24
+
25
+ # Generate content using Gemini
26
+ def get_response(model, image_part, user_prompt, system_instruction):
27
+ response = model.generate_content([
28
+ system_instruction,
29
+ image_part,
30
+ user_prompt
31
+ ])
32
+ return response.text
33
+
34
+ # Convert PDF to images
35
+ def convert_pdf_to_images(pdf_bytes):
36
+ images = []
37
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
38
+ for page in doc:
39
+ pix = page.get_pixmap(dpi=300)
40
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
41
+ images.append(img)
42
+ return images
43
+
44
+ # Streamlit UI
45
+ st.set_page_config(page_title="Invoice Extractor", layout="centered")
46
+ st.title("📄 Invoice Table Extractor using Gemini AI")
47
+
48
+ uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])
49
+
50
+ if uploaded_pdf:
51
+ with st.spinner("Converting PDF to images..."):
52
+ images = convert_pdf_to_images(uploaded_pdf.read())
53
+
54
+ st.image(images[0], caption="Page 1 of PDF", use_container_width=True)
55
+
56
+ if st.button("Extract Table from Invoice"):
57
+ with st.spinner("Extracting data with Gemini..."):
58
+ try:
59
+ model = genai.GenerativeModel('gemini-1.5-flash')
60
+
61
+ system_instruction = "You are an AI specialized in extracting structured data from invoices."
62
+ user_prompt = """
63
+ Extract the invoice table from the uploaded invoice document.
64
+ The table should include the following columns:
65
+ - CODE ARTICLE
66
+ - DESIGNATION
67
+ - QTE COMMANDÉE
68
+ - QTE LIVRÉE
69
+ - PRIX UNIT. REF
70
+ - PRIX UNIT. HT
71
+ - PRIX UNIT. TTC
72
+ - TOTAL HT
73
+ - TVA %
74
+ Also, extract and attach the following metadata fields to each row:
75
+ - N° CLIENT
76
+ - NOM CLIENT
77
+ - N° FACTURE
78
+ - DATE FACTURE
79
+ - DATE DE CDE
80
+ - Supplier/Company Name
81
+ After extraction:
82
+ - Create a clean pandas DataFrame containing all the above fields.
83
+ - Drop any rows where CODE ARTICLE is empty or missing.
84
+ - Return the data in JSON dictionary format.
85
+ """
86
+
87
+ image_part = image_to_gemini_format(images[0])
88
+ response_text = get_response(model, image_part, user_prompt, system_instruction)
89
+
90
+ # Extract JSON from Gemini response
91
+ json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL)
92
+ if json_match:
93
+ clean_json = json_match.group()
94
+ data = json.loads(clean_json)
95
+ df = pd.DataFrame(data)
96
+
97
+ # Clean data
98
+ df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]
99
+
100
+ if df.empty:
101
+ st.warning("No valid rows with CODE ARTICLE found.")
102
+ else:
103
+ st.success("✅ Gemini responded!")
104
+ st.dataframe(df)
105
+
106
+ # Create Excel file in memory
107
+ output = io.BytesIO()
108
+ with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
109
+ df.to_excel(writer, index=False, sheet_name="Invoice Data")
110
+ output.seek(0)
111
+
112
+ # Download button
113
+ st.download_button(
114
+ label="📥 Download Excel",
115
+ data=output,
116
+ file_name="invoice_extracted.xlsx",
117
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
118
+ )
119
+ else:
120
+ st.error("❌ Could not find valid JSON in Gemini's response.")
121
+
122
+ except Exception as e:
123
+ st.error("⚠️ Failed to extract or parse data.")
124
+ st.exception(e)