Spaces:

YAMITEK
/

pdf_extracter_using_gemini

Sleeping

App Files Files Community

pdf_extracter_using_gemini / app.py

YAMITEK

Create app.py

0b4e7f3 verified about 1 year ago

raw

history blame contribute delete

4.21 kB

	import streamlit as st
	import fitz # PyMuPDF
	from PIL import Image
	import io
	import pandas as pd
	import json
	import re
	import google.generativeai as genai
	from dotenv import load_dotenv
	import os

	# Load environment variables
	load_dotenv()
	genai.configure(api_key=os.getenv("api_key")) # Secure API key loading

	# Convert PIL Image to format Gemini accepts
	def image_to_gemini_format(image):
	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format="PNG")
	return {
	"mime_type": "image/png",
	"data": img_byte_arr.getvalue()
	}

	# Generate content using Gemini
	def get_response(model, image_part, user_prompt, system_instruction):
	response = model.generate_content([
	system_instruction,
	image_part,
	user_prompt
	])
	return response.text

	# Convert PDF to images
	def convert_pdf_to_images(pdf_bytes):
	images = []
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	for page in doc:
	pix = page.get_pixmap(dpi=300)
	img = Image.open(io.BytesIO(pix.tobytes("png")))
	images.append(img)
	return images

	# Streamlit UI
	st.set_page_config(page_title="Invoice Extractor", layout="centered")
	st.title("📄 Invoice Table Extractor using Gemini AI")

	uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])

	if uploaded_pdf:
	with st.spinner("Converting PDF to images..."):
	images = convert_pdf_to_images(uploaded_pdf.read())

	st.image(images[0], caption="Page 1 of PDF", use_container_width=True)

	if st.button("Extract Table from Invoice"):
	with st.spinner("Extracting data with Gemini..."):
	try:
	model = genai.GenerativeModel('gemini-1.5-flash')

	system_instruction = "You are an AI specialized in extracting structured data from invoices."
	user_prompt = """
	Extract the invoice table from the uploaded invoice document.
	The table should include the following columns:
	- CODE ARTICLE
	- DESIGNATION
	- QTE COMMANDÉE
	- QTE LIVRÉE
	- PRIX UNIT. REF
	- PRIX UNIT. HT
	- PRIX UNIT. TTC
	- TOTAL HT
	- TVA %
	Also, extract and attach the following metadata fields to each row:
	- N° CLIENT
	- NOM CLIENT
	- N° FACTURE
	- DATE FACTURE
	- DATE DE CDE
	- Supplier/Company Name
	After extraction:
	- Create a clean pandas DataFrame containing all the above fields.
	- Drop any rows where CODE ARTICLE is empty or missing.
	- Return the data in JSON dictionary format.
	"""

	image_part = image_to_gemini_format(images[0])
	response_text = get_response(model, image_part, user_prompt, system_instruction)

	# Extract JSON from Gemini response
	json_match = re.search(r"\[\s{.?}\s*]", response_text, re.DOTALL)
	if json_match:
	clean_json = json_match.group()
	data = json.loads(clean_json)
	df = pd.DataFrame(data)

	# Clean data
	df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]

	if df.empty:
	st.warning("No valid rows with CODE ARTICLE found.")
	else:
	st.success("✅ Gemini responded!")
	st.dataframe(df)

	# Create Excel file in memory
	output = io.BytesIO()
	with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
	df.to_excel(writer, index=False, sheet_name="Invoice Data")
	output.seek(0)

	# Download button
	st.download_button(
	label="📥 Download Excel",
	data=output,
	file_name="invoice_extracted.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	else:
	st.error("❌ Could not find valid JSON in Gemini's response.")

	except Exception as e:
	st.error("⚠️ Failed to extract or parse data.")
	st.exception(e)