Spaces:

jitubutwal1441
/

invoice-data-extractor-germany

Sleeping

App Files Files Community

invoice-data-extractor-germany / src /streamlit_app1.py

jitubutwal1441

Rename src/streamlit_app.py to src/streamlit_app1.py

1a32909 verified 11 months ago

raw

history blame contribute delete

7.4 kB


	import os


	config_dir = "/tmp/.streamlit"
	os.makedirs(config_dir, exist_ok=True)

	os.environ["HOME"] = "/tmp"
	os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"


	import streamlit as st
	import pandas as pd
	import pdfplumber
	import io
	import json
	from google import genai

	import re
	import tempfile
	import shutil
	# import openai
	# import anthropic

	# Use st.secrets for API keys in production
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # st.secrets.get('GOOGLE_API_KEY', '')

	# --- Session State ---
	if "uploaded_files" not in st.session_state:
	st.session_state.uploaded_files = []
	if "extracted_data" not in st.session_state:
	st.session_state.extracted_data = []

	# --- Extraction Functions ---
	def extract_with_python(pdf_file):
	"""Extract text using Python libraries"""
	text = ""
	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	return text

	def extract_with_gemini(text, property_list):
	"""Use Gemini API for extraction, returning only required fields as JSON."""
	client = genai.Client(api_key=GOOGLE_API_KEY)
	# Build the prompt dynamically
	json_keys = ', '.join([f'"{p}"' for p in property_list])
	prompt = (
	f"Extract the following fields from the German invoice text below: {', '.join(property_list)}.\n"
	f"Return the result as a single compact JSON object with exactly these keys ({json_keys}). "
	f"If a value is missing, return an empty string for that key.\n"
	f"Here is the invoice text:\n{text}"
	)
	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[prompt]
	)
	# Try to extract JSON from the response
	try:
	# Try to find the first JSON object in the response
	start = response.text.find('{')
	end = response.text.rfind('}') + 1
	json_str = response.text[start:end]
	data = json.loads(json_str)
	# Ensure all requested properties are present
	for key in property_list:
	if key not in data:
	data[key] = ""
	return data
	except Exception as e:
	# If parsing fails, return empty fields
	return {key: "" for key in property_list}

	def extract_with_chatgpt(text):
	"""Stub: Use ChatGPT API for extraction"""
	return text # For demonstration

	def extract_with_claude(text):
	"""Stub: Use Claude API for extraction"""
	return text # For demonstration

	def extract_without_llm(text, property_list):
	"""
	Extracts invoice data from text based on a list of properties.
	Returns a dictionary with the property names as keys and extracted values as values.
	If a value is not found, the value is set to an empty string.
	"""
	# Define regex patterns for common invoice fields (expand as needed)
	patterns = {
	"invoice_number": r"(?:Rechnungsnummer\|Invoice No\.?\|Nr\.?):?\s*([A-Za-z0-9\-\/]+)",
	"invoice_date": r"(?:Rechnungsdatum\|Datum\|Invoice Date):?\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})",
	"total_amount": r"(?:Gesamtbetrag\|Total Amount\|Betrag):?\s*([0-9\.,]+ ?(?:EUR\|€))",
	"customer_name": r"(?:Kunde\|Customer\|Empfänger):?\s*([A-Za-zÄÖÜäöüß \-]+)",
	"iban": r"(?:IBAN):?\s*([A-Z]{2}[0-9]{2}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4,})",
	"bic": r"(?:BIC\|SWIFT):?\s*([A-Z0-9]{8,11})",
	"due_date": r"(?:Fälligkeitsdatum\|Due Date):?\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})",
	# Add more patterns as needed
	}

	# Prepare the result dictionary
	result = {}

	for prop in property_list:
	value = ""
	# Try to use a predefined pattern for the property
	if prop in patterns:
	match = re.search(patterns[prop], text, re.IGNORECASE)
	if match:
	value = match.group(1).strip()
	else:
	# Fallback: try to find the property name in text and extract the value after it
	pattern = rf"{prop}:?\s*(.+)"
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	value = match.group(1).split('\n')[0].strip()
	result[prop] = value

	return result

	# --- Streamlit UI ---
	st.title("🇩🇪 German Invoice Processor")
	st.sidebar.header("Configuration")

	uploaded_files = st.file_uploader(
	"Upload PDF invoices",
	type="pdf",
	accept_multiple_files=True,
	help="Up to 50 PDF files at once"
	)

	if uploaded_files:
	st.session_state.uploaded_files = uploaded_files
	st.subheader("Uploaded Files:")
	st.write([f.name for f in uploaded_files])

	# Get the temp directory
	temp_dir = tempfile.gettempdir()
	saved_paths = []

	for uploaded_file in uploaded_files:
	# Create a full file path in the temp directory
	save_path = os.path.join(temp_dir, uploaded_file.name)
	# Save the file
	with open(save_path, "wb") as f:
	shutil.copyfileobj(uploaded_file, f)
	saved_paths.append(save_path)

	st.success(f"Files saved to temp directory: {temp_dir}")
	st.write(saved_paths)

	extraction_method = st.sidebar.radio(
	"Extraction Method:",
	# ["Python", "Google Gemini", "ChatGPT", "Claude", "Combined"]
	["Gemini API", "Without LLM"]
	)
	properties = st.sidebar.text_area(
	"Fields to extract (comma separated):",
	"Datum,Rechnungsnummer,Betrag,Steuer,Auftragsnummer"
	)
	property_list = [p.strip() for p in properties.split(",") if p.strip()]

	if st.button("Extract Data"):
	all_data = []
	with st.spinner("Extracting data, please wait..."):
	for pdf_file in uploaded_files:
	base_text = extract_with_python(pdf_file)
	if extraction_method == "Gemini API":
	result_dict = extract_with_gemini(base_text, property_list)
	elif extraction_method == "ChatGPT":
	# You can implement similar logic for ChatGPT
	result_dict = {prop: base_text for prop in property_list}
	elif extraction_method == "Claude":
	result_dict = {prop: base_text for prop in property_list}
	elif extraction_method == "Combined":
	llm_result = extract_with_gemini(base_text, property_list)
	result_dict = llm_result
	else:
	result_dict = extract_without_llm(base_text, property_list)

	# Add file name
	result_dict["Filename"] = pdf_file.name
	all_data.append(result_dict)

	st.session_state.extracted_data = all_data
	st.success(f"Processed {len(all_data)} files!")

	if st.session_state.extracted_data:
	df = pd.DataFrame(st.session_state.extracted_data)
	# Ensure DataFrame columns are in the order: ["Filename", ...user properties...]
	cols = ["Filename"] + property_list
	df = df[[col for col in cols if col in df.columns]]

	st.subheader("Extracted Data Preview")
	st.dataframe(df, use_container_width=True)

	output = io.BytesIO()
	with pd.ExcelWriter(output, engine='openpyxl') as writer:
	df.to_excel(writer, index=False)
	output.seek(0)

	st.download_button(
	label="Export to Excel",
	data=output,
	file_name="invoice_data.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	# st.balloons()