invoice-data-extractor-germany / src /streamlit_app1.py
jitubutwal1441's picture
Rename src/streamlit_app.py to src/streamlit_app1.py
1a32909 verified
import os
config_dir = "/tmp/.streamlit"
os.makedirs(config_dir, exist_ok=True)
os.environ["HOME"] = "/tmp"
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"
import streamlit as st
import pandas as pd
import pdfplumber
import io
import json
from google import genai
import re
import tempfile
import shutil
# import openai
# import anthropic
# Use st.secrets for API keys in production
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # st.secrets.get('GOOGLE_API_KEY', '')
# --- Session State ---
if "uploaded_files" not in st.session_state:
st.session_state.uploaded_files = []
if "extracted_data" not in st.session_state:
st.session_state.extracted_data = []
# --- Extraction Functions ---
def extract_with_python(pdf_file):
"""Extract text using Python libraries"""
text = ""
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def extract_with_gemini(text, property_list):
"""Use Gemini API for extraction, returning only required fields as JSON."""
client = genai.Client(api_key=GOOGLE_API_KEY)
# Build the prompt dynamically
json_keys = ', '.join([f'"{p}"' for p in property_list])
prompt = (
f"Extract the following fields from the German invoice text below: {', '.join(property_list)}.\n"
f"Return the result as a single compact JSON object with exactly these keys ({json_keys}). "
f"If a value is missing, return an empty string for that key.\n"
f"Here is the invoice text:\n{text}"
)
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt]
)
# Try to extract JSON from the response
try:
# Try to find the first JSON object in the response
start = response.text.find('{')
end = response.text.rfind('}') + 1
json_str = response.text[start:end]
data = json.loads(json_str)
# Ensure all requested properties are present
for key in property_list:
if key not in data:
data[key] = ""
return data
except Exception as e:
# If parsing fails, return empty fields
return {key: "" for key in property_list}
def extract_with_chatgpt(text):
"""Stub: Use ChatGPT API for extraction"""
return text # For demonstration
def extract_with_claude(text):
"""Stub: Use Claude API for extraction"""
return text # For demonstration
def extract_without_llm(text, property_list):
"""
Extracts invoice data from text based on a list of properties.
Returns a dictionary with the property names as keys and extracted values as values.
If a value is not found, the value is set to an empty string.
"""
# Define regex patterns for common invoice fields (expand as needed)
patterns = {
"invoice_number": r"(?:Rechnungsnummer|Invoice No\.?|Nr\.?):?\s*([A-Za-z0-9\-\/]+)",
"invoice_date": r"(?:Rechnungsdatum|Datum|Invoice Date):?\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})",
"total_amount": r"(?:Gesamtbetrag|Total Amount|Betrag):?\s*([0-9\.,]+ ?(?:EUR|€))",
"customer_name": r"(?:Kunde|Customer|Empfänger):?\s*([A-Za-zÄÖÜäöüß \-]+)",
"iban": r"(?:IBAN):?\s*([A-Z]{2}[0-9]{2}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4,})",
"bic": r"(?:BIC|SWIFT):?\s*([A-Z0-9]{8,11})",
"due_date": r"(?:Fälligkeitsdatum|Due Date):?\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})",
# Add more patterns as needed
}
# Prepare the result dictionary
result = {}
for prop in property_list:
value = ""
# Try to use a predefined pattern for the property
if prop in patterns:
match = re.search(patterns[prop], text, re.IGNORECASE)
if match:
value = match.group(1).strip()
else:
# Fallback: try to find the property name in text and extract the value after it
pattern = rf"{prop}:?\s*(.+)"
match = re.search(pattern, text, re.IGNORECASE)
if match:
value = match.group(1).split('\n')[0].strip()
result[prop] = value
return result
# --- Streamlit UI ---
st.title("🇩🇪 German Invoice Processor")
st.sidebar.header("Configuration")
uploaded_files = st.file_uploader(
"Upload PDF invoices",
type="pdf",
accept_multiple_files=True,
help="Up to 50 PDF files at once"
)
if uploaded_files:
st.session_state.uploaded_files = uploaded_files
st.subheader("Uploaded Files:")
st.write([f.name for f in uploaded_files])
# Get the temp directory
temp_dir = tempfile.gettempdir()
saved_paths = []
for uploaded_file in uploaded_files:
# Create a full file path in the temp directory
save_path = os.path.join(temp_dir, uploaded_file.name)
# Save the file
with open(save_path, "wb") as f:
shutil.copyfileobj(uploaded_file, f)
saved_paths.append(save_path)
st.success(f"Files saved to temp directory: {temp_dir}")
st.write(saved_paths)
extraction_method = st.sidebar.radio(
"Extraction Method:",
# ["Python", "Google Gemini", "ChatGPT", "Claude", "Combined"]
["Gemini API", "Without LLM"]
)
properties = st.sidebar.text_area(
"Fields to extract (comma separated):",
"Datum,Rechnungsnummer,Betrag,Steuer,Auftragsnummer"
)
property_list = [p.strip() for p in properties.split(",") if p.strip()]
if st.button("Extract Data"):
all_data = []
with st.spinner("Extracting data, please wait..."):
for pdf_file in uploaded_files:
base_text = extract_with_python(pdf_file)
if extraction_method == "Gemini API":
result_dict = extract_with_gemini(base_text, property_list)
elif extraction_method == "ChatGPT":
# You can implement similar logic for ChatGPT
result_dict = {prop: base_text for prop in property_list}
elif extraction_method == "Claude":
result_dict = {prop: base_text for prop in property_list}
elif extraction_method == "Combined":
llm_result = extract_with_gemini(base_text, property_list)
result_dict = llm_result
else:
result_dict = extract_without_llm(base_text, property_list)
# Add file name
result_dict["Filename"] = pdf_file.name
all_data.append(result_dict)
st.session_state.extracted_data = all_data
st.success(f"Processed {len(all_data)} files!")
if st.session_state.extracted_data:
df = pd.DataFrame(st.session_state.extracted_data)
# Ensure DataFrame columns are in the order: ["Filename", ...user properties...]
cols = ["Filename"] + property_list
df = df[[col for col in cols if col in df.columns]]
st.subheader("Extracted Data Preview")
st.dataframe(df, use_container_width=True)
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False)
output.seek(0)
st.download_button(
label="Export to Excel",
data=output,
file_name="invoice_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# st.balloons()