|
|
| import os |
|
|
|
|
| config_dir = "/tmp/.streamlit" |
| os.makedirs(config_dir, exist_ok=True) |
|
|
| os.environ["HOME"] = "/tmp" |
| os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit" |
|
|
|
|
| import streamlit as st |
| import pandas as pd |
| import pdfplumber |
| import io |
| import json |
| from google import genai |
|
|
| import re |
| import tempfile |
| import shutil |
| |
| |
|
|
| |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
|
|
| |
| if "uploaded_files" not in st.session_state: |
| st.session_state.uploaded_files = [] |
| if "extracted_data" not in st.session_state: |
| st.session_state.extracted_data = [] |
|
|
| |
| def extract_with_python(pdf_file): |
| """Extract text using Python libraries""" |
| text = "" |
| with pdfplumber.open(pdf_file) as pdf: |
| for page in pdf.pages: |
| page_text = page.extract_text() |
| if page_text: |
| text += page_text |
| return text |
|
|
| def extract_with_gemini(text, property_list): |
| """Use Gemini API for extraction, returning only required fields as JSON.""" |
| client = genai.Client(api_key=GOOGLE_API_KEY) |
| |
| json_keys = ', '.join([f'"{p}"' for p in property_list]) |
| prompt = ( |
| f"Extract the following fields from the German invoice text below: {', '.join(property_list)}.\n" |
| f"Return the result as a single compact JSON object with exactly these keys ({json_keys}). " |
| f"If a value is missing, return an empty string for that key.\n" |
| f"Here is the invoice text:\n{text}" |
| ) |
| response = client.models.generate_content( |
| model="gemini-2.0-flash", |
| contents=[prompt] |
| ) |
| |
| try: |
| |
| start = response.text.find('{') |
| end = response.text.rfind('}') + 1 |
| json_str = response.text[start:end] |
| data = json.loads(json_str) |
| |
| for key in property_list: |
| if key not in data: |
| data[key] = "" |
| return data |
| except Exception as e: |
| |
| return {key: "" for key in property_list} |
|
|
| def extract_with_chatgpt(text): |
| """Stub: Use ChatGPT API for extraction""" |
| return text |
|
|
| def extract_with_claude(text): |
| """Stub: Use Claude API for extraction""" |
| return text |
|
|
| def extract_without_llm(text, property_list): |
| """ |
| Extracts invoice data from text based on a list of properties. |
| Returns a dictionary with the property names as keys and extracted values as values. |
| If a value is not found, the value is set to an empty string. |
| """ |
| |
| patterns = { |
| "invoice_number": r"(?:Rechnungsnummer|Invoice No\.?|Nr\.?):?\s*([A-Za-z0-9\-\/]+)", |
| "invoice_date": r"(?:Rechnungsdatum|Datum|Invoice Date):?\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})", |
| "total_amount": r"(?:Gesamtbetrag|Total Amount|Betrag):?\s*([0-9\.,]+ ?(?:EUR|€))", |
| "customer_name": r"(?:Kunde|Customer|Empfänger):?\s*([A-Za-zÄÖÜäöüß \-]+)", |
| "iban": r"(?:IBAN):?\s*([A-Z]{2}[0-9]{2}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4}[ ]?[A-Z0-9]{4,})", |
| "bic": r"(?:BIC|SWIFT):?\s*([A-Z0-9]{8,11})", |
| "due_date": r"(?:Fälligkeitsdatum|Due Date):?\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})", |
| |
| } |
|
|
| |
| result = {} |
|
|
| for prop in property_list: |
| value = "" |
| |
| if prop in patterns: |
| match = re.search(patterns[prop], text, re.IGNORECASE) |
| if match: |
| value = match.group(1).strip() |
| else: |
| |
| pattern = rf"{prop}:?\s*(.+)" |
| match = re.search(pattern, text, re.IGNORECASE) |
| if match: |
| value = match.group(1).split('\n')[0].strip() |
| result[prop] = value |
|
|
| return result |
|
|
| |
| st.title("🇩🇪 German Invoice Processor") |
| st.sidebar.header("Configuration") |
|
|
| uploaded_files = st.file_uploader( |
| "Upload PDF invoices", |
| type="pdf", |
| accept_multiple_files=True, |
| help="Up to 50 PDF files at once" |
| ) |
|
|
| if uploaded_files: |
| st.session_state.uploaded_files = uploaded_files |
| st.subheader("Uploaded Files:") |
| st.write([f.name for f in uploaded_files]) |
|
|
| |
| temp_dir = tempfile.gettempdir() |
| saved_paths = [] |
|
|
| for uploaded_file in uploaded_files: |
| |
| save_path = os.path.join(temp_dir, uploaded_file.name) |
| |
| with open(save_path, "wb") as f: |
| shutil.copyfileobj(uploaded_file, f) |
| saved_paths.append(save_path) |
|
|
| st.success(f"Files saved to temp directory: {temp_dir}") |
| st.write(saved_paths) |
|
|
| extraction_method = st.sidebar.radio( |
| "Extraction Method:", |
| |
| ["Gemini API", "Without LLM"] |
| ) |
| properties = st.sidebar.text_area( |
| "Fields to extract (comma separated):", |
| "Datum,Rechnungsnummer,Betrag,Steuer,Auftragsnummer" |
| ) |
| property_list = [p.strip() for p in properties.split(",") if p.strip()] |
|
|
| if st.button("Extract Data"): |
| all_data = [] |
| with st.spinner("Extracting data, please wait..."): |
| for pdf_file in uploaded_files: |
| base_text = extract_with_python(pdf_file) |
| if extraction_method == "Gemini API": |
| result_dict = extract_with_gemini(base_text, property_list) |
| elif extraction_method == "ChatGPT": |
| |
| result_dict = {prop: base_text for prop in property_list} |
| elif extraction_method == "Claude": |
| result_dict = {prop: base_text for prop in property_list} |
| elif extraction_method == "Combined": |
| llm_result = extract_with_gemini(base_text, property_list) |
| result_dict = llm_result |
| else: |
| result_dict = extract_without_llm(base_text, property_list) |
|
|
| |
| result_dict["Filename"] = pdf_file.name |
| all_data.append(result_dict) |
|
|
| st.session_state.extracted_data = all_data |
| st.success(f"Processed {len(all_data)} files!") |
|
|
| if st.session_state.extracted_data: |
| df = pd.DataFrame(st.session_state.extracted_data) |
| |
| cols = ["Filename"] + property_list |
| df = df[[col for col in cols if col in df.columns]] |
|
|
| st.subheader("Extracted Data Preview") |
| st.dataframe(df, use_container_width=True) |
|
|
| output = io.BytesIO() |
| with pd.ExcelWriter(output, engine='openpyxl') as writer: |
| df.to_excel(writer, index=False) |
| output.seek(0) |
|
|
| st.download_button( |
| label="Export to Excel", |
| data=output, |
| file_name="invoice_data.xlsx", |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
| ) |
| |
|
|