import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM from pypdf import PdfReader import pandas as pd import re import torch # Load Mistral model from Hugging Face tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") model = AutoModelForCausalLM.from_pretrained( "mistralai/Mistral-7B-Instruct-v0.1", device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) model.eval() # Read PDF text def get_pdf_text(pdf_doc): text = "" reader = PdfReader(pdf_doc) for page in reader.pages: text += page.extract_text() return text # Extract invoice data using the model def extracted_data(pages_data): prompt = f"""Extract the following values from the text: invoice no., Description, Quantity, date, Unit price, Amount, Total, email, phone number, and address. Text: {pages_data} Output format: {{ 'Invoice no.': '1001329', 'Description': 'Office Chair', 'Quantity': '2', 'Date': '5/4/2023', 'Unit price': '1100.00', 'Amount': '2200.00', 'Total': '2200.00', 'Email': 'example@email.com', 'Phone number': '9999999999', 'Address': 'Mumbai, India' }} """ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=512) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Process PDF list and build DataFrame def create_docs(user_pdf_list): df = pd.DataFrame(columns=[ 'Invoice no.', 'Description', 'Quantity', 'Date', 'Unit price', 'Amount', 'Total', 'Email', 'Phone number', 'Address' ]) for file in user_pdf_list: raw_text = get_pdf_text(file) llm_output = extracted_data(raw_text) # Try extracting JSON-like data from output pattern = r'{(.+)}' match = re.search(pattern, llm_output, re.DOTALL) if match: extracted = match.group(1) try: data_dict = eval("{" + extracted + "}") df = df.append([data_dict], ignore_index=True) except Exception as e: print("Parsing error:", e) else: print("Model response format issue.") return df def main(): st.set_page_config(page_title="Invoice Extraction Bot") st.title("Invoice Extraction Bot 🤖") st.subheader("Upload your PDF invoices to extract key information!") pdf_files = st.file_uploader("Upload PDF invoices", type=["pdf"], accept_multiple_files=True) submit = st.button("Extract Data") if submit and pdf_files: with st.spinner("Extracting data from invoices..."): df = create_docs(pdf_files) st.write(df) if not df.empty: csv_data = df.to_csv(index=False).encode("utf-8") st.download_button( "Download CSV", csv_data, "invoice_data.csv", "text/csv", key="download-csv" ) st.success("Data extraction completed! 🎉") if __name__ == "__main__": main()