Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from pypdf import PdfReader | |
| import pandas as pd | |
| import re | |
| import torch | |
| # Load Mistral model from Hugging Face | |
| tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "mistralai/Mistral-7B-Instruct-v0.1", | |
| device_map="auto", | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
| ) | |
| model.eval() | |
| # Read PDF text | |
| def get_pdf_text(pdf_doc): | |
| text = "" | |
| reader = PdfReader(pdf_doc) | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Extract invoice data using the model | |
| def extracted_data(pages_data): | |
| prompt = f"""Extract the following values from the text: | |
| invoice no., Description, Quantity, date, Unit price, Amount, Total, email, phone number, and address. | |
| Text: {pages_data} | |
| Output format: | |
| {{ | |
| 'Invoice no.': '1001329', | |
| 'Description': 'Office Chair', | |
| 'Quantity': '2', | |
| 'Date': '5/4/2023', | |
| 'Unit price': '1100.00', | |
| 'Amount': '2200.00', | |
| 'Total': '2200.00', | |
| 'Email': 'example@email.com', | |
| 'Phone number': '9999999999', | |
| 'Address': 'Mumbai, India' | |
| }} | |
| """ | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, max_new_tokens=512) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return response | |
| # Process PDF list and build DataFrame | |
| def create_docs(user_pdf_list): | |
| df = pd.DataFrame(columns=[ | |
| 'Invoice no.', 'Description', 'Quantity', 'Date', | |
| 'Unit price', 'Amount', 'Total', 'Email', | |
| 'Phone number', 'Address' | |
| ]) | |
| for file in user_pdf_list: | |
| raw_text = get_pdf_text(file) | |
| llm_output = extracted_data(raw_text) | |
| # Try extracting JSON-like data from output | |
| pattern = r'{(.+)}' | |
| match = re.search(pattern, llm_output, re.DOTALL) | |
| if match: | |
| extracted = match.group(1) | |
| try: | |
| data_dict = eval("{" + extracted + "}") | |
| df = df.append([data_dict], ignore_index=True) | |
| except Exception as e: | |
| print("Parsing error:", e) | |
| else: | |
| print("Model response format issue.") | |
| return df | |
| def main(): | |
| st.set_page_config(page_title="Invoice Extraction Bot") | |
| st.title("Invoice Extraction Bot 🤖") | |
| st.subheader("Upload your PDF invoices to extract key information!") | |
| pdf_files = st.file_uploader("Upload PDF invoices", type=["pdf"], accept_multiple_files=True) | |
| submit = st.button("Extract Data") | |
| if submit and pdf_files: | |
| with st.spinner("Extracting data from invoices..."): | |
| df = create_docs(pdf_files) | |
| st.write(df) | |
| if not df.empty: | |
| csv_data = df.to_csv(index=False).encode("utf-8") | |
| st.download_button( | |
| "Download CSV", | |
| csv_data, | |
| "invoice_data.csv", | |
| "text/csv", | |
| key="download-csv" | |
| ) | |
| st.success("Data extraction completed! 🎉") | |
| if __name__ == "__main__": | |
| main() |