InvoiceBot / utils.py
Balaprime's picture
Create utils.py
ce5499f verified
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from pypdf import PdfReader
import pandas as pd
import re
import torch
# Load Mistral model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-Instruct-v0.1",
device_map="auto",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()
# Read PDF text
def get_pdf_text(pdf_doc):
text = ""
reader = PdfReader(pdf_doc)
for page in reader.pages:
text += page.extract_text()
return text
# Extract invoice data using the model
def extracted_data(pages_data):
prompt = f"""Extract the following values from the text:
invoice no., Description, Quantity, date, Unit price, Amount, Total, email, phone number, and address.
Text: {pages_data}
Output format:
{{
'Invoice no.': '1001329',
'Description': 'Office Chair',
'Quantity': '2',
'Date': '5/4/2023',
'Unit price': '1100.00',
'Amount': '2200.00',
'Total': '2200.00',
'Email': 'example@email.com',
'Phone number': '9999999999',
'Address': 'Mumbai, India'
}}
"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# Process PDF list and build DataFrame
def create_docs(user_pdf_list):
df = pd.DataFrame(columns=[
'Invoice no.', 'Description', 'Quantity', 'Date',
'Unit price', 'Amount', 'Total', 'Email',
'Phone number', 'Address'
])
for file in user_pdf_list:
raw_text = get_pdf_text(file)
llm_output = extracted_data(raw_text)
# Try extracting JSON-like data from output
pattern = r'{(.+)}'
match = re.search(pattern, llm_output, re.DOTALL)
if match:
extracted = match.group(1)
try:
data_dict = eval("{" + extracted + "}")
df = df.append([data_dict], ignore_index=True)
except Exception as e:
print("Parsing error:", e)
else:
print("Model response format issue.")
return df
def main():
st.set_page_config(page_title="Invoice Extraction Bot")
st.title("Invoice Extraction Bot 🤖")
st.subheader("Upload your PDF invoices to extract key information!")
pdf_files = st.file_uploader("Upload PDF invoices", type=["pdf"], accept_multiple_files=True)
submit = st.button("Extract Data")
if submit and pdf_files:
with st.spinner("Extracting data from invoices..."):
df = create_docs(pdf_files)
st.write(df)
if not df.empty:
csv_data = df.to_csv(index=False).encode("utf-8")
st.download_button(
"Download CSV",
csv_data,
"invoice_data.csv",
"text/csv",
key="download-csv"
)
st.success("Data extraction completed! 🎉")
if __name__ == "__main__":
main()