File size: 3,312 Bytes
ce5499f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from pypdf import PdfReader
import pandas as pd
import re
import torch

# Load Mistral model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()

# Read PDF text
def get_pdf_text(pdf_doc):
    text = ""
    reader = PdfReader(pdf_doc)
    for page in reader.pages:
        text += page.extract_text()
    return text

# Extract invoice data using the model
def extracted_data(pages_data):
    prompt = f"""Extract the following values from the text:
invoice no., Description, Quantity, date, Unit price, Amount, Total, email, phone number, and address.

Text: {pages_data}

Output format:
{{
    'Invoice no.': '1001329',
    'Description': 'Office Chair',
    'Quantity': '2',
    'Date': '5/4/2023',
    'Unit price': '1100.00',
    'Amount': '2200.00',
    'Total': '2200.00',
    'Email': 'example@email.com',
    'Phone number': '9999999999',
    'Address': 'Mumbai, India'
}}
"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=512)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Process PDF list and build DataFrame
def create_docs(user_pdf_list):
    df = pd.DataFrame(columns=[
        'Invoice no.', 'Description', 'Quantity', 'Date',
        'Unit price', 'Amount', 'Total', 'Email',
        'Phone number', 'Address'
    ])

    for file in user_pdf_list:
        raw_text = get_pdf_text(file)
        llm_output = extracted_data(raw_text)

        # Try extracting JSON-like data from output
        pattern = r'{(.+)}'
        match = re.search(pattern, llm_output, re.DOTALL)
        if match:
            extracted = match.group(1)
            try:
                data_dict = eval("{" + extracted + "}")
                df = df.append([data_dict], ignore_index=True)
            except Exception as e:
                print("Parsing error:", e)
        else:
            print("Model response format issue.")

    return df

def main():
    st.set_page_config(page_title="Invoice Extraction Bot")
    st.title("Invoice Extraction Bot 🤖")
    st.subheader("Upload your PDF invoices to extract key information!")

    pdf_files = st.file_uploader("Upload PDF invoices", type=["pdf"], accept_multiple_files=True)
    submit = st.button("Extract Data")

    if submit and pdf_files:
        with st.spinner("Extracting data from invoices..."):
            df = create_docs(pdf_files)
            st.write(df)

            if not df.empty:
                csv_data = df.to_csv(index=False).encode("utf-8")
                st.download_button(
                    "Download CSV",
                    csv_data,
                    "invoice_data.csv",
                    "text/csv",
                    key="download-csv"
                )
        st.success("Data extraction completed! 🎉")

if __name__ == "__main__":
    main()