File size: 3,472 Bytes
fb58988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
from PyPDF2 import PdfReader
import re
from transformers import pipeline
import pandas as pd

# ----------------------------
# Helper functions
# ----------------------------
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = " ".join(page.extract_text() for page in reader.pages)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

@st.cache_resource
def load_qa_pipeline():
    model_path = "./model"  # your saved QA model folder
    return pipeline("question-answering", model=model_path, tokenizer=model_path)

def extract_fields_with_qa(text, qa_pipeline):
    questions = {
        "bank_name": "Which bank issued this credit card statement?",
        "card_last4": "What are the last 4 digits of the credit card?",
        "billing_cycle": "What is the billing cycle or statement period?",
        "payment_due_date": "What is the payment due date?",
        "total_amount_due": "What is the total amount due?"
    }
    answers = {}
    for key, question in questions.items():
        try:
            result = qa_pipeline(question=question, context=text)
            answers[key] = result.get("answer", "Not found")
        except:
            answers[key] = "Not found"
    return answers

def clean_text(s):
    if not s:
        return "Not found"
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def normalize_amount(amount):
    if not amount:
        return "0"
    amount = amount.replace('₹','').replace('$','').replace(',','').strip()
    match = re.search(r'[\d\.]+', amount)
    return match.group(0) if match else "0"

def normalize_date(date_str):
    return clean_text(date_str)

def clean_extracted_data(data):
    return {
        "Bank Name": clean_text(data.get("bank_name","")),
        "Card Last 4": clean_text(data.get("card_last4","")),
        "Billing Cycle": clean_text(data.get("billing_cycle","")),
        "Payment Due Date": normalize_date(data.get("payment_due_date","")),
        "Total Amount Due": normalize_amount(data.get("total_amount_due",""))
    }

# ----------------------------
# Streamlit UI
# ----------------------------
st.set_page_config(page_title="Credit Card Statement Extractor", page_icon="💳", layout="wide")
st.title("💳 Credit Card Statement Extractor")

uploaded_files = st.file_uploader(
    "Upload one or more credit card statement PDFs",
    type="pdf",
    accept_multiple_files=True
)

if uploaded_files:
    qa_pipeline = load_qa_pipeline()
    all_extracted_data = []

    for pdf_file in uploaded_files:
        with st.spinner(f"Processing {pdf_file.name}..."):
            pdf_text = extract_text_from_pdf(pdf_file)
            extracted_data = extract_fields_with_qa(pdf_text, qa_pipeline)
            cleaned_data = clean_extracted_data(extracted_data)
            cleaned_data["File Name"] = pdf_file.name
            all_extracted_data.append(cleaned_data)

    # Display in a dataframe
    st.subheader("Extracted Information for All PDFs")
    df = pd.DataFrame(all_extracted_data)
    st.dataframe(df.style.format({"Total Amount Due": "${}"}))

    # Download combined CSV
    csv_file = df.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="⬇️ Download All Extracted Data as CSV",
        data=csv_file,
        file_name="all_credit_statements_data.csv",
        mime="text/csv",
    )