fintech-assignment / src /streamlit_app.py
yakul259's picture
Rename src/app.py to src/streamlit_app.py
4d3700e verified
import streamlit as st
from PyPDF2 import PdfReader
import re
from transformers import pipeline
import pandas as pd
# ----------------------------
# Helper functions
# ----------------------------
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = " ".join(page.extract_text() for page in reader.pages)
text = re.sub(r'\s+', ' ', text).strip()
return text
@st.cache_resource
def load_qa_pipeline():
model_path = "./model" # your saved QA model folder
return pipeline("question-answering", model=model_path, tokenizer=model_path)
def extract_fields_with_qa(text, qa_pipeline):
questions = {
"bank_name": "Which bank issued this credit card statement?",
"card_last4": "What are the last 4 digits of the credit card?",
"billing_cycle": "What is the billing cycle or statement period?",
"payment_due_date": "What is the payment due date?",
"total_amount_due": "What is the total amount due?"
}
answers = {}
for key, question in questions.items():
try:
result = qa_pipeline(question=question, context=text)
answers[key] = result.get("answer", "Not found")
except:
answers[key] = "Not found"
return answers
def clean_text(s):
if not s:
return "Not found"
s = re.sub(r'\s+', ' ', s).strip()
return s
def normalize_amount(amount):
if not amount:
return "0"
amount = amount.replace('₹','').replace('$','').replace(',','').strip()
match = re.search(r'[\d\.]+', amount)
return match.group(0) if match else "0"
def normalize_date(date_str):
return clean_text(date_str)
def clean_extracted_data(data):
return {
"Bank Name": clean_text(data.get("bank_name","")),
"Card Last 4": clean_text(data.get("card_last4","")),
"Billing Cycle": clean_text(data.get("billing_cycle","")),
"Payment Due Date": normalize_date(data.get("payment_due_date","")),
"Total Amount Due": normalize_amount(data.get("total_amount_due",""))
}
# ----------------------------
# Streamlit UI
# ----------------------------
st.set_page_config(page_title="Credit Card Statement Extractor", page_icon="💳", layout="wide")
st.title("💳 Credit Card Statement Extractor")
uploaded_files = st.file_uploader(
"Upload one or more credit card statement PDFs",
type="pdf",
accept_multiple_files=True
)
if uploaded_files:
qa_pipeline = load_qa_pipeline()
all_extracted_data = []
for pdf_file in uploaded_files:
with st.spinner(f"Processing {pdf_file.name}..."):
pdf_text = extract_text_from_pdf(pdf_file)
extracted_data = extract_fields_with_qa(pdf_text, qa_pipeline)
cleaned_data = clean_extracted_data(extracted_data)
cleaned_data["File Name"] = pdf_file.name
all_extracted_data.append(cleaned_data)
# Display in a dataframe
st.subheader("Extracted Information for All PDFs")
df = pd.DataFrame(all_extracted_data)
st.dataframe(df.style.format({"Total Amount Due": "${}"}))
# Download combined CSV
csv_file = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="⬇️ Download All Extracted Data as CSV",
data=csv_file,
file_name="all_credit_statements_data.csv",
mime="text/csv",
)