import streamlit as st from PyPDF2 import PdfReader import re from transformers import pipeline import pandas as pd # ---------------------------- # Helper functions # ---------------------------- def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = " ".join(page.extract_text() for page in reader.pages) text = re.sub(r'\s+', ' ', text).strip() return text @st.cache_resource def load_qa_pipeline(): model_path = "./model" # your saved QA model folder return pipeline("question-answering", model=model_path, tokenizer=model_path) def extract_fields_with_qa(text, qa_pipeline): questions = { "bank_name": "Which bank issued this credit card statement?", "card_last4": "What are the last 4 digits of the credit card?", "billing_cycle": "What is the billing cycle or statement period?", "payment_due_date": "What is the payment due date?", "total_amount_due": "What is the total amount due?" } answers = {} for key, question in questions.items(): try: result = qa_pipeline(question=question, context=text) answers[key] = result.get("answer", "Not found") except: answers[key] = "Not found" return answers def clean_text(s): if not s: return "Not found" s = re.sub(r'\s+', ' ', s).strip() return s def normalize_amount(amount): if not amount: return "0" amount = amount.replace('₹','').replace('$','').replace(',','').strip() match = re.search(r'[\d\.]+', amount) return match.group(0) if match else "0" def normalize_date(date_str): return clean_text(date_str) def clean_extracted_data(data): return { "Bank Name": clean_text(data.get("bank_name","")), "Card Last 4": clean_text(data.get("card_last4","")), "Billing Cycle": clean_text(data.get("billing_cycle","")), "Payment Due Date": normalize_date(data.get("payment_due_date","")), "Total Amount Due": normalize_amount(data.get("total_amount_due","")) } # ---------------------------- # Streamlit UI # ---------------------------- st.set_page_config(page_title="Credit Card Statement Extractor", page_icon="💳", layout="wide") st.title("💳 Credit Card Statement Extractor") uploaded_files = st.file_uploader( "Upload one or more credit card statement PDFs", type="pdf", accept_multiple_files=True ) if uploaded_files: qa_pipeline = load_qa_pipeline() all_extracted_data = [] for pdf_file in uploaded_files: with st.spinner(f"Processing {pdf_file.name}..."): pdf_text = extract_text_from_pdf(pdf_file) extracted_data = extract_fields_with_qa(pdf_text, qa_pipeline) cleaned_data = clean_extracted_data(extracted_data) cleaned_data["File Name"] = pdf_file.name all_extracted_data.append(cleaned_data) # Display in a dataframe st.subheader("Extracted Information for All PDFs") df = pd.DataFrame(all_extracted_data) st.dataframe(df.style.format({"Total Amount Due": "${}"})) # Download combined CSV csv_file = df.to_csv(index=False).encode('utf-8') st.download_button( label="⬇️ Download All Extracted Data as CSV", data=csv_file, file_name="all_credit_statements_data.csv", mime="text/csv", )