Spaces:

yakul259
/

fintech-assignment

Build error

App Files Files Community

yakul259 commited on Oct 15, 2025

Commit

fb58988

verified ·

1 Parent(s): 76dd5be

Upload 3 files

Browse files

Files changed (3) hide show

src/.gitattributes +1 -0
src/app.py +99 -0
src/requirements.txt +9 -0

src/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ model/*.safetensors filter=lfs diff=lfs merge=lfs -text

src/app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+import re
+from transformers import pipeline
+import pandas as pd
+# ----------------------------
+# Helper functions
+# ----------------------------
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = " ".join(page.extract_text() for page in reader.pages)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+@st.cache_resource
+def load_qa_pipeline():
+    model_path = "./model"  # your saved QA model folder
+    return pipeline("question-answering", model=model_path, tokenizer=model_path)
+def extract_fields_with_qa(text, qa_pipeline):
+    questions = {
+        "bank_name": "Which bank issued this credit card statement?",
+        "card_last4": "What are the last 4 digits of the credit card?",
+        "billing_cycle": "What is the billing cycle or statement period?",
+        "payment_due_date": "What is the payment due date?",
+        "total_amount_due": "What is the total amount due?"
+    }
+    answers = {}
+    for key, question in questions.items():
+        try:
+            result = qa_pipeline(question=question, context=text)
+            answers[key] = result.get("answer", "Not found")
+        except:
+            answers[key] = "Not found"
+    return answers
+def clean_text(s):
+    if not s:
+        return "Not found"
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s
+def normalize_amount(amount):
+    if not amount:
+        return "0"
+    amount = amount.replace('₹','').replace('$','').replace(',','').strip()
+    match = re.search(r'[\d\.]+', amount)
+    return match.group(0) if match else "0"
+def normalize_date(date_str):
+    return clean_text(date_str)
+def clean_extracted_data(data):
+    return {
+        "Bank Name": clean_text(data.get("bank_name","")),
+        "Card Last 4": clean_text(data.get("card_last4","")),
+        "Billing Cycle": clean_text(data.get("billing_cycle","")),
+        "Payment Due Date": normalize_date(data.get("payment_due_date","")),
+        "Total Amount Due": normalize_amount(data.get("total_amount_due",""))
+    }
+# ----------------------------
+# Streamlit UI
+# ----------------------------
+st.set_page_config(page_title="Credit Card Statement Extractor", page_icon="💳", layout="wide")
+st.title("💳 Credit Card Statement Extractor")
+uploaded_files = st.file_uploader(
+    "Upload one or more credit card statement PDFs",
+    type="pdf",
+    accept_multiple_files=True
+)
+if uploaded_files:
+    qa_pipeline = load_qa_pipeline()
+    all_extracted_data = []
+    for pdf_file in uploaded_files:
+        with st.spinner(f"Processing {pdf_file.name}..."):
+            pdf_text = extract_text_from_pdf(pdf_file)
+            extracted_data = extract_fields_with_qa(pdf_text, qa_pipeline)
+            cleaned_data = clean_extracted_data(extracted_data)
+            cleaned_data["File Name"] = pdf_file.name
+            all_extracted_data.append(cleaned_data)
+    # Display in a dataframe
+    st.subheader("Extracted Information for All PDFs")
+    df = pd.DataFrame(all_extracted_data)
+    st.dataframe(df.style.format({"Total Amount Due": "${}"}))
+    # Download combined CSV
+    csv_file = df.to_csv(index=False).encode('utf-8')
+    st.download_button(
+        label="⬇️ Download All Extracted Data as CSV",
+        data=csv_file,
+        file_name="all_credit_statements_data.csv",
+        mime="text/csv",
+    )

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0.0
+transformers>=5.0.0
+PyPDF2>=3.0.0
+streamlit>=1.29.0
+pandas>=2.0.0
+regex
+datasets
+seqeval
+streamlit