yakul259 commited on
Commit
fb58988
·
verified ·
1 Parent(s): 76dd5be

Upload 3 files

Browse files
Files changed (3) hide show
  1. src/.gitattributes +1 -0
  2. src/app.py +99 -0
  3. src/requirements.txt +9 -0
src/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ model/*.safetensors filter=lfs diff=lfs merge=lfs -text
src/app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ import re
4
+ from transformers import pipeline
5
+ import pandas as pd
6
+
7
+ # ----------------------------
8
+ # Helper functions
9
+ # ----------------------------
10
+ def extract_text_from_pdf(pdf_file):
11
+ reader = PdfReader(pdf_file)
12
+ text = " ".join(page.extract_text() for page in reader.pages)
13
+ text = re.sub(r'\s+', ' ', text).strip()
14
+ return text
15
+
16
+ @st.cache_resource
17
+ def load_qa_pipeline():
18
+ model_path = "./model" # your saved QA model folder
19
+ return pipeline("question-answering", model=model_path, tokenizer=model_path)
20
+
21
+ def extract_fields_with_qa(text, qa_pipeline):
22
+ questions = {
23
+ "bank_name": "Which bank issued this credit card statement?",
24
+ "card_last4": "What are the last 4 digits of the credit card?",
25
+ "billing_cycle": "What is the billing cycle or statement period?",
26
+ "payment_due_date": "What is the payment due date?",
27
+ "total_amount_due": "What is the total amount due?"
28
+ }
29
+ answers = {}
30
+ for key, question in questions.items():
31
+ try:
32
+ result = qa_pipeline(question=question, context=text)
33
+ answers[key] = result.get("answer", "Not found")
34
+ except:
35
+ answers[key] = "Not found"
36
+ return answers
37
+
38
+ def clean_text(s):
39
+ if not s:
40
+ return "Not found"
41
+ s = re.sub(r'\s+', ' ', s).strip()
42
+ return s
43
+
44
+ def normalize_amount(amount):
45
+ if not amount:
46
+ return "0"
47
+ amount = amount.replace('₹','').replace('$','').replace(',','').strip()
48
+ match = re.search(r'[\d\.]+', amount)
49
+ return match.group(0) if match else "0"
50
+
51
+ def normalize_date(date_str):
52
+ return clean_text(date_str)
53
+
54
+ def clean_extracted_data(data):
55
+ return {
56
+ "Bank Name": clean_text(data.get("bank_name","")),
57
+ "Card Last 4": clean_text(data.get("card_last4","")),
58
+ "Billing Cycle": clean_text(data.get("billing_cycle","")),
59
+ "Payment Due Date": normalize_date(data.get("payment_due_date","")),
60
+ "Total Amount Due": normalize_amount(data.get("total_amount_due",""))
61
+ }
62
+
63
+ # ----------------------------
64
+ # Streamlit UI
65
+ # ----------------------------
66
+ st.set_page_config(page_title="Credit Card Statement Extractor", page_icon="💳", layout="wide")
67
+ st.title("💳 Credit Card Statement Extractor")
68
+
69
+ uploaded_files = st.file_uploader(
70
+ "Upload one or more credit card statement PDFs",
71
+ type="pdf",
72
+ accept_multiple_files=True
73
+ )
74
+
75
+ if uploaded_files:
76
+ qa_pipeline = load_qa_pipeline()
77
+ all_extracted_data = []
78
+
79
+ for pdf_file in uploaded_files:
80
+ with st.spinner(f"Processing {pdf_file.name}..."):
81
+ pdf_text = extract_text_from_pdf(pdf_file)
82
+ extracted_data = extract_fields_with_qa(pdf_text, qa_pipeline)
83
+ cleaned_data = clean_extracted_data(extracted_data)
84
+ cleaned_data["File Name"] = pdf_file.name
85
+ all_extracted_data.append(cleaned_data)
86
+
87
+ # Display in a dataframe
88
+ st.subheader("Extracted Information for All PDFs")
89
+ df = pd.DataFrame(all_extracted_data)
90
+ st.dataframe(df.style.format({"Total Amount Due": "${}"}))
91
+
92
+ # Download combined CSV
93
+ csv_file = df.to_csv(index=False).encode('utf-8')
94
+ st.download_button(
95
+ label="⬇️ Download All Extracted Data as CSV",
96
+ data=csv_file,
97
+ file_name="all_credit_statements_data.csv",
98
+ mime="text/csv",
99
+ )
src/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=5.0.0
3
+ PyPDF2>=3.0.0
4
+ streamlit>=1.29.0
5
+ pandas>=2.0.0
6
+ regex
7
+ datasets
8
+ seqeval
9
+ streamlit