rairo commited on
Commit
35994d7
·
verified ·
1 Parent(s): da1091c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py CHANGED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import streamlit as st
4
+ from datetime import datetime
5
+ import pypdf
6
+
7
+ def read_pdf(file_path):
8
+ text_content = []
9
+ with open(file_path, 'rb') as file:
10
+ pdf_reader = pypdf.PdfReader(file)
11
+ for page_num in range(len(pdf_reader.pages)):
12
+ page = pdf_reader.pages[page_num]
13
+ text = page.extract_text()
14
+ if text:
15
+ text_content.append(text)
16
+ return text_content
17
+
18
+ def preprocess_text(text_pages):
19
+ full_text = '\n'.join(text_pages)
20
+ return full_text
21
+
22
+ def parse_amount(amount_str):
23
+ if not amount_str:
24
+ return 0.0
25
+ amount_str = amount_str.replace(' ', '').replace(',', '.')
26
+ if '-' in amount_str:
27
+ amount_str = amount_str.replace('-', '')
28
+ return -float(amount_str)
29
+ return float(amount_str)
30
+
31
+ def extract_transactions(text):
32
+ lines = text.split('\n')
33
+ transactions = []
34
+ current_transaction = None
35
+
36
+ for line in lines:
37
+ date_match = re.match(r'^(\d{1,2}/\d{2}/\d{4})', line)
38
+ if date_match:
39
+ if current_transaction:
40
+ transactions.append(current_transaction)
41
+ date_str = date_match.group(1)
42
+ remaining_line = line[len(date_str):].strip()
43
+ parts = remaining_line.split()
44
+ charge_code = None
45
+ debit = None
46
+ credit = None
47
+ balance = None
48
+ description_parts = []
49
+
50
+ i = 0
51
+ while i < len(parts):
52
+ part = parts[i]
53
+ if part in ('A', 'C', 'M', 'S', 'T', 'V'):
54
+ charge_code = part
55
+ i += 1
56
+ break
57
+ if re.match(r'^[\d\.,-]+$', part):
58
+ break
59
+ description_parts.append(part)
60
+ i += 1
61
+
62
+ description = ' '.join(description_parts).strip()
63
+
64
+ amount_parts = parts[i:]
65
+ if amount_parts:
66
+ try:
67
+ balance = parse_amount(amount_parts[-1])
68
+ amount_parts = amount_parts[:-1]
69
+ except:
70
+ balance = None
71
+
72
+ for amt in amount_parts:
73
+ if ' ' in amt or ',' in amt or '.' in amt:
74
+ if debit is None:
75
+ debit = parse_amount(amt)
76
+ else:
77
+ credit = parse_amount(amt)
78
+
79
+ current_transaction = {
80
+ 'Date': date_str,
81
+ 'Description': description,
82
+ 'Charge Code': charge_code,
83
+ 'Debit': debit if debit != 0 else None,
84
+ 'Credit': credit if credit != 0 else None,
85
+ 'Balance': balance
86
+ }
87
+ else:
88
+ if current_transaction:
89
+ current_transaction['Description'] += ' ' + line.strip()
90
+
91
+ if current_transaction:
92
+ transactions.append(current_transaction)
93
+
94
+ data = []
95
+ for t in transactions:
96
+ date = datetime.strptime(t['Date'], '%d/%m/%Y').strftime('%d/%m/%Y')
97
+ desc = t['Description']
98
+ charge_code = t['Charge Code']
99
+ debit = t['Debit']
100
+ credit = t['Credit']
101
+
102
+ if charge_code:
103
+ if debit is not None:
104
+ data.append({
105
+ 'Date': date,
106
+ 'Description': desc,
107
+ 'Amount': -abs(debit),
108
+ 'Type': 'bank charge'
109
+ })
110
+ else:
111
+ if debit is not None and debit < 0:
112
+ data.append({
113
+ 'Date': date,
114
+ 'Description': desc,
115
+ 'Amount': debit,
116
+ 'Type': 'debit amount'
117
+ })
118
+ elif debit is not None and debit > 0:
119
+ data.append({
120
+ 'Date': date,
121
+ 'Description': desc,
122
+ 'Amount': -debit,
123
+ 'Type': 'debit amount'
124
+ })
125
+ if credit is not None and credit > 0:
126
+ data.append({
127
+ 'Date': date,
128
+ 'Description': desc,
129
+ 'Amount': credit,
130
+ 'Type': 'credit amount'
131
+ })
132
+
133
+ df = pd.DataFrame(data)
134
+ return df
135
+
136
+ def main():
137
+ st.title("Bank Statement Parser")
138
+ uploaded_file = st.file_uploader("Upload a PDF bank statement", type="pdf")
139
+
140
+ if uploaded_file is not None:
141
+ with open("temp.pdf", "wb") as f:
142
+ f.write(uploaded_file.getbuffer())
143
+
144
+ text_content = read_pdf("temp.pdf")
145
+ processed_text = preprocess_text(text_content)
146
+ transactions_df = extract_transactions(processed_text)
147
+
148
+ if not transactions_df.empty:
149
+ st.write("### Extracted Transactions")
150
+ st.dataframe(transactions_df)
151
+ else:
152
+ st.write("No transactions found.")
153
+
154
+ if __name__ == "__main__":
155
+ main()