rairo commited on
Commit
4d6b816
·
verified ·
1 Parent(s): b182938

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -135
app.py CHANGED
@@ -1,155 +1,98 @@
1
  import re
2
  import pandas as pd
3
  import streamlit as st
4
- from datetime import datetime
5
  import pypdf
 
 
 
6
 
 
 
 
 
 
 
 
7
  def read_pdf(file_path):
8
  text_content = []
9
  with open(file_path, 'rb') as file:
10
  pdf_reader = pypdf.PdfReader(file)
11
- for page_num in range(len(pdf_reader.pages)):
12
- page = pdf_reader.pages[page_num]
13
  text = page.extract_text()
14
  if text:
15
  text_content.append(text)
16
- return text_content
17
-
18
- def preprocess_text(text_pages):
19
- full_text = '\n'.join(text_pages)
20
- return full_text
21
-
22
- def parse_amount(amount_str):
23
- if not amount_str:
24
- return 0.0
25
- amount_str = amount_str.replace(' ', '').replace(',', '.')
26
- if '-' in amount_str:
27
- amount_str = amount_str.replace('-', '')
28
- return -float(amount_str)
29
- return float(amount_str)
30
-
31
- def extract_transactions(text):
32
- lines = text.split('\n')
33
- transactions = []
34
- current_transaction = None
35
-
36
- for line in lines:
37
- date_match = re.match(r'^(\d{1,2}/\d{2}/\d{4})', line)
38
- if date_match:
39
- if current_transaction:
40
- transactions.append(current_transaction)
41
- date_str = date_match.group(1)
42
- remaining_line = line[len(date_str):].strip()
43
- parts = remaining_line.split()
44
- charge_code = None
45
- debit = None
46
- credit = None
47
- balance = None
48
- description_parts = []
49
-
50
- i = 0
51
- while i < len(parts):
52
- part = parts[i]
53
- if part in ('A', 'C', 'M', 'S', 'T', 'V'):
54
- charge_code = part
55
- i += 1
56
- break
57
- if re.match(r'^[\d\.,-]+$', part):
58
- break
59
- description_parts.append(part)
60
- i += 1
61
-
62
- description = ' '.join(description_parts).strip()
63
-
64
- amount_parts = parts[i:]
65
- if amount_parts:
66
- try:
67
- balance = parse_amount(amount_parts[-1])
68
- amount_parts = amount_parts[:-1]
69
- except:
70
- balance = None
71
-
72
- for amt in amount_parts:
73
- if ' ' in amt or ',' in amt or '.' in amt:
74
- if debit is None:
75
- debit = parse_amount(amt)
76
- else:
77
- credit = parse_amount(amt)
78
-
79
- current_transaction = {
80
- 'Date': date_str,
81
- 'Description': description,
82
- 'Charge Code': charge_code,
83
- 'Debit': debit if debit != 0 else None,
84
- 'Credit': credit if credit != 0 else None,
85
- 'Balance': balance
86
  }
87
- else:
88
- if current_transaction:
89
- current_transaction['Description'] += ' ' + line.strip()
90
-
91
- if current_transaction:
92
- transactions.append(current_transaction)
93
-
94
- data = []
95
- for t in transactions:
96
- date = datetime.strptime(t['Date'], '%d/%m/%Y').strftime('%d/%m/%Y')
97
- desc = t['Description']
98
- charge_code = t['Charge Code']
99
- debit = t['Debit']
100
- credit = t['Credit']
101
-
102
- if charge_code:
103
- if debit is not None:
104
- data.append({
105
- 'Date': date,
106
- 'Description': desc,
107
- 'Amount': -abs(debit),
108
- 'Type': 'bank charge'
109
- })
110
- else:
111
- if debit is not None and debit < 0:
112
- data.append({
113
- 'Date': date,
114
- 'Description': desc,
115
- 'Amount': debit,
116
- 'Type': 'debit amount'
117
- })
118
- elif debit is not None and debit > 0:
119
- data.append({
120
- 'Date': date,
121
- 'Description': desc,
122
- 'Amount': -debit,
123
- 'Type': 'debit amount'
124
- })
125
- if credit is not None and credit > 0:
126
- data.append({
127
- 'Date': date,
128
- 'Description': desc,
129
- 'Amount': credit,
130
- 'Type': 'credit amount'
131
- })
132
-
133
- df = pd.DataFrame(data)
134
- return df
135
 
 
136
  def main():
137
- st.title("Bank Statement Parser")
 
 
 
 
138
  uploaded_file = st.file_uploader("Upload a PDF bank statement", type="pdf")
139
-
140
- if uploaded_file is not None:
141
- with open("temp.pdf", "wb") as f:
142
- f.write(uploaded_file.getbuffer())
143
-
144
- text_content = read_pdf("temp.pdf")
145
- processed_text = preprocess_text(text_content)
146
- transactions_df = extract_transactions(processed_text)
147
-
148
- if not transactions_df.empty:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  st.write("### Extracted Transactions")
150
- st.dataframe(transactions_df)
151
- else:
152
- st.write("No transactions found.")
 
 
153
 
154
  if __name__ == "__main__":
155
  main()
 
1
  import re
2
  import pandas as pd
3
  import streamlit as st
4
+ import google.generativeai as genai
5
  import pypdf
6
+ import json
7
+ from datetime import datetime
8
+
9
 
10
+ api_key = os.environ['Gemini']
11
+ # Configure Gemini
12
+ def configure_gemini(api_key):
13
+ genai.configure(api_key=api_key)
14
+ return genai.GenerativeModel('gemini-2.0-flash-exp')
15
+
16
+ # Read PDF content
17
  def read_pdf(file_path):
18
  text_content = []
19
  with open(file_path, 'rb') as file:
20
  pdf_reader = pypdf.PdfReader(file)
21
+ for page in pdf_reader.pages:
 
22
  text = page.extract_text()
23
  if text:
24
  text_content.append(text)
25
+ return "\n".join(text_content)
26
+
27
+ # Process text with Gemini
28
+ def process_with_gemini(model, text):
29
+ prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
30
+ - Date (format DD/MM/YYYY)
31
+ - Description
32
+ - Amount (positive for credits, negative for debits)
33
+ - Type (either 'debit amount', 'credit amount', or 'bank charge')
34
+
35
+ Return ONLY valid JSON with this structure:
36
+ {
37
+ "transactions": [
38
+ {
39
+ "Date": "string",
40
+ "Description": "string",
41
+ "Amount": number,
42
+ "Type": "string"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
+ ]
45
+ }"""
46
+
47
+ response = model.generate_content([prompt, text])
48
+ return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Main Streamlit app
51
  def main():
52
+ st.title("Bank Statement Parser with Gemini AI")
53
+
54
+ # API key input
55
+ api_key = st.text_input("Enter your Gemini API key:", type="password")
56
+
57
  uploaded_file = st.file_uploader("Upload a PDF bank statement", type="pdf")
58
+
59
+ if uploaded_file and api_key:
60
+ try:
61
+ # Configure Gemini
62
+ model = configure_gemini(api_key)
63
+
64
+ # Save and read PDF
65
+ with open("temp.pdf", "wb") as f:
66
+ f.write(uploaded_file.getbuffer())
67
+
68
+ pdf_text = read_pdf("temp.pdf")
69
+
70
+ # Process with Gemini
71
+ with st.spinner("Analyzing statement with Gemini AI..."):
72
+ json_response = process_with_gemini(model, pdf_text)
73
+
74
+ # Clean JSON response
75
+ json_str = json_response[json_response.find('{'):json_response.rfind('}')+1]
76
+ json_str = json_str.replace('```json', '').replace('```', '')
77
+
78
+ data = json.loads(json_str)
79
+ transactions = data.get('transactions', [])
80
+
81
+ # Create DataFrame
82
+ df = pd.DataFrame(transactions)
83
+
84
+ # Format amounts
85
+ if not df.empty:
86
+ df['Amount'] = df['Amount'].apply(lambda x: f"R {x:,.2f}" if x >= 0 else f"R ({abs(x):,.2f})")
87
+ df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y').dt.strftime('%d/%m/%Y')
88
+
89
+ st.success("Analysis complete!")
90
  st.write("### Extracted Transactions")
91
+ st.dataframe(df)
92
+
93
+ except Exception as e:
94
+ st.error(f"Error processing document: {str(e)}")
95
+ st.error("Please ensure you're using a valid bank statement PDF and API key")
96
 
97
  if __name__ == "__main__":
98
  main()