SHIKARICHACHA commited on
Commit
c68a3b2
·
verified ·
1 Parent(s): efe3e48

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +947 -0
app.py ADDED
@@ -0,0 +1,947 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ import re
4
+ import pandas as pd
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
+ import numpy as np
9
+ from datetime import datetime, timedelta
10
+ import sqlite3
11
+ import hashlib
12
+ import json
13
+ from http.server import HTTPServer, SimpleHTTPRequestHandler
14
+
15
+ # Advanced ML and NLP imports
16
+ try:
17
+ import xgboost as xgb
18
+ from sklearn.ensemble import RandomForestClassifier, IsolationForest
19
+ from sklearn.model_selection import train_test_split
20
+ from sklearn.preprocessing import StandardScaler
21
+ from sklearn.cluster import KMeans
22
+ import nltk
23
+ from textblob import TextBlob
24
+ except ImportError as e:
25
+ st.warning(f"Some advanced features may not be available. Missing: {str(e)}")
26
+
27
+ # Configuration
28
+ st.set_page_config(
29
+ page_title='Enhanced Bank Statement Analysis',
30
+ page_icon='🏦',
31
+ layout='wide',
32
+ initial_sidebar_state='expanded'
33
+ )
34
+
35
+ # Initialize session state
36
+ if 'user_profile' not in st.session_state:
37
+ st.session_state.user_profile = None
38
+ if 'transactions_df' not in st.session_state:
39
+ st.session_state.transactions_df = None
40
+ if 'analysis_complete' not in st.session_state:
41
+ st.session_state.analysis_complete = False
42
+
43
+ class DatabaseManager:
44
+ """Handles all database operations for user profiles and financial data"""
45
+
46
+ def __init__(self):
47
+ self.db_path = 'financial_analysis.db'
48
+ self.init_database()
49
+
50
+ def init_database(self):
51
+ """Initialize database tables"""
52
+ try:
53
+ conn = sqlite3.connect(self.db_path)
54
+ cursor = conn.cursor()
55
+
56
+ # Test database integrity first
57
+ cursor.execute("PRAGMA integrity_check;")
58
+ integrity_result = cursor.fetchone()
59
+
60
+ if integrity_result[0] != "ok":
61
+ conn.close()
62
+ raise sqlite3.DatabaseError("Database integrity check failed")
63
+
64
+ except sqlite3.DatabaseError as e:
65
+ # Handle corrupted database by removing it and creating fresh
66
+ import os
67
+ if os.path.exists(self.db_path):
68
+ os.remove(self.db_path)
69
+ conn = sqlite3.connect(self.db_path)
70
+ cursor = conn.cursor()
71
+
72
+ # User profiles table
73
+ cursor.execute('''
74
+ CREATE TABLE IF NOT EXISTS user_profiles (
75
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
76
+ user_id TEXT UNIQUE NOT NULL,
77
+ name TEXT NOT NULL,
78
+ email TEXT UNIQUE NOT NULL,
79
+ password_hash TEXT NOT NULL,
80
+ financial_goals TEXT,
81
+ risk_tolerance TEXT,
82
+ monthly_income REAL,
83
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
84
+ )
85
+ ''')
86
+
87
+ # Financial data table
88
+ cursor.execute('''
89
+ CREATE TABLE IF NOT EXISTS financial_data (
90
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
91
+ user_id TEXT NOT NULL,
92
+ date TEXT NOT NULL,
93
+ description TEXT,
94
+ amount REAL,
95
+ category TEXT,
96
+ balance REAL,
97
+ analysis_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
98
+ FOREIGN KEY (user_id) REFERENCES user_profiles (user_id)
99
+ )
100
+ ''')
101
+
102
+ # Recommendations table
103
+ cursor.execute('''
104
+ CREATE TABLE IF NOT EXISTS recommendations (
105
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
106
+ user_id TEXT NOT NULL,
107
+ recommendation_type TEXT,
108
+ title TEXT,
109
+ description TEXT,
110
+ priority INTEGER,
111
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
112
+ FOREIGN KEY (user_id) REFERENCES user_profiles (user_id)
113
+ )
114
+ ''')
115
+
116
+ conn.commit()
117
+ conn.close()
118
+
119
+ def create_user(self, user_id, name, email, password, financial_goals="", risk_tolerance="moderate", monthly_income=0):
120
+ """Create a new user profile"""
121
+ conn = sqlite3.connect(self.db_path)
122
+ cursor = conn.cursor()
123
+
124
+ password_hash = hashlib.sha256(password.encode()).hexdigest()
125
+
126
+ try:
127
+ cursor.execute('''
128
+ INSERT INTO user_profiles (user_id, name, email, password_hash, financial_goals, risk_tolerance, monthly_income)
129
+ VALUES (?, ?, ?, ?, ?, ?, ?)
130
+ ''', (user_id, name, email, password_hash, financial_goals, risk_tolerance, monthly_income))
131
+ conn.commit()
132
+ return True
133
+ except sqlite3.IntegrityError:
134
+ return False
135
+ finally:
136
+ conn.close()
137
+
138
+ def authenticate_user(self, email, password):
139
+ """Authenticate user login"""
140
+ conn = sqlite3.connect(self.db_path)
141
+ cursor = conn.cursor()
142
+
143
+ password_hash = hashlib.sha256(password.encode()).hexdigest()
144
+ cursor.execute('SELECT user_id, name FROM user_profiles WHERE email = ? AND password_hash = ?',
145
+ (email, password_hash))
146
+ result = cursor.fetchone()
147
+ conn.close()
148
+
149
+ return result
150
+
151
+ def get_user_profile(self, user_id):
152
+ """Get user profile data"""
153
+ conn = sqlite3.connect(self.db_path)
154
+ cursor = conn.cursor()
155
+
156
+ cursor.execute('SELECT * FROM user_profiles WHERE user_id = ?', (user_id,))
157
+ result = cursor.fetchone()
158
+ conn.close()
159
+
160
+ if result:
161
+ columns = ['id', 'user_id', 'name', 'email', 'password_hash', 'financial_goals', 'risk_tolerance', 'monthly_income', 'created_at']
162
+ return dict(zip(columns, result))
163
+ return None
164
+
165
+ class PersonalizationEngine:
166
+ """Advanced personalization and recommendation system"""
167
+
168
+ def __init__(self, user_profile=None):
169
+ self.user_profile = user_profile
170
+ self.scaler = StandardScaler()
171
+
172
+ def analyze_spending_patterns(self, df):
173
+ """Analyze user spending patterns using advanced ML"""
174
+ if df.empty:
175
+ return {}
176
+
177
+ # Calculate advanced metrics
178
+ spending_by_category = df.groupby('Category')['Amount'].agg(['sum', 'mean', 'count', 'std'])
179
+
180
+ # Time-based analysis
181
+ df['Date'] = pd.to_datetime(df['Date'])
182
+ df['DayOfWeek'] = df['Date'].dt.dayofweek
183
+ df['Month'] = df['Date'].dt.month
184
+ df['IsWeekend'] = df['DayOfWeek'].isin([5, 6])
185
+
186
+ patterns = {
187
+ 'spending_by_category': spending_by_category.to_dict(),
188
+ 'weekend_vs_weekday': {
189
+ 'weekend_avg': df[df['IsWeekend']]['Amount'].mean(),
190
+ 'weekday_avg': df[~df['IsWeekend']]['Amount'].mean()
191
+ },
192
+ 'monthly_trends': df.groupby('Month')['Amount'].mean().to_dict(),
193
+ 'transaction_frequency': len(df) / max(1, (df['Date'].max() - df['Date'].min()).days),
194
+ 'balance_trend': df['Balance'].iloc[-1] - df['Balance'].iloc[0] if len(df) > 1 else 0
195
+ }
196
+
197
+ return patterns
198
+
199
+ def generate_personalized_recommendations(self, df, patterns):
200
+ """Generate personalized financial recommendations"""
201
+ recommendations = []
202
+
203
+ if df.empty:
204
+ return recommendations
205
+
206
+ # Budget recommendations
207
+ total_spending = abs(df[df['Amount'] < 0]['Amount'].sum())
208
+
209
+ if self.user_profile and self.user_profile.get('monthly_income', 0) > 0:
210
+ spending_ratio = total_spending / self.user_profile['monthly_income']
211
+
212
+ if spending_ratio > 0.8:
213
+ recommendations.append({
214
+ 'type': 'budget',
215
+ 'priority': 'high',
216
+ 'title': 'Reduce Monthly Spending',
217
+ 'description': f'Your spending is {spending_ratio:.1%} of your income. Consider reducing expenses by 20%.'
218
+ })
219
+ elif spending_ratio < 0.5:
220
+ recommendations.append({
221
+ 'type': 'savings',
222
+ 'priority': 'medium',
223
+ 'title': 'Increase Savings',
224
+ 'description': f'Great job! You\'re only spending {spending_ratio:.1%} of your income. Consider increasing your savings rate.'
225
+ })
226
+
227
+ # Category-specific recommendations
228
+ for category, data in patterns.get('spending_by_category', {}).items():
229
+ if data.get('sum', 0) < 0 and abs(data['sum']) > 1000: # Significant spending categories
230
+ recommendations.append({
231
+ 'type': 'category',
232
+ 'priority': 'medium',
233
+ 'title': f'Optimize {category} Spending',
234
+ 'description': f'You spent R{abs(data["sum"]):.2f} on {category}. Consider reviewing these expenses.'
235
+ })
236
+
237
+ # Investment recommendations
238
+ if patterns.get('balance_trend', 0) > 5000:
239
+ recommendations.append({
240
+ 'type': 'investment',
241
+ 'priority': 'medium',
242
+ 'title': 'Consider Investment Options',
243
+ 'description': 'Your balance is growing steadily. Consider investing excess funds for better returns.'
244
+ })
245
+
246
+ return recommendations
247
+
248
+ def calculate_financial_health_score(self, df, patterns):
249
+ """Calculate comprehensive financial health score"""
250
+ if df.empty:
251
+ return 0
252
+
253
+ score_components = {
254
+ 'income_stability': 0,
255
+ 'spending_control': 0,
256
+ 'savings_rate': 0,
257
+ 'transaction_diversity': 0,
258
+ 'balance_growth': 0
259
+ }
260
+
261
+ # Income stability (based on regular credits)
262
+ credits = df[df['Amount'] > 0]
263
+ if len(credits) > 0:
264
+ credit_std = credits['Amount'].std()
265
+ credit_mean = credits['Amount'].mean()
266
+ stability = max(0, 1 - (credit_std / credit_mean if credit_mean > 0 else 1))
267
+ score_components['income_stability'] = min(stability * 25, 25)
268
+
269
+ # Spending control (consistent spending patterns)
270
+ debits = df[df['Amount'] < 0]
271
+ if len(debits) > 0:
272
+ spending_consistency = 1 - (debits['Amount'].std() / abs(debits['Amount'].mean()) if debits['Amount'].mean() != 0 else 1)
273
+ score_components['spending_control'] = max(0, spending_consistency * 20)
274
+
275
+ # Savings rate
276
+ if self.user_profile and self.user_profile.get('monthly_income', 0) > 0:
277
+ total_spending = abs(df[df['Amount'] < 0]['Amount'].sum())
278
+ savings_rate = 1 - (total_spending / self.user_profile['monthly_income'])
279
+ score_components['savings_rate'] = max(0, min(savings_rate * 25, 25))
280
+
281
+ # Transaction diversity
282
+ unique_categories = df['Category'].nunique()
283
+ diversity_score = min(unique_categories / 8 * 15, 15) # Max 15 points for 8+ categories
284
+ score_components['transaction_diversity'] = diversity_score
285
+
286
+ # Balance growth
287
+ balance_trend = patterns.get('balance_trend', 0)
288
+ if balance_trend > 0:
289
+ score_components['balance_growth'] = min(balance_trend / 5000 * 15, 15)
290
+
291
+ total_score = sum(score_components.values())
292
+ return min(100, max(0, total_score)), score_components
293
+
294
+ class AdvancedAnalytics:
295
+ """Advanced analytics and ML models for financial analysis"""
296
+
297
+ def __init__(self):
298
+ self.models = {}
299
+ self.scaler = StandardScaler()
300
+
301
+ def detect_anomalies(self, df):
302
+ """Detect unusual transactions using Isolation Forest"""
303
+ if len(df) < 10:
304
+ return df
305
+
306
+ # Prepare features for anomaly detection
307
+ features = df[['Amount']].copy()
308
+ features['DayOfWeek'] = pd.to_datetime(df['Date']).dt.dayofweek
309
+ features['IsWeekend'] = features['DayOfWeek'].isin([5, 6]).astype(int)
310
+
311
+ # Scale features
312
+ features_scaled = self.scaler.fit_transform(features)
313
+
314
+ # Detect anomalies
315
+ iso_forest = IsolationForest(contamination=0.1, random_state=42)
316
+ anomalies = iso_forest.fit_predict(features_scaled)
317
+
318
+ df_copy = df.copy()
319
+ df_copy['IsAnomaly'] = anomalies == -1
320
+
321
+ return df_copy
322
+
323
+ def predict_future_spending(self, df, days_ahead=30):
324
+ """Predict future spending patterns"""
325
+ if len(df) < 10:
326
+ return None
327
+
328
+ # Prepare time series data
329
+ df_daily = df.groupby(pd.to_datetime(df['Date']).dt.date)['Amount'].sum().reset_index()
330
+ df_daily['Date'] = pd.to_datetime(df_daily['Date'])
331
+ df_daily = df_daily.sort_values('Date')
332
+
333
+ # Simple linear trend prediction
334
+ x = np.arange(len(df_daily))
335
+ y = df_daily['Amount'].values
336
+
337
+ coeffs = np.polyfit(x, y, 1)
338
+
339
+ # Predict future values
340
+ future_x = np.arange(len(df_daily), len(df_daily) + days_ahead)
341
+ future_predictions = np.polyval(coeffs, future_x)
342
+
343
+ future_dates = [df_daily['Date'].max() + timedelta(days=i+1) for i in range(days_ahead)]
344
+
345
+ return pd.DataFrame({
346
+ 'Date': future_dates,
347
+ 'Predicted_Amount': future_predictions
348
+ })
349
+
350
+ def enhanced_loan_prediction(self, df):
351
+ """Enhanced loan eligibility prediction using XGBoost"""
352
+ try:
353
+ # Load training data
354
+ training_data = pd.read_csv('absa.csv')
355
+
356
+ # Prepare features
357
+ total_credits = df[df['Amount'] > 0]['Amount'].sum()
358
+ total_debits = abs(df[df['Amount'] < 0]['Amount'].sum())
359
+ num_transactions = len(df)
360
+
361
+ # Advanced features
362
+ avg_transaction_amount = df['Amount'].mean()
363
+ transaction_variability = df['Amount'].std()
364
+ balance_trend = df['Balance'].iloc[-1] - df['Balance'].iloc[0] if len(df) > 1 else 0
365
+
366
+ # Additional features
367
+ credit_frequency = len(df[df['Amount'] > 0]) / max(1, len(df))
368
+ max_single_debit = abs(df[df['Amount'] < 0]['Amount'].min()) if len(df[df['Amount'] < 0]) > 0 else 0
369
+ balance_volatility = df['Balance'].std()
370
+
371
+ # Prepare feature vector
372
+ features = pd.DataFrame({
373
+ 'total_credits': [total_credits],
374
+ 'total_debits': [total_debits],
375
+ 'num_transactions': [num_transactions],
376
+ 'avg_transaction_amount': [avg_transaction_amount],
377
+ 'transaction_variability': [transaction_variability],
378
+ 'balance_trend': [balance_trend],
379
+ 'credit_frequency': [credit_frequency],
380
+ 'max_single_debit': [max_single_debit],
381
+ 'balance_volatility': [balance_volatility]
382
+ })
383
+
384
+ # Train XGBoost model if available
385
+ if 'xgb' in globals():
386
+ X_train = training_data[['total_credits', 'total_debits', 'num_transactions',
387
+ 'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
388
+ y_train = training_data['Eligibility (y)']
389
+
390
+ # Add missing features with defaults
391
+ for col in features.columns:
392
+ if col not in X_train.columns:
393
+ X_train[col] = 0
394
+
395
+ model = xgb.XGBClassifier(random_state=42)
396
+ model.fit(X_train[features.columns], y_train)
397
+
398
+ prediction = model.predict(features)[0]
399
+ prediction_proba = model.predict_proba(features)[0]
400
+
401
+ return {
402
+ 'eligible': bool(prediction),
403
+ 'confidence': float(max(prediction_proba)),
404
+ 'model_type': 'XGBoost'
405
+ }
406
+ else:
407
+ # Fallback to Random Forest
408
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
409
+ X_train = training_data[['total_credits', 'total_debits', 'num_transactions',
410
+ 'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
411
+ y_train = training_data['Eligibility (y)']
412
+ model.fit(X_train, y_train)
413
+
414
+ features_basic = features[['total_credits', 'total_debits', 'num_transactions',
415
+ 'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
416
+ prediction = model.predict(features_basic)[0]
417
+ prediction_proba = model.predict_proba(features_basic)[0]
418
+
419
+ return {
420
+ 'eligible': bool(prediction),
421
+ 'confidence': float(max(prediction_proba)),
422
+ 'model_type': 'Random Forest'
423
+ }
424
+
425
+ except Exception as e:
426
+ st.error(f"Error in loan prediction: {str(e)}")
427
+ return {'eligible': False, 'confidence': 0.0, 'model_type': 'Error'}
428
+
429
+ def parse_pdf_enhanced(file):
430
+ """Enhanced PDF parsing with better text extraction"""
431
+ try:
432
+ with pdfplumber.open(file) as pdf:
433
+ text = ''
434
+ for page in pdf.pages:
435
+ page_text = page.extract_text()
436
+ if page_text:
437
+ text += page_text
438
+ return text
439
+ except Exception as e:
440
+ st.error(f"Error parsing PDF: {str(e)}")
441
+ return ""
442
+
443
+ def process_text_to_df_enhanced(text):
444
+ """Enhanced text processing with better pattern recognition"""
445
+ transactions = []
446
+
447
+ # Use the exact working pattern from app_original.py as primary
448
+ transaction_pattern = re.compile(r'(\d{4}-\d{2}-\d{2})\s+(.+?)\s+(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')
449
+
450
+ for line in text.split('\n'):
451
+ line = line.strip()
452
+ if not line:
453
+ continue
454
+
455
+ match = transaction_pattern.search(line)
456
+ if match:
457
+ try:
458
+ date_str, description, amount_str, balance_str = match.groups()
459
+
460
+ # Clean and convert amounts (same logic as app_original.py)
461
+ amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
462
+ balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
463
+
464
+ transactions.append([date_str, description.strip(), amount, balance])
465
+ except (ValueError, AttributeError):
466
+ continue
467
+
468
+ if not transactions:
469
+ return pd.DataFrame(columns=['Date', 'Description', 'Amount', 'Balance'])
470
+
471
+ df = pd.DataFrame(transactions, columns=['Date', 'Description', 'Amount', 'Balance'])
472
+ df['Date'] = pd.to_datetime(df['Date'])
473
+ df = df.sort_values('Date').reset_index(drop=True)
474
+
475
+ return df
476
+
477
+ def categorize_expense_enhanced(description):
478
+ """Enhanced expense categorization using NLP"""
479
+ description_lower = description.lower()
480
+
481
+ # Enhanced categorization with more sophisticated rules
482
+ category_keywords = {
483
+ 'Salary/Income': ['salary', 'wage', 'income', 'payroll', 'refund'],
484
+ 'Groceries': ['grocery', 'supermarket', 'food', 'spar', 'checkers', 'woolworths'],
485
+ 'Transport': ['fuel', 'petrol', 'uber', 'taxi', 'transport', 'car payment'],
486
+ 'Utilities': ['electricity', 'water', 'municipal', 'rates', 'internet', 'phone'],
487
+ 'Entertainment': ['restaurant', 'movie', 'entertainment', 'netflix', 'spotify'],
488
+ 'Healthcare': ['medical', 'doctor', 'hospital', 'pharmacy', 'health'],
489
+ 'Shopping': ['retail', 'clothing', 'amazon', 'takealot', 'mall'],
490
+ 'Investments': ['investment', 'shares', 'unit trust', 'retirement'],
491
+ 'Insurance': ['insurance', 'medical aid', 'life cover', 'short term'],
492
+ 'POS Purchases': ['cashsend mobile', 'pos purchase'],
493
+ 'Payments': ['immediate trf', 'digital payment', 'payment'],
494
+ 'Credits': ['acb credit', 'immediate trf cr', 'credit'],
495
+ 'Bank Charges': ['fees', 'charge', 'commission'],
496
+ 'Cash Transactions': ['atm', 'cash deposit', 'withdrawal'],
497
+ 'Cellular': ['airtime', 'data', 'vodacom', 'mtn', 'cell c'],
498
+ 'Interest': ['interest'],
499
+ 'Failed Transactions': ['unsuccessful', 'declined', 'failed']
500
+ }
501
+
502
+ # Check for specific keywords
503
+ for category, keywords in category_keywords.items():
504
+ if any(keyword in description_lower for keyword in keywords):
505
+ return category
506
+
507
+ # Try NLP-based categorization if available
508
+ try:
509
+ if 'TextBlob' in globals():
510
+ blob = TextBlob(description)
511
+ # Simple sentiment and keyword analysis could be added here
512
+ pass
513
+ except:
514
+ pass
515
+
516
+ return 'Others'
517
+
518
+ def create_advanced_visualizations(df, patterns, recommendations):
519
+ """Create advanced interactive visualizations"""
520
+
521
+ # 1. Financial Health Dashboard
522
+ col1, col2, col3, col4 = st.columns(4)
523
+
524
+ total_income = df[df['Amount'] > 0]['Amount'].sum()
525
+ total_expenses = abs(df[df['Amount'] < 0]['Amount'].sum())
526
+ net_flow = total_income - total_expenses
527
+ transaction_count = len(df)
528
+
529
+ with col1:
530
+ st.metric("Total Income", f"R{total_income:,.2f}", delta=None)
531
+ with col2:
532
+ st.metric("Total Expenses", f"R{total_expenses:,.2f}", delta=None)
533
+ with col3:
534
+ st.metric("Net Cash Flow", f"R{net_flow:,.2f}",
535
+ delta=f"{'Positive' if net_flow > 0 else 'Negative'}")
536
+ with col4:
537
+ st.metric("Transactions", f"{transaction_count}", delta=None)
538
+
539
+ # 2. Enhanced spending analysis
540
+ fig_treemap = px.treemap(
541
+ df.groupby('Category')['Amount'].sum().abs().reset_index(),
542
+ path=['Category'],
543
+ values='Amount',
544
+ title='Spending Distribution by Category (Treemap)'
545
+ )
546
+ st.plotly_chart(fig_treemap, use_container_width=True)
547
+
548
+ # 3. Time series analysis with predictions
549
+ daily_spending = df.groupby(df['Date'].dt.date)['Amount'].sum().reset_index()
550
+ daily_spending['Date'] = pd.to_datetime(daily_spending['Date'])
551
+
552
+ fig_timeseries = go.Figure()
553
+ fig_timeseries.add_trace(go.Scatter(
554
+ x=daily_spending['Date'],
555
+ y=daily_spending['Amount'],
556
+ mode='lines+markers',
557
+ name='Actual Spending',
558
+ line=dict(color='blue')
559
+ ))
560
+
561
+ # Add trend line
562
+ x_numeric = np.arange(len(daily_spending))
563
+ z = np.polyfit(x_numeric, daily_spending['Amount'], 1)
564
+ p = np.poly1d(z)
565
+ fig_timeseries.add_trace(go.Scatter(
566
+ x=daily_spending['Date'],
567
+ y=p(x_numeric),
568
+ mode='lines',
569
+ name='Trend',
570
+ line=dict(color='red', dash='dash')
571
+ ))
572
+
573
+ fig_timeseries.update_layout(title='Daily Spending Trend with Projection')
574
+ st.plotly_chart(fig_timeseries, use_container_width=True)
575
+
576
+ # 4. Category-wise monthly analysis
577
+ df['Month'] = df['Date'].dt.to_period('M')
578
+ monthly_category = df.groupby(['Month', 'Category'])['Amount'].sum().abs().reset_index()
579
+ monthly_category['Month'] = monthly_category['Month'].astype(str)
580
+
581
+ fig_monthly = px.bar(
582
+ monthly_category,
583
+ x='Month',
584
+ y='Amount',
585
+ color='Category',
586
+ title='Monthly Spending by Category'
587
+ )
588
+ st.plotly_chart(fig_monthly, use_container_width=True)
589
+
590
+ def main():
591
+ """Main application function"""
592
+
593
+ # Initialize database
594
+ db = DatabaseManager()
595
+
596
+ # Sidebar for user authentication
597
+ with st.sidebar:
598
+ st.title("🏦 Financial Analysis")
599
+
600
+ if st.session_state.user_profile is None:
601
+ tab1, tab2 = st.tabs(["Login", "Sign Up"])
602
+
603
+ with tab1:
604
+ st.subheader("Login")
605
+ email = st.text_input("Email", key="login_email")
606
+ password = st.text_input("Password", type="password", key="login_password")
607
+
608
+ if st.button("Login", key="login_btn"):
609
+ user_data = db.authenticate_user(email, password)
610
+ if user_data:
611
+ user_id, name = user_data
612
+ st.session_state.user_profile = db.get_user_profile(user_id)
613
+ st.success(f"Welcome back, {name}!")
614
+ st.rerun()
615
+ else:
616
+ st.error("Invalid credentials")
617
+
618
+ with tab2:
619
+ st.subheader("Create Account")
620
+ new_name = st.text_input("Full Name", key="signup_name")
621
+ new_email = st.text_input("Email", key="signup_email")
622
+ new_password = st.text_input("Password", type="password", key="signup_password")
623
+ monthly_income = st.number_input("Monthly Income (R)", min_value=0.0, key="signup_income")
624
+ risk_tolerance = st.selectbox("Risk Tolerance", ["conservative", "moderate", "aggressive"], key="signup_risk")
625
+ financial_goals = st.text_area("Financial Goals", key="signup_goals")
626
+
627
+ if st.button("Create Account", key="signup_btn"):
628
+ if new_name and new_email and new_password:
629
+ user_id = hashlib.md5(new_email.encode()).hexdigest()[:8]
630
+ if db.create_user(user_id, new_name, new_email, new_password, financial_goals, risk_tolerance, monthly_income):
631
+ st.success("Account created successfully! Please login.")
632
+ else:
633
+ st.error("Email already exists")
634
+ else:
635
+ st.error("Please fill all required fields")
636
+ st.rerun()
637
+
638
+ else:
639
+ st.success(f"Welcome, {st.session_state.user_profile['name']}!")
640
+ if st.button("Logout"):
641
+ st.session_state.user_profile = None
642
+ st.session_state.transactions_df = None
643
+ st.session_state.analysis_complete = False
644
+ st.experimental_rerun()
645
+
646
+ # Main content area
647
+ if st.session_state.user_profile is None:
648
+ st.markdown("""
649
+ # 🏦 Enhanced Bank Statement Analysis
650
+
651
+ ### Welcome to the next generation of financial analysis!
652
+
653
+ **Key Features:**
654
+ - 🤖 **AI-Powered Insights**: Advanced machine learning for personalized recommendations
655
+ - 📊 **Comprehensive Analytics**: Deep dive into your spending patterns
656
+ - 🎯 **Goal Tracking**: Set and monitor your financial objectives
657
+ - 🔮 **Predictive Analysis**: Forecast future spending trends
658
+ - 🛡️ **Anomaly Detection**: Identify unusual transactions
659
+ - 💡 **Smart Recommendations**: Personalized financial advice
660
+
661
+ **Please login or create an account to get started.**
662
+ """)
663
+
664
+ return
665
+
666
+ # Main analysis interface
667
+ st.title(f"🏦 Financial Analysis Dashboard - {st.session_state.user_profile['name']}")
668
+
669
+ # File upload section
670
+ st.markdown("### 📄 Upload Your Bank Statement")
671
+ uploaded_file = st.file_uploader(
672
+ "Choose a PDF bank statement",
673
+ type="pdf",
674
+ help="Upload your ABSA bank statement in PDF format for analysis"
675
+ )
676
+
677
+ if uploaded_file is not None:
678
+ try:
679
+ # Parse PDF
680
+ with st.spinner("🔍 Parsing bank statement..."):
681
+ text = parse_pdf_enhanced(uploaded_file)
682
+ df = process_text_to_df_enhanced(text)
683
+
684
+ if df.empty:
685
+ st.warning("⚠️ No transactions found in the uploaded statement. Please check the file format.")
686
+ return
687
+
688
+ # Store in session state
689
+ st.session_state.transactions_df = df
690
+
691
+ # Enhance data with categories
692
+ df['Category'] = df['Description'].apply(categorize_expense_enhanced)
693
+
694
+ # Initialize analytics engines
695
+ personalization = PersonalizationEngine(st.session_state.user_profile)
696
+ analytics = AdvancedAnalytics()
697
+
698
+ # Perform analysis
699
+ with st.spinner("🧠 Analyzing your financial data..."):
700
+ patterns = personalization.analyze_spending_patterns(df)
701
+ recommendations = personalization.generate_personalized_recommendations(df, patterns)
702
+ health_score, score_components = personalization.calculate_financial_health_score(df, patterns)
703
+ loan_prediction = analytics.enhanced_loan_prediction(df)
704
+ df_with_anomalies = analytics.detect_anomalies(df)
705
+
706
+ st.session_state.analysis_complete = True
707
+
708
+ # Display results in tabs
709
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
710
+ "📊 Overview", "💡 Recommendations", "🏥 Health Score",
711
+ "🔍 Detailed Analysis", "🚨 Anomalies", "💰 Loan Eligibility"
712
+ ])
713
+
714
+ with tab1:
715
+ st.markdown("### 📈 Financial Overview")
716
+ create_advanced_visualizations(df, patterns, recommendations)
717
+
718
+ # Transaction table with enhanced features
719
+ st.markdown("### 📋 Transaction History")
720
+ st.dataframe(
721
+ df[['Date', 'Description', 'Category', 'Amount', 'Balance']],
722
+ use_container_width=True
723
+ )
724
+
725
+ with tab2:
726
+ st.markdown("### 💡 Personalized Recommendations")
727
+
728
+ if recommendations:
729
+ for i, rec in enumerate(recommendations):
730
+ priority_color = {
731
+ 'high': '🔴',
732
+ 'medium': '🟡',
733
+ 'low': '🟢'
734
+ }.get(rec['priority'], '⚪')
735
+
736
+ st.markdown(f"""
737
+ **{priority_color} {rec['title']}**
738
+
739
+ {rec['description']}
740
+
741
+ *Category: {rec['type'].title()} | Priority: {rec['priority'].title()}*
742
+ """)
743
+ st.divider()
744
+ else:
745
+ st.info("💫 Great job! Your financial habits look healthy. Keep up the good work!")
746
+
747
+ with tab3:
748
+ st.markdown("### 🏥 Financial Health Score")
749
+
750
+ # Display overall score
751
+ col1, col2 = st.columns([1, 2])
752
+
753
+ with col1:
754
+ # Create gauge chart for health score
755
+ fig_gauge = go.Figure(go.Indicator(
756
+ mode = "gauge+number+delta",
757
+ value = health_score,
758
+ domain = {'x': [0, 1], 'y': [0, 1]},
759
+ title = {'text': "Financial Health Score"},
760
+ delta = {'reference': 75},
761
+ gauge = {
762
+ 'axis': {'range': [None, 100]},
763
+ 'bar': {'color': "darkblue"},
764
+ 'steps': [
765
+ {'range': [0, 25], 'color': "lightgray"},
766
+ {'range': [25, 50], 'color': "gray"},
767
+ {'range': [50, 75], 'color': "lightgreen"},
768
+ {'range': [75, 100], 'color': "green"}
769
+ ],
770
+ 'threshold': {
771
+ 'line': {'color': "red", 'width': 4},
772
+ 'thickness': 0.75,
773
+ 'value': 90
774
+ }
775
+ }
776
+ ))
777
+ fig_gauge.update_layout(height=300)
778
+ st.plotly_chart(fig_gauge, use_container_width=True)
779
+
780
+ with col2:
781
+ st.markdown("#### Score Breakdown")
782
+ for component, score in score_components.items():
783
+ component_name = component.replace('_', ' ').title()
784
+ st.progress(score/25, text=f"{component_name}: {score:.1f}/25")
785
+
786
+ # Health recommendations
787
+ if health_score < 60:
788
+ st.warning("⚠️ Your financial health needs attention. Consider the recommendations above.")
789
+ elif health_score < 80:
790
+ st.info("💡 Good financial health! A few improvements could boost your score.")
791
+ else:
792
+ st.success("🎉 Excellent financial health! You're doing great!")
793
+
794
+ with tab4:
795
+ st.markdown("### 🔍 Detailed Financial Analysis")
796
+
797
+ # Advanced metrics
798
+ col1, col2 = st.columns(2)
799
+
800
+ with col1:
801
+ st.markdown("#### Spending Patterns")
802
+ weekend_avg = patterns.get('weekend_vs_weekday', {}).get('weekend_avg', 0)
803
+ weekday_avg = patterns.get('weekend_vs_weekday', {}).get('weekday_avg', 0)
804
+
805
+ st.write(f"Weekend Average: R{weekend_avg:.2f}")
806
+ st.write(f"Weekday Average: R{weekday_avg:.2f}")
807
+ st.write(f"Transaction Frequency: {patterns.get('transaction_frequency', 0):.2f} per day")
808
+
809
+ with col2:
810
+ st.markdown("#### Monthly Trends")
811
+ monthly_trends = patterns.get('monthly_trends', {})
812
+ for month, avg_spending in monthly_trends.items():
813
+ month_name = pd.to_datetime(f"2023-{month:02d}-01").strftime("%B")
814
+ st.write(f"{month_name}: R{avg_spending:.2f}")
815
+
816
+ # Category analysis
817
+ st.markdown("#### Category Analysis")
818
+ category_data = []
819
+ for category, data in patterns.get('spending_by_category', {}).items():
820
+ if isinstance(data, dict):
821
+ category_data.append({
822
+ 'Category': category,
823
+ 'Total': data.get('sum', 0),
824
+ 'Average': data.get('mean', 0),
825
+ 'Count': data.get('count', 0),
826
+ 'Std Dev': data.get('std', 0)
827
+ })
828
+
829
+ if category_data:
830
+ category_df = pd.DataFrame(category_data)
831
+ st.dataframe(category_df, use_container_width=True)
832
+
833
+ with tab5:
834
+ st.markdown("### 🚨 Anomaly Detection")
835
+
836
+ anomalies = df_with_anomalies[df_with_anomalies['IsAnomaly']]
837
+
838
+ if not anomalies.empty:
839
+ st.warning(f"⚠️ Found {len(anomalies)} unusual transactions:")
840
+ st.dataframe(
841
+ anomalies[['Date', 'Description', 'Amount', 'Category']],
842
+ use_container_width=True
843
+ )
844
+
845
+ # Visualization
846
+ fig_anomaly = px.scatter(
847
+ df_with_anomalies,
848
+ x='Date',
849
+ y='Amount',
850
+ color='IsAnomaly',
851
+ title='Transaction Anomalies',
852
+ color_discrete_map={True: 'red', False: 'blue'}
853
+ )
854
+ st.plotly_chart(fig_anomaly, use_container_width=True)
855
+ else:
856
+ st.success("✅ No unusual transactions detected. Your spending patterns look normal!")
857
+
858
+ with tab6:
859
+ st.markdown("### 💰 Loan Eligibility Assessment")
860
+
861
+ # Display loan prediction results
862
+ if loan_prediction['eligible']:
863
+ st.success(f"✅ **Congratulations!** You are eligible for a loan.")
864
+ else:
865
+ st.error(f"❌ **Unfortunately,** you are not currently eligible for a loan.")
866
+
867
+ col1, col2 = st.columns(2)
868
+ with col1:
869
+ st.metric("Confidence Score", f"{loan_prediction['confidence']:.1%}")
870
+ with col2:
871
+ st.metric("Model Used", loan_prediction['model_type'])
872
+
873
+ # Detailed loan analysis
874
+ st.markdown("#### Loan Assessment Factors")
875
+
876
+ total_credits = df[df['Amount'] > 0]['Amount'].sum()
877
+ total_debits = abs(df[df['Amount'] < 0]['Amount'].sum())
878
+ debt_to_income = total_debits / total_credits if total_credits > 0 else float('inf')
879
+
880
+ factors = {
881
+ "Total Income": f"R{total_credits:,.2f}",
882
+ "Total Expenses": f"R{total_debits:,.2f}",
883
+ "Debt-to-Income Ratio": f"{debt_to_income:.2%}",
884
+ "Net Cash Flow": f"R{total_credits - total_debits:,.2f}",
885
+ "Transaction Count": str(len(df)),
886
+ "Account Balance Trend": f"R{patterns.get('balance_trend', 0):,.2f}"
887
+ }
888
+
889
+ for factor, value in factors.items():
890
+ st.write(f"**{factor}:** {value}")
891
+
892
+ # Improvement suggestions for loan eligibility
893
+ if not loan_prediction['eligible']:
894
+ st.markdown("#### 💡 How to Improve Your Loan Eligibility")
895
+ st.markdown("""
896
+ - **Increase Income**: Look for ways to boost your monthly income
897
+ - **Reduce Expenses**: Cut down on non-essential spending
898
+ - **Build Savings**: Maintain a higher account balance
899
+ - **Regular Transactions**: Show consistent financial activity
900
+ - **Improve Cash Flow**: Ensure more money comes in than goes out
901
+ """)
902
+
903
+ except Exception as e:
904
+ st.error(f"❌ An error occurred while processing your statement: {str(e)}")
905
+ st.info("Please ensure your PDF is a valid ABSA bank statement and try again.")
906
+
907
+ if __name__ == "__main__":
908
+ main()
909
+
910
+ # Custom CSS for better styling
911
+ st.markdown("""
912
+ <style>
913
+ .metric-card {
914
+ background-color: #f0f2f6;
915
+ padding: 1rem;
916
+ border-radius: 0.5rem;
917
+ border-left: 4px solid #1f77b4;
918
+ }
919
+
920
+ .recommendation-card {
921
+ background-color: #f8f9fa;
922
+ padding: 1rem;
923
+ border-radius: 0.5rem;
924
+ margin: 0.5rem 0;
925
+ border-left: 4px solid #28a745;
926
+ }
927
+
928
+ .stTabs [data-baseweb="tab-list"] {
929
+ gap: 24px;
930
+ }
931
+
932
+ .stTabs [data-baseweb="tab"] {
933
+ height: 50px;
934
+ white-space: pre-wrap;
935
+ background-color: #f0f2f6;
936
+ border-radius: 4px 4px 0px 0px;
937
+ gap: 1px;
938
+ padding-top: 10px;
939
+ padding-bottom: 10px;
940
+ }
941
+
942
+ .stTabs [aria-selected="true"] {
943
+ background-color: #1f77b4;
944
+ color: white;
945
+ }
946
+ </style>
947
+ """, unsafe_allow_html=True)