import os from dotenv import load_dotenv # Load for local testing only if os.path.exists(".env"): load_dotenv() import streamlit as st import pandas as pd import plotly.express as px import time from datetime import datetime # Initialize database connections properly # Assuming database uses Supabase, if Supabase client init fails due to missing secrets, app will handle gracefully import modules.database as db from modules.etl import load_and_validate, engineer_features from modules.detection import apply_detection from modules.risk_profiling import build_customer_profiles, assign_kyc_tier import modules.visualizations as viz from modules.ai_agent import stream_compliance_report from modules.pdf_report import build_pdf st.set_page_config(page_title="AML Shield", page_icon="🛡️", layout="wide", initial_sidebar_state="expanded") # --- CSS Overrides --- st.markdown(""" """, unsafe_allow_html=True) # --- Sidebar --- st.sidebar.title("🛡️ AML Shield Navigation") tabs = ["Upload & Analyze", "Dashboard", "Customer Profiles", "AI Report", "Global Analytics", "About"] page = st.sidebar.radio("Go to", tabs) # --- Helper logic for analysis --- def run_pipeline(file_obj, filename="Uploaded Data"): progress_bar = st.progress(0) status_text = st.empty() # Step 1 status_text.text("[▓░░░░] Loading & validating data...") df, msg = load_and_validate(file_obj) if df is None: st.error(msg) progress_bar.empty() status_text.empty() return False progress_bar.progress(25) time.sleep(0.5) # Step 2 status_text.text("[▓▓▓░░] Engineering features...") df = engineer_features(df) progress_bar.progress(50) time.sleep(0.5) # Step 3 status_text.text("[▓▓▓▓░] Running anomaly detection...") df = apply_detection(df) progress_bar.progress(75) time.sleep(0.5) # Step 4 status_text.text("[▓▓▓▓▓] Building customer profiles...") profile_df = build_customer_profiles(df) profile_df = assign_kyc_tier(profile_df) progress_bar.progress(100) time.sleep(0.5) status_text.empty() progress_bar.empty() # Summary Metrics total_tx = len(df) flagged = df['is_flagged'].sum() high_risk = len(df[df['risk_level'] == 'High']) med_risk = len(df[df['risk_level'] == 'Medium']) avg_score = df['risk_score'].mean() date_range = f"{df['timestamp'].dt.date.min()} to {df['timestamp'].dt.date.max()}" # Structuring & Intl stats for report struct_attempts = profile_df['structuring_attempts'].sum() intl_high = len(df[(df['is_international'] == 1) & (df['amount'] > 25000)]) kyc_counts = profile_df['kyc_tier'].value_counts().to_dict() # Top flagged rules rules_flat = [rule for sublist in df['rule_flags'] if isinstance(sublist, list) for rule in sublist] top_rules = pd.Series(rules_flat).value_counts().head(3).to_dict() if rules_flat else {} top_customers = profile_df.sort_values('avg_risk_score', ascending=False)['customer_id'].head(3).tolist() # Save Upload to DB upload_id = db.save_upload( filename=filename, total=total_tx, flagged=flagged, high_risk=high_risk, medium_risk=med_risk, avg_score=avg_score, date_range=date_range ) if upload_id: # Batch insert chunks db.save_transactions(df, upload_id) db.save_customer_profiles(profile_df, upload_id) summary_data = { "filename": filename, "total_transactions": int(total_tx), "flagged_count": int(flagged), "high_risk_count": int(high_risk), "medium_risk_count": int(med_risk), "avg_risk_score": float(avg_score), "date_range": date_range, "structuring_attempts": int(struct_attempts), "international_high_value_count": int(intl_high), "kyc_tier_breakdown": kyc_counts, "top_rules_triggered": top_rules, "top_flagged_customers": top_customers } # Session State st.session_state.df_raw = df.copy() st.session_state.df_scored = df.copy() st.session_state.profile_df = profile_df.copy() st.session_state.upload_id = upload_id st.session_state.summary_data = summary_data st.session_state.ai_report = None st.success(f"✅ {total_tx} transactions analyzed | ⚠️ {flagged} flagged | 🔴 {high_risk} high risk | 📊 Avg risk score: {avg_score:.1f}") # --- Model Evaluation (if is_fraud exists) --- if "is_fraud" in df.columns: st.markdown("---") st.subheader("🎯 Model Performance Evaluation") # Normalize fraud labels df["is_fraud_numeric"] = df["is_fraud"].astype(str).str.upper().map( {"TRUE": 1, "FALSE": 0, "1": 1, "0": 0, "1.0": 1, "0.0": 0} ).fillna(0).astype(int) from sklearn.metrics import classification_report, confusion_matrix import seaborn as sns import matplotlib.pyplot as plt # ml_anomaly_flag comes from detection.py (it's 1 if scoring high) # In our case, the 'is_flagged' column is the prediction y_true = df["is_fraud_numeric"] y_pred = df["is_flagged"] report = classification_report(y_true, y_pred, output_dict=True) c1, c2, c3 = st.columns(3) c1.metric("Accuracy", f"{report['accuracy']:.2%}") c2.metric("Precision (Fraud)", f"{report['1']['precision']:.2%}") c3.metric("Recall (Fraud)", f"{report['1']['recall']:.2%}") with st.expander("Detailed Classification Report"): st.code(classification_report(y_true, y_pred)) st.markdown("---") st.subheader("Top 5 Highest Risk Transactions Preview") preview = df.sort_values('risk_score', ascending=False).head(5) cols = ['transaction_id', 'customer_id', 'amount', 'transaction_type', 'risk_score', 'risk_level', 'rule_flags'] st.dataframe(preview[cols]) return True # --- PAGE ROUTING --- if page == "Upload & Analyze": st.title("🛡️ AML Shield") st.write("AI-Powered Anti-Money Laundering Transaction Intelligence Platform") col1, col2 = st.columns(2) with col1: uploaded_file = st.file_uploader("Upload CSV Transactions", type=['csv']) if uploaded_file is not None: if st.button("Analyze Uploaded File"): run_pipeline(uploaded_file, filename=uploaded_file.name) with col2: st.write("Or test with pre-generated synthetic data:") if st.button("Use Sample Dataset"): sample_path = "sample_data/sample_transactions.csv" if os.path.exists(sample_path): run_pipeline(sample_path, filename="sample_transactions.csv") else: st.error("Sample dataset not found. Please ensure it was generated.") elif page == "Dashboard": if 'df_scored' not in st.session_state: st.warning("Please upload or load sample data first in the 'Upload & Analyze' tab.") else: df = st.session_state.df_scored.copy() summ = st.session_state.summary_data # Dashboard Filters in Sidebar st.sidebar.markdown("---") st.sidebar.subheader("Dashboard Filters") risk_filter = st.sidebar.multiselect("Risk Level", options=['High', 'Medium', 'Low'], default=['High', 'Medium', 'Low']) type_filter = st.sidebar.multiselect("Transaction Type", options=df['transaction_type'].unique(), default=df['transaction_type'].unique()) min_date = df['timestamp'].min().date() max_date = df['timestamp'].max().date() date_filter = st.sidebar.slider("Date Range", min_value=min_date, max_value=max_date, value=(min_date, max_date)) # Apply filters df_filtered = df[ (df['risk_level'].isin(risk_filter)) & (df['transaction_type'].isin(type_filter)) & (df['timestamp'].dt.date >= date_filter[0]) & (df['timestamp'].dt.date <= date_filter[1]) ] # KPIs c1, c2, c3, c4 = st.columns(4) c1.metric("Total Transactions", summ['total_transactions']) flagged_pct = (summ['flagged_count'] / summ['total_transactions']) * 100 if summ['total_transactions'] > 0 else 0 c2.metric("Flagged", summ['flagged_count'], delta=f"{flagged_pct:.1f}%") c3.metric("High Risk", summ['high_risk_count']) c4.metric("Avg Risk Score", f"{summ['avg_risk_score']:.1f}") # Charts Row 1 r1c1, r1c2 = st.columns(2) with r1c1: st.subheader("Risk Distribution") st.plotly_chart(viz.risk_distribution_chart(df_filtered), use_container_width=True) with r1c2: st.subheader("Daily Flagged Transactions") st.plotly_chart(viz.flagged_transactions_timeline(df_filtered), use_container_width=True) # Charts Row 2 st.subheader("Amount vs Risk Score Scatter") st.plotly_chart(viz.amount_vs_risk_scatter(df_filtered), use_container_width=True) # Charts Row 3 r3c1, r3c2 = st.columns(2) with r3c1: st.subheader("Transaction Types (Flagged vs Clean)") st.plotly_chart(viz.transaction_type_breakdown(df_filtered), use_container_width=True) with r3c2: st.subheader("Rule Trigger Frequency") st.plotly_chart(viz.rule_trigger_frequency(df_filtered), use_container_width=True) # Charts Row 4 st.subheader("Top Flagged Customers") st.plotly_chart(viz.top_flagged_customers_chart(df_filtered), use_container_width=True) # Table st.subheader("Flagged Transactions Explorer") flagged_df = df_filtered[df_filtered['is_flagged'] == 1].copy() # Convert rule_flags list to string for display/CSV flagged_df['rule_flags_str'] = flagged_df['rule_flags'].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x)) disp_cols = ['transaction_id', 'customer_id', 'amount', 'transaction_type', 'risk_score', 'risk_level', 'rule_flags_str'] st.dataframe(flagged_df[disp_cols]) csv_data = flagged_df[disp_cols].to_csv(index=False).encode('utf-8') st.download_button("Download Flagged Transactions CSV", data=csv_data, file_name="flagged_transactions.csv", mime="text/csv") elif page == "Customer Profiles": if 'profile_df' not in st.session_state: st.warning("Please upload data first to analyze customer profiles.") else: profile_df = st.session_state.profile_df.copy() df = st.session_state.df_scored st.title("Customer KYC Profiles") col1, col2 = st.columns([1, 2]) with col1: st.subheader("KYC Tier Distribution") st.plotly_chart(viz.kyc_tier_distribution(profile_df), use_container_width=True) with col2: st.subheader("All Customer Profiles") st.dataframe(profile_df) st.markdown("---") st.subheader("Customer Drill-down") selected_cust = st.selectbox("Select Customer ID", options=profile_df['customer_id'].unique()) cust_profile = profile_df[profile_df['customer_id'] == selected_cust].iloc[0] cust_tx = df[df['customer_id'] == selected_cust].sort_values('timestamp', ascending=False) cust_flags = cust_tx[cust_tx['is_flagged'] == 1] c1, c2, c3 = st.columns(3) c1.metric("KYC Tier", cust_profile['kyc_tier']) c2.metric("Total Volume", f"${cust_profile['total_volume']:,.2f}") c3.metric("Avg Risk Score", f"{cust_profile['avg_risk_score']:.1f}") st.write("### Transaction History") st.dataframe(cust_tx[['transaction_id', 'timestamp', 'amount', 'transaction_type', 'risk_score', 'risk_level']]) st.write("### Repeated Suspicious Behavior") if len(cust_flags) > 0: st.dataframe(cust_flags[['transaction_id', 'amount', 'rule_flags']]) else: st.write("None detected.") elif page == "AI Report": if 'summary_data' not in st.session_state: st.warning("Please upload data first to generate an AI report.") else: st.title("🤖 AI Compliance Report Generation") summ = st.session_state.summary_data st.info(f"**Dataset loaded:** {summ['filename']} | **Total Transactions:** {summ['total_transactions']} | **Flagged:** {summ['flagged_count']}") if st.button("🤖 Generate AI Compliance Report", type="primary"): if not os.environ.get("BYTEZ_API_KEY"): st.error("BYTEZ_API_KEY requires to be set to generate AI report.") else: with st.spinner("Connecting to AI analyst..."): placeholder = st.empty() report_text = stream_compliance_report(summ, placeholder) if report_text and not report_text.startswith("Error"): st.success("✅ Report generated using meta-llama/Llama-3.1-8B-Instruct via Bytez") st.session_state.ai_report = report_text if st.session_state.upload_id: db.save_ai_report(st.session_state.upload_id, report_text, "meta-llama/Llama-3.1-8B-Instruct") if st.session_state.get('ai_report'): st.markdown("---") st.write("### Actions") # PDF generation flagged_df = st.session_state.df_scored[st.session_state.df_scored['is_flagged'] == 1].copy() pdf_bytes = build_pdf(st.session_state.ai_report, summ, flagged_df) date_str = datetime.now().strftime("%Y%m%d_%H%M") st.download_button("📄 Download PDF Report", data=pdf_bytes, file_name=f"AML_Shield_Report_{date_str}.pdf", mime="application/pdf") st.markdown("---") st.markdown(st.session_state.ai_report) elif page == "Global Analytics": st.title("🌍 Global Analytics") with st.spinner("Fetching global stats from Supabase..."): try: stats = db.get_global_stats() uploads = db.get_all_uploads() uploads_df = pd.DataFrame(uploads) except Exception as e: st.error(f"Could not connect to Supabase: {e}") stats = None uploads_df = pd.DataFrame() if stats: c1, c2, c3, c4 = st.columns(4) c1.metric("All-time Transactions", stats['total_transactions_ever']) c2.metric("Total Uploads", stats['total_uploads']) c3.metric("All-time Flagged", stats['total_flagged_ever']) c4.metric("Global Avg Risk", f"{stats['avg_risk_score_global']:.1f}") st.markdown("---") col1, col2 = st.columns(2) with col1: st.subheader("Global Trend: Flagged per Upload") if not uploads_df.empty: if 'uploaded_at' in uploads_df.columns: uploads_df['date'] = pd.to_datetime(uploads_df['uploaded_at']).dt.date trend_df = uploads_df.groupby('date')['flagged_count'].sum().reset_index() fig = px.line(trend_df, x='date', y='flagged_count', markers=True) fig.update_traces(line_color=viz.COLOR_MED) st.plotly_chart(viz.apply_theme(fig), use_container_width=True) else: st.write("No historical data available.") with col2: st.subheader("Most Common Rule Triggered") st.info(stats.get('most_common_rule_triggered', 'N/A')) st.write("*(Approximation based on available metric patterns)*") st.subheader("Uploads History") if not uploads_df.empty: st.dataframe(uploads_df[['filename', 'uploaded_at', 'total_transactions', 'flagged_count', 'high_risk_count', 'avg_risk_score']]) elif page == "About": st.title("ℹ️ About AML Shield") st.write(""" ### AI-Powered Anti-Money Laundering Transaction Intelligence Platform AML Shield is built to demonstrate production-grade AML compliance analytics skills for financial services roles. #### How it works: 1. **Upload CSV** → ETL validation & pre-processing. 2. **Rule-based AML flags** → applied to all inputs. 3. **Isolation Forest ML** → anomaly detection logic. 4. **Risk scoring (0-100)** → deterministic algorithm based on flags+ML. 5. **KYC customer profiling** → KMeans clustering into tiers. 6. **LangChain + Bytez** → streams a formal regulatory compliance report utilizing meta-llama/Llama-3.1-8B-Instruct. 7. **ReportLab** → renders professional downloadable PDF. 8. **Supabase** → All data natively persisted. """) with st.expander("AML Rules Explained"): st.write(""" - **Structuring**: Transactions intentionally sizing just beneath the $10,000 CTR reporting requirement ($9000 - $9999). - **Rapid Fire Transactions**: Accounts showing an abnormally high transaction velocity. - **Large Cash Out**: Immediate cash liquidations above $50,000. - **Dormant Account Spike**: High amounts triggered by newly created or previously dormant accounts (< 30 days). - **International High Value**: Large wire transfers sent outside of the domestic region. - **Suspicious Round Amount**: High net round payments generally uncharacteristic of organic spending. """) st.write(""" **Regulatory Frameworks Considered:** - BSA (Bank Secrecy Act) - FinCEN SAR (Suspicious Activity Report) requirements - FATF Recommendation 16 (Wire transfers) """) st.markdown("---") st.write("**Tech Stack:** Streamlit | Pandas | Scikit-learn | Plotly | ReportLab | LangChain | Bytez | Supabase") st.write("**Deployments:** Live on Hugging Face Spaces")