import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import numpy as np from datetime import datetime # 1. PAGE CONFIGURATION st.set_page_config( page_title="S.T.A.R.K AI | UIDAI Fraud Detection", page_icon="", layout="wide", initial_sidebar_state="expanded" ) # 2. PROFESSIONAL STYLING (THEME OVERRIDE) st.markdown(""" """, unsafe_allow_html=True) # 3. SMART DATA LOADING (MAPPING) @st.cache_data def load_data(): # 1. Load or Generate Data try: df = pd.read_csv('analyzed_aadhaar_data.csv') except FileNotFoundError: # Dummy Data Generator if file missing dates = pd.date_range(start="2025-01-01", periods=200) df = pd.DataFrame({ 'date': dates, 'state': np.random.choice(['Maharashtra', 'Uttar Pradesh', 'Bihar', 'Karnataka', 'Delhi', 'West Bengal', 'Tamil Nadu', 'Gujarat', 'Rajasthan', 'Kerala'], 200), 'district': np.random.choice(['North', 'South', 'East', 'West', 'Central', 'Rural A', 'Urban B'], 200), 'pincode': np.random.randint(110001, 800000, 200), 'RISK_SCORE': np.random.uniform(15, 99, 200), 'total_activity': np.random.randint(50, 800, 200), 'enrol_adult': np.random.randint(10, 400, 200), 'ratio_deviation': np.random.uniform(-0.15, 0.6, 200), 'is_weekend': np.random.choice([0, 1], 200, p=[0.7, 0.3]) }) # Standardize Date if 'date' in df.columns: df['date'] = pd.to_datetime(df['date']) # SMART GEO-CLUSTERING LOGIC # Comprehensive Center Points for Indian States & UTs state_centers = { 'Andaman and Nicobar Islands': (11.7401, 92.6586), 'Andhra Pradesh': (15.9129, 79.7400), 'Arunachal Pradesh': (28.2180, 94.7278), 'Assam': (26.2006, 92.9376), 'Bihar': (25.0961, 85.3131), 'Chandigarh': (30.7333, 76.7794), 'Chhattisgarh': (21.2787, 81.8661), 'Dadra and Nagar Haveli and Daman and Diu': (20.4283, 72.8397), 'Delhi': (28.7041, 77.1025), 'Goa': (15.2993, 74.1240), 'Gujarat': (22.2587, 71.1924), 'Haryana': (29.0588, 76.0856), 'Himachal Pradesh': (31.9579, 77.1095), 'Jammu and Kashmir': (33.7782, 76.5762), 'Jharkhand': (23.6102, 85.2799), 'Karnataka': (15.3173, 75.7139), 'Kerala': (10.8505, 76.2711), 'Ladakh': (34.1526, 77.5770), 'Lakshadweep': (10.5667, 72.6417), 'Madhya Pradesh': (22.9734, 78.6569), 'Maharashtra': (19.7515, 75.7139), 'Manipur': (24.6637, 93.9063), 'Meghalaya': (25.4670, 91.3662), 'Mizoram': (23.1645, 92.9376), 'Nagaland': (26.1584, 94.5624), 'Odisha': (20.9517, 85.0985), 'Puducherry': (11.9416, 79.8083), 'Punjab': (31.1471, 75.3412), 'Rajasthan': (27.0238, 74.2179), 'Sikkim': (27.5330, 88.5122), 'Tamil Nadu': (11.1271, 78.6569), 'Telangana': (18.1124, 79.0193), 'Tripura': (23.9408, 91.9882), 'Uttar Pradesh': (26.8467, 80.9462), 'Uttarakhand': (30.0668, 79.0193), 'West Bengal': (22.9868, 87.8550) } def get_coords(row): state = row.get('state', 'Delhi') district = str(row.get('district', 'Unknown')) # 1. Get State Base Coordinates base_lat, base_lon = state_centers.get(state, (20.5937, 78.9629)) # Default to India Center # 2. DETERMINISTIC HASHING FOR DISTRICT # This ensures "District A" is ALWAYS in the same spot relative to the State Center # Creates distinct clusters instead of random noise district_hash = hash(state + district) np.random.seed(district_hash % 2**32) # Offset the district center by up to 1.5 degrees (~150km) from state center dist_lat_offset = np.random.uniform(-1.5, 1.5) dist_lon_offset = np.random.uniform(-1.5, 1.5) # 3. INDIVIDUAL CENTER JITTER # Add tiny random noise (~4km) so points don't stack perfectly # We re-seed with None to get true randomness for the jitter np.random.seed(None) noise_lat = np.random.normal(0, 0.04) noise_lon = np.random.normal(0, 0.04) return pd.Series({ 'lat': base_lat + dist_lat_offset + noise_lat, 'lon': base_lon + dist_lon_offset + noise_lon }) # Apply coordinates coords = df.apply(get_coords, axis=1) df['lat'] = coords['lat'] df['lon'] = coords['lon'] # Risk Categories df['risk_category'] = pd.cut( df['RISK_SCORE'], bins=[-1, 50, 75, 85, 100], labels=['Low', 'Medium', 'High', 'Critical'] ) return df # Load Data df = load_data() # 4. SIDEBAR & FILTERS with st.sidebar: st.markdown("### S.T.A.R.K AI Control") st.markdown("---") # State Filter state_list = ['All'] + sorted(df['state'].unique().tolist()) selected_state = st.selectbox("Select State", state_list) # District Filter if selected_state != 'All': filtered_df = df[df['state'] == selected_state] district_list = ['All'] + sorted(filtered_df['district'].unique().tolist()) else: filtered_df = df.copy() district_list = ['All'] selected_district = st.selectbox("Select District", district_list) if selected_district != 'All': filtered_df = filtered_df[filtered_df['district'] == selected_district] st.markdown("---") # Risk Filter risk_filter = st.multiselect( "Risk Level", options=['Low', 'Medium', 'High', 'Critical'], default=['High', 'Critical'] ) if risk_filter: filtered_df = filtered_df[filtered_df['risk_category'].isin(risk_filter)] st.markdown("---") # Links st.markdown("**Resources**") st.link_button("Open Notebook in Colab", "https://colab.research.google.com/drive/1YAQ4nfxltvG_cts3fmGc_zi2JQc4oPOT?usp=sharing") st.markdown("---") st.info(f"**User:** UIDAI_Officer\n\n**Team:** UIDAI_4571") # 5. HEADER & KPI METRICS col1, col2 = st.columns([3, 1]) with col1: st.title("Project S.T.A.R.K AI Dashboard") st.markdown("Context-Aware Fraud Detection System") with col2: st.markdown("""
System Online
Live Monitor
""", unsafe_allow_html=True) st.markdown("---") # METRICS ROW m1, m2, m3, m4 = st.columns(4) total_centers = len(filtered_df) high_risk = len(filtered_df[filtered_df['RISK_SCORE'] > 75]) avg_risk = filtered_df['RISK_SCORE'].mean() if not filtered_df.empty else 0 weekend_alerts = len(filtered_df[(filtered_df['is_weekend'] == 1) & (filtered_df['RISK_SCORE'] > 70)]) m1.metric("Total Centers", f"{total_centers:,}", border=True) m2.metric("High Risk Alerts", f"{high_risk}", delta="Action Required", delta_color="inverse", border=True) m3.metric("Avg. Risk Score", f"{avg_risk:.1f}/100", border=True) m4.metric("Weekend Spikes", f"{weekend_alerts}", "Unauthorized", delta_color="off", border=True) st.markdown("##") # Spacer # 6. MAIN TABS tab_map, tab_list, tab_charts = st.tabs(["Geographic Risk", "Priority List", "Pattern Analytics"]) # TAB 1: GEOGRAPHIC RISK (MAP) with tab_map: col_map, col_details = st.columns([3, 1]) with col_map: if not filtered_df.empty: # Using Open-Street-Map for better contrast and no-token requirement fig_map = px.scatter_mapbox( filtered_df, lat="lat", lon="lon", color="RISK_SCORE", size="total_activity", # Traffic Light Colors: Green -> Yellow -> Red color_continuous_scale=["#22c55e", "#eab308", "#ef4444"], size_max=20, zoom=4.5 if selected_state != 'All' else 3.5, center={"lat": 22.0, "lon": 80.0}, # Center of India hover_name="pincode", hover_data={"district": True, "state": True, "RISK_SCORE": True, "lat": False, "lon": False}, mapbox_style="open-street-map", height=600, title="Live Fraud Risk Heatmap" ) fig_map.update_layout(margin={"r":0,"t":40,"l":0,"b":0}) st.plotly_chart(fig_map, use_container_width=True) else: st.warning("No data matches current filters.") with col_details: st.subheader("Top Hotspots") if not filtered_df.empty: top_districts = filtered_df.groupby('district')['RISK_SCORE'].mean().sort_values(ascending=False).head(5) for district, score in top_districts.items(): # Color code the side bar color = "#ef4444" if score > 80 else "#f59e0b" st.markdown(f"""
{district}
Avg Risk: {score:.1f}
""", unsafe_allow_html=True) # TAB 2: PRIORITY LIST (DATAFRAME) with tab_list: st.subheader("Target Investigation List") st.markdown("Filter: *Showing centers with Risk Score > 75*") target_list = filtered_df[filtered_df['RISK_SCORE'] > 75].sort_values('RISK_SCORE', ascending=False) st.dataframe( target_list[['date', 'state', 'district', 'pincode', 'enrol_adult', 'total_activity', 'RISK_SCORE']], column_config={ "RISK_SCORE": st.column_config.ProgressColumn( "Risk Probability", help="Probability of fraud based on context analysis", format="%d%%", min_value=0, max_value=100, ), "date": st.column_config.DateColumn("Date", format="DD MMM YYYY"), "total_activity": st.column_config.NumberColumn("Volume"), "enrol_adult": st.column_config.NumberColumn("Adult Enrols"), }, use_container_width=True, hide_index=True, height=400 ) # Export Button csv = target_list.to_csv(index=False).encode('utf-8') st.download_button( "Download CSV", data=csv, file_name="uidai_S.T.A.R.K AI_priority_list.csv", mime="text/csv", type="primary" ) # --- TAB 3: CHARTS --- with tab_charts: c1, c2 = st.columns(2) with c1: st.subheader("Ghost ID Pattern (Ratio Deviation)") # Scatter Plot fig_scatter = px.scatter( filtered_df, x="total_activity", y="ratio_deviation", color="risk_category", color_discrete_map={'Critical': '#ef4444', 'High': '#f97316', 'Medium': '#eab308', 'Low': '#22c55e'}, title="Deviation from District Baseline", labels={"ratio_deviation": "Deviation Score", "total_activity": "Daily Transactions"}, hover_data=['pincode', 'district'] ) fig_scatter.add_hline(y=0.2, line_dash="dash", line_color="red", annotation_text="Fraud Threshold") st.plotly_chart(fig_scatter, use_container_width=True) with c2: st.subheader("Risk Distribution") # Histogram fig_hist = px.histogram( filtered_df, x="RISK_SCORE", nbins=20, color_discrete_sequence=['#3b82f6'], title="Frequency of Risk Scores" ) fig_hist.update_layout(bargap=0.1) st.plotly_chart(fig_hist, use_container_width=True) # 7. FOOTER st.markdown("---") st.markdown("""
Project S.T.A.R.K AI | UIDAI Hackathon 2026 | Team UIDAI_4571
Confidential - For Official Use Only
""", unsafe_allow_html=True)