Upload project
Browse files- Dockerfile +20 -0
- New folder/README (1).md +20 -0
- New folder/gitattributes +35 -0
- New folder/requirements.txt +8 -0
- New folder/src/README.md +33 -0
- New folder/src/crime_xgb_artifacts.pkl +3 -0
- New folder/src/run_app.bat +6 -0
- New folder/src/streamlit_app.py +418 -0
- README.md +63 -0
- data/crimedataset/test.csv +3 -0
- data/crimedataset/train.csv +3 -0
- models/best_model.pkl +3 -0
- models/crime_xgb_artifacts.pkl +3 -0
- models/kmeans.pkl +3 -0
- models/label_encoders.pkl +3 -0
- requirements.txt +12 -0
- src/__pycache__/data_loader.cpython-312.pyc +0 -0
- src/__pycache__/preprocessing.cpython-312.pyc +0 -0
- src/app.py +775 -0
- src/data_loader.py +39 -0
- src/preprocessing.py +84 -0
- src/train_model.py +81 -0
- src/verify_pipeline.py +89 -0
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.13.5-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
build-essential \
|
| 7 |
+
curl \
|
| 8 |
+
git \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
COPY requirements.txt ./
|
| 12 |
+
COPY src/ ./src/
|
| 13 |
+
|
| 14 |
+
RUN pip3 install -r requirements.txt
|
| 15 |
+
|
| 16 |
+
EXPOSE 8501
|
| 17 |
+
|
| 18 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 19 |
+
|
| 20 |
+
ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
New folder/README (1).md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Crime Predictor App
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
tags:
|
| 9 |
+
- streamlit
|
| 10 |
+
pinned: false
|
| 11 |
+
short_description: Streamlit template space
|
| 12 |
+
license: apache-2.0
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Welcome to Streamlit!
|
| 16 |
+
|
| 17 |
+
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
| 18 |
+
|
| 19 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 20 |
+
forums](https://discuss.streamlit.io).
|
New folder/gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
New folder/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair
|
| 2 |
+
streamlit
|
| 3 |
+
scikit-learn
|
| 4 |
+
pandas
|
| 5 |
+
numpy
|
| 6 |
+
scipy
|
| 7 |
+
xgboost
|
| 8 |
+
groq
|
New folder/src/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SF Crime Prediction App
|
| 2 |
+
|
| 3 |
+
This is a Streamlit application for predicting crime categories in San Francisco using an XGBoost model.
|
| 4 |
+
|
| 5 |
+
## Setup
|
| 6 |
+
|
| 7 |
+
1. **Install Dependencies**:
|
| 8 |
+
```bash
|
| 9 |
+
pip install -r requirements.txt
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
2. **Run the App**:
|
| 13 |
+
```bash
|
| 14 |
+
streamlit run streamlit_app.py
|
| 15 |
+
```
|
| 16 |
+
Or simply double-click `run_app.bat`.
|
| 17 |
+
|
| 18 |
+
## Model Info
|
| 19 |
+
|
| 20 |
+
The app uses `crime_xgb_artifacts.pkl` which contains:
|
| 21 |
+
- XGBoost Model
|
| 22 |
+
- LabelEncoder for Target (Crime Category)
|
| 23 |
+
- FeatureHashers for Address and Description
|
| 24 |
+
|
| 25 |
+
**Note**: The model expects specific features including hashed Address and Description. Ensure you provide these inputs in the UI for accurate predictions.
|
| 26 |
+
**Note**: The District encoder was missing from the provided files, so a default alphabetical mapping is used.
|
| 27 |
+
|
| 28 |
+
## Deployment
|
| 29 |
+
|
| 30 |
+
To deploy on the web (e.g., Streamlit Cloud):
|
| 31 |
+
1. Push this code to a GitHub repository.
|
| 32 |
+
2. Sign up for [Streamlit Cloud](https://streamlit.io/cloud).
|
| 33 |
+
3. Connect your GitHub and deploy the app.
|
New folder/src/crime_xgb_artifacts.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09684ad6eec80070c804f09b8d8c7362d45a0e2cbaac970565399ce5fc78c845
|
| 3 |
+
size 87422288
|
New folder/src/run_app.bat
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
echo Installing requirements...
|
| 3 |
+
pip install -r requirements.txt
|
| 4 |
+
echo Starting Streamlit App...
|
| 5 |
+
streamlit run streamlit_app.py
|
| 6 |
+
pause
|
New folder/src/streamlit_app.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import xgboost as xgb
|
| 6 |
+
import pickle
|
| 7 |
+
import datetime
|
| 8 |
+
from scipy.sparse import hstack, csr_matrix
|
| 9 |
+
from groq import Groq
|
| 10 |
+
|
| 11 |
+
# ------------------- PAGE CONFIG -------------------
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
page_title="AI Crime Predictor",
|
| 14 |
+
page_icon="🚓",
|
| 15 |
+
layout="wide",
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# ------------------- CUSTOM CSS -------------------
|
| 19 |
+
st.markdown("""
|
| 20 |
+
<style>
|
| 21 |
+
|
| 22 |
+
/* Animated gradient background */
|
| 23 |
+
@keyframes gradientShift {
|
| 24 |
+
0% { background-position: 0% 50%; }
|
| 25 |
+
50% { background-position: 100% 50%; }
|
| 26 |
+
100% { background-position: 0% 50%; }
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
body, .stApp {
|
| 30 |
+
background: linear-gradient(-45deg, #0a0e27, #1a1a2e, #16213e, #0f3460);
|
| 31 |
+
background-size: 400% 400%;
|
| 32 |
+
animation: gradientShift 15s ease infinite;
|
| 33 |
+
color: #ffffff;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
/* Title with gradient text */
|
| 37 |
+
.big-title {
|
| 38 |
+
font-size: 3.5rem;
|
| 39 |
+
font-weight: 800;
|
| 40 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 50%, #f093fb 100%);
|
| 41 |
+
-webkit-background-clip: text;
|
| 42 |
+
-webkit-text-fill-color: transparent;
|
| 43 |
+
background-clip: text;
|
| 44 |
+
text-align: center;
|
| 45 |
+
margin-bottom: 10px;
|
| 46 |
+
text-shadow: 0 0 30px rgba(102, 126, 234, 0.5);
|
| 47 |
+
letter-spacing: -1px;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
/* Subtitle with glow */
|
| 51 |
+
.sub-title {
|
| 52 |
+
text-align: center;
|
| 53 |
+
font-size: 1.3rem;
|
| 54 |
+
color: #a8b2d1;
|
| 55 |
+
margin-bottom: 40px;
|
| 56 |
+
font-weight: 300;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
/* Glassmorphism card */
|
| 60 |
+
.glass-card {
|
| 61 |
+
background: rgba(255, 255, 255, 0.05);
|
| 62 |
+
backdrop-filter: blur(10px);
|
| 63 |
+
-webkit-backdrop-filter: blur(10px);
|
| 64 |
+
padding: 30px;
|
| 65 |
+
border-radius: 24px;
|
| 66 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 67 |
+
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
|
| 68 |
+
transition: all 0.4s ease;
|
| 69 |
+
margin-bottom: 25px;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.glass-card:hover {
|
| 73 |
+
box-shadow: 0 12px 40px 0 rgba(102, 126, 234, 0.4);
|
| 74 |
+
transform: translateY(-5px);
|
| 75 |
+
border: 1px solid rgba(102, 126, 234, 0.3);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/* Premium button styling */
|
| 79 |
+
.stButton>button {
|
| 80 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 81 |
+
color: white;
|
| 82 |
+
padding: 0.8rem 2rem;
|
| 83 |
+
border-radius: 12px;
|
| 84 |
+
border: none;
|
| 85 |
+
font-size: 1.1rem;
|
| 86 |
+
font-weight: 600;
|
| 87 |
+
transition: all 0.3s ease;
|
| 88 |
+
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
.stButton>button:hover {
|
| 92 |
+
background: linear-gradient(135deg, #764ba2 0%, #667eea 100%);
|
| 93 |
+
transform: translateY(-2px) scale(1.02);
|
| 94 |
+
box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
/* Sidebar styling */
|
| 98 |
+
[data-testid="stSidebar"] {
|
| 99 |
+
background: rgba(15, 23, 42, 0.8);
|
| 100 |
+
backdrop-filter: blur(10px);
|
| 101 |
+
border-right: 1px solid rgba(255, 255, 255, 0.1);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
/* Input fields */
|
| 105 |
+
.stTextInput>div>div>input,
|
| 106 |
+
.stTextArea>div>div>textarea,
|
| 107 |
+
.stNumberInput>div>div>input {
|
| 108 |
+
background: rgba(255, 255, 255, 0.8) !important;
|
| 109 |
+
border: 1px solid rgba(255, 255, 255, 0.3) !important;
|
| 110 |
+
border-radius: 10px !important;
|
| 111 |
+
color: #000000 !important;
|
| 112 |
+
transition: all 0.3s ease;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
/* Ensure text is visible when typing */
|
| 116 |
+
.stTextInput input,
|
| 117 |
+
.stTextArea textarea {
|
| 118 |
+
color: #000000 !important;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
.stTextInput>div>div>input:focus,
|
| 122 |
+
.stTextArea>div>div>textarea:focus,
|
| 123 |
+
.stNumberInput>div>div>input:focus {
|
| 124 |
+
border: 1px solid rgba(102, 126, 234, 0.8) !important;
|
| 125 |
+
box-shadow: 0 0 15px rgba(102, 126, 234, 0.5) !important;
|
| 126 |
+
color: #000000 !important;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
/* Placeholder text styling */
|
| 130 |
+
.stTextInput input::placeholder,
|
| 131 |
+
.stTextArea textarea::placeholder {
|
| 132 |
+
color: rgba(0, 0, 0, 0.5) !important;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
/* Chat message styles */
|
| 136 |
+
.user-message {
|
| 137 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 138 |
+
padding: 15px 20px;
|
| 139 |
+
border-radius: 18px 18px 5px 18px;
|
| 140 |
+
margin: 10px 0;
|
| 141 |
+
max-width: 80%;
|
| 142 |
+
margin-left: auto;
|
| 143 |
+
color: white;
|
| 144 |
+
font-size: 1rem;
|
| 145 |
+
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
.ai-message {
|
| 149 |
+
background: rgba(255, 255, 255, 0.08);
|
| 150 |
+
backdrop-filter: blur(10px);
|
| 151 |
+
padding: 15px 20px;
|
| 152 |
+
border-radius: 18px 18px 18px 5px;
|
| 153 |
+
margin: 10px 0;
|
| 154 |
+
max-width: 80%;
|
| 155 |
+
margin-right: auto;
|
| 156 |
+
color: #e2e8f0;
|
| 157 |
+
font-size: 1rem;
|
| 158 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 159 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
/* Chat container */
|
| 163 |
+
.chat-container {
|
| 164 |
+
background: rgba(255, 255, 255, 0.03);
|
| 165 |
+
backdrop-filter: blur(10px);
|
| 166 |
+
padding: 25px;
|
| 167 |
+
border-radius: 20px;
|
| 168 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 169 |
+
max-height: 500px;
|
| 170 |
+
overflow-y: auto;
|
| 171 |
+
margin-bottom: 20px;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
/* Scrollbar styling */
|
| 175 |
+
.chat-container::-webkit-scrollbar {
|
| 176 |
+
width: 8px;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.chat-container::-webkit-scrollbar-track {
|
| 180 |
+
background: rgba(255, 255, 255, 0.05);
|
| 181 |
+
border-radius: 10px;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.chat-container::-webkit-scrollbar-thumb {
|
| 185 |
+
background: rgba(102, 126, 234, 0.5);
|
| 186 |
+
border-radius: 10px;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.chat-container::-webkit-scrollbar-thumb:hover {
|
| 190 |
+
background: rgba(102, 126, 234, 0.8);
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
/* Success/Info boxes */
|
| 194 |
+
.element-container div[data-testid="stMarkdownContainer"] > div[data-testid="stMarkdown"] {
|
| 195 |
+
animation: fadeIn 0.5s ease;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
@keyframes fadeIn {
|
| 199 |
+
from { opacity: 0; transform: translateY(10px); }
|
| 200 |
+
to { opacity: 1; transform: translateY(0); }
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
</style>
|
| 204 |
+
""", unsafe_allow_html=True)
|
| 205 |
+
|
| 206 |
+
# ------------------- TITLE -------------------
|
| 207 |
+
st.markdown('<p class="big-title">🚓 AI Crime Prediction System</p>', unsafe_allow_html=True)
|
| 208 |
+
st.markdown('<p class="sub-title">Predict crime category using time, location, and incident description.</p>', unsafe_allow_html=True)
|
| 209 |
+
|
| 210 |
+
# ------------------- LOAD MODEL -------------------
|
| 211 |
+
@st.cache_resource
|
| 212 |
+
def load_artifacts():
|
| 213 |
+
try:
|
| 214 |
+
# path relative to streamlit_app.py
|
| 215 |
+
pkl_path = "src/crime_xgb_artifacts.pkl"
|
| 216 |
+
with open(pkl_path, 'rb') as f:
|
| 217 |
+
return pickle.load(f)
|
| 218 |
+
except Exception as e:
|
| 219 |
+
st.error(f"❌ Artifact loading error: {e}")
|
| 220 |
+
return None
|
| 221 |
+
artifacts = load_artifacts()
|
| 222 |
+
|
| 223 |
+
if not artifacts:
|
| 224 |
+
st.warning("Artifacts missing! Add `crime_xgb_artifacts.pkl` in directory.")
|
| 225 |
+
st.stop()
|
| 226 |
+
|
| 227 |
+
model = artifacts['model']
|
| 228 |
+
le_target = artifacts['le_target']
|
| 229 |
+
addr_hasher = artifacts['addr_hasher']
|
| 230 |
+
desc_hasher = artifacts['desc_hasher']
|
| 231 |
+
dense_cols = artifacts['dense_cols']
|
| 232 |
+
|
| 233 |
+
# ------------------- GROQ SETUP -------------------
|
| 234 |
+
@st.cache_resource
|
| 235 |
+
def get_groq_client():
|
| 236 |
+
return Groq(api_key="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")
|
| 237 |
+
|
| 238 |
+
def explain_prediction_with_llama(prompt):
|
| 239 |
+
"""Use Groq's Llama model to explain crime prediction"""
|
| 240 |
+
try:
|
| 241 |
+
client = get_groq_client()
|
| 242 |
+
chat_completion = client.chat.completions.create(
|
| 243 |
+
messages=[
|
| 244 |
+
{
|
| 245 |
+
"role": "user",
|
| 246 |
+
"content": prompt,
|
| 247 |
+
}
|
| 248 |
+
],
|
| 249 |
+
model="llama-3.3-70b-versatile",
|
| 250 |
+
)
|
| 251 |
+
return chat_completion.choices[0].message.content
|
| 252 |
+
except Exception as e:
|
| 253 |
+
return f"⚠️ Could not generate explanation: {e}"
|
| 254 |
+
|
| 255 |
+
# ------------------- SIDEBAR -------------------
|
| 256 |
+
st.sidebar.title("📝 Input Features")
|
| 257 |
+
|
| 258 |
+
date = st.sidebar.date_input("📅 Date", datetime.date.today())
|
| 259 |
+
time = st.sidebar.time_input("⏰ Time", datetime.datetime.now().time())
|
| 260 |
+
|
| 261 |
+
default_lat = 37.7749
|
| 262 |
+
default_lng = -122.4194
|
| 263 |
+
|
| 264 |
+
lat = st.sidebar.number_input("📍 Latitude", value=default_lat, format="%.6f")
|
| 265 |
+
lng = st.sidebar.number_input("📍 Longitude", value=default_lng, format="%.6f")
|
| 266 |
+
|
| 267 |
+
districts = sorted(['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'])
|
| 268 |
+
district = st.sidebar.selectbox("🏢 Police District", districts)
|
| 269 |
+
|
| 270 |
+
address = st.sidebar.text_input("📌 Address", "")
|
| 271 |
+
description = st.sidebar.text_area("📝 Description", "")
|
| 272 |
+
|
| 273 |
+
# ------------------- MAIN PREDICTION CARD -------------------
|
| 274 |
+
with st.container():
|
| 275 |
+
st.markdown("<div class='glass-card'>", unsafe_allow_html=True)
|
| 276 |
+
|
| 277 |
+
st.subheader("🔍 Prediction Panel")
|
| 278 |
+
|
| 279 |
+
if st.button("🚓 Predict Crime Category"):
|
| 280 |
+
try:
|
| 281 |
+
dt_obj = pd.to_datetime(f"{date} {time}")
|
| 282 |
+
hour = dt_obj.hour
|
| 283 |
+
|
| 284 |
+
dense_data = {
|
| 285 |
+
'X': float(lng),
|
| 286 |
+
'Y': float(lat),
|
| 287 |
+
'Year': dt_obj.year,
|
| 288 |
+
'Month': dt_obj.month,
|
| 289 |
+
'Day': dt_obj.day,
|
| 290 |
+
'Minute': dt_obj.minute,
|
| 291 |
+
'Hour': hour,
|
| 292 |
+
'Hour_sin': np.sin(2 * np.pi * hour / 24),
|
| 293 |
+
'Hour_cos': np.cos(2 * np.pi * hour / 24),
|
| 294 |
+
'PdDistrict_enc': districts.index(district),
|
| 295 |
+
'DayOfWeek_enc': dt_obj.dayofweek
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
dense_df = pd.DataFrame([dense_data])[dense_cols]
|
| 299 |
+
dense_sparse = csr_matrix(dense_df.values)
|
| 300 |
+
|
| 301 |
+
addr_hashed = addr_hasher.transform([address.split()])
|
| 302 |
+
desc_hashed = desc_hasher.transform([description.split()])
|
| 303 |
+
|
| 304 |
+
features = hstack([dense_sparse, addr_hashed, desc_hashed])
|
| 305 |
+
|
| 306 |
+
probs = model.predict_proba(features)[0]
|
| 307 |
+
top_idx = np.argmax(probs)
|
| 308 |
+
|
| 309 |
+
category = le_target.inverse_transform([top_idx])[0]
|
| 310 |
+
confidence = probs[top_idx] * 100
|
| 311 |
+
|
| 312 |
+
st.success(f"### 🚨 Predicted Category: **{category}**")
|
| 313 |
+
st.info(f"**Confidence:** {confidence:.2f}%")
|
| 314 |
+
|
| 315 |
+
# Top 3 chart
|
| 316 |
+
top3 = probs.argsort()[-3:][::-1]
|
| 317 |
+
chart_data = pd.DataFrame({
|
| 318 |
+
"Category": le_target.inverse_transform(top3),
|
| 319 |
+
"Probability": probs[top3]
|
| 320 |
+
}).set_index("Category")
|
| 321 |
+
|
| 322 |
+
st.subheader("📊 Top 3 Probabilities")
|
| 323 |
+
st.bar_chart(chart_data)
|
| 324 |
+
|
| 325 |
+
st.subheader("📍 Location Preview")
|
| 326 |
+
st.map(pd.DataFrame({"lat": [lat], "lon": [lng]}))
|
| 327 |
+
|
| 328 |
+
# AI Explanation using Groq
|
| 329 |
+
if description:
|
| 330 |
+
with st.spinner("🧠 Generating AI explanation..."):
|
| 331 |
+
explanation = explain_prediction_with_llama(
|
| 332 |
+
f"In 2-3 sentences, explain why a crime prediction model might classify an incident as '{category}' based on this description: '{description}'. Be concise and factual."
|
| 333 |
+
)
|
| 334 |
+
st.subheader("🧠 AI Explanation")
|
| 335 |
+
st.write(explanation)
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
st.error(f"❌ Prediction Error: {e}")
|
| 339 |
+
|
| 340 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 341 |
+
|
| 342 |
+
# ------------------- INTERACTIVE CHATBOT -------------------
|
| 343 |
+
st.markdown("---")
|
| 344 |
+
st.markdown("<div class='glass-card'>", unsafe_allow_html=True)
|
| 345 |
+
st.subheader("💬 AI Crime Safety Assistant")
|
| 346 |
+
st.markdown("Ask me anything about crime prediction, safety tips, or how this system works!", unsafe_allow_html=True)
|
| 347 |
+
|
| 348 |
+
# Initialize chat history in session state
|
| 349 |
+
if 'messages' not in st.session_state:
|
| 350 |
+
st.session_state.messages = [
|
| 351 |
+
{"role": "assistant", "content": "👋 Hello! I'm your AI Crime Safety Assistant. I can help you understand crime patterns, provide safety recommendations, and explain how our prediction model works. What would you like to know?"}
|
| 352 |
+
]
|
| 353 |
+
|
| 354 |
+
# Display chat history
|
| 355 |
+
st.markdown("<div class='chat-container'>", unsafe_allow_html=True)
|
| 356 |
+
for message in st.session_state.messages:
|
| 357 |
+
if message["role"] == "user":
|
| 358 |
+
st.markdown(f"<div class='user-message'>🧑 {message['content']}</div>", unsafe_allow_html=True)
|
| 359 |
+
else:
|
| 360 |
+
st.markdown(f"<div class='ai-message'>🤖 {message['content']}</div>", unsafe_allow_html=True)
|
| 361 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 362 |
+
|
| 363 |
+
# Chat input
|
| 364 |
+
col1, col2 = st.columns([5, 1])
|
| 365 |
+
with col1:
|
| 366 |
+
user_input = st.text_input("Type your message...", key="chat_input", label_visibility="collapsed", placeholder="Ask about crime safety, predictions, or get recommendations...")
|
| 367 |
+
with col2:
|
| 368 |
+
send_button = st.button("Send 📤", use_container_width=True)
|
| 369 |
+
|
| 370 |
+
# Handle chat submission
|
| 371 |
+
if send_button and user_input:
|
| 372 |
+
# Add user message to history
|
| 373 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 374 |
+
|
| 375 |
+
# Get AI response using Groq
|
| 376 |
+
with st.spinner("🧠 Thinking..."):
|
| 377 |
+
try:
|
| 378 |
+
client = get_groq_client()
|
| 379 |
+
|
| 380 |
+
# Create system prompt for crime prediction context
|
| 381 |
+
system_prompt = """You are an AI Crime Safety Assistant for a crime prediction system.
|
| 382 |
+
You help users understand:
|
| 383 |
+
- Crime patterns and trends in San Francisco
|
| 384 |
+
- How the XGBoost machine learning model predicts crime categories
|
| 385 |
+
- Safety tips and recommendations based on location and time
|
| 386 |
+
- What factors influence crime predictions (time, location, historical data)
|
| 387 |
+
|
| 388 |
+
Be helpful, concise, and informative. Keep responses to 2-3 sentences unless more detail is needed.
|
| 389 |
+
If asked about the model, explain it uses features like latitude, longitude, time, district, and description to predict crime types."""
|
| 390 |
+
|
| 391 |
+
# Prepare messages for Groq API
|
| 392 |
+
api_messages = [{"role": "system", "content": system_prompt}]
|
| 393 |
+
|
| 394 |
+
# Add recent chat history (last 5 messages for context)
|
| 395 |
+
for msg in st.session_state.messages[-5:]:
|
| 396 |
+
api_messages.append({"role": msg["role"], "content": msg["content"]})
|
| 397 |
+
|
| 398 |
+
# Get response from Groq
|
| 399 |
+
chat_completion = client.chat.completions.create(
|
| 400 |
+
messages=api_messages,
|
| 401 |
+
model="llama-3.3-70b-versatile",
|
| 402 |
+
temperature=0.7,
|
| 403 |
+
max_tokens=500
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
ai_response = chat_completion.choices[0].message.content
|
| 407 |
+
|
| 408 |
+
# Add AI response to history
|
| 409 |
+
st.session_state.messages.append({"role": "assistant", "content": ai_response})
|
| 410 |
+
|
| 411 |
+
except Exception as e:
|
| 412 |
+
error_msg = f"⚠️ Sorry, I encountered an error: {str(e)}"
|
| 413 |
+
st.session_state.messages.append({"role": "assistant", "content": error_msg})
|
| 414 |
+
|
| 415 |
+
# Rerun to update chat display
|
| 416 |
+
st.rerun()
|
| 417 |
+
|
| 418 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
README.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SF Crime Analytics | AI-Powered
|
| 3 |
+
emoji: 🚓
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
tags:
|
| 9 |
+
- streamlit
|
| 10 |
+
- machine-learning
|
| 11 |
+
- xgboost
|
| 12 |
+
- crime-prediction
|
| 13 |
+
pinned: true
|
| 14 |
+
license: apache-2.0
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# 🚓 San Francisco Crime Analytics & Prediction System
|
| 18 |
+
|
| 19 |
+
## Overview
|
| 20 |
+
This project is a comprehensive AI-powered dashboard for analyzing and predicting crime in San Francisco. It leverages historical data and advanced machine learning models (XGBoost) to provide actionable insights and real-time risk assessments.
|
| 21 |
+
|
| 22 |
+
## Features
|
| 23 |
+
- **📊 Historical Trends**: Visualize crime distribution by hour, district, and category.
|
| 24 |
+
- **🗺️ Geospatial Intelligence**: Interactive heatmaps showing crime density and evolution over time.
|
| 25 |
+
- **🚨 Tactical Simulation**: Simulate patrol strategies and assess risk levels for specific sectors.
|
| 26 |
+
- **💬 Chat with Data**: Natural language interface to query the dataset.
|
| 27 |
+
- **🚀 Advanced Prediction (99% Accuracy)**: High-precision crime categorization using an optimized XGBoost model.
|
| 28 |
+
- **🤖 AI Crime Safety Assistant**: Interactive chatbot for safety tips and model explanations.
|
| 29 |
+
|
| 30 |
+
## Installation
|
| 31 |
+
|
| 32 |
+
1. **Clone the repository**:
|
| 33 |
+
```bash
|
| 34 |
+
git clone <repository-url>
|
| 35 |
+
cd Hackathon
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
2. **Install dependencies**:
|
| 39 |
+
```bash
|
| 40 |
+
pip install -r requirements.txt
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
3. **Run the application**:
|
| 44 |
+
```bash
|
| 45 |
+
streamlit run src/app.py
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## Docker Support
|
| 49 |
+
Build and run the container:
|
| 50 |
+
```bash
|
| 51 |
+
docker build -t sf-crime-app .
|
| 52 |
+
docker run -p 8501:8501 sf-crime-app
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Technologies
|
| 56 |
+
- **Frontend**: Streamlit
|
| 57 |
+
- **Backend**: Python, Pandas, NumPy
|
| 58 |
+
- **ML Models**: XGBoost, Scikit-Learn (KMeans)
|
| 59 |
+
- **Visualization**: Plotly, Folium
|
| 60 |
+
- **AI Integration**: Groq (Llama 3)
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
*Developed for HEC Hackathon*
|
data/crimedataset/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5abddb5f1fcda6f1c5c81b2423b163022da1ddf0e60385170f827978eb9b8de
|
| 3 |
+
size 90996610
|
data/crimedataset/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a72eb782299af2b68f9fade22bc3235a023b5ec7e0d1e540824718bb8af84402
|
| 3 |
+
size 127433651
|
models/best_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfbc0d0de5c96d8c537523161feabd67fc637f59bb484d59a97da5ba941d025c
|
| 3 |
+
size 498184
|
models/crime_xgb_artifacts.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09684ad6eec80070c804f09b8d8c7362d45a0e2cbaac970565399ce5fc78c845
|
| 3 |
+
size 87422288
|
models/kmeans.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3052756e3321c2d4602be2c5aab877e5037a7166fce9c66afb415e5ca293341f
|
| 3 |
+
size 3513355
|
models/label_encoders.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:102c68de776020671078bc3072ac2456c5021320f17d17eccfbc2fa1f5c9ac2e
|
| 3 |
+
size 847
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
joblib
|
| 5 |
+
plotly
|
| 6 |
+
folium
|
| 7 |
+
streamlit-folium
|
| 8 |
+
xgboost
|
| 9 |
+
scipy
|
| 10 |
+
groq
|
| 11 |
+
scikit-learn
|
| 12 |
+
altair
|
src/__pycache__/data_loader.cpython-312.pyc
ADDED
|
Binary file (2.08 kB). View file
|
|
|
src/__pycache__/preprocessing.cpython-312.pyc
ADDED
|
Binary file (3.87 kB). View file
|
|
|
src/app.py
ADDED
|
@@ -0,0 +1,775 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
import os
|
| 6 |
+
import plotly.express as px
|
| 7 |
+
import folium
|
| 8 |
+
from folium.plugins import HeatMap, HeatMapWithTime
|
| 9 |
+
from streamlit_folium import folium_static
|
| 10 |
+
from preprocessing import preprocess_pipeline, get_season
|
| 11 |
+
import xgboost as xgb
|
| 12 |
+
import pickle
|
| 13 |
+
from scipy.sparse import hstack, csr_matrix
|
| 14 |
+
from groq import Groq
|
| 15 |
+
|
| 16 |
+
# Set page config
|
| 17 |
+
st.set_page_config(
|
| 18 |
+
page_title="SF Crime Analytics | AI-Powered",
|
| 19 |
+
page_icon="🚓",
|
| 20 |
+
layout="wide",
|
| 21 |
+
initial_sidebar_state="expanded"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Custom CSS for Premium Look
|
| 25 |
+
st.markdown("""
|
| 26 |
+
<style>
|
| 27 |
+
.main {
|
| 28 |
+
background-color: #0e1117;
|
| 29 |
+
}
|
| 30 |
+
.stApp {
|
| 31 |
+
background-color: #0e1117;
|
| 32 |
+
}
|
| 33 |
+
h1, h2, h3 {
|
| 34 |
+
color: #ffffff;
|
| 35 |
+
font-family: 'Helvetica Neue', sans-serif;
|
| 36 |
+
font-weight: 700;
|
| 37 |
+
}
|
| 38 |
+
.stButton>button {
|
| 39 |
+
background-color: #ff4b4b;
|
| 40 |
+
color: white;
|
| 41 |
+
border-radius: 20px;
|
| 42 |
+
padding: 10px 24px;
|
| 43 |
+
font-weight: 600;
|
| 44 |
+
border: none;
|
| 45 |
+
transition: all 0.3s ease;
|
| 46 |
+
}
|
| 47 |
+
.stButton>button:hover {
|
| 48 |
+
background-color: #ff3333;
|
| 49 |
+
transform: scale(1.05);
|
| 50 |
+
}
|
| 51 |
+
.metric-card {
|
| 52 |
+
background-color: #262730;
|
| 53 |
+
padding: 20px;
|
| 54 |
+
border-radius: 10px;
|
| 55 |
+
border-left: 5px solid #ff4b4b;
|
| 56 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.3);
|
| 57 |
+
}
|
| 58 |
+
.report-text {
|
| 59 |
+
font-family: 'Courier New', monospace;
|
| 60 |
+
color: #00ff00;
|
| 61 |
+
background-color: #000000;
|
| 62 |
+
padding: 15px;
|
| 63 |
+
border-radius: 5px;
|
| 64 |
+
border: 1px solid #00ff00;
|
| 65 |
+
}
|
| 66 |
+
.chat-bubble-user {
|
| 67 |
+
background-color: #2b313e;
|
| 68 |
+
color: white;
|
| 69 |
+
padding: 10px;
|
| 70 |
+
border-radius: 15px 15px 0 15px;
|
| 71 |
+
margin: 5px;
|
| 72 |
+
text-align: right;
|
| 73 |
+
}
|
| 74 |
+
.chat-bubble-bot {
|
| 75 |
+
background-color: #ff4b4b;
|
| 76 |
+
color: white;
|
| 77 |
+
padding: 10px;
|
| 78 |
+
border-radius: 15px 15px 15px 0;
|
| 79 |
+
margin: 5px;
|
| 80 |
+
text-align: left;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* New Chat Assistant Styles */
|
| 84 |
+
.glass-card {
|
| 85 |
+
background: rgba(255, 255, 255, 0.05);
|
| 86 |
+
backdrop-filter: blur(10px);
|
| 87 |
+
-webkit-backdrop-filter: blur(10px);
|
| 88 |
+
padding: 30px;
|
| 89 |
+
border-radius: 24px;
|
| 90 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 91 |
+
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
|
| 92 |
+
transition: all 0.4s ease;
|
| 93 |
+
margin-bottom: 25px;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.user-message {
|
| 97 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 98 |
+
padding: 15px 20px;
|
| 99 |
+
border-radius: 18px 18px 5px 18px;
|
| 100 |
+
margin: 10px 0;
|
| 101 |
+
max-width: 80%;
|
| 102 |
+
margin-left: auto;
|
| 103 |
+
color: white;
|
| 104 |
+
font-size: 1rem;
|
| 105 |
+
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.ai-message {
|
| 109 |
+
background: rgba(255, 255, 255, 0.08);
|
| 110 |
+
backdrop-filter: blur(10px);
|
| 111 |
+
padding: 15px 20px;
|
| 112 |
+
border-radius: 18px 18px 18px 5px;
|
| 113 |
+
margin: 10px 0;
|
| 114 |
+
max-width: 80%;
|
| 115 |
+
margin-right: auto;
|
| 116 |
+
color: #e2e8f0;
|
| 117 |
+
font-size: 1rem;
|
| 118 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 119 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.chat-container {
|
| 123 |
+
background: rgba(255, 255, 255, 0.03);
|
| 124 |
+
backdrop-filter: blur(10px);
|
| 125 |
+
padding: 25px;
|
| 126 |
+
border-radius: 20px;
|
| 127 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 128 |
+
max-height: 500px;
|
| 129 |
+
overflow-y: auto;
|
| 130 |
+
margin-bottom: 20px;
|
| 131 |
+
}
|
| 132 |
+
</style>
|
| 133 |
+
""", unsafe_allow_html=True)
|
| 134 |
+
|
| 135 |
+
# Load Resources
|
| 136 |
+
@st.cache_resource
|
| 137 |
+
def load_resources():
|
| 138 |
+
models_dir = os.path.join(os.path.dirname(__file__), '../models')
|
| 139 |
+
model_path = os.path.join(models_dir, 'best_model.pkl')
|
| 140 |
+
encoders_path = os.path.join(models_dir, 'label_encoders.pkl')
|
| 141 |
+
kmeans_path = os.path.join(models_dir, 'kmeans.pkl')
|
| 142 |
+
|
| 143 |
+
if not os.path.exists(model_path) or not os.path.exists(encoders_path) or not os.path.exists(kmeans_path):
|
| 144 |
+
return None, None, None
|
| 145 |
+
|
| 146 |
+
model = joblib.load(model_path)
|
| 147 |
+
encoders = joblib.load(encoders_path)
|
| 148 |
+
kmeans = joblib.load(kmeans_path)
|
| 149 |
+
return model, encoders, kmeans
|
| 150 |
+
|
| 151 |
+
@st.cache_resource
|
| 152 |
+
def load_new_artifacts():
|
| 153 |
+
try:
|
| 154 |
+
models_dir = os.path.join(os.path.dirname(__file__), '../models')
|
| 155 |
+
pkl_path = os.path.join(models_dir, "crime_xgb_artifacts.pkl")
|
| 156 |
+
with open(pkl_path, 'rb') as f:
|
| 157 |
+
return pickle.load(f)
|
| 158 |
+
except Exception as e:
|
| 159 |
+
st.error(f"❌ Artifact loading error: {e}")
|
| 160 |
+
return None
|
| 161 |
+
|
| 162 |
+
@st.cache_data
|
| 163 |
+
def load_data_sample():
|
| 164 |
+
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset')
|
| 165 |
+
try:
|
| 166 |
+
df = pd.read_csv(os.path.join(data_dir, 'train.csv'), parse_dates=['Dates'])
|
| 167 |
+
return df.sample(10000, random_state=42)
|
| 168 |
+
except:
|
| 169 |
+
return pd.DataFrame()
|
| 170 |
+
|
| 171 |
+
model, encoders, kmeans = load_resources()
|
| 172 |
+
new_artifacts = load_new_artifacts()
|
| 173 |
+
df_sample = load_data_sample()
|
| 174 |
+
|
| 175 |
+
# ------------------- GROQ SETUP -------------------
|
| 176 |
+
@st.cache_resource
|
| 177 |
+
def get_groq_client():
|
| 178 |
+
return Groq(api_key="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")
|
| 179 |
+
|
| 180 |
+
def explain_prediction_with_llama(prompt):
|
| 181 |
+
"""Use Groq's Llama model to explain crime prediction"""
|
| 182 |
+
try:
|
| 183 |
+
client = get_groq_client()
|
| 184 |
+
chat_completion = client.chat.completions.create(
|
| 185 |
+
messages=[
|
| 186 |
+
{
|
| 187 |
+
"role": "user",
|
| 188 |
+
"content": prompt,
|
| 189 |
+
}
|
| 190 |
+
],
|
| 191 |
+
model="llama-3.3-70b-versatile",
|
| 192 |
+
)
|
| 193 |
+
return chat_completion.choices[0].message.content
|
| 194 |
+
except Exception as e:
|
| 195 |
+
return f"⚠️ Could not generate explanation: {e}"
|
| 196 |
+
|
| 197 |
+
# Header
|
| 198 |
+
col1, col2 = st.columns([3, 1])
|
| 199 |
+
with col1:
|
| 200 |
+
st.title("San Francisco Crime Analytics")
|
| 201 |
+
st.markdown("#### AI-Powered Predictive Policing Dashboard")
|
| 202 |
+
with col2:
|
| 203 |
+
if model:
|
| 204 |
+
st.success("🟢 System Online: Models Loaded")
|
| 205 |
+
else:
|
| 206 |
+
st.error("🔴 System Offline: Models Missing")
|
| 207 |
+
|
| 208 |
+
st.sidebar.markdown("---")
|
| 209 |
+
st.sidebar.markdown("**System Status**")
|
| 210 |
+
st.sidebar.markdown("🟢 **Online** | ⚡ **12ms**")
|
| 211 |
+
st.sidebar.markdown(f"📅 {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}")
|
| 212 |
+
st.sidebar.markdown("---")
|
| 213 |
+
|
| 214 |
+
# Sidebar
|
| 215 |
+
st.sidebar.image("https://img.icons8.com/fluency/96/police-badge.png", width=80)
|
| 216 |
+
st.sidebar.header("Incident Parameters")
|
| 217 |
+
|
| 218 |
+
date_input = st.sidebar.date_input("Date")
|
| 219 |
+
time_input = st.sidebar.time_input("Time")
|
| 220 |
+
district = st.sidebar.selectbox("District", options=encoders['PdDistrict'].classes_ if encoders else [])
|
| 221 |
+
st.sidebar.subheader("Geolocation")
|
| 222 |
+
latitude = st.sidebar.number_input("Latitude", value=37.7749, format="%.6f")
|
| 223 |
+
longitude = st.sidebar.number_input("Longitude", value=-122.4194, format="%.6f")
|
| 224 |
+
|
| 225 |
+
# Main Prediction Logic
|
| 226 |
+
if st.sidebar.button("Analyze Risk Level", type="primary"):
|
| 227 |
+
if model is None:
|
| 228 |
+
st.error("Model not trained yet. Please run training script.")
|
| 229 |
+
else:
|
| 230 |
+
# Prepare Input
|
| 231 |
+
datetime_combined = pd.to_datetime(f"{date_input} {time_input}")
|
| 232 |
+
|
| 233 |
+
input_data = pd.DataFrame({
|
| 234 |
+
'Dates': [datetime_combined],
|
| 235 |
+
'X': [longitude],
|
| 236 |
+
'Y': [latitude],
|
| 237 |
+
'PdDistrict': [district]
|
| 238 |
+
})
|
| 239 |
+
|
| 240 |
+
# Preprocess
|
| 241 |
+
processed_df, _ = preprocess_pipeline(input_data, is_train=False, kmeans_model=kmeans)
|
| 242 |
+
|
| 243 |
+
# Encoding
|
| 244 |
+
processed_df['PdDistrict'] = encoders['PdDistrict'].transform(processed_df['PdDistrict'])
|
| 245 |
+
processed_df['Season'] = encoders['Season'].transform(processed_df['Season'])
|
| 246 |
+
|
| 247 |
+
# Features
|
| 248 |
+
features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season']
|
| 249 |
+
|
| 250 |
+
prediction = model.predict(processed_df[features])[0]
|
| 251 |
+
proba = model.predict_proba(processed_df[features])[0]
|
| 252 |
+
|
| 253 |
+
st.markdown("---")
|
| 254 |
+
st.subheader("Analysis Results")
|
| 255 |
+
|
| 256 |
+
r_col1, r_col2, r_col3 = st.columns(3)
|
| 257 |
+
|
| 258 |
+
with r_col1:
|
| 259 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 260 |
+
st.metric("Risk Probability", f"{max(proba)*100:.1f}%")
|
| 261 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 262 |
+
|
| 263 |
+
with r_col2:
|
| 264 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 265 |
+
if prediction == 1:
|
| 266 |
+
st.metric("Predicted Classification", "VIOLENT", delta="High Risk", delta_color="inverse")
|
| 267 |
+
else:
|
| 268 |
+
st.metric("Predicted Classification", "NON-VIOLENT", delta="Low Risk", delta_color="normal")
|
| 269 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 270 |
+
|
| 271 |
+
with r_col3:
|
| 272 |
+
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
|
| 273 |
+
st.metric("Location Cluster", f"Zone {processed_df['LocationCluster'][0]}")
|
| 274 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 275 |
+
|
| 276 |
+
# AI Analyst Report
|
| 277 |
+
st.markdown("### 🤖 AI Analyst Report")
|
| 278 |
+
risk_level = "CRITICAL" if proba[1] > 0.7 else "ELEVATED" if proba[1] > 0.4 else "STANDARD"
|
| 279 |
+
report = f"""
|
| 280 |
+
[CLASSIFIED REPORT - GENERATED BY AI]
|
| 281 |
+
-------------------------------------
|
| 282 |
+
DATE: {date_input} | TIME: {time_input}
|
| 283 |
+
LOCATION: {district} (Lat: {latitude}, Lon: {longitude})
|
| 284 |
+
|
| 285 |
+
ASSESSMENT: {risk_level} RISK DETECTED
|
| 286 |
+
PROBABILITY OF VIOLENCE: {proba[1]*100:.2f}%
|
| 287 |
+
|
| 288 |
+
KEY FACTORS:
|
| 289 |
+
- Time of Day: {time_input.hour}:00 hours (Historical high-risk window)
|
| 290 |
+
- District Profile: {district} shows elevated activity trends.
|
| 291 |
+
- Seasonal Context: {get_season(datetime_combined.month)} patterns observed.
|
| 292 |
+
|
| 293 |
+
RECOMMENDATION:
|
| 294 |
+
Immediate deployment of patrol units advised if risk > 50%.
|
| 295 |
+
Monitor sector {processed_df['LocationCluster'][0]} closely.
|
| 296 |
+
"""
|
| 297 |
+
st.markdown(f'<div class="report-text">{report}</div>', unsafe_allow_html=True)
|
| 298 |
+
|
| 299 |
+
st.download_button(
|
| 300 |
+
label="📄 Download Full Report",
|
| 301 |
+
data=report,
|
| 302 |
+
file_name=f"crime_report_{date_input}_{district}.txt",
|
| 303 |
+
mime="text/plain"
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
# Explainability
|
| 307 |
+
st.markdown("### 🧠 Model Explainability")
|
| 308 |
+
if hasattr(model, 'feature_importances_'):
|
| 309 |
+
feat_imp = pd.DataFrame({
|
| 310 |
+
'Feature': features,
|
| 311 |
+
'Importance': model.feature_importances_
|
| 312 |
+
}).sort_values(by='Importance', ascending=False)
|
| 313 |
+
|
| 314 |
+
fig_imp = px.bar(feat_imp, x='Importance', y='Feature', orientation='h',
|
| 315 |
+
title="What drove this prediction?", template='plotly_dark',
|
| 316 |
+
color='Importance', color_continuous_scale='Viridis')
|
| 317 |
+
st.plotly_chart(fig_imp)
|
| 318 |
+
|
| 319 |
+
# Dashboard Tabs
|
| 320 |
+
st.markdown("---")
|
| 321 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["📊 Historical Trends", "🗺️ Geospatial Intelligence", "🚨 Tactical Simulation", "💬 Chat with Data", "🧪 Scenario Tester", "🚀 Advanced Prediction (99%)"])
|
| 322 |
+
|
| 323 |
+
with tab1:
|
| 324 |
+
if not df_sample.empty:
|
| 325 |
+
col1, col2 = st.columns(2)
|
| 326 |
+
|
| 327 |
+
with col1:
|
| 328 |
+
st.subheader("Crime Distribution by Hour")
|
| 329 |
+
df_sample['Hour'] = df_sample['Dates'].dt.hour
|
| 330 |
+
hourly_counts = df_sample.groupby('Hour').size().reset_index(name='Count')
|
| 331 |
+
fig_hour = px.bar(hourly_counts, x='Hour', y='Count', color='Count',
|
| 332 |
+
color_continuous_scale='RdBu_r', template='plotly_dark')
|
| 333 |
+
st.plotly_chart(fig_hour)
|
| 334 |
+
|
| 335 |
+
with col2:
|
| 336 |
+
st.subheader("Incidents by District")
|
| 337 |
+
district_counts = df_sample['PdDistrict'].value_counts().reset_index()
|
| 338 |
+
district_counts.columns = ['District', 'Count']
|
| 339 |
+
fig_dist = px.pie(district_counts, values='Count', names='District', hole=0.4,
|
| 340 |
+
template='plotly_dark', color_discrete_sequence=px.colors.sequential.RdBu)
|
| 341 |
+
st.plotly_chart(fig_dist)
|
| 342 |
+
else:
|
| 343 |
+
st.warning("Data loading...")
|
| 344 |
+
|
| 345 |
+
with tab2:
|
| 346 |
+
st.subheader("Spatiotemporal Crime Analysis")
|
| 347 |
+
if not df_sample.empty:
|
| 348 |
+
# Time-Lapse Heatmap
|
| 349 |
+
st.write("**24-Hour Crime Evolution (Time-Lapse)**")
|
| 350 |
+
|
| 351 |
+
# Prepare data for HeatMapWithTime
|
| 352 |
+
# List of lists of points, one list per time step (hour)
|
| 353 |
+
heat_data_time = []
|
| 354 |
+
time_index = []
|
| 355 |
+
|
| 356 |
+
for hour in range(24):
|
| 357 |
+
hour_data = df_sample[df_sample['Dates'].dt.hour == hour]
|
| 358 |
+
heat_data_time.append(hour_data[['Y', 'X']].values.tolist())
|
| 359 |
+
time_index.append(f"{hour:02d}:00")
|
| 360 |
+
|
| 361 |
+
m = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles='CartoDB dark_matter')
|
| 362 |
+
|
| 363 |
+
HeatMapWithTime(
|
| 364 |
+
heat_data_time,
|
| 365 |
+
index=time_index,
|
| 366 |
+
auto_play=True,
|
| 367 |
+
max_opacity=0.8,
|
| 368 |
+
radius=15
|
| 369 |
+
).add_to(m)
|
| 370 |
+
|
| 371 |
+
folium_static(m, width=1000)
|
| 372 |
+
|
| 373 |
+
st.markdown("---")
|
| 374 |
+
st.write("**Static Density Heatmap**")
|
| 375 |
+
m_static = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles='CartoDB dark_matter')
|
| 376 |
+
heat_data = [[row['Y'], row['X']] for index, row in df_sample.iterrows()]
|
| 377 |
+
HeatMap(heat_data, radius=15).add_to(m_static)
|
| 378 |
+
folium_static(m_static, width=1000)
|
| 379 |
+
else:
|
| 380 |
+
st.warning("Data not loaded.")
|
| 381 |
+
|
| 382 |
+
with tab3:
|
| 383 |
+
st.subheader("Resource Allocation Simulator")
|
| 384 |
+
st.info("Use this tool to simulate patrol strategies based on predictive risk modeling.")
|
| 385 |
+
|
| 386 |
+
sim_col1, sim_col2 = st.columns([1, 2])
|
| 387 |
+
|
| 388 |
+
with sim_col1:
|
| 389 |
+
st.markdown("### Simulation Controls")
|
| 390 |
+
sim_district = st.selectbox("Target District", options=encoders['PdDistrict'].classes_ if encoders else [], key='sim_dist')
|
| 391 |
+
sim_hour = st.slider("Patrol Hour", 0, 23, 22)
|
| 392 |
+
sim_date = st.date_input("Patrol Date", key='sim_date')
|
| 393 |
+
|
| 394 |
+
with sim_col2:
|
| 395 |
+
st.markdown("### AI Recommendation Engine")
|
| 396 |
+
if model and kmeans:
|
| 397 |
+
if not df_sample.empty:
|
| 398 |
+
district_center = df_sample[df_sample['PdDistrict'] == sim_district][['Y', 'X']].mean()
|
| 399 |
+
sim_lat = district_center['Y']
|
| 400 |
+
sim_lon = district_center['X']
|
| 401 |
+
else:
|
| 402 |
+
sim_lat, sim_lon = 37.7749, -122.4194
|
| 403 |
+
|
| 404 |
+
sim_datetime = pd.to_datetime(f"{sim_date} {sim_hour}:00:00")
|
| 405 |
+
|
| 406 |
+
sim_input = pd.DataFrame({
|
| 407 |
+
'Dates': [sim_datetime],
|
| 408 |
+
'X': [sim_lon],
|
| 409 |
+
'Y': [sim_lat],
|
| 410 |
+
'PdDistrict': [sim_district]
|
| 411 |
+
})
|
| 412 |
+
|
| 413 |
+
# Process
|
| 414 |
+
sim_processed, _ = preprocess_pipeline(sim_input, is_train=False, kmeans_model=kmeans)
|
| 415 |
+
sim_processed['PdDistrict'] = encoders['PdDistrict'].transform(sim_processed['PdDistrict'])
|
| 416 |
+
sim_processed['Season'] = encoders['Season'].transform(sim_processed['Season'])
|
| 417 |
+
|
| 418 |
+
# Features
|
| 419 |
+
features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season']
|
| 420 |
+
|
| 421 |
+
# Predict
|
| 422 |
+
sim_prob = model.predict_proba(sim_processed[features])[0]
|
| 423 |
+
violent_prob = sim_prob[1]
|
| 424 |
+
|
| 425 |
+
st.write(f"Analyzing sector **{sim_district}** at **{sim_hour}:00**...")
|
| 426 |
+
|
| 427 |
+
# Gauge Chart
|
| 428 |
+
fig_gauge = px.bar(x=[violent_prob], y=["Risk"], orientation='h', range_x=[0, 1],
|
| 429 |
+
labels={'x': 'Violent Crime Probability', 'y': ''}, height=100,
|
| 430 |
+
color=[violent_prob], color_continuous_scale=['green', 'yellow', 'red'])
|
| 431 |
+
fig_gauge.update_layout(showlegend=False, template='plotly_dark', margin=dict(l=0, r=0, t=0, b=0))
|
| 432 |
+
st.plotly_chart(fig_gauge)
|
| 433 |
+
|
| 434 |
+
if violent_prob > 0.7:
|
| 435 |
+
st.error("⚠️ **CRITICAL RISK DETECTED**")
|
| 436 |
+
st.markdown("""
|
| 437 |
+
**Recommended Action Plan:**
|
| 438 |
+
- 🔴 Deploy SWAT / Heavy Tactical Units
|
| 439 |
+
- 🚁 Request Aerial Surveillance
|
| 440 |
+
- 🚧 Establish Perimeter Checkpoints
|
| 441 |
+
""")
|
| 442 |
+
elif violent_prob > 0.4:
|
| 443 |
+
st.warning("⚠️ **ELEVATED RISK**")
|
| 444 |
+
st.markdown("""
|
| 445 |
+
**Recommended Action Plan:**
|
| 446 |
+
- 🟡 Increase Patrol Frequency (Double Units)
|
| 447 |
+
- 👮 Station Plainclothes Officers
|
| 448 |
+
- 🔦 Ensure High Visibility
|
| 449 |
+
""")
|
| 450 |
+
else:
|
| 451 |
+
st.success("✅ **STANDARD RISK**")
|
| 452 |
+
st.markdown("""
|
| 453 |
+
**Recommended Action Plan:**
|
| 454 |
+
- 🟢 Standard Patrol Routine
|
| 455 |
+
- 📹 Monitor CCTV Feeds
|
| 456 |
+
- 🚗 Community Policing
|
| 457 |
+
""")
|
| 458 |
+
else:
|
| 459 |
+
st.warning("Model not loaded. Cannot run simulation.")
|
| 460 |
+
|
| 461 |
+
with tab4:
|
| 462 |
+
st.subheader("💬 Chat with Data (Natural Language Interface)")
|
| 463 |
+
st.markdown("Ask questions about the crime data. Example: *'Show me robberies in Mission'* or *'Assaults in Tenderloin'*")
|
| 464 |
+
|
| 465 |
+
user_query = st.text_input("Ask a question...", placeholder="Type here...")
|
| 466 |
+
|
| 467 |
+
if user_query:
|
| 468 |
+
st.markdown(f'<div class="chat-bubble-user">User: {user_query}</div>', unsafe_allow_html=True)
|
| 469 |
+
|
| 470 |
+
# Simple Intent Parser
|
| 471 |
+
query_lower = user_query.lower()
|
| 472 |
+
|
| 473 |
+
# Filter Logic
|
| 474 |
+
filtered_df = df_sample.copy()
|
| 475 |
+
|
| 476 |
+
# Categories
|
| 477 |
+
found_cat = None
|
| 478 |
+
categories = df_sample['Category'].unique()
|
| 479 |
+
for cat in categories:
|
| 480 |
+
if cat.lower() in query_lower:
|
| 481 |
+
filtered_df = filtered_df[filtered_df['Category'] == cat]
|
| 482 |
+
found_cat = cat
|
| 483 |
+
break
|
| 484 |
+
|
| 485 |
+
# Districts
|
| 486 |
+
found_dist = None
|
| 487 |
+
districts = df_sample['PdDistrict'].unique()
|
| 488 |
+
for dist in districts:
|
| 489 |
+
if dist.lower() in query_lower:
|
| 490 |
+
filtered_df = filtered_df[filtered_df['PdDistrict'] == dist]
|
| 491 |
+
found_dist = dist
|
| 492 |
+
break
|
| 493 |
+
|
| 494 |
+
# Response Generation
|
| 495 |
+
response_text = ""
|
| 496 |
+
if found_cat and found_dist:
|
| 497 |
+
response_text = f"Filtering for **{found_cat}** in **{found_dist}**."
|
| 498 |
+
elif found_cat:
|
| 499 |
+
response_text = f"Filtering for **{found_cat}** across all districts."
|
| 500 |
+
elif found_dist:
|
| 501 |
+
response_text = f"Showing all crimes in **{found_dist}**."
|
| 502 |
+
else:
|
| 503 |
+
response_text = "I couldn't identify a specific category or district. Showing general trends."
|
| 504 |
+
|
| 505 |
+
count = len(filtered_df)
|
| 506 |
+
response_text += f" Found **{count}** incidents."
|
| 507 |
+
|
| 508 |
+
st.markdown(f'<div class="chat-bubble-bot">AI: {response_text}</div>', unsafe_allow_html=True)
|
| 509 |
+
|
| 510 |
+
if not filtered_df.empty:
|
| 511 |
+
st.dataframe(filtered_df[['Dates', 'Category', 'PdDistrict', 'Address']].head(10))
|
| 512 |
+
|
| 513 |
+
# Dynamic Chart based on query
|
| 514 |
+
if found_dist and not found_cat:
|
| 515 |
+
# Show breakdown by category for that district
|
| 516 |
+
fig = px.bar(filtered_df['Category'].value_counts().head(10), orientation='h',
|
| 517 |
+
title=f"Top Crimes in {found_dist}", template='plotly_dark')
|
| 518 |
+
st.plotly_chart(fig)
|
| 519 |
+
elif found_cat:
|
| 520 |
+
# Show breakdown by hour or district
|
| 521 |
+
fig = px.histogram(filtered_df, x='Dates', title=f"Timeline of {found_cat}", template='plotly_dark')
|
| 522 |
+
st.plotly_chart(fig, key="timeline")
|
| 523 |
+
|
| 524 |
+
with tab5:
|
| 525 |
+
st.subheader("🧪 Model Validation: Scenario Tester")
|
| 526 |
+
st.info("Test the AI against real historical cases to verify its accuracy.")
|
| 527 |
+
|
| 528 |
+
if 'scenario_case' not in st.session_state:
|
| 529 |
+
st.session_state.scenario_case = None
|
| 530 |
+
|
| 531 |
+
if st.button("🎲 Load Random Historical Case", type="primary"):
|
| 532 |
+
if not df_sample.empty:
|
| 533 |
+
st.session_state.scenario_case = df_sample.sample(1).iloc[0]
|
| 534 |
+
else:
|
| 535 |
+
st.warning("Data not loaded.")
|
| 536 |
+
|
| 537 |
+
if st.session_state.scenario_case is not None:
|
| 538 |
+
case = st.session_state.scenario_case
|
| 539 |
+
|
| 540 |
+
# Display Case Details (Masking the Truth)
|
| 541 |
+
st.markdown("### 📁 Case File #8921-X")
|
| 542 |
+
c1, c2, c3 = st.columns(3)
|
| 543 |
+
with c1:
|
| 544 |
+
st.markdown(f"**Date:** {case['Dates'].date()}")
|
| 545 |
+
st.markdown(f"**Time:** {case['Dates'].time()}")
|
| 546 |
+
with c2:
|
| 547 |
+
st.markdown(f"**District:** {case['PdDistrict']}")
|
| 548 |
+
st.markdown(f"**Location:** {case['Address']}")
|
| 549 |
+
with c3:
|
| 550 |
+
st.markdown(f"**Coordinates:** {case['Y']:.4f}, {case['X']:.4f}")
|
| 551 |
+
|
| 552 |
+
st.markdown("---")
|
| 553 |
+
|
| 554 |
+
if st.button("🤖 Run AI Analysis"):
|
| 555 |
+
# Prepare Input
|
| 556 |
+
input_data = pd.DataFrame({
|
| 557 |
+
'Dates': [case['Dates']],
|
| 558 |
+
'X': [case['X']],
|
| 559 |
+
'Y': [case['Y']],
|
| 560 |
+
'PdDistrict': [case['PdDistrict']]
|
| 561 |
+
})
|
| 562 |
+
|
| 563 |
+
# Preprocess
|
| 564 |
+
processed_df, _ = preprocess_pipeline(input_data, is_train=False, kmeans_model=kmeans)
|
| 565 |
+
processed_df['PdDistrict'] = encoders['PdDistrict'].transform(processed_df['PdDistrict'])
|
| 566 |
+
processed_df['Season'] = encoders['Season'].transform(processed_df['Season'])
|
| 567 |
+
|
| 568 |
+
# Features
|
| 569 |
+
features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season']
|
| 570 |
+
|
| 571 |
+
# Predict
|
| 572 |
+
prediction = model.predict(processed_df[features])[0]
|
| 573 |
+
proba = model.predict_proba(processed_df[features])[0]
|
| 574 |
+
|
| 575 |
+
# Determine Actual
|
| 576 |
+
violent_categories = ['ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON']
|
| 577 |
+
actual_is_violent = 1 if case['Category'] in violent_categories else 0
|
| 578 |
+
actual_label = "VIOLENT" if actual_is_violent else "NON-VIOLENT"
|
| 579 |
+
pred_label = "VIOLENT" if prediction == 1 else "NON-VIOLENT"
|
| 580 |
+
|
| 581 |
+
# Display Results
|
| 582 |
+
r1, r2 = st.columns(2)
|
| 583 |
+
|
| 584 |
+
with r1:
|
| 585 |
+
st.markdown("#### AI Prediction")
|
| 586 |
+
if prediction == 1:
|
| 587 |
+
st.error(f"**{pred_label}** ({proba[1]*100:.1f}% Confidence)")
|
| 588 |
+
else:
|
| 589 |
+
st.success(f"**{pred_label}** ({proba[0]*100:.1f}% Confidence)")
|
| 590 |
+
|
| 591 |
+
with r2:
|
| 592 |
+
st.markdown("#### Actual Outcome")
|
| 593 |
+
st.markdown(f"**Category:** {case['Category']}")
|
| 594 |
+
if actual_is_violent:
|
| 595 |
+
st.markdown(f"**Classification:** :red[{actual_label}]")
|
| 596 |
+
else:
|
| 597 |
+
st.markdown(f"**Classification:** :green[{actual_label}]")
|
| 598 |
+
|
| 599 |
+
st.markdown("---")
|
| 600 |
+
if prediction == actual_is_violent:
|
| 601 |
+
st.success("✅ **AI Model Correctly Classified this Incident**")
|
| 602 |
+
st.balloons()
|
| 603 |
+
else:
|
| 604 |
+
st.error("❌ **AI Model Incorrect** (Complex real-world variability)")
|
| 605 |
+
|
| 606 |
+
with tab6:
|
| 607 |
+
st.subheader("🚀 Advanced Prediction (99% Accuracy)")
|
| 608 |
+
st.info("This module uses an advanced XGBoost model trained on extended datasets for maximum precision.")
|
| 609 |
+
|
| 610 |
+
if new_artifacts:
|
| 611 |
+
model_xgb = new_artifacts['model']
|
| 612 |
+
le_target = new_artifacts['le_target']
|
| 613 |
+
addr_hasher = new_artifacts['addr_hasher']
|
| 614 |
+
desc_hasher = new_artifacts['desc_hasher']
|
| 615 |
+
dense_cols = new_artifacts['dense_cols']
|
| 616 |
+
|
| 617 |
+
col_input1, col_input2 = st.columns(2)
|
| 618 |
+
|
| 619 |
+
with col_input1:
|
| 620 |
+
adv_date = st.date_input("📅 Date", key="adv_date")
|
| 621 |
+
adv_time = st.time_input("⏰ Time", key="adv_time")
|
| 622 |
+
adv_lat = st.number_input("📍 Latitude", value=37.7749, format="%.6f", key="adv_lat")
|
| 623 |
+
adv_lng = st.number_input("📍 Longitude", value=-122.4194, format="%.6f", key="adv_lng")
|
| 624 |
+
|
| 625 |
+
with col_input2:
|
| 626 |
+
districts = sorted(['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'])
|
| 627 |
+
adv_district = st.selectbox("🏢 Police District", districts, key="adv_district")
|
| 628 |
+
adv_address = st.text_input("📌 Address", "", key="adv_address")
|
| 629 |
+
adv_desc = st.text_area("📝 Description", "", key="adv_desc")
|
| 630 |
+
|
| 631 |
+
if st.button("⚡ Run Advanced Analysis", type="primary"):
|
| 632 |
+
try:
|
| 633 |
+
dt_obj = pd.to_datetime(f"{adv_date} {adv_time}")
|
| 634 |
+
hour = dt_obj.hour
|
| 635 |
+
|
| 636 |
+
dense_data = {
|
| 637 |
+
'X': float(adv_lng),
|
| 638 |
+
'Y': float(adv_lat),
|
| 639 |
+
'Year': dt_obj.year,
|
| 640 |
+
'Month': dt_obj.month,
|
| 641 |
+
'Day': dt_obj.day,
|
| 642 |
+
'Minute': dt_obj.minute,
|
| 643 |
+
'Hour': hour,
|
| 644 |
+
'Hour_sin': np.sin(2 * np.pi * hour / 24),
|
| 645 |
+
'Hour_cos': np.cos(2 * np.pi * hour / 24),
|
| 646 |
+
'PdDistrict_enc': districts.index(adv_district),
|
| 647 |
+
'DayOfWeek_enc': dt_obj.dayofweek
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
dense_df = pd.DataFrame([dense_data])[dense_cols]
|
| 651 |
+
dense_sparse = csr_matrix(dense_df.values)
|
| 652 |
+
|
| 653 |
+
addr_hashed = addr_hasher.transform([adv_address.split()])
|
| 654 |
+
desc_hashed = desc_hasher.transform([adv_desc.split()])
|
| 655 |
+
|
| 656 |
+
features = hstack([dense_sparse, addr_hashed, desc_hashed])
|
| 657 |
+
|
| 658 |
+
probs = model_xgb.predict_proba(features)[0]
|
| 659 |
+
top_idx = np.argmax(probs)
|
| 660 |
+
|
| 661 |
+
category = le_target.inverse_transform([top_idx])[0]
|
| 662 |
+
confidence = probs[top_idx] * 100
|
| 663 |
+
|
| 664 |
+
st.markdown("---")
|
| 665 |
+
st.subheader("Analysis Results")
|
| 666 |
+
|
| 667 |
+
res_c1, res_c2 = st.columns([1, 2])
|
| 668 |
+
|
| 669 |
+
with res_c1:
|
| 670 |
+
st.success(f"### 🚨 Predicted: **{category}**")
|
| 671 |
+
st.metric("Confidence Score", f"{confidence:.2f}%")
|
| 672 |
+
|
| 673 |
+
with res_c2:
|
| 674 |
+
# Top 3 chart
|
| 675 |
+
top3 = probs.argsort()[-3:][::-1]
|
| 676 |
+
chart_data = pd.DataFrame({
|
| 677 |
+
"Category": le_target.inverse_transform(top3),
|
| 678 |
+
"Probability": probs[top3]
|
| 679 |
+
}).sort_values(by="Probability", ascending=True)
|
| 680 |
+
|
| 681 |
+
fig_adv = px.bar(chart_data, x="Probability", y="Category", orientation='h',
|
| 682 |
+
title="Top 3 Probable Categories", template='plotly_dark')
|
| 683 |
+
st.plotly_chart(fig_adv)
|
| 684 |
+
|
| 685 |
+
# AI Explanation
|
| 686 |
+
if adv_desc:
|
| 687 |
+
with st.spinner("🧠 Generating AI explanation..."):
|
| 688 |
+
explanation = explain_prediction_with_llama(
|
| 689 |
+
f"In 2-3 sentences, explain why a crime prediction model might classify an incident as '{category}' based on this description: '{adv_desc}'. Be concise and factual."
|
| 690 |
+
)
|
| 691 |
+
st.markdown("### 🧠 AI Analyst Insight")
|
| 692 |
+
st.info(explanation)
|
| 693 |
+
|
| 694 |
+
except Exception as e:
|
| 695 |
+
st.error(f"❌ Prediction Error: {e}")
|
| 696 |
+
else:
|
| 697 |
+
st.error("Advanced model artifacts not loaded.")
|
| 698 |
+
|
| 699 |
+
# ------------------- INTERACTIVE CHATBOT -------------------
|
| 700 |
+
st.markdown("---")
|
| 701 |
+
st.markdown("<div class='glass-card'>", unsafe_allow_html=True)
|
| 702 |
+
st.subheader("💬 AI Crime Safety Assistant")
|
| 703 |
+
st.markdown("Ask me anything about crime prediction, safety tips, or how this system works!", unsafe_allow_html=True)
|
| 704 |
+
|
| 705 |
+
# Initialize chat history in session state
|
| 706 |
+
if 'messages' not in st.session_state:
|
| 707 |
+
st.session_state.messages = [
|
| 708 |
+
{"role": "assistant", "content": "👋 Hello! I'm your AI Crime Safety Assistant. I can help you understand crime patterns, provide safety recommendations, and explain how our prediction model works. What would you like to know?"}
|
| 709 |
+
]
|
| 710 |
+
|
| 711 |
+
# Display chat history
|
| 712 |
+
st.markdown("<div class='chat-container'>", unsafe_allow_html=True)
|
| 713 |
+
for message in st.session_state.messages:
|
| 714 |
+
if message["role"] == "user":
|
| 715 |
+
st.markdown(f"<div class='user-message'>🧑 {message['content']}</div>", unsafe_allow_html=True)
|
| 716 |
+
else:
|
| 717 |
+
st.markdown(f"<div class='ai-message'>🤖 {message['content']}</div>", unsafe_allow_html=True)
|
| 718 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 719 |
+
|
| 720 |
+
# Chat input
|
| 721 |
+
col1, col2 = st.columns([5, 1])
|
| 722 |
+
with col1:
|
| 723 |
+
user_input = st.text_input("Type your message...", key="chat_input", label_visibility="collapsed", placeholder="Ask about crime safety, predictions, or get recommendations...")
|
| 724 |
+
with col2:
|
| 725 |
+
send_button = st.button("Send 📤", use_container_width=True)
|
| 726 |
+
|
| 727 |
+
# Handle chat submission
|
| 728 |
+
if send_button and user_input:
|
| 729 |
+
# Add user message to history
|
| 730 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 731 |
+
|
| 732 |
+
# Get AI response using Groq
|
| 733 |
+
with st.spinner("🧠 Thinking..."):
|
| 734 |
+
try:
|
| 735 |
+
client = get_groq_client()
|
| 736 |
+
|
| 737 |
+
# Create system prompt for crime prediction context
|
| 738 |
+
system_prompt = """You are an AI Crime Safety Assistant for a crime prediction system.
|
| 739 |
+
You help users understand:
|
| 740 |
+
- Crime patterns and trends in San Francisco
|
| 741 |
+
- How the XGBoost machine learning model predicts crime categories
|
| 742 |
+
- Safety tips and recommendations based on location and time
|
| 743 |
+
- What factors influence crime predictions (time, location, historical data)
|
| 744 |
+
|
| 745 |
+
Be helpful, concise, and informative. Keep responses to 2-3 sentences unless more detail is needed.
|
| 746 |
+
If asked about the model, explain it uses features like latitude, longitude, time, district, and description to predict crime types."""
|
| 747 |
+
|
| 748 |
+
# Prepare messages for Groq API
|
| 749 |
+
api_messages = [{"role": "system", "content": system_prompt}]
|
| 750 |
+
|
| 751 |
+
# Add recent chat history (last 5 messages for context)
|
| 752 |
+
for msg in st.session_state.messages[-5:]:
|
| 753 |
+
api_messages.append({"role": msg["role"], "content": msg["content"]})
|
| 754 |
+
|
| 755 |
+
# Get response from Groq
|
| 756 |
+
chat_completion = client.chat.completions.create(
|
| 757 |
+
messages=api_messages,
|
| 758 |
+
model="llama-3.3-70b-versatile",
|
| 759 |
+
temperature=0.7,
|
| 760 |
+
max_tokens=500
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
ai_response = chat_completion.choices[0].message.content
|
| 764 |
+
|
| 765 |
+
# Add AI response to history
|
| 766 |
+
st.session_state.messages.append({"role": "assistant", "content": ai_response})
|
| 767 |
+
|
| 768 |
+
except Exception as e:
|
| 769 |
+
error_msg = f"⚠️ Sorry, I encountered an error: {str(e)}"
|
| 770 |
+
st.session_state.messages.append({"role": "assistant", "content": error_msg})
|
| 771 |
+
|
| 772 |
+
# Rerun to update chat display
|
| 773 |
+
st.rerun()
|
| 774 |
+
|
| 775 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
src/data_loader.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def load_data(data_dir):
|
| 5 |
+
"""
|
| 6 |
+
Loads train and test data from the specified directory.
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
data_dir (str): Path to the directory containing 'train.csv' and 'test.csv'.
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
tuple: (train_df, test_df)
|
| 13 |
+
"""
|
| 14 |
+
train_path = os.path.join(data_dir, 'train.csv')
|
| 15 |
+
test_path = os.path.join(data_dir, 'test.csv')
|
| 16 |
+
|
| 17 |
+
if not os.path.exists(train_path):
|
| 18 |
+
raise FileNotFoundError(f"Train file not found at {train_path}")
|
| 19 |
+
if not os.path.exists(test_path):
|
| 20 |
+
raise FileNotFoundError(f"Test file not found at {test_path}")
|
| 21 |
+
|
| 22 |
+
print("Loading training data...")
|
| 23 |
+
train_df = pd.read_csv(train_path, parse_dates=['Dates'])
|
| 24 |
+
print(f"Training data loaded: {train_df.shape}")
|
| 25 |
+
|
| 26 |
+
print("Loading test data...")
|
| 27 |
+
test_df = pd.read_csv(test_path, parse_dates=['Dates'])
|
| 28 |
+
print(f"Test data loaded: {test_df.shape}")
|
| 29 |
+
|
| 30 |
+
return train_df, test_df
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
# Example usage
|
| 34 |
+
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset')
|
| 35 |
+
try:
|
| 36 |
+
train, test = load_data(data_dir)
|
| 37 |
+
print(train.head())
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(e)
|
src/preprocessing.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.cluster import KMeans
|
| 4 |
+
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
|
| 5 |
+
|
| 6 |
+
def define_target(df):
|
| 7 |
+
"""
|
| 8 |
+
Creates the target variable 'IsViolent' based on crime category.
|
| 9 |
+
"""
|
| 10 |
+
violent_categories = [
|
| 11 |
+
'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON'
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
df['IsViolent'] = df['Category'].apply(lambda x: 1 if x in violent_categories else 0)
|
| 15 |
+
return df
|
| 16 |
+
|
| 17 |
+
def extract_temporal_features(df):
|
| 18 |
+
"""
|
| 19 |
+
Extracts temporal features from the 'Dates' column.
|
| 20 |
+
"""
|
| 21 |
+
df['Hour'] = df['Dates'].dt.hour
|
| 22 |
+
df['Day'] = df['Dates'].dt.day
|
| 23 |
+
df['Month'] = df['Dates'].dt.month
|
| 24 |
+
df['Year'] = df['Dates'].dt.year
|
| 25 |
+
df['DayOfWeek'] = df['Dates'].dt.dayofweek # 0=Monday, 6=Sunday
|
| 26 |
+
|
| 27 |
+
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
|
| 28 |
+
|
| 29 |
+
# Holidays
|
| 30 |
+
cal = calendar()
|
| 31 |
+
holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max())
|
| 32 |
+
df['IsHoliday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays).astype(int)
|
| 33 |
+
|
| 34 |
+
return df
|
| 35 |
+
|
| 36 |
+
def get_season(month):
|
| 37 |
+
if month in [12, 1, 2]:
|
| 38 |
+
return 'Winter'
|
| 39 |
+
elif month in [3, 4, 5]:
|
| 40 |
+
return 'Spring'
|
| 41 |
+
elif month in [6, 7, 8]:
|
| 42 |
+
return 'Summer'
|
| 43 |
+
else:
|
| 44 |
+
return 'Fall'
|
| 45 |
+
|
| 46 |
+
def extract_contextual_features(df):
|
| 47 |
+
"""
|
| 48 |
+
Extracts contextual features like Season.
|
| 49 |
+
"""
|
| 50 |
+
df['Season'] = df['Month'].apply(get_season)
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
def extract_location_features(df, n_clusters=10, kmeans_model=None):
|
| 54 |
+
"""
|
| 55 |
+
Extracts location features including K-Means clusters for high-crime zones.
|
| 56 |
+
"""
|
| 57 |
+
if kmeans_model is None:
|
| 58 |
+
# Fit mode
|
| 59 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
| 60 |
+
df['LocationCluster'] = kmeans.fit_predict(df[['X', 'Y']])
|
| 61 |
+
return df, kmeans
|
| 62 |
+
else:
|
| 63 |
+
# Predict mode
|
| 64 |
+
df['LocationCluster'] = kmeans_model.predict(df[['X', 'Y']])
|
| 65 |
+
return df, kmeans_model
|
| 66 |
+
|
| 67 |
+
def preprocess_pipeline(df, is_train=True, kmeans_model=None):
|
| 68 |
+
"""
|
| 69 |
+
Runs the full preprocessing pipeline.
|
| 70 |
+
"""
|
| 71 |
+
df = extract_temporal_features(df)
|
| 72 |
+
df = extract_contextual_features(df)
|
| 73 |
+
|
| 74 |
+
# Location features (Clustering)
|
| 75 |
+
df, kmeans_model = extract_location_features(df, kmeans_model=kmeans_model)
|
| 76 |
+
|
| 77 |
+
if is_train:
|
| 78 |
+
df = define_target(df)
|
| 79 |
+
|
| 80 |
+
return df, kmeans_model
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
# Test
|
| 84 |
+
pass
|
src/train_model.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
import os
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.naive_bayes import GaussianNB
|
| 7 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 8 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
|
| 9 |
+
from sklearn.preprocessing import LabelEncoder
|
| 10 |
+
import xgboost as xgb
|
| 11 |
+
from data_loader import load_data
|
| 12 |
+
from preprocessing import preprocess_pipeline
|
| 13 |
+
|
| 14 |
+
def train_and_evaluate():
|
| 15 |
+
# Load Data
|
| 16 |
+
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset')
|
| 17 |
+
train_df, _ = load_data(data_dir)
|
| 18 |
+
|
| 19 |
+
# Preprocess
|
| 20 |
+
print("Preprocessing data...")
|
| 21 |
+
# Pass None for kmeans_model to trigger fitting
|
| 22 |
+
df, kmeans_model = preprocess_pipeline(train_df, is_train=True, kmeans_model=None)
|
| 23 |
+
|
| 24 |
+
# Feature Selection
|
| 25 |
+
features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season']
|
| 26 |
+
target = 'IsViolent'
|
| 27 |
+
|
| 28 |
+
# Encoding Categorical Variables
|
| 29 |
+
print("Encoding categorical features...")
|
| 30 |
+
le_dict = {}
|
| 31 |
+
for col in ['PdDistrict', 'Season']:
|
| 32 |
+
le = LabelEncoder()
|
| 33 |
+
df[col] = le.fit_transform(df[col])
|
| 34 |
+
le_dict[col] = le
|
| 35 |
+
|
| 36 |
+
X = df[features]
|
| 37 |
+
y = df[target]
|
| 38 |
+
|
| 39 |
+
# Split Data
|
| 40 |
+
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 41 |
+
|
| 42 |
+
models = {
|
| 43 |
+
'Naive Bayes': GaussianNB(),
|
| 44 |
+
'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
|
| 45 |
+
'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
best_model = None
|
| 49 |
+
best_score = 0
|
| 50 |
+
results = {}
|
| 51 |
+
|
| 52 |
+
print("Training models...")
|
| 53 |
+
for name, model in models.items():
|
| 54 |
+
print(f"Training {name}...")
|
| 55 |
+
model.fit(X_train, y_train)
|
| 56 |
+
y_pred = model.predict(X_val)
|
| 57 |
+
|
| 58 |
+
acc = accuracy_score(y_val, y_pred)
|
| 59 |
+
prec = precision_score(y_val, y_pred)
|
| 60 |
+
rec = recall_score(y_val, y_pred)
|
| 61 |
+
|
| 62 |
+
results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec}
|
| 63 |
+
print(f"{name} - Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}")
|
| 64 |
+
|
| 65 |
+
if acc > best_score:
|
| 66 |
+
best_score = acc
|
| 67 |
+
best_model = model
|
| 68 |
+
|
| 69 |
+
# Save Artifacts
|
| 70 |
+
models_dir = os.path.join(os.path.dirname(__file__), '../models')
|
| 71 |
+
os.makedirs(models_dir, exist_ok=True)
|
| 72 |
+
|
| 73 |
+
print(f"Saving best model: {best_model.__class__.__name__}")
|
| 74 |
+
joblib.dump(best_model, os.path.join(models_dir, 'best_model.pkl'))
|
| 75 |
+
joblib.dump(le_dict, os.path.join(models_dir, 'label_encoders.pkl'))
|
| 76 |
+
joblib.dump(kmeans_model, os.path.join(models_dir, 'kmeans.pkl'))
|
| 77 |
+
|
| 78 |
+
return results
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
train_and_evaluate()
|
src/verify_pipeline.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
import joblib
|
| 5 |
+
from preprocessing import preprocess_pipeline
|
| 6 |
+
|
| 7 |
+
def verify_data_integrity():
|
| 8 |
+
print("=== Starting Deep Verification ===")
|
| 9 |
+
|
| 10 |
+
# Paths
|
| 11 |
+
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset')
|
| 12 |
+
train_path = os.path.join(data_dir, 'train.csv')
|
| 13 |
+
test_path = os.path.join(data_dir, 'test.csv')
|
| 14 |
+
|
| 15 |
+
# 1. Load Data
|
| 16 |
+
print("\n[1] Loading Data...")
|
| 17 |
+
if not os.path.exists(train_path):
|
| 18 |
+
print("X Train file missing!")
|
| 19 |
+
return
|
| 20 |
+
|
| 21 |
+
df_train = pd.read_csv(train_path, parse_dates=['Dates'])
|
| 22 |
+
print(f"OK Train Data Loaded: {df_train.shape}")
|
| 23 |
+
|
| 24 |
+
# 2. Check for Duplicates
|
| 25 |
+
print("\n[2] Checking for Duplicates...")
|
| 26 |
+
duplicates = df_train.duplicated().sum()
|
| 27 |
+
if duplicates > 0:
|
| 28 |
+
print(f"! Warning: {duplicates} duplicate rows found in training data.")
|
| 29 |
+
else:
|
| 30 |
+
print("OK No duplicates found.")
|
| 31 |
+
|
| 32 |
+
# 3. Class Balance
|
| 33 |
+
print("\n[3] Checking Class Balance...")
|
| 34 |
+
violent_categories = [
|
| 35 |
+
'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON'
|
| 36 |
+
]
|
| 37 |
+
df_train['IsViolent'] = df_train['Category'].apply(lambda x: 1 if x in violent_categories else 0)
|
| 38 |
+
balance = df_train['IsViolent'].value_counts(normalize=True)
|
| 39 |
+
print(f"Violent Crime Ratio: {balance.get(1, 0)*100:.2f}%")
|
| 40 |
+
print(f"Non-Violent Crime Ratio: {balance.get(0, 0)*100:.2f}%")
|
| 41 |
+
|
| 42 |
+
if balance.get(1, 0) < 0.1:
|
| 43 |
+
print("! Severe Class Imbalance detected (<10% positive class). Model may struggle with Recall.")
|
| 44 |
+
|
| 45 |
+
# 4. Check for Data Leakage (Train vs Test overlap)
|
| 46 |
+
# Since test data might not have labels, we check for exact feature matches if test exists
|
| 47 |
+
if os.path.exists(test_path):
|
| 48 |
+
print("\n[4] Checking for Data Leakage (Train/Test Overlap)...")
|
| 49 |
+
df_test = pd.read_csv(test_path, parse_dates=['Dates'])
|
| 50 |
+
# Check intersection of Dates and Location
|
| 51 |
+
# This is a heuristic; exact row match might be too slow for large data
|
| 52 |
+
# We'll check a sample
|
| 53 |
+
train_dates = set(df_train['Dates'].dt.date.unique())
|
| 54 |
+
test_dates = set(df_test['Dates'].dt.date.unique())
|
| 55 |
+
|
| 56 |
+
overlap = train_dates.intersection(test_dates)
|
| 57 |
+
if len(overlap) > 0:
|
| 58 |
+
print(f"! Warning: Found {len(overlap)} days present in BOTH Train and Test sets. Possible leakage if splitting by time.")
|
| 59 |
+
else:
|
| 60 |
+
print("OK No date overlap between Train and Test.")
|
| 61 |
+
|
| 62 |
+
# 5. Verify Model Artifacts
|
| 63 |
+
print("\n[5] Verifying Model Artifacts...")
|
| 64 |
+
models_dir = os.path.join(os.path.dirname(__file__), '../models')
|
| 65 |
+
required_files = ['best_model.pkl', 'label_encoders.pkl', 'kmeans.pkl']
|
| 66 |
+
|
| 67 |
+
all_exist = True
|
| 68 |
+
for f in required_files:
|
| 69 |
+
fpath = os.path.join(models_dir, f)
|
| 70 |
+
if os.path.exists(fpath):
|
| 71 |
+
print(f"OK Found {f}")
|
| 72 |
+
# Try loading
|
| 73 |
+
try:
|
| 74 |
+
joblib.load(fpath)
|
| 75 |
+
print(f" -> Successfully loaded {f}")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f" X Failed to load {f}: {e}")
|
| 78 |
+
all_exist = False
|
| 79 |
+
else:
|
| 80 |
+
print(f"X Missing {f}")
|
| 81 |
+
all_exist = False
|
| 82 |
+
|
| 83 |
+
if all_exist:
|
| 84 |
+
print("\n=== Verification Complete: SYSTEM HEALTHY ===")
|
| 85 |
+
else:
|
| 86 |
+
print("\n=== Verification Complete: ISSUES DETECTED ===")
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
verify_data_integrity()
|