Spaces:
Sleeping
Sleeping
Enhance README.md with detailed application overview, features, installation instructions, and usage guidelines for synthetic data generation and ML model training.
Browse files- .gitignore +4 -0
- App.py +1316 -0
- README.md +102 -13
- pages/02_Algorithm_Education.py +1250 -0
- pages/03_Model_implementation.py +227 -0
- requirements.txt +8 -0
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
models/
|
| 2 |
+
temp_uploads/
|
| 3 |
+
__pycache__/
|
| 4 |
+
|
App.py
ADDED
|
@@ -0,0 +1,1316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.model_selection import train_test_split
|
| 5 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 6 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
| 7 |
+
import plotly.express as px
|
| 8 |
+
from sklearn.linear_model import LogisticRegression, RidgeClassifier
|
| 9 |
+
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
|
| 10 |
+
from sklearn.svm import SVC, LinearSVC
|
| 11 |
+
from sklearn.naive_bayes import GaussianNB, MultinomialNB
|
| 12 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 13 |
+
from sklearn.neural_network import MLPClassifier
|
| 14 |
+
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler
|
| 15 |
+
import time
|
| 16 |
+
import warnings
|
| 17 |
+
import joblib
|
| 18 |
+
import os
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
import seaborn as sns
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
from matplotlib.colors import LinearSegmentedColormap
|
| 23 |
+
from sklearn.model_selection import learning_curve
|
| 24 |
+
import pickle
|
| 25 |
+
warnings.filterwarnings('ignore')
|
| 26 |
+
|
| 27 |
+
class DataGenerator:
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.features = None
|
| 30 |
+
self.feature_configs = None
|
| 31 |
+
self.classes = None
|
| 32 |
+
self.class_configs = None
|
| 33 |
+
|
| 34 |
+
def generate_synthetic_data(self, n_samples, feature_configs, classes, class_configs=None):
|
| 35 |
+
"""Generate synthetic data based on configurations"""
|
| 36 |
+
n_features = len(feature_configs)
|
| 37 |
+
n_classes = len(classes)
|
| 38 |
+
|
| 39 |
+
X = []
|
| 40 |
+
y = []
|
| 41 |
+
samples_per_class = n_samples // n_classes
|
| 42 |
+
|
| 43 |
+
for i in range(n_classes):
|
| 44 |
+
class_samples = []
|
| 45 |
+
class_name = classes[i]
|
| 46 |
+
|
| 47 |
+
for j, (feature_name, config) in enumerate(feature_configs.items()):
|
| 48 |
+
if class_configs and class_name in class_configs:
|
| 49 |
+
center = class_configs[class_name]['mean'][j]
|
| 50 |
+
std = class_configs[class_name]['std'][j]
|
| 51 |
+
else:
|
| 52 |
+
if config['type'] == 'random':
|
| 53 |
+
center = np.random.randn() * 5
|
| 54 |
+
std = config['std']
|
| 55 |
+
else:
|
| 56 |
+
center = config['center']
|
| 57 |
+
std = config['std']
|
| 58 |
+
|
| 59 |
+
feature_samples = np.round(np.random.normal(
|
| 60 |
+
loc=center,
|
| 61 |
+
scale=std,
|
| 62 |
+
size=samples_per_class
|
| 63 |
+
), decimals=2)
|
| 64 |
+
class_samples.append(feature_samples)
|
| 65 |
+
|
| 66 |
+
X.append(np.column_stack(class_samples))
|
| 67 |
+
y.extend([classes[i]] * samples_per_class)
|
| 68 |
+
|
| 69 |
+
X = np.vstack(X)
|
| 70 |
+
return X, np.array(y)
|
| 71 |
+
|
| 72 |
+
class ModelManager:
|
| 73 |
+
@staticmethod
|
| 74 |
+
def get_classifiers():
|
| 75 |
+
"""Return dictionary of classifiers with appropriate preprocessing"""
|
| 76 |
+
return {
|
| 77 |
+
'LogisticRegression': {
|
| 78 |
+
'model': LogisticRegression(max_iter=1000),
|
| 79 |
+
'scaler': StandardScaler()
|
| 80 |
+
},
|
| 81 |
+
'RidgeClassifier': {
|
| 82 |
+
'model': RidgeClassifier(),
|
| 83 |
+
'scaler': StandardScaler()
|
| 84 |
+
},
|
| 85 |
+
'RandomForestClassifier': {
|
| 86 |
+
'model': RandomForestClassifier(random_state=42),
|
| 87 |
+
'scaler': StandardScaler()
|
| 88 |
+
},
|
| 89 |
+
'AdaBoostClassifier': {
|
| 90 |
+
'model': AdaBoostClassifier(),
|
| 91 |
+
'scaler': StandardScaler()
|
| 92 |
+
},
|
| 93 |
+
'ExtraTreesClassifier': {
|
| 94 |
+
'model': ExtraTreesClassifier(),
|
| 95 |
+
'scaler': StandardScaler()
|
| 96 |
+
},
|
| 97 |
+
'SVC': {
|
| 98 |
+
'model': SVC(),
|
| 99 |
+
'scaler': StandardScaler()
|
| 100 |
+
},
|
| 101 |
+
'LinearSVC': {
|
| 102 |
+
'model': LinearSVC(max_iter=2000),
|
| 103 |
+
'scaler': StandardScaler()
|
| 104 |
+
},
|
| 105 |
+
'GaussianNB': {
|
| 106 |
+
'model': GaussianNB(),
|
| 107 |
+
'scaler': StandardScaler()
|
| 108 |
+
},
|
| 109 |
+
'KNeighborsClassifier': {
|
| 110 |
+
'model': KNeighborsClassifier(),
|
| 111 |
+
'scaler': StandardScaler()
|
| 112 |
+
},
|
| 113 |
+
'MLPClassifier': {
|
| 114 |
+
'model': MLPClassifier(max_iter=1000),
|
| 115 |
+
'scaler': StandardScaler()
|
| 116 |
+
},
|
| 117 |
+
'MultinomialNB': {
|
| 118 |
+
'model': MultinomialNB(),
|
| 119 |
+
'scaler': MaxAbsScaler()
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
@staticmethod
|
| 124 |
+
def ensure_non_negative(X):
|
| 125 |
+
"""Ensure data is non-negative by shifting"""
|
| 126 |
+
if isinstance(X, pd.DataFrame):
|
| 127 |
+
min_val = X.values.min()
|
| 128 |
+
if min_val < 0:
|
| 129 |
+
return X + abs(min_val)
|
| 130 |
+
return X
|
| 131 |
+
else:
|
| 132 |
+
min_val = X.min()
|
| 133 |
+
if min_val < 0:
|
| 134 |
+
return X - min_val
|
| 135 |
+
return X
|
| 136 |
+
|
| 137 |
+
def save_model(self, model_dict, model_name):
|
| 138 |
+
"""Save model and its scaler to files"""
|
| 139 |
+
if not os.path.exists('models'):
|
| 140 |
+
os.makedirs('models')
|
| 141 |
+
|
| 142 |
+
base_filename = f"{model_name}"
|
| 143 |
+
|
| 144 |
+
if hasattr(model_dict['model'], 'feature_names_in_'):
|
| 145 |
+
model_dict['scaler'].feature_names_in_ = model_dict['model'].feature_names_in_
|
| 146 |
+
elif hasattr(st.session_state, 'features'):
|
| 147 |
+
model_dict['scaler'].feature_names_in_ = np.array(st.session_state.features)
|
| 148 |
+
|
| 149 |
+
model_path = os.path.join('models', f"{base_filename}_model.joblib")
|
| 150 |
+
scaler_path = os.path.join('models', f"{base_filename}_scaler.joblib")
|
| 151 |
+
|
| 152 |
+
joblib.dump(model_dict['model'], model_path)
|
| 153 |
+
joblib.dump(model_dict['scaler'], scaler_path)
|
| 154 |
+
|
| 155 |
+
return model_path, scaler_path
|
| 156 |
+
|
| 157 |
+
def train_and_evaluate_model(self, clf_dict, X_train, X_test, y_train, y_test, model_name):
|
| 158 |
+
"""Train and evaluate a single model"""
|
| 159 |
+
start_time = time.time()
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
scaler = clf_dict['scaler']
|
| 163 |
+
feature_names = st.session_state.features if hasattr(st.session_state, 'features') else None
|
| 164 |
+
|
| 165 |
+
if model_name == 'MultinomialNB':
|
| 166 |
+
X_train_positive = self.ensure_non_negative(X_train)
|
| 167 |
+
X_test_positive = self.ensure_non_negative(X_test)
|
| 168 |
+
X_train_scaled = scaler.fit_transform(X_train_positive)
|
| 169 |
+
X_test_scaled = scaler.transform(X_test_positive)
|
| 170 |
+
|
| 171 |
+
if np.any(X_train_scaled < 0) or np.any(X_test_scaled < 0):
|
| 172 |
+
raise ValueError("Negative values in scaled data")
|
| 173 |
+
else:
|
| 174 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 175 |
+
X_test_scaled = scaler.transform(X_test)
|
| 176 |
+
|
| 177 |
+
if feature_names is not None:
|
| 178 |
+
if hasattr(clf_dict['model'], 'feature_names_in_'):
|
| 179 |
+
clf_dict['model'].feature_names_in_ = np.array(feature_names)
|
| 180 |
+
scaler.feature_names_in_ = np.array(feature_names)
|
| 181 |
+
|
| 182 |
+
clf_dict['model'].fit(X_train_scaled, y_train)
|
| 183 |
+
y_pred = clf_dict['model'].predict(X_test_scaled)
|
| 184 |
+
|
| 185 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 186 |
+
training_time = time.time() - start_time
|
| 187 |
+
|
| 188 |
+
model_path, scaler_path = self.save_model(clf_dict, model_name)
|
| 189 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
| 190 |
+
|
| 191 |
+
return {
|
| 192 |
+
'model_name': model_name,
|
| 193 |
+
'accuracy': accuracy,
|
| 194 |
+
'training_time': training_time,
|
| 195 |
+
'model': clf_dict['model'],
|
| 196 |
+
'predictions': y_pred,
|
| 197 |
+
'status': 'success',
|
| 198 |
+
'scaler': scaler_path,
|
| 199 |
+
'model_path': model_path,
|
| 200 |
+
'confusion_matrix': conf_matrix
|
| 201 |
+
}
|
| 202 |
+
except Exception as e:
|
| 203 |
+
return {
|
| 204 |
+
'model_name': model_name,
|
| 205 |
+
'accuracy': 0,
|
| 206 |
+
'training_time': 0,
|
| 207 |
+
'model': None,
|
| 208 |
+
'predictions': None,
|
| 209 |
+
'status': f'failed: {str(e)}',
|
| 210 |
+
'scaler': None,
|
| 211 |
+
'model_path': None,
|
| 212 |
+
'confusion_matrix': None
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
class Visualizer:
|
| 216 |
+
@staticmethod
|
| 217 |
+
def plot_learning_curve(estimator, X, y, title, ax):
|
| 218 |
+
"""Plot learning curves for a model"""
|
| 219 |
+
train_sizes, train_scores, test_scores = learning_curve(
|
| 220 |
+
estimator, X, y,
|
| 221 |
+
train_sizes=np.linspace(0.1, 1.0, 10),
|
| 222 |
+
cv=5,
|
| 223 |
+
n_jobs=-1,
|
| 224 |
+
scoring='accuracy'
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
train_mean = np.mean(train_scores, axis=1)
|
| 228 |
+
train_std = np.std(train_scores, axis=1)
|
| 229 |
+
test_mean = np.mean(test_scores, axis=1)
|
| 230 |
+
test_std = np.std(test_scores, axis=1)
|
| 231 |
+
|
| 232 |
+
ax.plot(train_sizes, train_mean, label='Training score')
|
| 233 |
+
ax.plot(train_sizes, test_mean, label='Cross-validation score')
|
| 234 |
+
|
| 235 |
+
ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
|
| 236 |
+
ax.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
|
| 237 |
+
|
| 238 |
+
ax.set_xlabel('Training Examples')
|
| 239 |
+
ax.set_ylabel('Score')
|
| 240 |
+
ax.set_title(title)
|
| 241 |
+
ax.legend(loc='lower right')
|
| 242 |
+
ax.grid(True)
|
| 243 |
+
|
| 244 |
+
def create_confusion_matrices_plot(self, successful_results, y_test):
|
| 245 |
+
"""Create and display confusion matrices for successful models"""
|
| 246 |
+
n_models = len(successful_results)
|
| 247 |
+
n_cols = 2
|
| 248 |
+
n_rows = (n_models + n_cols - 1) // n_cols
|
| 249 |
+
|
| 250 |
+
fig = plt.figure(figsize=(15, 5 * n_rows))
|
| 251 |
+
colors = ['white', '#4a90e2']
|
| 252 |
+
n_bins = 100
|
| 253 |
+
# cmap = LinearSegmentedColormap.from_list("custom_blues", colors, N=n_bins)
|
| 254 |
+
|
| 255 |
+
for idx, result in enumerate(successful_results):
|
| 256 |
+
ax = plt.subplot(n_rows, n_cols, idx + 1)
|
| 257 |
+
|
| 258 |
+
sns.heatmap(
|
| 259 |
+
result['confusion_matrix'],
|
| 260 |
+
annot=True,
|
| 261 |
+
fmt='d',
|
| 262 |
+
# cmap=cmap,
|
| 263 |
+
cmap='viridis',
|
| 264 |
+
ax=ax,
|
| 265 |
+
xticklabels=sorted(set(y_test)),
|
| 266 |
+
yticklabels=sorted(set(y_test))
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
ax.set_xlabel('Predicted')
|
| 270 |
+
ax.set_ylabel('Actual')
|
| 271 |
+
ax.set_title(f"{result['model_name']}\nAccuracy: {result['accuracy']:.4f}")
|
| 272 |
+
|
| 273 |
+
plt.tight_layout()
|
| 274 |
+
return fig
|
| 275 |
+
|
| 276 |
+
def create_performance_summary_plot(self, successful_df, selected_models):
|
| 277 |
+
"""Create performance metrics summary plot"""
|
| 278 |
+
metrics_to_compare = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
|
| 279 |
+
summary_df = successful_df[successful_df['Model'].isin(selected_models)].melt(
|
| 280 |
+
id_vars=['Model'],
|
| 281 |
+
value_vars=metrics_to_compare,
|
| 282 |
+
var_name='Metric',
|
| 283 |
+
value_name='Score'
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
fig_summary = px.bar(
|
| 287 |
+
summary_df,
|
| 288 |
+
x='Model',
|
| 289 |
+
y='Score',
|
| 290 |
+
color='Metric',
|
| 291 |
+
barmode='group',
|
| 292 |
+
title="Model Performance Metrics Comparison",
|
| 293 |
+
text='Score'
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
fig_summary.update_layout(
|
| 297 |
+
xaxis_tickangle=-45,
|
| 298 |
+
showlegend=True,
|
| 299 |
+
height=600,
|
| 300 |
+
yaxis=dict(
|
| 301 |
+
range=[0, 1],
|
| 302 |
+
title='Score'
|
| 303 |
+
),
|
| 304 |
+
legend=dict(
|
| 305 |
+
title='Metric',
|
| 306 |
+
orientation='h',
|
| 307 |
+
yanchor='bottom',
|
| 308 |
+
y=1.02,
|
| 309 |
+
xanchor='right',
|
| 310 |
+
x=1
|
| 311 |
+
)
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
fig_summary.update_traces(
|
| 315 |
+
texttemplate='%{text:.4f}',
|
| 316 |
+
textposition='outside',
|
| 317 |
+
textangle=0
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
summary_df['Avg_Score'] = summary_df.groupby('Model')['Score'].transform('mean')
|
| 321 |
+
models_order = summary_df.drop_duplicates('Model').sort_values('Avg_Score', ascending=False)['Model']
|
| 322 |
+
fig_summary.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': models_order})
|
| 323 |
+
|
| 324 |
+
return fig_summary
|
| 325 |
+
|
| 326 |
+
class StreamlitUI:
|
| 327 |
+
def __init__(self):
|
| 328 |
+
self.data_generator = DataGenerator()
|
| 329 |
+
self.model_manager = ModelManager()
|
| 330 |
+
self.visualizer = Visualizer()
|
| 331 |
+
|
| 332 |
+
# Add default configurations as class attribute
|
| 333 |
+
self.default_configs = {
|
| 334 |
+
# Features: [length (mm), width (mm), density (g/cm³), pH]
|
| 335 |
+
|
| 336 |
+
# AMPALAYA: Medium length (150-180mm), thin width (40-50mm)
|
| 337 |
+
# Medium density (95 g/cm³) due to hollow interior, slightly basic pH (6.8-7.0)
|
| 338 |
+
"Ampalaya": {'mean': [165, 45, 95, 6.9], 'std': [15, 5, 10, 0.1]},
|
| 339 |
+
|
| 340 |
+
# BANANA: Long length (180-220mm), medium width (30-40mm)
|
| 341 |
+
# Low density (85 g/cm³), acidic pH (4.5-5.2)
|
| 342 |
+
"Banana": {'mean': [200, 35, 85, 4.8], 'std': [20, 5, 8, 0.3]},
|
| 343 |
+
|
| 344 |
+
# CABBAGE: Round shape - similar length/width (150-200mm x 150-200mm)
|
| 345 |
+
# Very low density (65 g/cm³) due to layered leaves, neutral pH (6.5-7.0)
|
| 346 |
+
"Cabbage": {'mean': [175, 175, 65, 6.8], 'std': [25, 25, 5, 0.2]},
|
| 347 |
+
|
| 348 |
+
# CARROT: Medium length (140-180mm), narrow width (25-35mm)
|
| 349 |
+
# High density (115 g/cm³) due to dense flesh, slightly acidic pH (6.0-6.5)
|
| 350 |
+
"Carrot": {'mean': [160, 30, 115, 6.3], 'std': [20, 5, 10, 0.2]},
|
| 351 |
+
|
| 352 |
+
# CASSAVA: Long length (200-300mm), thick width (50-80mm)
|
| 353 |
+
# High density (125 g/cm³) due to starchy flesh, slightly acidic pH (6.0-6.5)
|
| 354 |
+
"Cassava": {'mean': [250, 65, 125, 6.2], 'std': [50, 15, 12, 0.2]}
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
# Default feature names that match the measurements in default_configs
|
| 358 |
+
self.default_features = [
|
| 359 |
+
'length (mm)',
|
| 360 |
+
'width (mm)',
|
| 361 |
+
'density (g/cm³)',
|
| 362 |
+
'pH'
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
# Add new session state variables for static visualizations
|
| 366 |
+
self.initialize_static_visualizations()
|
| 367 |
+
|
| 368 |
+
# Add new session state variable for data source
|
| 369 |
+
if 'data_source' not in st.session_state:
|
| 370 |
+
st.session_state.data_source = 'synthetic'
|
| 371 |
+
|
| 372 |
+
def initialize_static_visualizations(self):
|
| 373 |
+
"""Initialize session state variables for static visualizations"""
|
| 374 |
+
if 'confusion_matrices_fig' not in st.session_state:
|
| 375 |
+
st.session_state.confusion_matrices_fig = None
|
| 376 |
+
if 'learning_curves_fig' not in st.session_state:
|
| 377 |
+
st.session_state.learning_curves_fig = None
|
| 378 |
+
|
| 379 |
+
def initialize_session_state(self):
|
| 380 |
+
"""Initialize all session state variables"""
|
| 381 |
+
session_vars = {
|
| 382 |
+
'data_generated': False,
|
| 383 |
+
'df': None,
|
| 384 |
+
'features': None,
|
| 385 |
+
'feature_configs': None,
|
| 386 |
+
'X_train': None,
|
| 387 |
+
'X_test': None,
|
| 388 |
+
'y_train': None,
|
| 389 |
+
'y_test': None,
|
| 390 |
+
'y_pred': None,
|
| 391 |
+
'model_results': None,
|
| 392 |
+
'best_model': None,
|
| 393 |
+
'accuracy': None,
|
| 394 |
+
'feature_importance': None,
|
| 395 |
+
'split_info': None
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
for var, value in session_vars.items():
|
| 399 |
+
if var not in st.session_state:
|
| 400 |
+
st.session_state[var] = value
|
| 401 |
+
|
| 402 |
+
def setup_page_config(self):
|
| 403 |
+
"""Configure the Streamlit page"""
|
| 404 |
+
st.set_page_config(
|
| 405 |
+
page_title="ML Model Generator & Implementation",
|
| 406 |
+
page_icon="🤖",
|
| 407 |
+
layout="wide",
|
| 408 |
+
menu_items={
|
| 409 |
+
'About': """
|
| 410 |
+
## Final project in Modeling and Simulation \n
|
| 411 |
+
### Juan Dela Cruz - BSCS 4A"""
|
| 412 |
+
}
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
def get_sidebar_inputs(self):
|
| 416 |
+
"""Get all inputs from the sidebar"""
|
| 417 |
+
st.sidebar.header("Data Generation Parameters")
|
| 418 |
+
|
| 419 |
+
# Feature configuration
|
| 420 |
+
st.sidebar.subheader("Feature Configuration")
|
| 421 |
+
|
| 422 |
+
# Initialize default features if not in session state
|
| 423 |
+
if 'features_input' not in st.session_state:
|
| 424 |
+
st.session_state.features_input = ", ".join(self.default_features)
|
| 425 |
+
|
| 426 |
+
features_input = st.sidebar.text_input(
|
| 427 |
+
"Enter feature names (comma-separated)",
|
| 428 |
+
key='features_input'
|
| 429 |
+
)
|
| 430 |
+
features = [f.strip() for f in features_input.split(",")]
|
| 431 |
+
|
| 432 |
+
# Initialize default classes if not in session state
|
| 433 |
+
if 'classes_input' not in st.session_state:
|
| 434 |
+
st.session_state.classes_input = ", ".join(self.default_configs.keys())
|
| 435 |
+
|
| 436 |
+
classes_input = st.sidebar.text_input(
|
| 437 |
+
"Enter class names (comma-separated)",
|
| 438 |
+
key='classes_input'
|
| 439 |
+
)
|
| 440 |
+
classes = [c.strip() for c in classes_input.split(",")]
|
| 441 |
+
|
| 442 |
+
# Generate feature configs
|
| 443 |
+
feature_configs = {}
|
| 444 |
+
for feature in features:
|
| 445 |
+
feature_configs[feature] = {
|
| 446 |
+
'type': 'random',
|
| 447 |
+
'std': 20.0,
|
| 448 |
+
'center': None
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
return features, feature_configs, classes
|
| 452 |
+
|
| 453 |
+
def get_class_configs(self, classes, features):
|
| 454 |
+
"""Get class-specific configurations from the sidebar"""
|
| 455 |
+
class_configs = {}
|
| 456 |
+
st.sidebar.subheader("Class-Specific Settings")
|
| 457 |
+
|
| 458 |
+
for class_name in classes:
|
| 459 |
+
with st.sidebar.expander(f"{class_name} Settings", expanded=False):
|
| 460 |
+
checkbox_key = f"use_specific_{class_name}"
|
| 461 |
+
|
| 462 |
+
# Initialize checkbox state if not in session state
|
| 463 |
+
if checkbox_key not in st.session_state:
|
| 464 |
+
st.session_state[checkbox_key] = True
|
| 465 |
+
|
| 466 |
+
use_specific = st.checkbox(
|
| 467 |
+
f"Set specific values for {class_name}",
|
| 468 |
+
key=checkbox_key
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
means = []
|
| 472 |
+
stds = []
|
| 473 |
+
|
| 474 |
+
# Generate unique means for each class if not in default configs
|
| 475 |
+
if class_name not in self.default_configs:
|
| 476 |
+
# Generate random means between 0-100 that are different from other classes
|
| 477 |
+
random_means = []
|
| 478 |
+
for _ in range(len(features)):
|
| 479 |
+
mean = np.random.uniform(0, 100)
|
| 480 |
+
# Ensure means are unique across classes
|
| 481 |
+
while any(abs(mean - c['mean'][_]) < 10 for c in class_configs.values() if 'mean' in c):
|
| 482 |
+
mean = np.random.uniform(0, 100)
|
| 483 |
+
random_means.append(mean)
|
| 484 |
+
default_values = {'mean': random_means, 'std': [20.0] * len(features)}
|
| 485 |
+
else:
|
| 486 |
+
# Ensure default values match the number of features
|
| 487 |
+
default_means = self.default_configs[class_name]['mean']
|
| 488 |
+
default_stds = self.default_configs[class_name]['std']
|
| 489 |
+
|
| 490 |
+
# If we have more features than default values, extend with random values
|
| 491 |
+
if len(features) > len(default_means):
|
| 492 |
+
additional_means = [np.random.uniform(0, 100) for _ in range(len(features) - len(default_means))]
|
| 493 |
+
additional_stds = [20.0 for _ in range(len(features) - len(default_stds))]
|
| 494 |
+
default_means.extend(additional_means)
|
| 495 |
+
default_stds.extend(additional_stds)
|
| 496 |
+
# If we have fewer features than default values, truncate
|
| 497 |
+
elif len(features) < len(default_means):
|
| 498 |
+
default_means = default_means[:len(features)]
|
| 499 |
+
default_stds = default_stds[:len(features)]
|
| 500 |
+
|
| 501 |
+
default_values = {'mean': default_means, 'std': default_stds}
|
| 502 |
+
|
| 503 |
+
if use_specific:
|
| 504 |
+
for idx, feature in enumerate(features):
|
| 505 |
+
mean_key = f"mean_{class_name}_{feature}"
|
| 506 |
+
std_key = f"std_{class_name}_{feature}"
|
| 507 |
+
|
| 508 |
+
if mean_key not in st.session_state:
|
| 509 |
+
st.session_state[mean_key] = float(default_values['mean'][idx])
|
| 510 |
+
if std_key not in st.session_state:
|
| 511 |
+
st.session_state[std_key] = float(default_values['std'][idx])
|
| 512 |
+
|
| 513 |
+
col1, col2 = st.columns(2)
|
| 514 |
+
with col1:
|
| 515 |
+
mean = st.number_input(
|
| 516 |
+
f"Mean for {feature}",
|
| 517 |
+
key=mean_key
|
| 518 |
+
)
|
| 519 |
+
means.append(mean)
|
| 520 |
+
with col2:
|
| 521 |
+
std = st.number_input(
|
| 522 |
+
f"Std Dev for {feature}",
|
| 523 |
+
min_value=0.1,
|
| 524 |
+
key=std_key
|
| 525 |
+
)
|
| 526 |
+
stds.append(std)
|
| 527 |
+
else:
|
| 528 |
+
# Use default values if specific values not requested
|
| 529 |
+
means = default_values['mean']
|
| 530 |
+
stds = default_values['std']
|
| 531 |
+
|
| 532 |
+
class_configs[class_name] = {
|
| 533 |
+
'mean': means,
|
| 534 |
+
'std': stds
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
return class_configs
|
| 538 |
+
|
| 539 |
+
def get_training_params(self):
|
| 540 |
+
"""Get training parameters from the sidebar"""
|
| 541 |
+
st.sidebar.subheader("Sample Size & Train/Test Split Configuration")
|
| 542 |
+
|
| 543 |
+
# Initialize default values if not in session state
|
| 544 |
+
if 'n_samples' not in st.session_state:
|
| 545 |
+
st.session_state.n_samples = 10000
|
| 546 |
+
|
| 547 |
+
col1, col2 = st.sidebar.columns(2)
|
| 548 |
+
|
| 549 |
+
with col1:
|
| 550 |
+
n_samples = st.slider(
|
| 551 |
+
"Number of samples",
|
| 552 |
+
500,
|
| 553 |
+
50000,
|
| 554 |
+
step=500,
|
| 555 |
+
key='n_samples'
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
with col2:
|
| 559 |
+
test_size = st.slider(
|
| 560 |
+
"Test Size",
|
| 561 |
+
min_value=10,
|
| 562 |
+
max_value=50,
|
| 563 |
+
value=30, # Default value directly in the widget
|
| 564 |
+
step=5,
|
| 565 |
+
key='test_size',
|
| 566 |
+
format="%d%%",
|
| 567 |
+
help="Percentage of data to use for testing"
|
| 568 |
+
)
|
| 569 |
+
st.write(f"Test: {test_size}% / Train: {100 - test_size}%")
|
| 570 |
+
|
| 571 |
+
return n_samples, test_size
|
| 572 |
+
|
| 573 |
+
def generate_and_train(self, n_samples, feature_configs, classes, class_configs, test_size):
|
| 574 |
+
"""Generate data and train models"""
|
| 575 |
+
X, y = self.data_generator.generate_synthetic_data(
|
| 576 |
+
n_samples,
|
| 577 |
+
feature_configs,
|
| 578 |
+
classes,
|
| 579 |
+
class_configs
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
st.session_state.df = pd.DataFrame(X, columns=st.session_state.features)
|
| 583 |
+
st.session_state.df['target'] = y
|
| 584 |
+
|
| 585 |
+
# Train test split
|
| 586 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 587 |
+
X, y,
|
| 588 |
+
test_size=test_size/100,
|
| 589 |
+
random_state=42
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
# Store split data
|
| 593 |
+
st.session_state.X_train = X_train
|
| 594 |
+
st.session_state.X_test = X_test
|
| 595 |
+
st.session_state.y_train = y_train
|
| 596 |
+
st.session_state.y_test = y_test
|
| 597 |
+
|
| 598 |
+
# Get classifiers and train models
|
| 599 |
+
classifiers = self.model_manager.get_classifiers()
|
| 600 |
+
results = []
|
| 601 |
+
|
| 602 |
+
with st.spinner('Training models... Please wait.'):
|
| 603 |
+
progress_bar = st.progress(0)
|
| 604 |
+
for idx, (name, clf_dict) in enumerate(classifiers.items()):
|
| 605 |
+
result = self.model_manager.train_and_evaluate_model(
|
| 606 |
+
clf_dict,
|
| 607 |
+
X_train,
|
| 608 |
+
X_test,
|
| 609 |
+
y_train,
|
| 610 |
+
y_test,
|
| 611 |
+
name
|
| 612 |
+
)
|
| 613 |
+
results.append(result)
|
| 614 |
+
progress_bar.progress((idx + 1) / len(classifiers))
|
| 615 |
+
|
| 616 |
+
st.session_state.model_results = results
|
| 617 |
+
st.session_state.data_generated = True
|
| 618 |
+
|
| 619 |
+
# Find best model
|
| 620 |
+
successful_results = [r for r in results if r['status'] == 'success']
|
| 621 |
+
if successful_results:
|
| 622 |
+
best_model = max(successful_results, key=lambda x: x['accuracy'])
|
| 623 |
+
st.session_state.best_model = best_model
|
| 624 |
+
|
| 625 |
+
# Store split information
|
| 626 |
+
st.session_state.split_info = {
|
| 627 |
+
'total_samples': len(X),
|
| 628 |
+
'train_samples': len(X_train),
|
| 629 |
+
'test_samples': len(X_test),
|
| 630 |
+
'test_percentage': test_size
|
| 631 |
+
}
|
| 632 |
+
st.session_state.feature_configs = feature_configs
|
| 633 |
+
|
| 634 |
+
# Generate static visualizations after training
|
| 635 |
+
successful_results = [r for r in st.session_state.model_results if r['status'] == 'success']
|
| 636 |
+
if successful_results:
|
| 637 |
+
# Generate and store confusion matrices
|
| 638 |
+
st.session_state.confusion_matrices_fig = self.visualizer.create_confusion_matrices_plot(
|
| 639 |
+
successful_results,
|
| 640 |
+
st.session_state.y_test
|
| 641 |
+
)
|
| 642 |
+
|
| 643 |
+
# Generate and store learning curves
|
| 644 |
+
st.session_state.learning_curves_fig = self.generate_learning_curves_figure(successful_results)
|
| 645 |
+
|
| 646 |
+
def generate_learning_curves_figure(self, successful_results):
|
| 647 |
+
"""Generate learning curves figure"""
|
| 648 |
+
successful_results.sort(key=lambda x: x['accuracy'], reverse=True)
|
| 649 |
+
n_models = len(successful_results)
|
| 650 |
+
n_cols = 2
|
| 651 |
+
n_rows = (n_models + n_cols - 1) // n_cols
|
| 652 |
+
|
| 653 |
+
fig_learning = plt.figure(figsize=(15, 5 * n_rows))
|
| 654 |
+
|
| 655 |
+
for idx, result in enumerate(successful_results):
|
| 656 |
+
ax = plt.subplot(n_rows, n_cols, idx + 1)
|
| 657 |
+
|
| 658 |
+
model_name = result['model_name']
|
| 659 |
+
model = result['model']
|
| 660 |
+
scaler = joblib.load(result['scaler'])
|
| 661 |
+
|
| 662 |
+
if model_name == 'MultinomialNB':
|
| 663 |
+
X_scaled = self.model_manager.ensure_non_negative(
|
| 664 |
+
st.session_state.df.drop('target', axis=1)
|
| 665 |
+
)
|
| 666 |
+
X_scaled = scaler.transform(X_scaled)
|
| 667 |
+
else:
|
| 668 |
+
X_scaled = scaler.transform(st.session_state.df.drop('target', axis=1))
|
| 669 |
+
|
| 670 |
+
y = st.session_state.df['target']
|
| 671 |
+
|
| 672 |
+
self.visualizer.plot_learning_curve(
|
| 673 |
+
model,
|
| 674 |
+
X_scaled,
|
| 675 |
+
y,
|
| 676 |
+
f'Learning Curve - {model_name}\nFinal Accuracy: {result["accuracy"]:.4f}',
|
| 677 |
+
ax
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
plt.tight_layout()
|
| 681 |
+
return fig_learning
|
| 682 |
+
|
| 683 |
+
def display_model_comparison(self):
|
| 684 |
+
"""Display model comparison section"""
|
| 685 |
+
st.subheader("Model Comparison")
|
| 686 |
+
|
| 687 |
+
comparison_data = []
|
| 688 |
+
for result in st.session_state.model_results:
|
| 689 |
+
if result['status'] == 'success':
|
| 690 |
+
report_dict = classification_report(
|
| 691 |
+
st.session_state.y_test,
|
| 692 |
+
result['predictions'],
|
| 693 |
+
output_dict=True
|
| 694 |
+
)
|
| 695 |
+
|
| 696 |
+
macro_avg = report_dict['macro avg']
|
| 697 |
+
|
| 698 |
+
comparison_data.append({
|
| 699 |
+
'Model': result['model_name'],
|
| 700 |
+
'Accuracy': float(f"{result['accuracy']:.4f}"),
|
| 701 |
+
'Precision': float(f"{macro_avg['precision']:.4f}"),
|
| 702 |
+
'Recall': float(f"{macro_avg['recall']:.4f}"),
|
| 703 |
+
'F1-Score': float(f"{macro_avg['f1-score']:.4f}"),
|
| 704 |
+
'Training Time (s)': float(f"{result['training_time']:.3f}"),
|
| 705 |
+
'Status': 'Success'
|
| 706 |
+
})
|
| 707 |
+
else:
|
| 708 |
+
comparison_data.append({
|
| 709 |
+
'Model': result['model_name'],
|
| 710 |
+
'Accuracy': 0,
|
| 711 |
+
'Precision': 0,
|
| 712 |
+
'Recall': 0,
|
| 713 |
+
'F1-Score': 0,
|
| 714 |
+
'Training Time (s)': 0,
|
| 715 |
+
'Status': result['status']
|
| 716 |
+
})
|
| 717 |
+
|
| 718 |
+
comparison_df = pd.DataFrame(comparison_data)
|
| 719 |
+
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
|
| 720 |
+
|
| 721 |
+
st.dataframe(comparison_df.style.format({
|
| 722 |
+
'Accuracy': '{:.4f}',
|
| 723 |
+
'Precision': '{:.4f}',
|
| 724 |
+
'Recall': '{:.4f}',
|
| 725 |
+
'F1-Score': '{:.4f}',
|
| 726 |
+
'Training Time (s)': '{:.3f}'
|
| 727 |
+
}))
|
| 728 |
+
|
| 729 |
+
return comparison_df
|
| 730 |
+
|
| 731 |
+
def display_metric_visualization(self, comparison_df):
|
| 732 |
+
"""Display metric visualization section"""
|
| 733 |
+
metric_to_plot = st.selectbox(
|
| 734 |
+
"Select metric to visualize",
|
| 735 |
+
['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Training Time (s)']
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
successful_df = comparison_df[comparison_df['Status'] == 'Success']
|
| 739 |
+
|
| 740 |
+
if metric_to_plot == 'Training Time (s)':
|
| 741 |
+
successful_df = successful_df.sort_values(metric_to_plot)
|
| 742 |
+
else:
|
| 743 |
+
successful_df = successful_df.sort_values(metric_to_plot, ascending=False)
|
| 744 |
+
|
| 745 |
+
fig_comparison = px.bar(
|
| 746 |
+
successful_df,
|
| 747 |
+
x='Model',
|
| 748 |
+
y=metric_to_plot,
|
| 749 |
+
title=f"Model {metric_to_plot} Comparison",
|
| 750 |
+
color=metric_to_plot,
|
| 751 |
+
text=metric_to_plot
|
| 752 |
+
)
|
| 753 |
+
|
| 754 |
+
fig_comparison.update_layout(
|
| 755 |
+
xaxis_tickangle=-45,
|
| 756 |
+
showlegend=True,
|
| 757 |
+
height=500,
|
| 758 |
+
yaxis=dict(
|
| 759 |
+
range=[0, 1] if metric_to_plot != 'Training Time (s)' else None
|
| 760 |
+
)
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
fig_comparison.update_traces(
|
| 764 |
+
texttemplate='%{text:.4f}',
|
| 765 |
+
textposition='outside',
|
| 766 |
+
textangle=0
|
| 767 |
+
)
|
| 768 |
+
|
| 769 |
+
st.plotly_chart(fig_comparison)
|
| 770 |
+
return successful_df
|
| 771 |
+
|
| 772 |
+
def display_best_model_performance(self):
|
| 773 |
+
"""Display best model performance section"""
|
| 774 |
+
if hasattr(st.session_state, 'best_model'):
|
| 775 |
+
st.subheader("Best Model Performance")
|
| 776 |
+
best_model = st.session_state.best_model
|
| 777 |
+
st.write(f"Best Model: **{best_model['model_name']}**")
|
| 778 |
+
st.write(f"Accuracy: {best_model['accuracy']:.4f}")
|
| 779 |
+
|
| 780 |
+
st.write("Classification Report (Best Model):")
|
| 781 |
+
report_dict = classification_report(
|
| 782 |
+
st.session_state.y_test,
|
| 783 |
+
best_model['predictions'],
|
| 784 |
+
output_dict=True
|
| 785 |
+
)
|
| 786 |
+
report_df = pd.DataFrame(report_dict).transpose()
|
| 787 |
+
st.dataframe(report_df.style.format('{:.4f}'))
|
| 788 |
+
|
| 789 |
+
def display_dataset_info(self):
|
| 790 |
+
"""Display dataset split information"""
|
| 791 |
+
if st.session_state.split_info:
|
| 792 |
+
st.subheader("Dataset Split Information")
|
| 793 |
+
col1, col2, col3 = st.columns(3)
|
| 794 |
+
|
| 795 |
+
with col1:
|
| 796 |
+
st.metric(
|
| 797 |
+
"Total Samples",
|
| 798 |
+
st.session_state.split_info['total_samples']
|
| 799 |
+
)
|
| 800 |
+
|
| 801 |
+
with col2:
|
| 802 |
+
st.metric(
|
| 803 |
+
"Training Samples",
|
| 804 |
+
f"{st.session_state.split_info['train_samples']} "
|
| 805 |
+
f"({100 - st.session_state.split_info['test_percentage']}%)"
|
| 806 |
+
)
|
| 807 |
+
|
| 808 |
+
with col3:
|
| 809 |
+
st.metric(
|
| 810 |
+
"Testing Samples",
|
| 811 |
+
f"{st.session_state.split_info['test_samples']} "
|
| 812 |
+
f"({st.session_state.split_info['test_percentage']}%)"
|
| 813 |
+
)
|
| 814 |
+
|
| 815 |
+
def display_feature_configs(self):
|
| 816 |
+
"""Display feature configurations"""
|
| 817 |
+
st.subheader("Feature Configurations")
|
| 818 |
+
config_data = []
|
| 819 |
+
for feature, config in st.session_state.feature_configs.items():
|
| 820 |
+
config_data.append({
|
| 821 |
+
'Feature': feature,
|
| 822 |
+
'Type': config['type'],
|
| 823 |
+
'Std Dev': config['std'],
|
| 824 |
+
'Center': config['center'] if config['type'] == 'user-defined' else 'Random'
|
| 825 |
+
})
|
| 826 |
+
st.table(pd.DataFrame(config_data))
|
| 827 |
+
|
| 828 |
+
def display_data_samples(self):
|
| 829 |
+
"""Display original and scaled data samples"""
|
| 830 |
+
st.subheader("Generated Data Sample")
|
| 831 |
+
|
| 832 |
+
# Get random samples from each class
|
| 833 |
+
unique_classes = st.session_state.df['target'].unique()
|
| 834 |
+
samples_per_class = 2 # Number of samples to show per class
|
| 835 |
+
|
| 836 |
+
sampled_data = []
|
| 837 |
+
for class_name in unique_classes:
|
| 838 |
+
class_data = st.session_state.df[st.session_state.df['target'] == class_name]
|
| 839 |
+
sampled_data.append(class_data.sample(n=min(samples_per_class, len(class_data))))
|
| 840 |
+
|
| 841 |
+
sampled_df = pd.concat(sampled_data).sample(frac=1).reset_index(drop=True)
|
| 842 |
+
|
| 843 |
+
col1, col2 = st.columns(2)
|
| 844 |
+
|
| 845 |
+
with col1:
|
| 846 |
+
st.write("Original Data (Random samples from each class):")
|
| 847 |
+
st.write(sampled_df)
|
| 848 |
+
|
| 849 |
+
with col2:
|
| 850 |
+
st.write("Scaled Data (using best model's scaler):")
|
| 851 |
+
if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
|
| 852 |
+
best_model_name = st.session_state.best_model['model_name']
|
| 853 |
+
scaler = joblib.load(st.session_state.best_model['scaler'])
|
| 854 |
+
|
| 855 |
+
features_df = sampled_df.drop('target', axis=1)
|
| 856 |
+
|
| 857 |
+
if best_model_name == 'MultinomialNB':
|
| 858 |
+
features_scaled = self.model_manager.ensure_non_negative(features_df)
|
| 859 |
+
features_scaled = scaler.transform(features_scaled)
|
| 860 |
+
else:
|
| 861 |
+
features_scaled = scaler.transform(features_df)
|
| 862 |
+
|
| 863 |
+
scaled_df = pd.DataFrame(
|
| 864 |
+
features_scaled,
|
| 865 |
+
columns=features_df.columns,
|
| 866 |
+
index=features_df.index
|
| 867 |
+
)
|
| 868 |
+
scaled_df['target'] = sampled_df['target']
|
| 869 |
+
|
| 870 |
+
st.write(scaled_df)
|
| 871 |
+
else:
|
| 872 |
+
st.write("No scaled data available (best model not found)")
|
| 873 |
+
|
| 874 |
+
def display_confusion_matrices(self):
|
| 875 |
+
"""Display confusion matrices section"""
|
| 876 |
+
st.subheader("Confusion Matrices")
|
| 877 |
+
st.write("""
|
| 878 |
+
Confusion matrices show the model's prediction performance across different classes.
|
| 879 |
+
- Each row represents the actual class
|
| 880 |
+
- Each column represents the predicted class
|
| 881 |
+
- Diagonal elements represent correct predictions (True Positives for each class)
|
| 882 |
+
- Off-diagonal elements represent incorrect predictions
|
| 883 |
+
- Numbers show how many samples were classified for each combination
|
| 884 |
+
- Colors range from yellow (high values) to green-blue (low values) using the viridis colormap
|
| 885 |
+
""")
|
| 886 |
+
if st.session_state.confusion_matrices_fig is not None:
|
| 887 |
+
st.pyplot(st.session_state.confusion_matrices_fig)
|
| 888 |
+
plt.close()
|
| 889 |
+
|
| 890 |
+
|
| 891 |
+
|
| 892 |
+
def display_performance_summary(self, successful_df):
|
| 893 |
+
"""Display performance metrics summary"""
|
| 894 |
+
st.subheader("Performance Metrics Summary")
|
| 895 |
+
|
| 896 |
+
all_models = successful_df['Model'].unique().tolist()
|
| 897 |
+
default_selection = all_models
|
| 898 |
+
|
| 899 |
+
col1, col2 = st.columns([3, 1])
|
| 900 |
+
with col1:
|
| 901 |
+
selected_models = st.multiselect(
|
| 902 |
+
"Select models to compare",
|
| 903 |
+
all_models,
|
| 904 |
+
default=default_selection
|
| 905 |
+
)
|
| 906 |
+
|
| 907 |
+
if not selected_models:
|
| 908 |
+
st.warning("Please select at least one model to display the comparison.")
|
| 909 |
+
return
|
| 910 |
+
|
| 911 |
+
fig_summary = self.visualizer.create_performance_summary_plot(
|
| 912 |
+
successful_df,
|
| 913 |
+
selected_models
|
| 914 |
+
)
|
| 915 |
+
st.plotly_chart(fig_summary, use_container_width=True)
|
| 916 |
+
|
| 917 |
+
def display_saved_models(self):
|
| 918 |
+
"""Display saved models information and download buttons"""
|
| 919 |
+
st.subheader("Saved Models")
|
| 920 |
+
saved_models = []
|
| 921 |
+
|
| 922 |
+
for result in st.session_state.model_results:
|
| 923 |
+
if result['status'] == 'success' and result['model_path']:
|
| 924 |
+
# Load model and scaler
|
| 925 |
+
model = joblib.load(result['model_path'])
|
| 926 |
+
scaler = joblib.load(result['scaler'])
|
| 927 |
+
|
| 928 |
+
# Create binary data for download using pickle
|
| 929 |
+
model_bytes = pickle.dumps(model)
|
| 930 |
+
scaler_bytes = pickle.dumps(scaler)
|
| 931 |
+
|
| 932 |
+
saved_models.append({
|
| 933 |
+
'Model': result['model_name'],
|
| 934 |
+
'Accuracy': result['accuracy'],
|
| 935 |
+
'Model_Binary': model_bytes,
|
| 936 |
+
'Scaler_Binary': scaler_bytes
|
| 937 |
+
})
|
| 938 |
+
|
| 939 |
+
if saved_models:
|
| 940 |
+
# Display models table
|
| 941 |
+
display_df = pd.DataFrame([{
|
| 942 |
+
'Model': m['Model'],
|
| 943 |
+
'Accuracy': m['Accuracy']
|
| 944 |
+
} for m in saved_models])
|
| 945 |
+
|
| 946 |
+
st.dataframe(display_df.style.format({
|
| 947 |
+
'Accuracy': '{:.4f}'
|
| 948 |
+
}))
|
| 949 |
+
|
| 950 |
+
# Add download buttons for each model
|
| 951 |
+
st.write("Download Models:")
|
| 952 |
+
for model_data in saved_models:
|
| 953 |
+
col1, col2 = st.columns(2)
|
| 954 |
+
|
| 955 |
+
with col1:
|
| 956 |
+
st.download_button(
|
| 957 |
+
label=f"Download {model_data['Model']} Model",
|
| 958 |
+
data=model_data['Model_Binary'],
|
| 959 |
+
file_name=f"{model_data['Model']}_model.pkl",
|
| 960 |
+
mime="application/octet-stream"
|
| 961 |
+
)
|
| 962 |
+
|
| 963 |
+
with col2:
|
| 964 |
+
st.download_button(
|
| 965 |
+
label=f"Download {model_data['Model']} Scaler",
|
| 966 |
+
data=model_data['Scaler_Binary'],
|
| 967 |
+
file_name=f"{model_data['Model']}_scaler.pkl",
|
| 968 |
+
mime="application/octet-stream"
|
| 969 |
+
)
|
| 970 |
+
else:
|
| 971 |
+
st.info("No models were saved. Models are saved automatically when accuracy exceeds 0.5")
|
| 972 |
+
|
| 973 |
+
def display_download_section(self):
|
| 974 |
+
"""Display dataset download section"""
|
| 975 |
+
st.subheader("Download Dataset")
|
| 976 |
+
col1, col2 = st.columns(2)
|
| 977 |
+
|
| 978 |
+
with col1:
|
| 979 |
+
if st.session_state.df is not None:
|
| 980 |
+
csv = st.session_state.df.to_csv(index=False)
|
| 981 |
+
st.download_button(
|
| 982 |
+
label="Download Original Dataset (CSV)",
|
| 983 |
+
data=csv,
|
| 984 |
+
file_name=f"synthetic_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 985 |
+
mime='text/csv',
|
| 986 |
+
help="Download the original unscaled dataset"
|
| 987 |
+
)
|
| 988 |
+
|
| 989 |
+
with col2:
|
| 990 |
+
if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
|
| 991 |
+
best_model_name = st.session_state.best_model['model_name']
|
| 992 |
+
scaler = joblib.load(st.session_state.best_model['scaler'])
|
| 993 |
+
|
| 994 |
+
features_df = st.session_state.df.drop('target', axis=1)
|
| 995 |
+
if best_model_name == 'MultinomialNB':
|
| 996 |
+
features_scaled = self.model_manager.ensure_non_negative(features_df)
|
| 997 |
+
features_scaled = scaler.transform(features_scaled)
|
| 998 |
+
else:
|
| 999 |
+
features_scaled = scaler.transform(features_df)
|
| 1000 |
+
|
| 1001 |
+
scaled_df = pd.DataFrame(
|
| 1002 |
+
features_scaled,
|
| 1003 |
+
columns=features_df.columns,
|
| 1004 |
+
index=features_df.index
|
| 1005 |
+
)
|
| 1006 |
+
scaled_df['target'] = st.session_state.df['target']
|
| 1007 |
+
|
| 1008 |
+
csv_scaled = scaled_df.to_csv(index=False)
|
| 1009 |
+
st.download_button(
|
| 1010 |
+
label="Download Scaled Dataset (CSV)",
|
| 1011 |
+
data=csv_scaled,
|
| 1012 |
+
file_name=f"synthetic_data_scaled_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 1013 |
+
mime='text/csv',
|
| 1014 |
+
help="Download the scaled dataset (using best model's scaler)"
|
| 1015 |
+
)
|
| 1016 |
+
|
| 1017 |
+
def display_dataset_statistics(self):
|
| 1018 |
+
"""Display dataset statistics"""
|
| 1019 |
+
with st.expander("Dataset Statistics"):
|
| 1020 |
+
col1, col2 = st.columns(2)
|
| 1021 |
+
|
| 1022 |
+
with col1:
|
| 1023 |
+
st.write("Original Dataset Statistics:")
|
| 1024 |
+
st.write(st.session_state.df.describe())
|
| 1025 |
+
|
| 1026 |
+
with col2:
|
| 1027 |
+
if st.session_state.best_model and st.session_state.best_model['status'] == 'success':
|
| 1028 |
+
st.write("Scaled Dataset Statistics:")
|
| 1029 |
+
best_model_name = st.session_state.best_model['model_name']
|
| 1030 |
+
scaler = joblib.load(st.session_state.best_model['scaler'])
|
| 1031 |
+
|
| 1032 |
+
features_df = st.session_state.df.drop('target', axis=1)
|
| 1033 |
+
if best_model_name == 'MultinomialNB':
|
| 1034 |
+
features_scaled = self.model_manager.ensure_non_negative(features_df)
|
| 1035 |
+
features_scaled = scaler.transform(features_scaled)
|
| 1036 |
+
else:
|
| 1037 |
+
features_scaled = scaler.transform(features_df)
|
| 1038 |
+
|
| 1039 |
+
scaled_df = pd.DataFrame(
|
| 1040 |
+
features_scaled,
|
| 1041 |
+
columns=features_df.columns,
|
| 1042 |
+
index=features_df.index
|
| 1043 |
+
)
|
| 1044 |
+
scaled_df['target'] = st.session_state.df['target']
|
| 1045 |
+
st.write(scaled_df.describe())
|
| 1046 |
+
|
| 1047 |
+
def display_learning_curves(self):
|
| 1048 |
+
"""Display learning curves section"""
|
| 1049 |
+
st.subheader("Learning Curves")
|
| 1050 |
+
st.write("""
|
| 1051 |
+
Learning curves show how model performance changes with increasing training data.
|
| 1052 |
+
- Blue line: Training score
|
| 1053 |
+
- Orange line: Cross-validation score
|
| 1054 |
+
- Shaded areas represent standard deviation
|
| 1055 |
+
""")
|
| 1056 |
+
|
| 1057 |
+
if st.session_state.learning_curves_fig is not None:
|
| 1058 |
+
st.pyplot(st.session_state.learning_curves_fig)
|
| 1059 |
+
plt.close()
|
| 1060 |
+
|
| 1061 |
+
def display_feature_visualization(self):
|
| 1062 |
+
"""Display 2D and 3D feature visualizations"""
|
| 1063 |
+
st.subheader("Feature Visualization")
|
| 1064 |
+
plot_type = st.radio("Select plot type", ["2D Plot", "3D Plot"], index=1)
|
| 1065 |
+
|
| 1066 |
+
if plot_type == "2D Plot":
|
| 1067 |
+
col1, col2 = st.columns(2)
|
| 1068 |
+
|
| 1069 |
+
with col1:
|
| 1070 |
+
x_feature = st.selectbox(
|
| 1071 |
+
"Select X-axis feature",
|
| 1072 |
+
st.session_state.features,
|
| 1073 |
+
index=0,
|
| 1074 |
+
key='x_2d'
|
| 1075 |
+
)
|
| 1076 |
+
|
| 1077 |
+
with col2:
|
| 1078 |
+
y_features = [f for f in st.session_state.features if f != x_feature]
|
| 1079 |
+
y_feature = st.selectbox(
|
| 1080 |
+
"Select Y-axis feature",
|
| 1081 |
+
y_features,
|
| 1082 |
+
index=0,
|
| 1083 |
+
key='y_2d'
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
fig = px.scatter(
|
| 1087 |
+
st.session_state.df,
|
| 1088 |
+
x=x_feature,
|
| 1089 |
+
y=y_feature,
|
| 1090 |
+
color='target',
|
| 1091 |
+
title=f"2D Visualization of {x_feature} vs {y_feature}",
|
| 1092 |
+
labels={'target': 'Class'}
|
| 1093 |
+
)
|
| 1094 |
+
|
| 1095 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1096 |
+
|
| 1097 |
+
else: # 3D Plot
|
| 1098 |
+
col1, col2, col3 = st.columns(3)
|
| 1099 |
+
|
| 1100 |
+
with col1:
|
| 1101 |
+
x_feature = st.selectbox(
|
| 1102 |
+
"Select X-axis feature",
|
| 1103 |
+
st.session_state.features,
|
| 1104 |
+
index=0,
|
| 1105 |
+
key='x_3d'
|
| 1106 |
+
)
|
| 1107 |
+
|
| 1108 |
+
with col2:
|
| 1109 |
+
y_features = [f for f in st.session_state.features if f != x_feature]
|
| 1110 |
+
y_feature = st.selectbox(
|
| 1111 |
+
"Select Y-axis feature",
|
| 1112 |
+
y_features,
|
| 1113 |
+
index=0,
|
| 1114 |
+
key='y_3d'
|
| 1115 |
+
)
|
| 1116 |
+
|
| 1117 |
+
with col3:
|
| 1118 |
+
z_features = [f for f in st.session_state.features if f not in [x_feature, y_feature]]
|
| 1119 |
+
z_feature = st.selectbox(
|
| 1120 |
+
"Select Z-axis feature",
|
| 1121 |
+
z_features,
|
| 1122 |
+
index=0,
|
| 1123 |
+
key='z_3d'
|
| 1124 |
+
)
|
| 1125 |
+
|
| 1126 |
+
fig = px.scatter_3d(
|
| 1127 |
+
st.session_state.df,
|
| 1128 |
+
x=x_feature,
|
| 1129 |
+
y=y_feature,
|
| 1130 |
+
z=z_feature,
|
| 1131 |
+
color='target',
|
| 1132 |
+
title=f"3D Visualization of {x_feature} vs {y_feature} vs {z_feature}",
|
| 1133 |
+
labels={'target': 'Class'}
|
| 1134 |
+
)
|
| 1135 |
+
|
| 1136 |
+
fig.update_layout(
|
| 1137 |
+
scene = dict(
|
| 1138 |
+
xaxis_title=x_feature,
|
| 1139 |
+
yaxis_title=y_feature,
|
| 1140 |
+
zaxis_title=z_feature
|
| 1141 |
+
),
|
| 1142 |
+
scene_camera=dict(
|
| 1143 |
+
up=dict(x=0, y=0, z=1),
|
| 1144 |
+
center=dict(x=0, y=0, z=0),
|
| 1145 |
+
eye=dict(x=1.5, y=1.5, z=1.5)
|
| 1146 |
+
)
|
| 1147 |
+
)
|
| 1148 |
+
|
| 1149 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1150 |
+
|
| 1151 |
+
def get_data_source(self):
|
| 1152 |
+
"""Get user's choice of data source"""
|
| 1153 |
+
st.sidebar.header("Data Source")
|
| 1154 |
+
data_source = st.sidebar.radio(
|
| 1155 |
+
"Choose data source",
|
| 1156 |
+
['Generate Synthetic Data', 'Upload Dataset'],
|
| 1157 |
+
key='data_source_radio'
|
| 1158 |
+
)
|
| 1159 |
+
st.session_state.data_source = 'synthetic' if data_source == 'Generate Synthetic Data' else 'upload'
|
| 1160 |
+
return st.session_state.data_source
|
| 1161 |
+
|
| 1162 |
+
def upload_dataset(self):
|
| 1163 |
+
"""Handle dataset upload"""
|
| 1164 |
+
st.sidebar.header("Upload Dataset")
|
| 1165 |
+
uploaded_file = st.sidebar.file_uploader(
|
| 1166 |
+
"Choose a CSV file",
|
| 1167 |
+
type="csv",
|
| 1168 |
+
help="Upload a CSV file with features and target column"
|
| 1169 |
+
)
|
| 1170 |
+
|
| 1171 |
+
if uploaded_file is not None:
|
| 1172 |
+
try:
|
| 1173 |
+
df = pd.read_csv(uploaded_file)
|
| 1174 |
+
|
| 1175 |
+
# Let user select target column
|
| 1176 |
+
target_col = st.sidebar.selectbox(
|
| 1177 |
+
"Select target column",
|
| 1178 |
+
df.columns.tolist()
|
| 1179 |
+
)
|
| 1180 |
+
|
| 1181 |
+
# Store features and target
|
| 1182 |
+
features = [col for col in df.columns if col != target_col]
|
| 1183 |
+
X = df[features]
|
| 1184 |
+
y = df[target_col]
|
| 1185 |
+
|
| 1186 |
+
# Store in session state
|
| 1187 |
+
st.session_state.df = df
|
| 1188 |
+
st.session_state.features = features
|
| 1189 |
+
|
| 1190 |
+
# Train test split
|
| 1191 |
+
test_size = st.sidebar.slider(
|
| 1192 |
+
"Test Size",
|
| 1193 |
+
min_value=10,
|
| 1194 |
+
max_value=50,
|
| 1195 |
+
value=30,
|
| 1196 |
+
step=5,
|
| 1197 |
+
format="%d%%",
|
| 1198 |
+
help="Percentage of data to use for testing"
|
| 1199 |
+
)
|
| 1200 |
+
|
| 1201 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 1202 |
+
X, y,
|
| 1203 |
+
test_size=test_size/100,
|
| 1204 |
+
random_state=42
|
| 1205 |
+
)
|
| 1206 |
+
|
| 1207 |
+
# Store split data
|
| 1208 |
+
st.session_state.X_train = X_train
|
| 1209 |
+
st.session_state.X_test = X_test
|
| 1210 |
+
st.session_state.y_train = y_train
|
| 1211 |
+
st.session_state.y_test = y_test
|
| 1212 |
+
|
| 1213 |
+
# Store split information
|
| 1214 |
+
st.session_state.split_info = {
|
| 1215 |
+
'total_samples': len(X),
|
| 1216 |
+
'train_samples': len(X_train),
|
| 1217 |
+
'test_samples': len(X_test),
|
| 1218 |
+
'test_percentage': test_size
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
return True
|
| 1222 |
+
except Exception as e:
|
| 1223 |
+
st.sidebar.error(f"Error loading dataset: {str(e)}")
|
| 1224 |
+
return False
|
| 1225 |
+
return False
|
| 1226 |
+
|
| 1227 |
+
def run(self):
|
| 1228 |
+
"""Main application logic"""
|
| 1229 |
+
self.setup_page_config()
|
| 1230 |
+
self.initialize_session_state()
|
| 1231 |
+
|
| 1232 |
+
st.title("ML Model Generator")
|
| 1233 |
+
|
| 1234 |
+
# Get data source choice
|
| 1235 |
+
data_source = self.get_data_source()
|
| 1236 |
+
|
| 1237 |
+
if data_source == 'synthetic':
|
| 1238 |
+
st.sidebar.header("Synthetic Data Generation")
|
| 1239 |
+
# Get inputs from sidebar for synthetic data
|
| 1240 |
+
features, feature_configs, classes = self.get_sidebar_inputs()
|
| 1241 |
+
class_configs = self.get_class_configs(classes, features)
|
| 1242 |
+
n_samples, test_size = self.get_training_params()
|
| 1243 |
+
|
| 1244 |
+
# Store features in session state
|
| 1245 |
+
st.session_state.features = features
|
| 1246 |
+
|
| 1247 |
+
# Generate Data button
|
| 1248 |
+
if st.sidebar.button("Generate Data and Train Models"):
|
| 1249 |
+
self.generate_and_train(n_samples, feature_configs, classes, class_configs, test_size)
|
| 1250 |
+
|
| 1251 |
+
else: # upload
|
| 1252 |
+
# Handle dataset upload
|
| 1253 |
+
if self.upload_dataset():
|
| 1254 |
+
if st.sidebar.button("Train Models"):
|
| 1255 |
+
# Get classifiers and train models
|
| 1256 |
+
classifiers = self.model_manager.get_classifiers()
|
| 1257 |
+
results = []
|
| 1258 |
+
|
| 1259 |
+
with st.spinner('Training models... Please wait.'):
|
| 1260 |
+
progress_bar = st.progress(0)
|
| 1261 |
+
for idx, (name, clf_dict) in enumerate(classifiers.items()):
|
| 1262 |
+
result = self.model_manager.train_and_evaluate_model(
|
| 1263 |
+
clf_dict,
|
| 1264 |
+
st.session_state.X_train,
|
| 1265 |
+
st.session_state.X_test,
|
| 1266 |
+
st.session_state.y_train,
|
| 1267 |
+
st.session_state.y_test,
|
| 1268 |
+
name
|
| 1269 |
+
)
|
| 1270 |
+
results.append(result)
|
| 1271 |
+
progress_bar.progress((idx + 1) / len(classifiers))
|
| 1272 |
+
|
| 1273 |
+
st.session_state.model_results = results
|
| 1274 |
+
st.session_state.data_generated = True
|
| 1275 |
+
|
| 1276 |
+
# Find best model
|
| 1277 |
+
successful_results = [r for r in results if r['status'] == 'success']
|
| 1278 |
+
if successful_results:
|
| 1279 |
+
best_model = max(successful_results, key=lambda x: x['accuracy'])
|
| 1280 |
+
st.session_state.best_model = best_model
|
| 1281 |
+
|
| 1282 |
+
# Generate static visualizations
|
| 1283 |
+
st.session_state.confusion_matrices_fig = self.visualizer.create_confusion_matrices_plot(
|
| 1284 |
+
successful_results,
|
| 1285 |
+
st.session_state.y_test
|
| 1286 |
+
)
|
| 1287 |
+
st.session_state.learning_curves_fig = self.generate_learning_curves_figure(successful_results)
|
| 1288 |
+
|
| 1289 |
+
# Display results if data has been generated/uploaded and trained
|
| 1290 |
+
if st.session_state.data_generated:
|
| 1291 |
+
self.display_dataset_info()
|
| 1292 |
+
self.display_data_samples()
|
| 1293 |
+
self.display_feature_visualization()
|
| 1294 |
+
self.display_download_section()
|
| 1295 |
+
self.display_dataset_statistics()
|
| 1296 |
+
self.display_best_model_performance()
|
| 1297 |
+
successful_df = self.display_model_comparison()
|
| 1298 |
+
|
| 1299 |
+
if successful_df is not None and not successful_df.empty:
|
| 1300 |
+
self.display_performance_summary(successful_df)
|
| 1301 |
+
self.display_saved_models()
|
| 1302 |
+
self.display_learning_curves()
|
| 1303 |
+
self.display_confusion_matrices()
|
| 1304 |
+
else:
|
| 1305 |
+
if data_source == 'synthetic':
|
| 1306 |
+
st.info("Please generate data using the sidebar button to view visualizations and results.")
|
| 1307 |
+
else:
|
| 1308 |
+
st.info("Please upload a dataset and click 'Train Models' to view visualizations and results.")
|
| 1309 |
+
|
| 1310 |
+
def main():
|
| 1311 |
+
app = StreamlitUI()
|
| 1312 |
+
app.run()
|
| 1313 |
+
|
| 1314 |
+
if __name__ == "__main__":
|
| 1315 |
+
main()
|
| 1316 |
+
|
README.md
CHANGED
|
@@ -1,13 +1,102 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Synthetic Data Generation and ML Model Training
|
| 2 |
+
|
| 3 |
+
A comprehensive Streamlit application for generating synthetic data, training machine learning models, and educational visualization of algorithm performance.
|
| 4 |
+
|
| 5 |
+
## Live Demo
|
| 6 |
+
|
| 7 |
+
**[Try the application online!](https://projectsyntheticdatageneration.streamlit.app/)**
|
| 8 |
+
|
| 9 |
+
## Overview
|
| 10 |
+
|
| 11 |
+
This application provides an end-to-end platform for:
|
| 12 |
+
1. Generating customizable synthetic datasets
|
| 13 |
+
2. Training and evaluating multiple machine learning classifiers
|
| 14 |
+
3. Visualizing model performance and data characteristics
|
| 15 |
+
4. Learning about different ML algorithms through interactive education
|
| 16 |
+
5. Implementing and testing trained models
|
| 17 |
+
|
| 18 |
+
## Features
|
| 19 |
+
|
| 20 |
+
### Main App (`App.py`)
|
| 21 |
+
- Synthetic data generation with customizable feature distributions
|
| 22 |
+
- Support for multiple classifier algorithms with automatic preprocessing
|
| 23 |
+
- Real-time visualization of model performance metrics
|
| 24 |
+
- Model comparison and selection
|
| 25 |
+
- Dataset exploration and visualization tools
|
| 26 |
+
- Model saving and exporting functionality
|
| 27 |
+
|
| 28 |
+
### Algorithm Education (`pages/02_Algorithm_Education.py`)
|
| 29 |
+
- Detailed explanations of various ML classification algorithms
|
| 30 |
+
- Interactive demonstrations with customizable parameters
|
| 31 |
+
- Mathematical foundations and implementation details
|
| 32 |
+
- Algorithm strengths, limitations, and use cases
|
| 33 |
+
- Performance visualization across different data distributions
|
| 34 |
+
|
| 35 |
+
### Model Implementation (`pages/03_Model_Implementation.py`)
|
| 36 |
+
- Upload and use previously trained models
|
| 37 |
+
- Real-time prediction with custom input values
|
| 38 |
+
- Model and scaler integration
|
| 39 |
+
|
| 40 |
+
## Installation
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
# Clone the repository
|
| 44 |
+
git clone https://github.com/yourusername/synthetic_data_generation.git
|
| 45 |
+
cd synthetic_data_generation
|
| 46 |
+
|
| 47 |
+
# Install dependencies
|
| 48 |
+
pip install -r requirements.txt
|
| 49 |
+
|
| 50 |
+
# Run the application
|
| 51 |
+
streamlit run App.py
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Requirements
|
| 55 |
+
|
| 56 |
+
- Python 3.7+
|
| 57 |
+
- streamlit>=1.28.0
|
| 58 |
+
- numpy>=1.24.0
|
| 59 |
+
- pandas>=2.0.0
|
| 60 |
+
- scikit-learn>=1.2.0
|
| 61 |
+
- plotly>=5.13.0
|
| 62 |
+
- seaborn>=0.12.0
|
| 63 |
+
- matplotlib>=3.7.0
|
| 64 |
+
- joblib>=1.2.0
|
| 65 |
+
|
| 66 |
+
## Usage
|
| 67 |
+
|
| 68 |
+
### Generating Synthetic Data
|
| 69 |
+
1. Define features and their distributions
|
| 70 |
+
2. Configure class characteristics
|
| 71 |
+
3. Set sample size and other generation parameters
|
| 72 |
+
4. Generate and explore your synthetic dataset
|
| 73 |
+
|
| 74 |
+
### Training Models
|
| 75 |
+
1. Select classifier algorithms to evaluate
|
| 76 |
+
2. Configure training parameters (test split, etc.)
|
| 77 |
+
3. Train models and view performance metrics
|
| 78 |
+
4. Compare model results through interactive visualizations
|
| 79 |
+
|
| 80 |
+
### Educational Resources
|
| 81 |
+
1. Navigate to the Algorithm Education page
|
| 82 |
+
2. Select an algorithm to learn about
|
| 83 |
+
3. Interact with the demo to see how parameters affect performance
|
| 84 |
+
4. Examine mathematical foundations and implementation details
|
| 85 |
+
|
| 86 |
+
### Model Implementation
|
| 87 |
+
1. Upload previously saved model and scaler files
|
| 88 |
+
2. Input feature values or generate random test values
|
| 89 |
+
3. Make predictions and view results
|
| 90 |
+
|
| 91 |
+
## Project Structure
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
synthetic_data_generation/
|
| 95 |
+
├── App.py # Main application
|
| 96 |
+
├── models/ # Directory for saved models
|
| 97 |
+
├── pages/ # Additional application pages
|
| 98 |
+
│ ├── 02_Algorithm_Education.py # Educational content about ML algorithms
|
| 99 |
+
│ └── 03_Model_implementation.py # Model deployment and usage interface
|
| 100 |
+
├── temp_uploads/ # Temporary directory for file uploads
|
| 101 |
+
└── requirements.txt # Project dependencies
|
| 102 |
+
```
|
pages/02_Algorithm_Education.py
ADDED
|
@@ -0,0 +1,1250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.naive_bayes import GaussianNB
|
| 4 |
+
from sklearn.svm import LinearSVC, SVC
|
| 5 |
+
from sklearn.neural_network import MLPClassifier
|
| 6 |
+
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
|
| 7 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 8 |
+
from sklearn.linear_model import RidgeClassifier
|
| 9 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 10 |
+
from sklearn.ensemble import AdaBoostClassifier
|
| 11 |
+
from sklearn.metrics import accuracy_score, confusion_matrix
|
| 12 |
+
from sklearn.model_selection import train_test_split, learning_curve
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import seaborn as sns
|
| 15 |
+
|
| 16 |
+
def setup_page_config():
|
| 17 |
+
"""Configure the Streamlit page"""
|
| 18 |
+
st.set_page_config(
|
| 19 |
+
page_title="Algorithm Education",
|
| 20 |
+
page_icon="🤖",
|
| 21 |
+
layout="wide"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
def page_introduction():
|
| 25 |
+
"""Display the introduction section of the page"""
|
| 26 |
+
st.title("Machine Learning Algorithm Education 🎓")
|
| 27 |
+
|
| 28 |
+
st.markdown("""
|
| 29 |
+
Welcome to the Algorithm Education page! This interactive guide helps you understand various machine learning
|
| 30 |
+
algorithms used in classification tasks. Each algorithm is explained in detail with:
|
| 31 |
+
|
| 32 |
+
- 📝 Clear descriptions and explanations
|
| 33 |
+
- ✅ Advantages and limitations
|
| 34 |
+
- 🎯 Practical use cases
|
| 35 |
+
- 📊 Mathematical foundations
|
| 36 |
+
- 💻 Implementation examples
|
| 37 |
+
- 🔬 Interactive demonstrations
|
| 38 |
+
- 📚 Academic references
|
| 39 |
+
|
| 40 |
+
### How to Use This Guide
|
| 41 |
+
1. Select an algorithm from the dropdown menu below
|
| 42 |
+
2. Explore its characteristics and implementation details
|
| 43 |
+
3. Try the interactive demo with different datasets
|
| 44 |
+
4. Compare performance metrics and visualizations
|
| 45 |
+
|
| 46 |
+
### Available Algorithms
|
| 47 |
+
This guide covers popular classification algorithms including:
|
| 48 |
+
- Naive Bayes variants
|
| 49 |
+
- Support Vector Machines
|
| 50 |
+
- Neural Networks
|
| 51 |
+
- Tree-based methods
|
| 52 |
+
- Nearest Neighbors
|
| 53 |
+
- Linear Classifiers
|
| 54 |
+
- Ensemble Methods
|
| 55 |
+
|
| 56 |
+
### Why Understanding Algorithms Matters
|
| 57 |
+
Choosing the right algorithm for your machine learning task is crucial for:
|
| 58 |
+
- Achieving optimal performance
|
| 59 |
+
- Efficient resource utilization
|
| 60 |
+
- Meeting specific problem constraints
|
| 61 |
+
- Understanding model behavior and limitations
|
| 62 |
+
""")
|
| 63 |
+
|
| 64 |
+
def algorithm_info():
|
| 65 |
+
"""Display detailed algorithm information"""
|
| 66 |
+
# First show the introduction
|
| 67 |
+
page_introduction()
|
| 68 |
+
|
| 69 |
+
algorithms = {
|
| 70 |
+
"Gaussian Naive Bayes (GaussianNB)": {
|
| 71 |
+
"description": """
|
| 72 |
+
A probabilistic classifier based on Bayes' theorem with strong independence assumptions between features.
|
| 73 |
+
Assumes features follow a Gaussian (normal) distribution.
|
| 74 |
+
""",
|
| 75 |
+
"pros": [
|
| 76 |
+
"Simple and fast",
|
| 77 |
+
"Works well with small datasets",
|
| 78 |
+
"Good for high-dimensional data",
|
| 79 |
+
"Performs well when features are normally distributed"
|
| 80 |
+
],
|
| 81 |
+
"cons": [
|
| 82 |
+
"Assumes feature independence (often unrealistic)",
|
| 83 |
+
"Limited by Gaussian distribution assumption",
|
| 84 |
+
"May underperform when features are highly correlated"
|
| 85 |
+
],
|
| 86 |
+
"use_cases": [
|
| 87 |
+
"Text classification",
|
| 88 |
+
"Spam detection",
|
| 89 |
+
"Medical diagnosis",
|
| 90 |
+
"Real-time prediction scenarios"
|
| 91 |
+
],
|
| 92 |
+
"math_details": {
|
| 93 |
+
"main_formula": r"""
|
| 94 |
+
P(y|x_1,...,x_n) = \frac{P(y)\prod_{i=1}^{n}P(x_i|y)}{P(x_1,...,x_n)}
|
| 95 |
+
""",
|
| 96 |
+
"component_formulas": [
|
| 97 |
+
{
|
| 98 |
+
"name": "Gaussian Probability Density",
|
| 99 |
+
"formula": r"""
|
| 100 |
+
P(x_i|y) = \frac{1}{\sqrt{2\pi\sigma^2_y}} \exp\left(-\frac{(x_i-\mu_y)^2}{2\sigma^2_y}\right)
|
| 101 |
+
"""
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"name": "Class Prior Probability",
|
| 105 |
+
"formula": r"""
|
| 106 |
+
P(y) = \frac{\text{number of samples in class y}}{\text{total number of samples}}
|
| 107 |
+
"""
|
| 108 |
+
}
|
| 109 |
+
],
|
| 110 |
+
"explanation": """
|
| 111 |
+
- P(y|x₁,...,xₙ) is the posterior probability of class y given features
|
| 112 |
+
- P(y) is the prior probability of class y
|
| 113 |
+
- P(xᵢ|y) is the likelihood of feature xᵢ given class y
|
| 114 |
+
- μy and σ²y are the mean and variance of features in class y
|
| 115 |
+
"""
|
| 116 |
+
},
|
| 117 |
+
"references": [
|
| 118 |
+
{
|
| 119 |
+
"title": "Naive Bayes and Text Classification",
|
| 120 |
+
"authors": "Sebastian Raschka",
|
| 121 |
+
"publication": "arXiv preprint",
|
| 122 |
+
"year": "2014",
|
| 123 |
+
"url": "https://arxiv.org/abs/1410.5329"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"title": "scikit-learn: Machine Learning in Python",
|
| 127 |
+
"authors": "Pedregosa et al.",
|
| 128 |
+
"publication": "Journal of Machine Learning Research",
|
| 129 |
+
"year": "2011",
|
| 130 |
+
"url": "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html"
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"title": "Fundamental Mathematical Formulas Used in Machine Learning",
|
| 134 |
+
"authors": "Showmik Setta",
|
| 135 |
+
"publication": "Medium",
|
| 136 |
+
"year": "2023",
|
| 137 |
+
"url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
|
| 138 |
+
}
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
"Linear Support Vector Classification (LinearSVC)": {
|
| 142 |
+
"description": """
|
| 143 |
+
A linear classifier that finds the hyperplane that best separates classes by maximizing the margin between them.
|
| 144 |
+
Optimized implementation of Support Vector Classification for linear classification.
|
| 145 |
+
""",
|
| 146 |
+
"pros": [
|
| 147 |
+
"Effective for high-dimensional spaces",
|
| 148 |
+
"Memory efficient",
|
| 149 |
+
"Faster than standard SVC with linear kernel",
|
| 150 |
+
"Works well when classes are linearly separable"
|
| 151 |
+
],
|
| 152 |
+
"cons": [
|
| 153 |
+
"Only suitable for linear classification",
|
| 154 |
+
"Sensitive to feature scaling",
|
| 155 |
+
"May struggle with overlapping classes",
|
| 156 |
+
"No probability estimates by default"
|
| 157 |
+
],
|
| 158 |
+
"use_cases": [
|
| 159 |
+
"Text classification",
|
| 160 |
+
"Image classification",
|
| 161 |
+
"Bioinformatics",
|
| 162 |
+
"High-dimensional data analysis"
|
| 163 |
+
],
|
| 164 |
+
"math_details": {
|
| 165 |
+
"main_formula": r"""
|
| 166 |
+
\min_{w,b} \frac{1}{2}||w||^2 + C\sum_{i=1}^{n} \max(0, 1-y_i(w^Tx_i+b))
|
| 167 |
+
""",
|
| 168 |
+
"component_formulas": [
|
| 169 |
+
{
|
| 170 |
+
"name": "Decision Function",
|
| 171 |
+
"formula": r"""
|
| 172 |
+
f(x) = w^Tx + b
|
| 173 |
+
"""
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"name": "Margin Width",
|
| 177 |
+
"formula": r"""
|
| 178 |
+
\text{margin} = \frac{2}{||w||}
|
| 179 |
+
"""
|
| 180 |
+
}
|
| 181 |
+
],
|
| 182 |
+
"explanation": """
|
| 183 |
+
- w is the weight vector
|
| 184 |
+
- b is the bias term
|
| 185 |
+
- C is the regularization parameter
|
| 186 |
+
- yᵢ are the true labels (±1)
|
| 187 |
+
- xᵢ are the input features
|
| 188 |
+
"""
|
| 189 |
+
},
|
| 190 |
+
"references": [
|
| 191 |
+
{
|
| 192 |
+
"title": "A Tutorial on Support Vector Machines for Pattern Recognition",
|
| 193 |
+
"authors": "Christopher J.C. Burges",
|
| 194 |
+
"publication": "Data Mining and Knowledge Discovery",
|
| 195 |
+
"year": "1998",
|
| 196 |
+
"url": "https://link.springer.com/article/10.1023/A:1009715923555"
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"title": "Support Vector Machines",
|
| 200 |
+
"authors": "Andrew Ng",
|
| 201 |
+
"publication": "CS229 Lecture Notes, Stanford University",
|
| 202 |
+
"year": "2018",
|
| 203 |
+
"url": "http://cs229.stanford.edu/notes/cs229-notes3.pdf"
|
| 204 |
+
},
|
| 205 |
+
{
|
| 206 |
+
"title": "Machine Learning Algorithms: Mathematical Deep Dive",
|
| 207 |
+
"authors": "Vidushi Meel",
|
| 208 |
+
"publication": "viso.ai",
|
| 209 |
+
"year": "2021",
|
| 210 |
+
"url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
|
| 211 |
+
}
|
| 212 |
+
]
|
| 213 |
+
},
|
| 214 |
+
"Support Vector Classification (SVC)": {
|
| 215 |
+
"description": """
|
| 216 |
+
A powerful classifier that can perform non-linear classification using different kernel functions to transform
|
| 217 |
+
the feature space. Creates an optimal hyperplane in a transformed feature space.
|
| 218 |
+
""",
|
| 219 |
+
"pros": [
|
| 220 |
+
"Effective for non-linear classification",
|
| 221 |
+
"Works well with high-dimensional data",
|
| 222 |
+
"Robust against overfitting",
|
| 223 |
+
"Versatile through different kernel functions"
|
| 224 |
+
],
|
| 225 |
+
"cons": [
|
| 226 |
+
"Computationally intensive for large datasets",
|
| 227 |
+
"Sensitive to feature scaling",
|
| 228 |
+
"Kernel selection can be challenging",
|
| 229 |
+
"Memory intensive for large datasets"
|
| 230 |
+
],
|
| 231 |
+
"use_cases": [
|
| 232 |
+
"Image classification",
|
| 233 |
+
"Handwriting recognition",
|
| 234 |
+
"Bioinformatics",
|
| 235 |
+
"Pattern recognition"
|
| 236 |
+
],
|
| 237 |
+
"math_details": {
|
| 238 |
+
"main_formula": r"""
|
| 239 |
+
\min_{w,b} \frac{1}{2}||w||^2 + C\sum_{i=1}^{n} \xi_i
|
| 240 |
+
""",
|
| 241 |
+
"component_formulas": [
|
| 242 |
+
{
|
| 243 |
+
"name": "Kernel Function (RBF)",
|
| 244 |
+
"formula": r"""
|
| 245 |
+
K(x,x') = \exp\left(-\gamma ||x-x'||^2\right)
|
| 246 |
+
"""
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"name": "Decision Function",
|
| 250 |
+
"formula": r"""
|
| 251 |
+
f(x) = \sum_{i=1}^{n} \alpha_i y_i K(x_i,x) + b
|
| 252 |
+
"""
|
| 253 |
+
}
|
| 254 |
+
],
|
| 255 |
+
"explanation": """
|
| 256 |
+
- K(x,x') is the kernel function
|
| 257 |
+
- γ is the kernel coefficient
|
| 258 |
+
- αᵢ are the dual coefficients
|
| 259 |
+
- ξᵢ are the slack variables
|
| 260 |
+
"""
|
| 261 |
+
},
|
| 262 |
+
"references": [
|
| 263 |
+
{
|
| 264 |
+
"title": "Support Vector Networks",
|
| 265 |
+
"authors": "Cortes C., Vapnik V.",
|
| 266 |
+
"publication": "Machine Learning",
|
| 267 |
+
"year": "1995",
|
| 268 |
+
"url": "https://link.springer.com/article/10.1007/BF00994018"
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"title": "A Practical Guide to Support Vector Classification",
|
| 272 |
+
"authors": "Hsu, Chang, and Lin",
|
| 273 |
+
"publication": "BJU International",
|
| 274 |
+
"year": "2003",
|
| 275 |
+
"url": "https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf"
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"title": "Machine Learning Algorithms: Mathematical Deep Dive",
|
| 279 |
+
"authors": "Vidushi Meel",
|
| 280 |
+
"publication": "viso.ai",
|
| 281 |
+
"year": "2021",
|
| 282 |
+
"url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
|
| 283 |
+
}
|
| 284 |
+
]
|
| 285 |
+
},
|
| 286 |
+
"Multi-layer Perceptron (MLPClassifier)": {
|
| 287 |
+
"description": """
|
| 288 |
+
A neural network classifier that learns non-linear models by training multiple layers of nodes.
|
| 289 |
+
Each node uses a non-linear activation function to transform inputs.
|
| 290 |
+
""",
|
| 291 |
+
"pros": [
|
| 292 |
+
"Can learn highly non-linear patterns",
|
| 293 |
+
"Capable of learning complex relationships",
|
| 294 |
+
"Good generalization with proper regularization",
|
| 295 |
+
"Can handle multiple classes naturally"
|
| 296 |
+
],
|
| 297 |
+
"cons": [
|
| 298 |
+
"Requires careful hyperparameter tuning",
|
| 299 |
+
"Computationally intensive",
|
| 300 |
+
"Sensitive to feature scaling",
|
| 301 |
+
"May get stuck in local minima"
|
| 302 |
+
],
|
| 303 |
+
"use_cases": [
|
| 304 |
+
"Image recognition",
|
| 305 |
+
"Speech recognition",
|
| 306 |
+
"Complex pattern recognition",
|
| 307 |
+
"Financial prediction"
|
| 308 |
+
],
|
| 309 |
+
"math_details": {
|
| 310 |
+
"main_formula": r"""
|
| 311 |
+
h_l = \sigma(W_l h_{l-1} + b_l)
|
| 312 |
+
""",
|
| 313 |
+
"component_formulas": [
|
| 314 |
+
{
|
| 315 |
+
"name": "ReLU Activation",
|
| 316 |
+
"formula": r"""
|
| 317 |
+
\sigma(x) = \max(0,x)
|
| 318 |
+
"""
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"name": "Softmax Output",
|
| 322 |
+
"formula": r"""
|
| 323 |
+
P(y=j|x) = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}}
|
| 324 |
+
"""
|
| 325 |
+
}
|
| 326 |
+
],
|
| 327 |
+
"explanation": """
|
| 328 |
+
- hₗ is the output of layer l
|
| 329 |
+
- Wₗ is the weight matrix for layer l
|
| 330 |
+
- bₗ is the bias vector for layer l
|
| 331 |
+
- σ is the activation function
|
| 332 |
+
"""
|
| 333 |
+
},
|
| 334 |
+
"references": [
|
| 335 |
+
{
|
| 336 |
+
"title": "Learning representations by back-propagating errors",
|
| 337 |
+
"authors": "Rumelhart, D. E., Hinton, G. E., & Williams, R. J.",
|
| 338 |
+
"publication": "Nature",
|
| 339 |
+
"year": "1986",
|
| 340 |
+
"url": "https://www.nature.com/articles/323533a0"
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"title": "Gradient-based learning applied to document recognition",
|
| 344 |
+
"authors": "LeCun Y., Bottou L., Bengio Y., & Haffner P.",
|
| 345 |
+
"publication": "Proceedings of the IEEE",
|
| 346 |
+
"year": "1998",
|
| 347 |
+
"url": "https://ieeexplore.ieee.org/document/726791"
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"title": "Fundamental Mathematical Formulas Used in Machine Learning",
|
| 351 |
+
"authors": "Showmik Setta",
|
| 352 |
+
"publication": "Medium",
|
| 353 |
+
"year": "2023",
|
| 354 |
+
"url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
|
| 355 |
+
}
|
| 356 |
+
]
|
| 357 |
+
},
|
| 358 |
+
"Extra Trees Classifier": {
|
| 359 |
+
"description": """
|
| 360 |
+
An ensemble method that builds multiple randomized decision trees and averages their predictions.
|
| 361 |
+
Similar to Random Forest but with additional randomization in the tree-building process.
|
| 362 |
+
""",
|
| 363 |
+
"pros": [
|
| 364 |
+
"Lower variance than Random Forest",
|
| 365 |
+
"Faster training than Random Forest",
|
| 366 |
+
"Good at handling high-dimensional data",
|
| 367 |
+
"Less prone to overfitting"
|
| 368 |
+
],
|
| 369 |
+
"cons": [
|
| 370 |
+
"May have slightly lower accuracy than Random Forest",
|
| 371 |
+
"Can be memory intensive",
|
| 372 |
+
"Less interpretable than single decision trees",
|
| 373 |
+
"May require more trees than Random Forest"
|
| 374 |
+
],
|
| 375 |
+
"use_cases": [
|
| 376 |
+
"Feature selection",
|
| 377 |
+
"Large dataset classification",
|
| 378 |
+
"Remote sensing",
|
| 379 |
+
"Biomedical classification"
|
| 380 |
+
],
|
| 381 |
+
"math_details": {
|
| 382 |
+
"main_formula": r"""
|
| 383 |
+
\hat{f}_{et}(x) = \frac{1}{B}\sum_{b=1}^B \hat{f}_b(x)
|
| 384 |
+
""",
|
| 385 |
+
"component_formulas": [
|
| 386 |
+
{
|
| 387 |
+
"name": "Random Split Selection",
|
| 388 |
+
"formula": r"""
|
| 389 |
+
\text{gain}(s,D) = \frac{|D_l|}{|D|}H(D_l) + \frac{|D_r|}{|D|}H(D_r)
|
| 390 |
+
"""
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"name": "Entropy",
|
| 394 |
+
"formula": r"""
|
| 395 |
+
H(D) = -\sum_{k=1}^K p_k\log(p_k)
|
| 396 |
+
"""
|
| 397 |
+
}
|
| 398 |
+
],
|
| 399 |
+
"explanation": """
|
| 400 |
+
- B is the number of trees
|
| 401 |
+
- fᵦ is the prediction of the b-th tree
|
| 402 |
+
- Dₗ and Dᵣ are left and right splits
|
| 403 |
+
- pₖ is the proportion of class k in the node
|
| 404 |
+
"""
|
| 405 |
+
},
|
| 406 |
+
"references": [
|
| 407 |
+
{
|
| 408 |
+
"title": "Extremely randomized trees",
|
| 409 |
+
"authors": "Geurts P., Ernst D., & Wehenkel L.",
|
| 410 |
+
"publication": "Machine Learning",
|
| 411 |
+
"year": "2006",
|
| 412 |
+
"url": "https://link.springer.com/article/10.1007/s10994-006-6226-1"
|
| 413 |
+
},
|
| 414 |
+
{
|
| 415 |
+
"title": "scikit-learn: Machine Learning in Python",
|
| 416 |
+
"authors": "Pedregosa et al.",
|
| 417 |
+
"publication": "Journal of Machine Learning Research",
|
| 418 |
+
"year": "2011",
|
| 419 |
+
"url": "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html"
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"title": "Fundamental Mathematical Formulas Used in Machine Learning",
|
| 423 |
+
"authors": "Showmik Setta",
|
| 424 |
+
"publication": "Medium",
|
| 425 |
+
"year": "2023",
|
| 426 |
+
"url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
|
| 427 |
+
}
|
| 428 |
+
]
|
| 429 |
+
},
|
| 430 |
+
"Random Forest Classifier": {
|
| 431 |
+
"description": """
|
| 432 |
+
An ensemble learning method that constructs multiple decision trees and combines their predictions.
|
| 433 |
+
Each tree is built using a random subset of features and bootstrap samples of the data.
|
| 434 |
+
""",
|
| 435 |
+
"pros": [
|
| 436 |
+
"Robust against overfitting",
|
| 437 |
+
"Handles non-linear relationships well",
|
| 438 |
+
"Provides feature importance",
|
| 439 |
+
"Works well with high-dimensional data"
|
| 440 |
+
],
|
| 441 |
+
"cons": [
|
| 442 |
+
"Can be computationally intensive",
|
| 443 |
+
"Less interpretable than single decision trees",
|
| 444 |
+
"Memory intensive for large datasets",
|
| 445 |
+
"May overfit on noisy datasets"
|
| 446 |
+
],
|
| 447 |
+
"use_cases": [
|
| 448 |
+
"Credit risk assessment",
|
| 449 |
+
"Medical diagnosis",
|
| 450 |
+
"Market prediction",
|
| 451 |
+
"Image classification"
|
| 452 |
+
],
|
| 453 |
+
"math_details": {
|
| 454 |
+
"main_formula": r"""
|
| 455 |
+
\hat{f}_{rf}(x) = \frac{1}{B}\sum_{b=1}^B \hat{f}_b(x)
|
| 456 |
+
""",
|
| 457 |
+
"component_formulas": [
|
| 458 |
+
{
|
| 459 |
+
"name": "Random Split Selection",
|
| 460 |
+
"formula": r"""
|
| 461 |
+
\text{gain}(s,D) = \frac{|D_l|}{|D|}H(D_l) + \frac{|D_r|}{|D|}H(D_r)
|
| 462 |
+
"""
|
| 463 |
+
},
|
| 464 |
+
{
|
| 465 |
+
"name": "Entropy",
|
| 466 |
+
"formula": r"""
|
| 467 |
+
H(D) = -\sum_{k=1}^K p_k\log(p_k)
|
| 468 |
+
"""
|
| 469 |
+
}
|
| 470 |
+
],
|
| 471 |
+
"explanation": """
|
| 472 |
+
- B is the number of trees
|
| 473 |
+
- fᵦ is the prediction of the b-th tree
|
| 474 |
+
- Dₗ and Dᵣ are left and right splits
|
| 475 |
+
- pₖ is the proportion of class k in the node
|
| 476 |
+
"""
|
| 477 |
+
},
|
| 478 |
+
"references": [
|
| 479 |
+
{
|
| 480 |
+
"title": "Random Forests",
|
| 481 |
+
"authors": "Breiman L.",
|
| 482 |
+
"publication": "Machine Learning",
|
| 483 |
+
"year": "2001",
|
| 484 |
+
"url": "https://link.springer.com/article/10.1023/A:1010933404324"
|
| 485 |
+
},
|
| 486 |
+
{
|
| 487 |
+
"title": "An Introduction to Statistical Learning",
|
| 488 |
+
"authors": "James G., Witten D., Hastie T., & Tibshirani R.",
|
| 489 |
+
"publication": "Springer",
|
| 490 |
+
"year": "2013",
|
| 491 |
+
"url": "https://www.statlearning.com/"
|
| 492 |
+
},
|
| 493 |
+
{
|
| 494 |
+
"title": "Machine Learning Algorithms: Mathematical Deep Dive",
|
| 495 |
+
"authors": "Vidushi Meel",
|
| 496 |
+
"publication": "viso.ai",
|
| 497 |
+
"year": "2021",
|
| 498 |
+
"url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
|
| 499 |
+
}
|
| 500 |
+
]
|
| 501 |
+
},
|
| 502 |
+
"K-Nearest Neighbors (KNeighborsClassifier)": {
|
| 503 |
+
"description": """
|
| 504 |
+
A non-parametric method that classifies a data point based on the majority class of its k nearest neighbors
|
| 505 |
+
in the feature space. Simple but effective algorithm.
|
| 506 |
+
""",
|
| 507 |
+
"pros": [
|
| 508 |
+
"Simple to understand and implement",
|
| 509 |
+
"No training phase",
|
| 510 |
+
"Naturally handles multi-class cases",
|
| 511 |
+
"Non-parametric (no assumptions about data)"
|
| 512 |
+
],
|
| 513 |
+
"cons": [
|
| 514 |
+
"Computationally intensive for large datasets",
|
| 515 |
+
"Sensitive to irrelevant features",
|
| 516 |
+
"Requires feature scaling",
|
| 517 |
+
"Memory intensive (stores all training data)"
|
| 518 |
+
],
|
| 519 |
+
"use_cases": [
|
| 520 |
+
"Recommendation systems",
|
| 521 |
+
"Pattern recognition",
|
| 522 |
+
"Data imputation",
|
| 523 |
+
"Anomaly detection"
|
| 524 |
+
],
|
| 525 |
+
"math_details": {
|
| 526 |
+
"main_formula": r"""
|
| 527 |
+
\hat{f}_{knn}(x) = \frac{1}{k}\sum_{i=1}^k y_i
|
| 528 |
+
""",
|
| 529 |
+
"component_formulas": [
|
| 530 |
+
{
|
| 531 |
+
"name": "Distance Function",
|
| 532 |
+
"formula": r"""
|
| 533 |
+
d(x,x') = \sum_{i=1}^p |x_i - x'_i|^2
|
| 534 |
+
"""
|
| 535 |
+
},
|
| 536 |
+
{
|
| 537 |
+
"name": "Decision Function",
|
| 538 |
+
"formula": r"""
|
| 539 |
+
f(x) = \text{sign}\left(\sum_{i=1}^k y_i \cdot \text{weight}(d(x,x_i))\right)
|
| 540 |
+
"""
|
| 541 |
+
}
|
| 542 |
+
],
|
| 543 |
+
"explanation": """
|
| 544 |
+
- d(x,x') is the distance function
|
| 545 |
+
- xᵢ are the k nearest neighbors
|
| 546 |
+
- yᵢ are the labels of the k nearest neighbors
|
| 547 |
+
- weight(d(x,x')) is the weight function based on distance
|
| 548 |
+
"""
|
| 549 |
+
},
|
| 550 |
+
"references": [
|
| 551 |
+
{
|
| 552 |
+
"title": "Nearest Neighbor Pattern Classification",
|
| 553 |
+
"authors": "Cover T. & Hart P.",
|
| 554 |
+
"publication": "IEEE Transactions on Information Theory",
|
| 555 |
+
"year": "1967",
|
| 556 |
+
"url": "https://ieeexplore.ieee.org/document/1053964"
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"title": "A Survey of Nearest Neighbor Techniques",
|
| 560 |
+
"authors": "Bhatia N. & Vandana",
|
| 561 |
+
"publication": "International Journal of Computer Science and Information Security",
|
| 562 |
+
"year": "2010",
|
| 563 |
+
"url": "https://arxiv.org/abs/1007.0085"
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"title": "Machine Learning Algorithms: Mathematical Deep Dive",
|
| 567 |
+
"authors": "Vidushi Meel",
|
| 568 |
+
"publication": "viso.ai",
|
| 569 |
+
"year": "2021",
|
| 570 |
+
"url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
|
| 571 |
+
}
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
"Ridge Classifier": {
|
| 575 |
+
"description": """
|
| 576 |
+
A linear classifier that uses L2 regularization to prevent overfitting. Similar to logistic regression
|
| 577 |
+
but with different loss function and regularization.
|
| 578 |
+
""",
|
| 579 |
+
"pros": [
|
| 580 |
+
"Good for multicollinear data",
|
| 581 |
+
"Less prone to overfitting",
|
| 582 |
+
"Computationally efficient",
|
| 583 |
+
"Works well with many features"
|
| 584 |
+
],
|
| 585 |
+
"cons": [
|
| 586 |
+
"Only for linear classification",
|
| 587 |
+
"May underfit complex patterns",
|
| 588 |
+
"Sensitive to feature scaling",
|
| 589 |
+
"No probability estimates"
|
| 590 |
+
],
|
| 591 |
+
"use_cases": [
|
| 592 |
+
"High-dimensional data classification",
|
| 593 |
+
"Text classification",
|
| 594 |
+
"Gene expression analysis",
|
| 595 |
+
"Simple binary classification"
|
| 596 |
+
],
|
| 597 |
+
"math_details": {
|
| 598 |
+
"main_formula": r"""
|
| 599 |
+
\min_{w} ||Xw - y||^2_2 + \alpha ||w||^2_2
|
| 600 |
+
""",
|
| 601 |
+
"component_formulas": [
|
| 602 |
+
{
|
| 603 |
+
"name": "Decision Function",
|
| 604 |
+
"formula": r"""
|
| 605 |
+
f(x) = w^Tx
|
| 606 |
+
"""
|
| 607 |
+
},
|
| 608 |
+
{
|
| 609 |
+
"name": "L2 Penalty",
|
| 610 |
+
"formula": r"""
|
| 611 |
+
\text{penalty} = \alpha ||w||^2_2 = \alpha \sum_{j=1}^p w_j^2
|
| 612 |
+
"""
|
| 613 |
+
}
|
| 614 |
+
],
|
| 615 |
+
"explanation": """
|
| 616 |
+
- w is the weight vector
|
| 617 |
+
- α is the regularization strength
|
| 618 |
+
- X is the feature matrix
|
| 619 |
+
- y is the target vector
|
| 620 |
+
- p is the number of features
|
| 621 |
+
"""
|
| 622 |
+
},
|
| 623 |
+
"references": [
|
| 624 |
+
{
|
| 625 |
+
"title": "Ridge Regression: Biased Estimation for Nonorthogonal Problems",
|
| 626 |
+
"authors": "Hoerl A.E. & Kennard R.W.",
|
| 627 |
+
"publication": "Technometrics",
|
| 628 |
+
"year": "1970",
|
| 629 |
+
"url": "https://www.tandfonline.com/doi/abs/10.1080/00401706.1970.10488634"
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"title": "The Elements of Statistical Learning",
|
| 633 |
+
"authors": "Hastie T., Tibshirani R., & Friedman J.",
|
| 634 |
+
"publication": "Springer",
|
| 635 |
+
"year": "2009",
|
| 636 |
+
"url": "https://web.stanford.edu/~hastie/ElemStatLearn/"
|
| 637 |
+
},
|
| 638 |
+
{
|
| 639 |
+
"title": "Fundamental Mathematical Formulas Used in Machine Learning",
|
| 640 |
+
"authors": "Showmik Setta",
|
| 641 |
+
"publication": "Medium",
|
| 642 |
+
"year": "2023",
|
| 643 |
+
"url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
|
| 644 |
+
}
|
| 645 |
+
]
|
| 646 |
+
},
|
| 647 |
+
"Multinomial Naive Bayes": {
|
| 648 |
+
"description": """
|
| 649 |
+
A specialized version of Naive Bayes for multinomially distributed data. Commonly used for text
|
| 650 |
+
classification with word counts.
|
| 651 |
+
""",
|
| 652 |
+
"pros": [
|
| 653 |
+
"Fast training and prediction",
|
| 654 |
+
"Works well with high-dimensional data",
|
| 655 |
+
"Good for text classification",
|
| 656 |
+
"Handles multiple classes well"
|
| 657 |
+
],
|
| 658 |
+
"cons": [
|
| 659 |
+
"Assumes feature independence",
|
| 660 |
+
"Requires non-negative features",
|
| 661 |
+
"Sensitive to feature distribution",
|
| 662 |
+
"May underperform with continuous data"
|
| 663 |
+
],
|
| 664 |
+
"use_cases": [
|
| 665 |
+
"Document classification",
|
| 666 |
+
"Spam detection",
|
| 667 |
+
"Language detection",
|
| 668 |
+
"Topic modeling"
|
| 669 |
+
],
|
| 670 |
+
"math_details": {
|
| 671 |
+
"main_formula": r"""
|
| 672 |
+
P(y|x) = \frac{P(y)\prod_{i=1}^n P(x_i|y)}{\sum_{k} P(y_k)\prod_{i=1}^n P(x_i|y_k)}
|
| 673 |
+
""",
|
| 674 |
+
"component_formulas": [
|
| 675 |
+
{
|
| 676 |
+
"name": "Feature Probability",
|
| 677 |
+
"formula": r"""
|
| 678 |
+
P(x_i|y) = \frac{N_{yi} + \alpha}{N_y + \alpha n}
|
| 679 |
+
"""
|
| 680 |
+
},
|
| 681 |
+
{
|
| 682 |
+
"name": "Log Probability",
|
| 683 |
+
"formula": r"""
|
| 684 |
+
\log P(y|x) = \log P(y) + \sum_{i=1}^n \log P(x_i|y)
|
| 685 |
+
"""
|
| 686 |
+
}
|
| 687 |
+
],
|
| 688 |
+
"explanation": """
|
| 689 |
+
- Nyᵢ is the count of feature i in class y
|
| 690 |
+
- Ny is the total count of all features in class y
|
| 691 |
+
- α is the smoothing parameter
|
| 692 |
+
- n is the number of features
|
| 693 |
+
"""
|
| 694 |
+
},
|
| 695 |
+
"references": [
|
| 696 |
+
{
|
| 697 |
+
"title": "A comparison of event models for naive Bayes text classification",
|
| 698 |
+
"authors": "McCallum A. & Nigam K.",
|
| 699 |
+
"publication": "AAAI-98 Workshop on Learning for Text Categorization",
|
| 700 |
+
"year": "1998",
|
| 701 |
+
"url": "https://www.cs.cmu.edu/~knigam/papers/multinomial-aaaiws98.pdf"
|
| 702 |
+
},
|
| 703 |
+
{
|
| 704 |
+
"title": "An empirical study of the naive Bayes classifier",
|
| 705 |
+
"authors": "Rish I.",
|
| 706 |
+
"publication": "IJCAI 2001 Workshop on Empirical Methods in Artificial Intelligence",
|
| 707 |
+
"year": "2001",
|
| 708 |
+
"url": "https://www.researchgate.net/publication/228845263_An_Empirical_Study_of_the_Naive_Bayes_Classifier"
|
| 709 |
+
},
|
| 710 |
+
{
|
| 711 |
+
"title": "Fundamental Mathematical Formulas Used in Machine Learning",
|
| 712 |
+
"authors": "Showmik Setta",
|
| 713 |
+
"publication": "Medium",
|
| 714 |
+
"year": "2023",
|
| 715 |
+
"url": "https://medium.com/@showmiklovesport/fundamental-mathematical-formulas-used-in-machine-learning-beginner-21c0843e61e0"
|
| 716 |
+
}
|
| 717 |
+
]
|
| 718 |
+
},
|
| 719 |
+
"AdaBoost Classifier": {
|
| 720 |
+
"description": """
|
| 721 |
+
An ensemble method that builds a strong classifier by iteratively adding weak learners, focusing on
|
| 722 |
+
previously misclassified examples.
|
| 723 |
+
""",
|
| 724 |
+
"pros": [
|
| 725 |
+
"Good generalization",
|
| 726 |
+
"Less prone to overfitting",
|
| 727 |
+
"Can identify hard-to-classify instances",
|
| 728 |
+
"Works well with weak learners"
|
| 729 |
+
],
|
| 730 |
+
"cons": [
|
| 731 |
+
"Sensitive to noisy data and outliers",
|
| 732 |
+
"Sequential nature (can't parallelize)",
|
| 733 |
+
"Can be computationally intensive",
|
| 734 |
+
"May require careful tuning"
|
| 735 |
+
],
|
| 736 |
+
"use_cases": [
|
| 737 |
+
"Face detection",
|
| 738 |
+
"Object recognition",
|
| 739 |
+
"Medical diagnosis",
|
| 740 |
+
"Fraud detection"
|
| 741 |
+
],
|
| 742 |
+
"math_details": {
|
| 743 |
+
"main_formula": r"""
|
| 744 |
+
F(x) = \text{sign}\left(\sum_{t=1}^T \alpha_t h_t(x)\right)
|
| 745 |
+
""",
|
| 746 |
+
"component_formulas": [
|
| 747 |
+
{
|
| 748 |
+
"name": "Weak Learner Weight",
|
| 749 |
+
"formula": r"""
|
| 750 |
+
\alpha_t = \frac{1}{2}\ln\left(\frac{1-\epsilon_t}{\epsilon_t}\right)
|
| 751 |
+
"""
|
| 752 |
+
},
|
| 753 |
+
{
|
| 754 |
+
"name": "Sample Weight Update",
|
| 755 |
+
"formula": r"""
|
| 756 |
+
w_{i,t+1} = w_{i,t}\exp(-y_i\alpha_th_t(x_i))
|
| 757 |
+
"""
|
| 758 |
+
}
|
| 759 |
+
],
|
| 760 |
+
"explanation": """
|
| 761 |
+
- hₜ(x) is the weak learner prediction
|
| 762 |
+
- αₜ is the weight of weak learner t
|
| 763 |
+
- εₜ is the weighted error rate
|
| 764 |
+
- wᵢ,ₜ is the weight of sample i at iteration t
|
| 765 |
+
"""
|
| 766 |
+
},
|
| 767 |
+
"references": [
|
| 768 |
+
{
|
| 769 |
+
"title": "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting",
|
| 770 |
+
"authors": "Freund Y. & Schapire R.E.",
|
| 771 |
+
"publication": "Journal of Computer and System Sciences",
|
| 772 |
+
"year": "1997",
|
| 773 |
+
"url": "https://www.sciencedirect.com/science/article/pii/S002200009791504X"
|
| 774 |
+
},
|
| 775 |
+
{
|
| 776 |
+
"title": "Experiments with a New Boosting Algorithm",
|
| 777 |
+
"authors": "Freund Y. & Schapire R.E.",
|
| 778 |
+
"publication": "International Conference on Machine Learning",
|
| 779 |
+
"year": "1996",
|
| 780 |
+
"url": "https://icml.cc/Conferences/1996/papers/boosting.pdf"
|
| 781 |
+
},
|
| 782 |
+
{
|
| 783 |
+
"title": "Machine Learning Algorithms: Mathematical Deep Dive",
|
| 784 |
+
"authors": "Vidushi Meel",
|
| 785 |
+
"publication": "viso.ai",
|
| 786 |
+
"year": "2021",
|
| 787 |
+
"url": "https://viso.ai/deep-learning/machine-learning-algorithms-mathematical-guide/"
|
| 788 |
+
}
|
| 789 |
+
]
|
| 790 |
+
}
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
# Add implementation details to each algorithm
|
| 794 |
+
for algo_name in algorithms:
|
| 795 |
+
algorithms[algo_name]["implementation"] = {
|
| 796 |
+
"Gaussian Naive Bayes (GaussianNB)": {
|
| 797 |
+
"code": """
|
| 798 |
+
from sklearn.naive_bayes import GaussianNB
|
| 799 |
+
from sklearn.datasets import make_classification
|
| 800 |
+
|
| 801 |
+
# Create sample dataset
|
| 802 |
+
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2)
|
| 803 |
+
|
| 804 |
+
# Initialize and train the model
|
| 805 |
+
gnb = GaussianNB()
|
| 806 |
+
gnb.fit(X, y)
|
| 807 |
+
|
| 808 |
+
# Make predictions
|
| 809 |
+
y_pred = gnb.predict(X)
|
| 810 |
+
""",
|
| 811 |
+
"key_parameters": {
|
| 812 |
+
"var_smoothing": "Portion of the largest variance of all features that is added to variances for calculation stability",
|
| 813 |
+
"priors": "Prior probabilities of the classes"
|
| 814 |
+
},
|
| 815 |
+
"tips": [
|
| 816 |
+
"Normalize features if they have very different scales",
|
| 817 |
+
"Good as a baseline model for comparison",
|
| 818 |
+
"Check feature distributions - should be roughly Gaussian"
|
| 819 |
+
]
|
| 820 |
+
},
|
| 821 |
+
"Linear Support Vector Classification (LinearSVC)": {
|
| 822 |
+
"code": """
|
| 823 |
+
from sklearn.svm import LinearSVC
|
| 824 |
+
from sklearn.preprocessing import StandardScaler
|
| 825 |
+
|
| 826 |
+
# Scale the features
|
| 827 |
+
scaler = StandardScaler()
|
| 828 |
+
X_scaled = scaler.fit_transform(X)
|
| 829 |
+
|
| 830 |
+
# Initialize and train the model
|
| 831 |
+
svc = LinearSVC(random_state=42, max_iter=1000)
|
| 832 |
+
svc.fit(X_scaled, y)
|
| 833 |
+
""",
|
| 834 |
+
"key_parameters": {
|
| 835 |
+
"C": "Regularization parameter (default=1.0)",
|
| 836 |
+
"max_iter": "Maximum iterations for convergence",
|
| 837 |
+
"dual": "Dual or primal formulation"
|
| 838 |
+
},
|
| 839 |
+
"tips": [
|
| 840 |
+
"Always scale your features",
|
| 841 |
+
"Increase max_iter if model doesn't converge",
|
| 842 |
+
"Try different C values using cross-validation"
|
| 843 |
+
]
|
| 844 |
+
},
|
| 845 |
+
"Support Vector Classification (SVC)": {
|
| 846 |
+
"code": """
|
| 847 |
+
from sklearn.svm import SVC
|
| 848 |
+
from sklearn.preprocessing import StandardScaler
|
| 849 |
+
|
| 850 |
+
# Scale the features
|
| 851 |
+
scaler = StandardScaler()
|
| 852 |
+
X_scaled = scaler.fit_transform(X)
|
| 853 |
+
|
| 854 |
+
# Initialize and train the model
|
| 855 |
+
svc = SVC(random_state=42)
|
| 856 |
+
svc.fit(X_scaled, y)
|
| 857 |
+
""",
|
| 858 |
+
"key_parameters": {
|
| 859 |
+
"C": "Regularization parameter (default=1.0)",
|
| 860 |
+
"kernel": "Kernel function used to transform the data",
|
| 861 |
+
"gamma": "Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels"
|
| 862 |
+
},
|
| 863 |
+
"tips": [
|
| 864 |
+
"Always scale your features",
|
| 865 |
+
"Try different kernels and gamma values",
|
| 866 |
+
"Increase C if model underfits",
|
| 867 |
+
"Decrease C if model overfits"
|
| 868 |
+
]
|
| 869 |
+
},
|
| 870 |
+
"Multi-layer Perceptron (MLPClassifier)": {
|
| 871 |
+
"code": """
|
| 872 |
+
from sklearn.neural_network import MLPClassifier
|
| 873 |
+
from sklearn.preprocessing import StandardScaler
|
| 874 |
+
|
| 875 |
+
# Scale the features
|
| 876 |
+
scaler = StandardScaler()
|
| 877 |
+
X_scaled = scaler.fit_transform(X)
|
| 878 |
+
|
| 879 |
+
# Initialize and train the model
|
| 880 |
+
mlp = MLPClassifier(random_state=42)
|
| 881 |
+
mlp.fit(X_scaled, y)
|
| 882 |
+
""",
|
| 883 |
+
"key_parameters": {
|
| 884 |
+
"hidden_layer_sizes": "Number of neurons in each layer",
|
| 885 |
+
"activation": "Activation function used in the hidden layers",
|
| 886 |
+
"solver": "Optimization algorithm used to train the model",
|
| 887 |
+
"alpha": "L2 regularization parameter"
|
| 888 |
+
},
|
| 889 |
+
"tips": [
|
| 890 |
+
"Always scale your features",
|
| 891 |
+
"Try different activation functions",
|
| 892 |
+
"Increase hidden_layer_sizes if model underfits",
|
| 893 |
+
"Decrease hidden_layer_sizes if model overfits"
|
| 894 |
+
]
|
| 895 |
+
},
|
| 896 |
+
"Extra Trees Classifier": {
|
| 897 |
+
"code": """
|
| 898 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
| 899 |
+
from sklearn.preprocessing import StandardScaler
|
| 900 |
+
|
| 901 |
+
# Scale the features
|
| 902 |
+
scaler = StandardScaler()
|
| 903 |
+
X_scaled = scaler.fit_transform(X)
|
| 904 |
+
|
| 905 |
+
# Initialize and train the model
|
| 906 |
+
et = ExtraTreesClassifier(random_state=42)
|
| 907 |
+
et.fit(X_scaled, y)
|
| 908 |
+
""",
|
| 909 |
+
"key_parameters": {
|
| 910 |
+
"n_estimators": "Number of trees in the forest",
|
| 911 |
+
"max_depth": "Maximum depth of the trees",
|
| 912 |
+
"min_samples_split": "Minimum number of samples required to split an internal node",
|
| 913 |
+
"min_samples_leaf": "Minimum number of samples required to be at a leaf node"
|
| 914 |
+
},
|
| 915 |
+
"tips": [
|
| 916 |
+
"Always scale your features",
|
| 917 |
+
"Try different max_depth values",
|
| 918 |
+
"Increase n_estimators if model underfits",
|
| 919 |
+
"Decrease n_estimators if model overfits"
|
| 920 |
+
]
|
| 921 |
+
},
|
| 922 |
+
"Random Forest Classifier": {
|
| 923 |
+
"code": """
|
| 924 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 925 |
+
from sklearn.preprocessing import StandardScaler
|
| 926 |
+
|
| 927 |
+
# Scale the features
|
| 928 |
+
scaler = StandardScaler()
|
| 929 |
+
X_scaled = scaler.fit_transform(X)
|
| 930 |
+
|
| 931 |
+
# Initialize and train the model
|
| 932 |
+
rf = RandomForestClassifier(random_state=42)
|
| 933 |
+
rf.fit(X_scaled, y)
|
| 934 |
+
""",
|
| 935 |
+
"key_parameters": {
|
| 936 |
+
"n_estimators": "Number of trees in the forest",
|
| 937 |
+
"max_depth": "Maximum depth of the trees",
|
| 938 |
+
"min_samples_split": "Minimum number of samples required to split an internal node",
|
| 939 |
+
"min_samples_leaf": "Minimum number of samples required to be at a leaf node"
|
| 940 |
+
},
|
| 941 |
+
"tips": [
|
| 942 |
+
"Always scale your features",
|
| 943 |
+
"Try different max_depth values",
|
| 944 |
+
"Increase n_estimators if model underfits",
|
| 945 |
+
"Decrease n_estimators if model overfits"
|
| 946 |
+
]
|
| 947 |
+
},
|
| 948 |
+
"K-Nearest Neighbors (KNeighborsClassifier)": {
|
| 949 |
+
"code": """
|
| 950 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 951 |
+
from sklearn.preprocessing import StandardScaler
|
| 952 |
+
|
| 953 |
+
# Scale the features
|
| 954 |
+
scaler = StandardScaler()
|
| 955 |
+
X_scaled = scaler.fit_transform(X)
|
| 956 |
+
|
| 957 |
+
# Initialize and train the model
|
| 958 |
+
knn = KNeighborsClassifier()
|
| 959 |
+
knn.fit(X_scaled, y)
|
| 960 |
+
""",
|
| 961 |
+
"key_parameters": {
|
| 962 |
+
"n_neighbors": "Number of neighbors to use",
|
| 963 |
+
"weights": "Weight function used in prediction",
|
| 964 |
+
"algorithm": "Algorithm used to compute the nearest neighbors",
|
| 965 |
+
"leaf_size": "Maximum number of samples in each leaf"
|
| 966 |
+
},
|
| 967 |
+
"tips": [
|
| 968 |
+
"Always scale your features",
|
| 969 |
+
"Try different n_neighbors values",
|
| 970 |
+
"Increase leaf_size if model underfits",
|
| 971 |
+
"Decrease leaf_size if model overfits"
|
| 972 |
+
]
|
| 973 |
+
},
|
| 974 |
+
"Ridge Classifier": {
|
| 975 |
+
"code": """
|
| 976 |
+
from sklearn.linear_model import RidgeClassifier
|
| 977 |
+
from sklearn.preprocessing import StandardScaler
|
| 978 |
+
|
| 979 |
+
# Scale the features
|
| 980 |
+
scaler = StandardScaler()
|
| 981 |
+
X_scaled = scaler.fit_transform(X)
|
| 982 |
+
|
| 983 |
+
# Initialize and train the model
|
| 984 |
+
ridge = RidgeClassifier(random_state=42)
|
| 985 |
+
ridge.fit(X_scaled, y)
|
| 986 |
+
""",
|
| 987 |
+
"key_parameters": {
|
| 988 |
+
"alpha": "Regularization parameter (default=1.0)",
|
| 989 |
+
"solver": "Optimization algorithm used to train the model",
|
| 990 |
+
"max_iter": "Maximum number of iterations for the solver to converge"
|
| 991 |
+
},
|
| 992 |
+
"tips": [
|
| 993 |
+
"Always scale your features",
|
| 994 |
+
"Try different alpha values",
|
| 995 |
+
"Increase max_iter if model doesn't converge",
|
| 996 |
+
"Decrease max_iter if model overfits"
|
| 997 |
+
]
|
| 998 |
+
},
|
| 999 |
+
"Multinomial Naive Bayes": {
|
| 1000 |
+
"code": """
|
| 1001 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 1002 |
+
from sklearn.preprocessing import StandardScaler
|
| 1003 |
+
|
| 1004 |
+
# Scale the features
|
| 1005 |
+
scaler = StandardScaler()
|
| 1006 |
+
X_scaled = scaler.fit_transform(X)
|
| 1007 |
+
|
| 1008 |
+
# Initialize and train the model
|
| 1009 |
+
nb = MultinomialNB()
|
| 1010 |
+
nb.fit(X_scaled, y)
|
| 1011 |
+
""",
|
| 1012 |
+
"key_parameters": {
|
| 1013 |
+
"alpha": "Regularization parameter (default=1.0)",
|
| 1014 |
+
"fit_prior": "Whether to learn class prior probabilities or not",
|
| 1015 |
+
"class_prior": "Prior probabilities of the classes"
|
| 1016 |
+
},
|
| 1017 |
+
"tips": [
|
| 1018 |
+
"Always scale your features",
|
| 1019 |
+
"Try different alpha values",
|
| 1020 |
+
"Increase alpha if model underfits",
|
| 1021 |
+
"Decrease alpha if model overfits"
|
| 1022 |
+
]
|
| 1023 |
+
},
|
| 1024 |
+
"AdaBoost Classifier": {
|
| 1025 |
+
"code": """
|
| 1026 |
+
from sklearn.ensemble import AdaBoostClassifier
|
| 1027 |
+
from sklearn.preprocessing import StandardScaler
|
| 1028 |
+
|
| 1029 |
+
# Scale the features
|
| 1030 |
+
scaler = StandardScaler()
|
| 1031 |
+
X_scaled = scaler.fit_transform(X)
|
| 1032 |
+
|
| 1033 |
+
# Initialize and train the model
|
| 1034 |
+
ada = AdaBoostClassifier(random_state=42)
|
| 1035 |
+
ada.fit(X_scaled, y)
|
| 1036 |
+
""",
|
| 1037 |
+
"key_parameters": {
|
| 1038 |
+
"n_estimators": "Number of trees in the forest",
|
| 1039 |
+
"learning_rate": "Learning rate used to update the weights of the weak classifiers",
|
| 1040 |
+
"algorithm": "Optimization algorithm used to train the model"
|
| 1041 |
+
},
|
| 1042 |
+
"tips": [
|
| 1043 |
+
"Always scale your features",
|
| 1044 |
+
"Try different learning_rate values",
|
| 1045 |
+
"Increase n_estimators if model underfits",
|
| 1046 |
+
"Decrease n_estimators if model overfits"
|
| 1047 |
+
]
|
| 1048 |
+
}
|
| 1049 |
+
}.get(algo_name, {})
|
| 1050 |
+
|
| 1051 |
+
# Algorithm selector
|
| 1052 |
+
selected_algo = st.selectbox(
|
| 1053 |
+
"Select an algorithm to learn more:",
|
| 1054 |
+
list(algorithms.keys())
|
| 1055 |
+
)
|
| 1056 |
+
|
| 1057 |
+
# Display algorithm information
|
| 1058 |
+
if selected_algo:
|
| 1059 |
+
st.header(selected_algo)
|
| 1060 |
+
|
| 1061 |
+
# Description
|
| 1062 |
+
st.subheader("Description")
|
| 1063 |
+
st.write(algorithms[selected_algo]["description"])
|
| 1064 |
+
|
| 1065 |
+
# Two-column layout for pros and cons
|
| 1066 |
+
col1, col2 = st.columns(2)
|
| 1067 |
+
|
| 1068 |
+
with col1:
|
| 1069 |
+
st.subheader("Advantages")
|
| 1070 |
+
for pro in algorithms[selected_algo]["pros"]:
|
| 1071 |
+
st.markdown(f"✅ {pro}")
|
| 1072 |
+
|
| 1073 |
+
with col2:
|
| 1074 |
+
st.subheader("Disadvantages")
|
| 1075 |
+
for con in algorithms[selected_algo]["cons"]:
|
| 1076 |
+
st.markdown(f"⚠️ {con}")
|
| 1077 |
+
|
| 1078 |
+
# Use cases
|
| 1079 |
+
st.subheader("Common Use Cases")
|
| 1080 |
+
for use_case in algorithms[selected_algo]["use_cases"]:
|
| 1081 |
+
st.markdown(f"🎯 {use_case}")
|
| 1082 |
+
|
| 1083 |
+
# Add mathematical details section
|
| 1084 |
+
st.markdown("---")
|
| 1085 |
+
display_math_details(algorithms[selected_algo])
|
| 1086 |
+
|
| 1087 |
+
# Add visual separator
|
| 1088 |
+
st.markdown("---")
|
| 1089 |
+
|
| 1090 |
+
# Implementation section
|
| 1091 |
+
if "implementation" in algorithms[selected_algo]:
|
| 1092 |
+
st.subheader("Implementation Example")
|
| 1093 |
+
|
| 1094 |
+
# Code example
|
| 1095 |
+
st.code(algorithms[selected_algo]["implementation"]["code"], language="python")
|
| 1096 |
+
|
| 1097 |
+
# Key Parameters
|
| 1098 |
+
st.subheader("Key Parameters")
|
| 1099 |
+
for param, desc in algorithms[selected_algo]["implementation"]["key_parameters"].items():
|
| 1100 |
+
st.markdown(f"**`{param}`**: {desc}")
|
| 1101 |
+
|
| 1102 |
+
# Implementation Tips
|
| 1103 |
+
st.subheader("Implementation Tips")
|
| 1104 |
+
for tip in algorithms[selected_algo]["implementation"]["tips"]:
|
| 1105 |
+
st.markdown(f"💡 {tip}")
|
| 1106 |
+
|
| 1107 |
+
# Add interactive demo section
|
| 1108 |
+
st.subheader("Interactive Demo")
|
| 1109 |
+
if st.checkbox("Show Interactive Demo"):
|
| 1110 |
+
st.write("Select dataset:")
|
| 1111 |
+
dataset_choice = st.selectbox(
|
| 1112 |
+
"Choose a sample dataset",
|
| 1113 |
+
["Iris", "Breast Cancer", "Wine", "Digits"]
|
| 1114 |
+
)
|
| 1115 |
+
|
| 1116 |
+
if st.button("Run Demo"):
|
| 1117 |
+
try:
|
| 1118 |
+
with st.spinner("Running demo..."):
|
| 1119 |
+
demo_results = run_algorithm_demo(selected_algo, dataset_choice)
|
| 1120 |
+
|
| 1121 |
+
# Display results
|
| 1122 |
+
st.write("Model Performance:")
|
| 1123 |
+
st.write(f"Accuracy: {demo_results['accuracy']:.4f}")
|
| 1124 |
+
|
| 1125 |
+
# Show confusion matrix
|
| 1126 |
+
st.write("Confusion Matrix:")
|
| 1127 |
+
st.pyplot(demo_results['confusion_matrix_plot'])
|
| 1128 |
+
|
| 1129 |
+
# Show learning curve
|
| 1130 |
+
st.write("Learning Curve:")
|
| 1131 |
+
st.pyplot(demo_results['learning_curve_plot'])
|
| 1132 |
+
except Exception as e:
|
| 1133 |
+
st.error(f"Error running demo: {str(e)}")
|
| 1134 |
+
|
| 1135 |
+
# Add a references section to display in the UI
|
| 1136 |
+
if st.checkbox("Show References"):
|
| 1137 |
+
st.subheader("Academic References")
|
| 1138 |
+
if "references" in algorithms[selected_algo]:
|
| 1139 |
+
for ref in algorithms[selected_algo]["references"]:
|
| 1140 |
+
st.markdown(f"**{ref['title']}**")
|
| 1141 |
+
st.markdown(f"*{ref['authors']}* ({ref['year']})")
|
| 1142 |
+
st.markdown(f"Published in: {ref['publication']}")
|
| 1143 |
+
st.markdown(f"[Link to Publication]({ref['url']})")
|
| 1144 |
+
st.markdown("---")
|
| 1145 |
+
else:
|
| 1146 |
+
st.write("No references available for this algorithm.")
|
| 1147 |
+
|
| 1148 |
+
def run_algorithm_demo(algorithm_name, dataset_name):
|
| 1149 |
+
"""Run a demo of the selected algorithm on the chosen dataset."""
|
| 1150 |
+
from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_digits
|
| 1151 |
+
from sklearn.model_selection import train_test_split, learning_curve
|
| 1152 |
+
from sklearn.preprocessing import StandardScaler
|
| 1153 |
+
import matplotlib.pyplot as plt
|
| 1154 |
+
import seaborn as sns
|
| 1155 |
+
|
| 1156 |
+
# Load dataset
|
| 1157 |
+
dataset_loaders = {
|
| 1158 |
+
"Iris": load_iris,
|
| 1159 |
+
"Breast Cancer": load_breast_cancer,
|
| 1160 |
+
"Wine": load_wine,
|
| 1161 |
+
"Digits": load_digits
|
| 1162 |
+
}
|
| 1163 |
+
|
| 1164 |
+
data = dataset_loaders[dataset_name]()
|
| 1165 |
+
X, y = data.data, data.target
|
| 1166 |
+
|
| 1167 |
+
# Split and scale data
|
| 1168 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
| 1169 |
+
scaler = StandardScaler()
|
| 1170 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 1171 |
+
X_test_scaled = scaler.transform(X_test)
|
| 1172 |
+
|
| 1173 |
+
# Initialize and train model
|
| 1174 |
+
model = get_model_instance(algorithm_name)
|
| 1175 |
+
model.fit(X_train_scaled, y_train)
|
| 1176 |
+
|
| 1177 |
+
# Get predictions and accuracy
|
| 1178 |
+
y_pred = model.predict(X_test_scaled)
|
| 1179 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 1180 |
+
|
| 1181 |
+
# Create confusion matrix plot
|
| 1182 |
+
plt.figure(figsize=(8, 6))
|
| 1183 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 1184 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='viridis')
|
| 1185 |
+
plt.title('Confusion Matrix')
|
| 1186 |
+
plt.ylabel('True Label')
|
| 1187 |
+
plt.xlabel('Predicted Label')
|
| 1188 |
+
cm_plot = plt.gcf()
|
| 1189 |
+
plt.close()
|
| 1190 |
+
|
| 1191 |
+
# Create learning curve plot
|
| 1192 |
+
train_sizes, train_scores, test_scores = learning_curve(
|
| 1193 |
+
model, X_train_scaled, y_train, cv=5,
|
| 1194 |
+
train_sizes=np.linspace(0.1, 1.0, 5)
|
| 1195 |
+
)
|
| 1196 |
+
|
| 1197 |
+
plt.figure(figsize=(8, 6))
|
| 1198 |
+
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
|
| 1199 |
+
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
|
| 1200 |
+
plt.xlabel('Training Examples')
|
| 1201 |
+
plt.ylabel('Score')
|
| 1202 |
+
plt.title('Learning Curve')
|
| 1203 |
+
plt.legend(loc='best')
|
| 1204 |
+
lc_plot = plt.gcf()
|
| 1205 |
+
plt.close()
|
| 1206 |
+
|
| 1207 |
+
return {
|
| 1208 |
+
'accuracy': accuracy,
|
| 1209 |
+
'confusion_matrix_plot': cm_plot,
|
| 1210 |
+
'learning_curve_plot': lc_plot
|
| 1211 |
+
}
|
| 1212 |
+
|
| 1213 |
+
def get_model_instance(algorithm_name):
|
| 1214 |
+
"""Return an instance of the specified algorithm."""
|
| 1215 |
+
models = {
|
| 1216 |
+
"Gaussian Naive Bayes (GaussianNB)": GaussianNB(),
|
| 1217 |
+
"Linear Support Vector Classification (LinearSVC)": LinearSVC(random_state=42),
|
| 1218 |
+
"Support Vector Classification (SVC)": SVC(random_state=42),
|
| 1219 |
+
"Multi-layer Perceptron (MLPClassifier)": MLPClassifier(random_state=42),
|
| 1220 |
+
"Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
|
| 1221 |
+
"Random Forest Classifier": RandomForestClassifier(random_state=42),
|
| 1222 |
+
"K-Nearest Neighbors (KNeighborsClassifier)": KNeighborsClassifier(),
|
| 1223 |
+
"Ridge Classifier": RidgeClassifier(random_state=42),
|
| 1224 |
+
"Multinomial Naive Bayes": MultinomialNB(),
|
| 1225 |
+
"AdaBoost Classifier": AdaBoostClassifier(random_state=42)
|
| 1226 |
+
}
|
| 1227 |
+
return models[algorithm_name]
|
| 1228 |
+
|
| 1229 |
+
def display_math_details(algorithm):
|
| 1230 |
+
"""Display mathematical details for the algorithm."""
|
| 1231 |
+
if "math_details" in algorithm:
|
| 1232 |
+
st.subheader("Mathematical Details")
|
| 1233 |
+
|
| 1234 |
+
# Main formula
|
| 1235 |
+
st.write("Main Formula:")
|
| 1236 |
+
st.latex(algorithm["math_details"]["main_formula"])
|
| 1237 |
+
|
| 1238 |
+
# Component formulas
|
| 1239 |
+
st.write("Component Formulas:")
|
| 1240 |
+
for component in algorithm["math_details"]["component_formulas"]:
|
| 1241 |
+
st.write(f"**{component['name']}:**")
|
| 1242 |
+
st.latex(component["formula"])
|
| 1243 |
+
|
| 1244 |
+
# Explanation
|
| 1245 |
+
st.write("**Variable Explanations:**")
|
| 1246 |
+
st.markdown(algorithm["math_details"]["explanation"])
|
| 1247 |
+
|
| 1248 |
+
if __name__ == "__main__":
|
| 1249 |
+
setup_page_config()
|
| 1250 |
+
algorithm_info()
|
pages/03_Model_implementation.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pickle
|
| 5 |
+
import os
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
from App import StreamlitUI
|
| 9 |
+
|
| 10 |
+
def setup_page_config():
|
| 11 |
+
"""Configure the Streamlit page"""
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
page_title="Model Implementation",
|
| 14 |
+
page_icon="🤖",
|
| 15 |
+
layout="wide"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def load_model_and_scaler(model_file, scaler_file):
|
| 19 |
+
try:
|
| 20 |
+
# Create a temporary directory if it doesn't exist
|
| 21 |
+
temp_dir = 'temp_uploads'
|
| 22 |
+
if not os.path.exists(temp_dir):
|
| 23 |
+
os.makedirs(temp_dir)
|
| 24 |
+
|
| 25 |
+
# Generate unique filenames
|
| 26 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 27 |
+
temp_model_path = os.path.join(temp_dir, f'model_{timestamp}.pkl')
|
| 28 |
+
temp_scaler_path = os.path.join(temp_dir, f'scaler_{timestamp}.pkl')
|
| 29 |
+
|
| 30 |
+
# Save uploaded files
|
| 31 |
+
with open(temp_model_path, 'wb') as f:
|
| 32 |
+
f.write(model_file.getbuffer())
|
| 33 |
+
with open(temp_scaler_path, 'wb') as f:
|
| 34 |
+
f.write(scaler_file.getbuffer())
|
| 35 |
+
|
| 36 |
+
# Load the files using pickle
|
| 37 |
+
with open(temp_model_path, 'rb') as f:
|
| 38 |
+
model = pickle.load(f)
|
| 39 |
+
with open(temp_scaler_path, 'rb') as f:
|
| 40 |
+
scaler = pickle.load(f)
|
| 41 |
+
|
| 42 |
+
# Clean up
|
| 43 |
+
os.remove(temp_model_path)
|
| 44 |
+
os.remove(temp_scaler_path)
|
| 45 |
+
|
| 46 |
+
return model, scaler
|
| 47 |
+
except Exception as e:
|
| 48 |
+
st.error(f"Error loading model or scaler: {str(e)}")
|
| 49 |
+
return None, None
|
| 50 |
+
|
| 51 |
+
def predict(model, scaler, features):
|
| 52 |
+
try:
|
| 53 |
+
# Convert features to numpy array and reshape
|
| 54 |
+
features_array = np.array(features).reshape(1, -1)
|
| 55 |
+
|
| 56 |
+
# Scale features
|
| 57 |
+
features_scaled = scaler.transform(features_array)
|
| 58 |
+
|
| 59 |
+
# Make prediction
|
| 60 |
+
prediction = model.predict(features_scaled)
|
| 61 |
+
|
| 62 |
+
# Get prediction probabilities if available
|
| 63 |
+
try:
|
| 64 |
+
probabilities = model.predict_proba(features_scaled)
|
| 65 |
+
return prediction[0], probabilities[0]
|
| 66 |
+
except:
|
| 67 |
+
return prediction[0], None
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
st.error(f"Error making prediction: {str(e)}")
|
| 71 |
+
return None, None
|
| 72 |
+
|
| 73 |
+
def generate_random_features(feature_names):
|
| 74 |
+
"""Generate random but realistic values for features"""
|
| 75 |
+
random_values = {}
|
| 76 |
+
|
| 77 |
+
# Get ranges from default configs in App.py
|
| 78 |
+
feature_ranges = {}
|
| 79 |
+
for feature_name in feature_names:
|
| 80 |
+
min_val = float('inf')
|
| 81 |
+
max_val = float('-inf')
|
| 82 |
+
|
| 83 |
+
# Calculate min/max across all classes in default configs
|
| 84 |
+
for class_config in StreamlitUI().default_configs.values():
|
| 85 |
+
mean = class_config['mean']
|
| 86 |
+
std = class_config['std']
|
| 87 |
+
|
| 88 |
+
# Get index of matching feature
|
| 89 |
+
try:
|
| 90 |
+
idx = StreamlitUI().default_features.index(feature_name)
|
| 91 |
+
feature_min = mean[idx] - 3*std[idx] # 3 std deviations for 99.7% coverage
|
| 92 |
+
feature_max = mean[idx] + 3*std[idx]
|
| 93 |
+
|
| 94 |
+
min_val = min(min_val, feature_min)
|
| 95 |
+
max_val = max(max_val, feature_max)
|
| 96 |
+
except ValueError:
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
# If feature not found in defaults, use reasonable fallback range
|
| 100 |
+
if min_val == float('inf'):
|
| 101 |
+
min_val, max_val = 0, 100
|
| 102 |
+
|
| 103 |
+
feature_ranges[feature_name] = (min_val, max_val)
|
| 104 |
+
|
| 105 |
+
for feature in feature_names:
|
| 106 |
+
# Default range if feature not in predefined ranges
|
| 107 |
+
min_val, max_val = 0, 100
|
| 108 |
+
|
| 109 |
+
# Check if any of the known features are in the feature name
|
| 110 |
+
for key, (min_range, max_range) in feature_ranges.items():
|
| 111 |
+
if key.lower() in feature.lower():
|
| 112 |
+
min_val, max_val = min_range, max_range
|
| 113 |
+
break
|
| 114 |
+
|
| 115 |
+
random_values[feature] = round(np.random.uniform(min_val, max_val), 2)
|
| 116 |
+
|
| 117 |
+
return random_values
|
| 118 |
+
|
| 119 |
+
def show():
|
| 120 |
+
st.title("Model Implementation")
|
| 121 |
+
|
| 122 |
+
# Initialize session state for random values if not exists
|
| 123 |
+
if 'random_values' not in st.session_state:
|
| 124 |
+
st.session_state.random_values = {}
|
| 125 |
+
|
| 126 |
+
# Keep file uploaders in sidebar
|
| 127 |
+
st.sidebar.subheader("Upload Model Files")
|
| 128 |
+
model_file = st.sidebar.file_uploader("Upload Model (.pkl)", type=['pkl'])
|
| 129 |
+
scaler_file = st.sidebar.file_uploader("Upload Scaler (.pkl)", type=['pkl'])
|
| 130 |
+
|
| 131 |
+
# Only proceed if both files are uploaded
|
| 132 |
+
if model_file and scaler_file:
|
| 133 |
+
model, scaler = load_model_and_scaler(model_file, scaler_file)
|
| 134 |
+
|
| 135 |
+
if model and scaler:
|
| 136 |
+
st.sidebar.success("Model and scaler loaded successfully!")
|
| 137 |
+
|
| 138 |
+
# Get feature names from scaler
|
| 139 |
+
feature_names = None
|
| 140 |
+
if hasattr(scaler, 'feature_names_in_'):
|
| 141 |
+
feature_names = scaler.feature_names_in_
|
| 142 |
+
elif hasattr(model, 'feature_names_in_'):
|
| 143 |
+
feature_names = model.feature_names_in_
|
| 144 |
+
|
| 145 |
+
if feature_names is None:
|
| 146 |
+
feature_names_input = st.sidebar.text_input(
|
| 147 |
+
"Enter feature names (comma-separated)",
|
| 148 |
+
"feature1, feature2, feature3"
|
| 149 |
+
)
|
| 150 |
+
feature_names = [f.strip() for f in feature_names_input.split(",")]
|
| 151 |
+
st.sidebar.info("Feature names were not found in the model/scaler. Using manually entered names.")
|
| 152 |
+
|
| 153 |
+
# Create two main columns for the page layout
|
| 154 |
+
input_col, result_col = st.columns(2)
|
| 155 |
+
|
| 156 |
+
# Left column for feature inputs
|
| 157 |
+
with input_col:
|
| 158 |
+
st.subheader("Enter Feature Values")
|
| 159 |
+
|
| 160 |
+
# Add randomization button
|
| 161 |
+
col1, col2 = st.columns([1, 2])
|
| 162 |
+
with col1:
|
| 163 |
+
if st.button("🎲 Randomize"):
|
| 164 |
+
# Generate new random values
|
| 165 |
+
st.session_state.random_values = generate_random_features(feature_names)
|
| 166 |
+
# Update session state for each feature
|
| 167 |
+
for feature in feature_names:
|
| 168 |
+
st.session_state[f"input_{feature}"] = st.session_state.random_values[feature]
|
| 169 |
+
with col2:
|
| 170 |
+
st.markdown("<div style='margin-top: 8px;'>Generate realistic random values</div>",
|
| 171 |
+
unsafe_allow_html=True)
|
| 172 |
+
|
| 173 |
+
# Create feature inputs in a grid layout
|
| 174 |
+
feature_values = {}
|
| 175 |
+
input_cols = st.columns(2) # 2 columns for feature inputs
|
| 176 |
+
for idx, feature in enumerate(feature_names):
|
| 177 |
+
with input_cols[idx % 2]:
|
| 178 |
+
# Initialize session state for this input if not exists
|
| 179 |
+
if f"input_{feature}" not in st.session_state:
|
| 180 |
+
st.session_state[f"input_{feature}"] = 0.0
|
| 181 |
+
|
| 182 |
+
feature_values[feature] = st.number_input(
|
| 183 |
+
f"{feature}",
|
| 184 |
+
key=f"input_{feature}",
|
| 185 |
+
step=1.0,
|
| 186 |
+
format="%.2f"
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# Make prediction button
|
| 190 |
+
predict_clicked = st.button("Make Prediction")
|
| 191 |
+
|
| 192 |
+
# Right column for prediction results
|
| 193 |
+
with result_col:
|
| 194 |
+
st.subheader("Prediction Results")
|
| 195 |
+
|
| 196 |
+
# Make prediction when values are available or button is clicked
|
| 197 |
+
if predict_clicked or st.session_state.random_values:
|
| 198 |
+
# Prepare features in correct order
|
| 199 |
+
features = [feature_values[feature] for feature in feature_names]
|
| 200 |
+
|
| 201 |
+
# Get prediction
|
| 202 |
+
prediction, probabilities = predict(model, scaler, features)
|
| 203 |
+
|
| 204 |
+
if prediction is not None:
|
| 205 |
+
st.write(f"Predicted Class: **{prediction}**")
|
| 206 |
+
|
| 207 |
+
# Display probabilities if available
|
| 208 |
+
if probabilities is not None:
|
| 209 |
+
st.write("Class Probabilities:")
|
| 210 |
+
prob_df = pd.DataFrame({
|
| 211 |
+
'Class': model.classes_,
|
| 212 |
+
'Probability': probabilities
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
# Display as bar chart
|
| 216 |
+
st.bar_chart(
|
| 217 |
+
prob_df.set_index('Class')
|
| 218 |
+
)
|
| 219 |
+
else:
|
| 220 |
+
st.info("Enter feature values and click 'Make Prediction' to see results.")
|
| 221 |
+
else:
|
| 222 |
+
st.sidebar.info("Please upload both model and scaler files to proceed.")
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
if __name__ == "__main__":
|
| 226 |
+
setup_page_config()
|
| 227 |
+
show()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
numpy>=1.24.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
scikit-learn>=1.2.0
|
| 5 |
+
plotly>=5.13.0
|
| 6 |
+
seaborn>=0.12.0
|
| 7 |
+
matplotlib>=3.7.0
|
| 8 |
+
joblib>=1.2.0
|