Commit ·
b8863d9
1
Parent(s): 689e49e
Add unsupervised clustering models.
Browse files- .dockerignore +1 -0
- app.py +115 -24
.dockerignore
CHANGED
|
@@ -2,3 +2,4 @@
|
|
| 2 |
./dataset/diabetes_012_health_indicators_BRFSS2015.csv
|
| 3 |
./dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
|
| 4 |
./dataset/final_phone_preferences_india.csv
|
|
|
|
|
|
| 2 |
./dataset/diabetes_012_health_indicators_BRFSS2015.csv
|
| 3 |
./dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
|
| 4 |
./dataset/final_phone_preferences_india.csv
|
| 5 |
+
./__pycache__/
|
app.py
CHANGED
|
@@ -4,6 +4,12 @@ __generated_with = "0.11.17"
|
|
| 4 |
app = marimo.App(width="medium")
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
@app.cell
|
| 8 |
def _():
|
| 9 |
import marimo as mo
|
|
@@ -14,54 +20,45 @@ def _():
|
|
| 14 |
@app.cell
|
| 15 |
def _(pl):
|
| 16 |
dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
|
| 17 |
-
dataset
|
| 18 |
return (dataset,)
|
| 19 |
|
| 20 |
|
| 21 |
-
@app.cell
|
| 22 |
def _(dataset, pl):
|
| 23 |
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
| 24 |
|
| 25 |
-
encoder = OneHotEncoder(sparse_output=False)
|
| 26 |
ord_encoder = OrdinalEncoder()
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
encoded_features = encoder.get_feature_names_out(['Obesity_BMI', 'Cancer_Stage'])
|
| 30 |
-
ord_encoded_features = ord_encoder.get_feature_names_out(['Survival_5_years'])
|
| 31 |
encoded_schema = {name: pl.Int8 for name in encoded_features}
|
| 32 |
-
|
| 33 |
-
dataset_encoded_parts = pl.DataFrame(
|
| 34 |
-
|
| 35 |
-
|
| 36 |
return (
|
| 37 |
OneHotEncoder,
|
| 38 |
OrdinalEncoder,
|
| 39 |
dataset_encoded,
|
| 40 |
dataset_encoded_parts,
|
| 41 |
-
dataset_ord_encoded_parts,
|
| 42 |
-
encoded,
|
| 43 |
encoded_features,
|
| 44 |
encoded_schema,
|
| 45 |
-
encoder,
|
| 46 |
ord_encoded,
|
| 47 |
-
ord_encoded_features,
|
| 48 |
-
ord_encoded_schema,
|
| 49 |
ord_encoder,
|
| 50 |
)
|
| 51 |
|
| 52 |
|
| 53 |
@app.cell
|
| 54 |
-
def _(dataset_encoded,
|
| 55 |
from sklearn.linear_model import LogisticRegression
|
| 56 |
from sklearn.naive_bayes import BernoulliNB
|
| 57 |
from sklearn.tree import DecisionTreeClassifier
|
| 58 |
-
from sklearn.svm import SVC
|
| 59 |
from sklearn.model_selection import train_test_split
|
| 60 |
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
|
| 61 |
|
| 62 |
-
X = dataset_encoded.select(['
|
| 63 |
y = dataset_encoded.select(['Survival_5_years'])
|
| 64 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=
|
| 65 |
logreg = LogisticRegression()
|
| 66 |
y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
|
| 67 |
bnb = BernoulliNB()
|
|
@@ -69,8 +66,9 @@ def _(dataset_encoded, encoded_features, mo):
|
|
| 69 |
dectree = DecisionTreeClassifier()
|
| 70 |
y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
|
| 71 |
|
|
|
|
| 72 |
mo.md(f"""
|
| 73 |
-
# Logistic Regression
|
| 74 |
|
| 75 |
Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
|
| 76 |
|
|
@@ -86,7 +84,7 @@ def _(dataset_encoded, encoded_features, mo):
|
|
| 86 |
{classification_report(y_test, y_pred_logreg)}
|
| 87 |
```
|
| 88 |
|
| 89 |
-
# Bernoulli Naive Bayes
|
| 90 |
|
| 91 |
Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
|
| 92 |
|
|
@@ -102,7 +100,7 @@ def _(dataset_encoded, encoded_features, mo):
|
|
| 102 |
{classification_report(y_test, y_pred_bnb)}
|
| 103 |
```
|
| 104 |
|
| 105 |
-
# Decision Tree Classifier
|
| 106 |
|
| 107 |
Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
|
| 108 |
|
|
@@ -117,12 +115,15 @@ def _(dataset_encoded, encoded_features, mo):
|
|
| 117 |
```
|
| 118 |
{classification_report(y_test, y_pred_dectree)}
|
| 119 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
""")
|
| 121 |
return (
|
| 122 |
BernoulliNB,
|
| 123 |
DecisionTreeClassifier,
|
| 124 |
LogisticRegression,
|
| 125 |
-
SVC,
|
| 126 |
X,
|
| 127 |
X_test,
|
| 128 |
X_train,
|
|
@@ -143,6 +144,96 @@ def _(dataset_encoded, encoded_features, mo):
|
|
| 143 |
)
|
| 144 |
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
if __name__ == "__main__":
|
| 148 |
app.run()
|
|
|
|
| 4 |
app = marimo.App(width="medium")
|
| 5 |
|
| 6 |
|
| 7 |
+
@app.cell
|
| 8 |
+
def _(mo):
|
| 9 |
+
mo.md(r"""# Analyzing Colorectal Cancer Dataset""")
|
| 10 |
+
return
|
| 11 |
+
|
| 12 |
+
|
| 13 |
@app.cell
|
| 14 |
def _():
|
| 15 |
import marimo as mo
|
|
|
|
| 20 |
@app.cell
|
| 21 |
def _(pl):
|
| 22 |
dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
|
| 23 |
+
# dataset.select("Tumor_Size_mm").describe()
|
| 24 |
return (dataset,)
|
| 25 |
|
| 26 |
|
| 27 |
+
@app.cell(hide_code=True)
|
| 28 |
def _(dataset, pl):
|
| 29 |
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
| 30 |
|
|
|
|
| 31 |
ord_encoder = OrdinalEncoder()
|
| 32 |
+
ord_encoded = ord_encoder.fit_transform(dataset.select('Early_Detection', 'Cancer_Stage', 'Survival_5_years'))
|
| 33 |
+
encoded_features = ord_encoder.get_feature_names_out(['Early_Detection', 'Cancer_Stage', 'Survival_5_years'])
|
|
|
|
|
|
|
| 34 |
encoded_schema = {name: pl.Int8 for name in encoded_features}
|
| 35 |
+
# print(encoded_schema)
|
| 36 |
+
dataset_encoded_parts = pl.DataFrame(ord_encoded, encoded_schema)
|
| 37 |
+
dataset_encoded = dataset.with_columns(dataset_encoded_parts)
|
| 38 |
+
# dataset_encoded
|
| 39 |
return (
|
| 40 |
OneHotEncoder,
|
| 41 |
OrdinalEncoder,
|
| 42 |
dataset_encoded,
|
| 43 |
dataset_encoded_parts,
|
|
|
|
|
|
|
| 44 |
encoded_features,
|
| 45 |
encoded_schema,
|
|
|
|
| 46 |
ord_encoded,
|
|
|
|
|
|
|
| 47 |
ord_encoder,
|
| 48 |
)
|
| 49 |
|
| 50 |
|
| 51 |
@app.cell
|
| 52 |
+
def _(dataset_encoded, mo):
|
| 53 |
from sklearn.linear_model import LogisticRegression
|
| 54 |
from sklearn.naive_bayes import BernoulliNB
|
| 55 |
from sklearn.tree import DecisionTreeClassifier
|
|
|
|
| 56 |
from sklearn.model_selection import train_test_split
|
| 57 |
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
|
| 58 |
|
| 59 |
+
X = dataset_encoded.select(['Tumor_Size_mm', 'Early_Detection', 'Cancer_Stage'])
|
| 60 |
y = dataset_encoded.select(['Survival_5_years'])
|
| 61 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
|
| 62 |
logreg = LogisticRegression()
|
| 63 |
y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
|
| 64 |
bnb = BernoulliNB()
|
|
|
|
| 66 |
dectree = DecisionTreeClassifier()
|
| 67 |
y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
|
| 68 |
|
| 69 |
+
|
| 70 |
mo.md(f"""
|
| 71 |
+
## Logistic Regression
|
| 72 |
|
| 73 |
Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
|
| 74 |
|
|
|
|
| 84 |
{classification_report(y_test, y_pred_logreg)}
|
| 85 |
```
|
| 86 |
|
| 87 |
+
## Bernoulli Naive Bayes
|
| 88 |
|
| 89 |
Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
|
| 90 |
|
|
|
|
| 100 |
{classification_report(y_test, y_pred_bnb)}
|
| 101 |
```
|
| 102 |
|
| 103 |
+
## Decision Tree Classifier
|
| 104 |
|
| 105 |
Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
|
| 106 |
|
|
|
|
| 115 |
```
|
| 116 |
{classification_report(y_test, y_pred_dectree)}
|
| 117 |
```
|
| 118 |
+
|
| 119 |
+
## Conclusion
|
| 120 |
+
|
| 121 |
+
{mo.callout("Classifiers don't work well with this dataset, let's try something else.", kind='info')}
|
| 122 |
""")
|
| 123 |
return (
|
| 124 |
BernoulliNB,
|
| 125 |
DecisionTreeClassifier,
|
| 126 |
LogisticRegression,
|
|
|
|
| 127 |
X,
|
| 128 |
X_test,
|
| 129 |
X_train,
|
|
|
|
| 144 |
)
|
| 145 |
|
| 146 |
|
| 147 |
+
@app.cell
|
| 148 |
+
def _(OrdinalEncoder, dataset, mo, pl):
|
| 149 |
+
def _():
|
| 150 |
+
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
|
| 151 |
+
from sklearn.svm import SVC
|
| 152 |
+
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
|
| 153 |
+
import altair as alt
|
| 154 |
+
|
| 155 |
+
genmut_encoder = OrdinalEncoder()
|
| 156 |
+
genmut_encoded = genmut_encoder.fit_transform(dataset.select('Genetic_Mutation'))
|
| 157 |
+
genmut_features = genmut_encoder.get_feature_names_out(['Genetic_Mutation'])
|
| 158 |
+
encoded_schema = {name: pl.Int8 for name in genmut_features}
|
| 159 |
+
dataset_encoded_parts = pl.DataFrame(genmut_encoded, encoded_schema)
|
| 160 |
+
dataset_encoded = dataset.with_columns(dataset_encoded_parts)
|
| 161 |
+
# Use samples since dataset is way too big to run locally
|
| 162 |
+
dataset_encoded = dataset_encoded.sample(3000, seed=11)
|
| 163 |
+
|
| 164 |
+
X = dataset_encoded.select(['Tumor_Size_mm', 'Genetic_Mutation'])
|
| 165 |
+
y = dataset_encoded.select(['Cancer_Stage']).to_series()
|
| 166 |
+
|
| 167 |
+
kmeans = KMeans(n_clusters=3, random_state=11)
|
| 168 |
+
spec = SpectralClustering(n_clusters=3, random_state=11)
|
| 169 |
+
|
| 170 |
+
labels_kmeans = kmeans.fit_predict(X)
|
| 171 |
+
labels_spec = spec.fit_predict(X)
|
| 172 |
+
|
| 173 |
+
# df_kmeans_parts = pl.DataFrame(labels_kmeans, schema=pl.String)
|
| 174 |
+
df_kmeans = X.with_columns(pl.lit(labels_kmeans, dtype=pl.String).alias('kmeans_cluster'))
|
| 175 |
+
|
| 176 |
+
return mo.vstack([
|
| 177 |
+
mo.md(f"""
|
| 178 |
+
## K-Means Clustering
|
| 179 |
+
|
| 180 |
+
### External Metrics
|
| 181 |
+
|
| 182 |
+
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_kmeans)}
|
| 183 |
+
|
| 184 |
+
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_kmeans)}
|
| 185 |
+
|
| 186 |
+
Homogeneity: {homogeneity_score(y, labels_kmeans)}
|
| 187 |
+
|
| 188 |
+
Completeness: {completeness_score(y, labels_kmeans)}
|
| 189 |
+
|
| 190 |
+
V-measure: {v_measure_score(y, labels_kmeans)}
|
| 191 |
+
|
| 192 |
+
### Internal Metrics
|
| 193 |
+
|
| 194 |
+
Silhouette Score: {silhouette_score(X, labels_kmeans)}
|
| 195 |
+
|
| 196 |
+
Davies-Bouldin Index: {davies_bouldin_score(X, labels_kmeans)}
|
| 197 |
+
|
| 198 |
+
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_kmeans)}
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
## Spectral Clustering
|
| 202 |
+
|
| 203 |
+
### External Metrics
|
| 204 |
+
|
| 205 |
+
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_spec)}
|
| 206 |
+
|
| 207 |
+
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_spec)}
|
| 208 |
+
|
| 209 |
+
Homogeneity: {homogeneity_score(y, labels_spec)}
|
| 210 |
+
|
| 211 |
+
Completeness: {completeness_score(y, labels_spec)}
|
| 212 |
+
|
| 213 |
+
V-measure: {v_measure_score(y, labels_spec)}
|
| 214 |
+
|
| 215 |
+
### Internal Metrics
|
| 216 |
+
|
| 217 |
+
Silhouette Score: {silhouette_score(X, labels_spec)}
|
| 218 |
+
|
| 219 |
+
Davies-Bouldin Index: {davies_bouldin_score(X, labels_spec)}
|
| 220 |
+
|
| 221 |
+
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_spec)}
|
| 222 |
+
|
| 223 |
+
{mo.callout("Unsupervised clustering techniques do perform reasonably well, but does not correlate to other labels.", 'info')}
|
| 224 |
+
"""),
|
| 225 |
+
|
| 226 |
+
alt.Chart(df_kmeans, autosize='pad').mark_circle().encode(
|
| 227 |
+
x='Genetic_Mutation',
|
| 228 |
+
y='Tumor_Size_mm',
|
| 229 |
+
color='kmeans_cluster'
|
| 230 |
+
)
|
| 231 |
+
])
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
_()
|
| 235 |
+
return
|
| 236 |
+
|
| 237 |
|
| 238 |
if __name__ == "__main__":
|
| 239 |
app.run()
|