Upload 2 files
Browse files- .gitattributes +1 -0
- audit_Diabetes_clustering.md +192 -0
- diabetic_data.csv +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
diabetic_data.csv filter=lfs diff=lfs merge=lfs -text
|
audit_Diabetes_clustering.md
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
jupytext:
|
| 3 |
+
formats: ipynb,md:myst
|
| 4 |
+
text_representation:
|
| 5 |
+
extension: .md
|
| 6 |
+
format_name: myst
|
| 7 |
+
format_version: 0.13
|
| 8 |
+
jupytext_version: 1.17.3
|
| 9 |
+
kernelspec:
|
| 10 |
+
display_name: Python 3 (ipykernel)
|
| 11 |
+
language: python
|
| 12 |
+
name: python3
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Diabetes Hospital from 1999-2008
|
| 16 |
+
|
| 17 |
+
Info:
|
| 18 |
+
|
| 19 |
+
this data represents 10 years of clinical care 130 hospitals, the rows are the hospital record of patiends diagnosed with diabetes. Despite having strong improvements trhouhg the clinic for dibetics patients, not every patients recives the same outcome as the
|
| 20 |
+
|
| 21 |
+
```{code-cell} ipython3
|
| 22 |
+
import pandas as pd
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import numpy as np
|
| 25 |
+
from sklearn import datasets
|
| 26 |
+
from sklearn.cluster import KMeans
|
| 27 |
+
from sklearn.preprocessing import StandardScaler
|
| 28 |
+
from sklearn import metrics
|
| 29 |
+
from sklearn import tree
|
| 30 |
+
import matplotlib.pyplot as plt
|
| 31 |
+
from sklearn.preprocessing import PolynomialFeatures
|
| 32 |
+
from sklearn.model_selection import train_test_split
|
| 33 |
+
from sklearn.naive_bayes import GaussianNB
|
| 34 |
+
|
| 35 |
+
sns.set_theme(palette = 'colorblind')
|
| 36 |
+
|
| 37 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
| 38 |
+
from IPython.display import display
|
| 39 |
+
|
| 40 |
+
np.random.seed(1103)
|
| 41 |
+
np.random.seed(113)
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
```{code-cell} ipython3
|
| 45 |
+
url_base = 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008'
|
| 46 |
+
|
| 47 |
+
# can't find the raw data but here is the link where I got the source from.
|
| 48 |
+
|
| 49 |
+
diabetic_df = pd.read_csv('diabetic_data.csv', index_col = 0)
|
| 50 |
+
|
| 51 |
+
diabetic_df.replace('?', 'N/A', inplace = True)
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
```{code-cell} ipython3
|
| 55 |
+
diabetic_df.shape
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
```{code-cell} ipython3
|
| 59 |
+
diabetic_df.columns
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
```{code-cell} ipython3
|
| 63 |
+
sns.pairplot(diabetic_df)
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
```{code-cell} ipython3
|
| 67 |
+
important_data = ['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'num_medications', 'number_emergency']
|
| 68 |
+
|
| 69 |
+
# dropping any data with nothing
|
| 70 |
+
|
| 71 |
+
data_df = diabetic_df[important_data].dropna()
|
| 72 |
+
|
| 73 |
+
data_df.head()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
```{code-cell} ipython3
|
| 79 |
+
data_df.shape # we can see that a lot of the columns reduce as we are getting the important parts
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
```{code-cell} ipython3
|
| 83 |
+
data_info = data_df['race']
|
| 84 |
+
data_info.iloc[0:25]
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
```{code-cell} ipython3
|
| 88 |
+
num_data = pd.get_dummies(data_df, drop_first= True)
|
| 89 |
+
num_data.head() # make age, race, and gender be a number to match pairs
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
```{code-cell} ipython3
|
| 93 |
+
num_data.columns # more columns
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
```{code-cell} ipython3
|
| 97 |
+
num_data['number_emergency'].value_counts()
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
```{code-cell} ipython3
|
| 101 |
+
km = KMeans(n_clusters=3)
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
```{code-cell} ipython3
|
| 105 |
+
km.__dict__
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
```{code-cell} ipython3
|
| 109 |
+
km.fit(num_data)
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
```{code-cell} ipython3
|
| 113 |
+
km.__dict__
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
```{code-cell} ipython3
|
| 117 |
+
labels = km.predict(num_data)
|
| 118 |
+
#readable
|
| 119 |
+
num_data['km3'] = labels.astype(str)
|
| 120 |
+
|
| 121 |
+
sns.pairplot(num_data.sample(100), hue='km3')
|
| 122 |
+
|
| 123 |
+
# this works just that theres too much graphing happening in the background so I decided to reduce the sample.
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
```{code-cell} ipython3
|
| 127 |
+
silo_values = metrics.silhouette_samples(num_data.drop(columns=['km3']),num_data['km3'].astype(int))
|
| 128 |
+
|
| 129 |
+
num_data['km3_silo'] = silo_values
|
| 130 |
+
|
| 131 |
+
num_data.groupby('km3')['km3_silo'].mean()
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
```{code-cell} ipython3
|
| 137 |
+
target = diabetic_df['readmitted']
|
| 138 |
+
|
| 139 |
+
X = num_data.drop(columns=['km3', 'km3_silo'])
|
| 140 |
+
y = target.loc[X.index]
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
# Conclusion:
|
| 144 |
+
|
| 145 |
+
Describe what question you would be asking in applying clustering to this dataset. What does it mean if clustering does not work well?
|
| 146 |
+
|
| 147 |
+
- can we see some patterns with patients to see if they are somewhat related?
|
| 148 |
+
- if clustering does not work then it could just be that diabetic are something that is hard to find a pattern to overcome it.
|
| 149 |
+
|
| 150 |
+
How does this task compare to what the classification task on this dataset?
|
| 151 |
+
|
| 152 |
+
- I believe from the data, most of the info comes down to readmitted as in if they came back before 30 days, after 30 days or just didnt show up. but what we did was we wanted to know what else could we do with the data if we didn't know about the 30 days process.
|
| 153 |
+
-
|
| 154 |
+
Apply Kmeans using the known, correct number of clusters, K.
|
| 155 |
+
Evaluate how well clustering worked on the data:
|
| 156 |
+
using a true clustering metric and
|
| 157 |
+
using visualization and
|
| 158 |
+
using a clustering metric that uses the ground truth labels
|
| 159 |
+
|
| 160 |
+
- the data that came out were almost close but not really so therefore it's not a good data to use..
|
| 161 |
+
- using pairplot you can see the colors on the graphs and they aren't really organized..
|
| 162 |
+
|
| 163 |
+
Include a discussion of your results that addresses the following:
|
| 164 |
+
describes what the clustering means
|
| 165 |
+
what the metrics show
|
| 166 |
+
Does this clustering work better or worse than expected based on the classification performance (if you didn’t complete assignment 7, also apply a classifier)
|
| 167 |
+
|
| 168 |
+
- I think that this means for clustering is that its really hard to see a pattern when patients tend to have alot going on at the same time. as in some stay in hospital longer while others dont.. its just a mess
|
| 169 |
+
- metrics show that theres something, like small percent of patterns but they aren't perfect.
|
| 170 |
+
- I kind of just did clustering as you can see below I was trying to make classification work but if im being honest, I feel like clustering makes it worst as we were mixing and grouping things together to see something! so I have a feeling classification works best here.
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
```{code-cell} ipython3
|
| 174 |
+
df6 = diabetic_df[['race','gender','age']]
|
| 175 |
+
df6.head()
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
```{code-cell} ipython3
|
| 179 |
+
sns.pairplot(data=df6, hue = 'char', hue_order=['A','B'])
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
```{code-cell} ipython3
|
| 183 |
+
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
```{code-cell} ipython3
|
| 187 |
+
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
```{code-cell} ipython3
|
| 191 |
+
|
| 192 |
+
```
|
diabetic_data.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0689e7ec031237dc63031b938805c48377748761a3b26acab621567afa24df97
|
| 3 |
+
size 19159383
|