Angelo901 commited on
Commit
639e1ee
·
verified ·
1 Parent(s): 1642e5a

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ diabetic_data.csv filter=lfs diff=lfs merge=lfs -text
audit_Diabetes_clustering.md ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ jupytext:
3
+ formats: ipynb,md:myst
4
+ text_representation:
5
+ extension: .md
6
+ format_name: myst
7
+ format_version: 0.13
8
+ jupytext_version: 1.17.3
9
+ kernelspec:
10
+ display_name: Python 3 (ipykernel)
11
+ language: python
12
+ name: python3
13
+ ---
14
+
15
+ # Diabetes Hospital from 1999-2008
16
+
17
+ Info:
18
+
19
+ this data represents 10 years of clinical care 130 hospitals, the rows are the hospital record of patiends diagnosed with diabetes. Despite having strong improvements trhouhg the clinic for dibetics patients, not every patients recives the same outcome as the
20
+
21
+ ```{code-cell} ipython3
22
+ import pandas as pd
23
+ import seaborn as sns
24
+ import numpy as np
25
+ from sklearn import datasets
26
+ from sklearn.cluster import KMeans
27
+ from sklearn.preprocessing import StandardScaler
28
+ from sklearn import metrics
29
+ from sklearn import tree
30
+ import matplotlib.pyplot as plt
31
+ from sklearn.preprocessing import PolynomialFeatures
32
+ from sklearn.model_selection import train_test_split
33
+ from sklearn.naive_bayes import GaussianNB
34
+
35
+ sns.set_theme(palette = 'colorblind')
36
+
37
+ from sklearn.metrics import confusion_matrix, classification_report
38
+ from IPython.display import display
39
+
40
+ np.random.seed(1103)
41
+ np.random.seed(113)
42
+ ```
43
+
44
+ ```{code-cell} ipython3
45
+ url_base = 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008'
46
+
47
+ # can't find the raw data but here is the link where I got the source from.
48
+
49
+ diabetic_df = pd.read_csv('diabetic_data.csv', index_col = 0)
50
+
51
+ diabetic_df.replace('?', 'N/A', inplace = True)
52
+ ```
53
+
54
+ ```{code-cell} ipython3
55
+ diabetic_df.shape
56
+ ```
57
+
58
+ ```{code-cell} ipython3
59
+ diabetic_df.columns
60
+ ```
61
+
62
+ ```{code-cell} ipython3
63
+ sns.pairplot(diabetic_df)
64
+ ```
65
+
66
+ ```{code-cell} ipython3
67
+ important_data = ['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'num_medications', 'number_emergency']
68
+
69
+ # dropping any data with nothing
70
+
71
+ data_df = diabetic_df[important_data].dropna()
72
+
73
+ data_df.head()
74
+
75
+
76
+ ```
77
+
78
+ ```{code-cell} ipython3
79
+ data_df.shape # we can see that a lot of the columns reduce as we are getting the important parts
80
+ ```
81
+
82
+ ```{code-cell} ipython3
83
+ data_info = data_df['race']
84
+ data_info.iloc[0:25]
85
+ ```
86
+
87
+ ```{code-cell} ipython3
88
+ num_data = pd.get_dummies(data_df, drop_first= True)
89
+ num_data.head() # make age, race, and gender be a number to match pairs
90
+ ```
91
+
92
+ ```{code-cell} ipython3
93
+ num_data.columns # more columns
94
+ ```
95
+
96
+ ```{code-cell} ipython3
97
+ num_data['number_emergency'].value_counts()
98
+ ```
99
+
100
+ ```{code-cell} ipython3
101
+ km = KMeans(n_clusters=3)
102
+ ```
103
+
104
+ ```{code-cell} ipython3
105
+ km.__dict__
106
+ ```
107
+
108
+ ```{code-cell} ipython3
109
+ km.fit(num_data)
110
+ ```
111
+
112
+ ```{code-cell} ipython3
113
+ km.__dict__
114
+ ```
115
+
116
+ ```{code-cell} ipython3
117
+ labels = km.predict(num_data)
118
+ #readable
119
+ num_data['km3'] = labels.astype(str)
120
+
121
+ sns.pairplot(num_data.sample(100), hue='km3')
122
+
123
+ # this works just that theres too much graphing happening in the background so I decided to reduce the sample.
124
+ ```
125
+
126
+ ```{code-cell} ipython3
127
+ silo_values = metrics.silhouette_samples(num_data.drop(columns=['km3']),num_data['km3'].astype(int))
128
+
129
+ num_data['km3_silo'] = silo_values
130
+
131
+ num_data.groupby('km3')['km3_silo'].mean()
132
+
133
+
134
+ ```
135
+
136
+ ```{code-cell} ipython3
137
+ target = diabetic_df['readmitted']
138
+
139
+ X = num_data.drop(columns=['km3', 'km3_silo'])
140
+ y = target.loc[X.index]
141
+ ```
142
+
143
+ # Conclusion:
144
+
145
+ Describe what question you would be asking in applying clustering to this dataset. What does it mean if clustering does not work well?
146
+
147
+ - can we see some patterns with patients to see if they are somewhat related?
148
+ - if clustering does not work then it could just be that diabetic are something that is hard to find a pattern to overcome it.
149
+
150
+ How does this task compare to what the classification task on this dataset?
151
+
152
+ - I believe from the data, most of the info comes down to readmitted as in if they came back before 30 days, after 30 days or just didnt show up. but what we did was we wanted to know what else could we do with the data if we didn't know about the 30 days process.
153
+ -
154
+ Apply Kmeans using the known, correct number of clusters, K.
155
+ Evaluate how well clustering worked on the data:
156
+ using a true clustering metric and
157
+ using visualization and
158
+ using a clustering metric that uses the ground truth labels
159
+
160
+ - the data that came out were almost close but not really so therefore it's not a good data to use..
161
+ - using pairplot you can see the colors on the graphs and they aren't really organized..
162
+
163
+ Include a discussion of your results that addresses the following:
164
+ describes what the clustering means
165
+ what the metrics show
166
+ Does this clustering work better or worse than expected based on the classification performance (if you didn’t complete assignment 7, also apply a classifier)
167
+
168
+ - I think that this means for clustering is that its really hard to see a pattern when patients tend to have alot going on at the same time. as in some stay in hospital longer while others dont.. its just a mess
169
+ - metrics show that theres something, like small percent of patterns but they aren't perfect.
170
+ - I kind of just did clustering as you can see below I was trying to make classification work but if im being honest, I feel like clustering makes it worst as we were mixing and grouping things together to see something! so I have a feeling classification works best here.
171
+
172
+
173
+ ```{code-cell} ipython3
174
+ df6 = diabetic_df[['race','gender','age']]
175
+ df6.head()
176
+ ```
177
+
178
+ ```{code-cell} ipython3
179
+ sns.pairplot(data=df6, hue = 'char', hue_order=['A','B'])
180
+ ```
181
+
182
+ ```{code-cell} ipython3
183
+
184
+ ```
185
+
186
+ ```{code-cell} ipython3
187
+
188
+ ```
189
+
190
+ ```{code-cell} ipython3
191
+
192
+ ```
clustering.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c16656fed7ce53e9a6546b37c6b14681bd5665bd88936b84fd4b8080fec57254
3
+ size 547
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config = {
2
+ "sklearn": {
3
+ "columns": [
4
+ "x0",
5
+ "x1",
6
+ "char"
7
+ ],
8
+ "environment": [
9
+ "scikit-learn=1.0.2"
10
+ ],
11
+ "example_input": {
12
+ "x0": [
13
+ 6.14,
14
+ 2.22,
15
+ 2.27
16
+ ],
17
+ "x1": [
18
+ 2.2,
19
+ 2.0,
20
+ 5.9
21
+ ],
22
+ "char": [
23
+ 'A',
24
+ 'B',
25
+ 'A'
26
+ ]
27
+ },
28
+ "model": {
29
+ "file": "model.pkl"
30
+ },
31
+ "task": "tabular-classification"
32
+ }
33
+ }
diabetic_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0689e7ec031237dc63031b938805c48377748761a3b26acab621567afa24df97
3
+ size 19159383
gitattributes.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ diabetic_data.csv filter=lfs diff=lfs merge=lfs -text