Tryfonas commited on
Commit
0e2c35a
·
verified ·
1 Parent(s): 1c03ffd

Upload folder using huggingface_hub

Browse files
BDS24_Weekly_Assignments/KIVA___BDS24_Assignment_Karmiris_Tryfonas.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
BDS24_Weekly_Assignments/__pycache__/dual_axis_and_boxplot.cpython-312.pyc ADDED
Binary file (4.82 kB). View file
 
BDS24_Weekly_Assignments/__pycache__/repayment_interval_chart.cpython-312.pyc ADDED
Binary file (2.97 kB). View file
 
BDS24_Weekly_Assignments/app.py ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import altair as alt
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from scipy.stats import zscore
8
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.cluster import KMeans, AgglomerativeClustering
11
+ from scipy.spatial.distance import cdist
12
+ from scipy.cluster.hierarchy import dendrogram, linkage
13
+
14
+ # Function to load and clean data
15
+ @st.cache_data
16
+ def load_and_clean_data(file_path):
17
+ # Load data
18
+ df_kiva_loans = pd.read_csv(file_path)
19
+
20
+ # Clean data
21
+ df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
22
+ df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)
23
+
24
+ # Calculate Z-scores
25
+ z_scores = zscore(df_kiva_loans['funded_amount'])
26
+ df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
27
+ df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
28
+
29
+ return df_kiva_loans_cleaned
30
+
31
+ # Load the cleaned data
32
+ file_path = 'kiva_loans.csv'
33
+ df_kiva_loans_cleaned = load_and_clean_data(file_path)
34
+
35
+ # Streamlit App Title
36
+ st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
37
+
38
+ # Sidebar for navigation
39
+ st.sidebar.title("Navigation")
40
+ page = st.sidebar.radio("Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive", "KMeans Clustering & Recommendations","Hierarchical Clustering & Dendrogram"])
41
+
42
+ # Introduction Page
43
+ if page == "Introduction":
44
+ st.subheader("Introduction")
45
+ st.write("""
46
+ This application provides insights into Kiva loans data.
47
+ You can explore the distribution of funded amounts,
48
+ analyze top values by selected variables, and visualize
49
+ relationships between funded amounts and various factors such as Countries and Sectors that the loans were funded.
50
+ """)
51
+
52
+ # Data Overview Page
53
+ elif page == "Data Overview":
54
+ st.subheader("Data Overview")
55
+ st.write("Here is a preview of the cleaned Kiva loans data:")
56
+
57
+ # Display the cleaned data table
58
+ st.table(df_kiva_loans_cleaned.head())
59
+
60
+ # Distribution of Funded Amounts
61
+ st.subheader('Distribution of Funded Amounts')
62
+ chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
63
+ alt.X('funded_amount', bin=alt.Bin(maxbins=50)), # Use funded_amount for distribution
64
+ y='count()',
65
+ ).properties(
66
+ title='Distribution of Funded Amounts'
67
+ )
68
+ st.altair_chart(chart, use_container_width=True)
69
+ st.write("This chart shows the distribution of funded amounts for Kiva loans. The x-axis represents the funded amount, while the y-axis shows the count of loans that fall within each bin. As you can see most of the loans are low valued with most of them being in the range of 100 and 500")
70
+
71
+ # Page 3: Top Values by Selected Variable
72
+ elif page == "Top Values by Selected Variable":
73
+ st.subheader('Top Values by Selected Variable')
74
+
75
+ # Dropdown for plot type
76
+ plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
77
+
78
+ # Slider to select the number of top values to display
79
+ num_columns = st.slider(
80
+ "Select Number of Columns to Display on the Chart",
81
+ min_value=5,
82
+ max_value=50,
83
+ value=10, # default value
84
+ step=1
85
+ )
86
+
87
+ # Select the top values based on the selected variable and number of columns
88
+ if plot_type == 'country':
89
+ top_values = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
90
+ x_column = 'country'
91
+ count_column = 'count'
92
+ description = f"This chart displays the top {num_columns} countries by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. In general Phillipines is the country with the most loans followed by Kenya and El Salvador."
93
+ elif plot_type == 'repayment_interval':
94
+ top_values = df_kiva_loans_cleaned.groupby('repayment_interval')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
95
+ x_column = 'repayment_interval'
96
+ count_column = 'count'
97
+ description = f"This chart shows the top {num_columns} repayment intervals by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. Most of the loans are funded with a monthly repayment interval, where the bullet repayment is an unsusal choice"
98
+ else: # sector
99
+ top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
100
+ x_column = 'sector'
101
+ count_column = 'count'
102
+ description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. Most loans are funded to the Agriculture Sector with Food and Retail completing the first three. Looks like that if the sector of the business is close to Primary production or its Basic Necessities(food) "
103
+
104
+ # Display description
105
+ st.write(description)
106
+
107
+ # Create a dual-axis bar plot using Matplotlib
108
+ fig, ax1 = plt.subplots(figsize=(12, 9))
109
+ plt.xticks(rotation=90)
110
+
111
+ # Bar plot for funded_amount
112
+ color = 'tab:blue'
113
+ ax1.set_xlabel(x_column.replace("_", " ").title())
114
+ ax1.set_ylabel('Funded Amount', color=color)
115
+ ax1.bar(top_values[x_column], top_values['sum'], color=color, alpha=0.6, label='Funded Amount')
116
+ ax1.tick_params(axis='y', labelcolor=color)
117
+
118
+ # Create a second y-axis for count
119
+ ax2 = ax1.twinx()
120
+ color = 'tab:red'
121
+ ax2.set_ylabel('Count', color=color)
122
+ ax2.plot(top_values[x_column], top_values[count_column], color=color, marker='o', linestyle='-', linewidth=2, label='Count')
123
+ ax2.tick_params(axis='y', labelcolor=color)
124
+
125
+ # Add titles and labels
126
+ plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
127
+ fig.tight_layout()
128
+ st.pyplot(fig)
129
+
130
+ # Boxplot after the dual-axis plot
131
+ st.subheader('Funded Amount vs. Selected Variable')
132
+
133
+ # Filter the data based on the selected variable and number of top values
134
+ if plot_type == 'sector':
135
+ top_values_boxplot = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
136
+ filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
137
+ elif plot_type == 'country':
138
+ top_values_boxplot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
139
+ filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
140
+ else: # repayment_interval
141
+ filtered_df_boxplot = df_kiva_loans_cleaned
142
+
143
+ # Create a boxplot
144
+ fig, ax = plt.subplots(figsize=(12, 6))
145
+ if plot_type != 'repayment_interval':
146
+ top_values_sorted = df_kiva_loans_cleaned.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
147
+ sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
148
+ st.write(f"This boxplot shows the distribution of funded amounts for the top {num_columns} {plot_type.replace('_', ' ')}. It provides insights into the spread and outliers of funded amounts.")
149
+ else:
150
+ sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)
151
+ st.write(f"This boxplot shows the distribution of funded amounts for the top {num_columns} {plot_type.replace('_', ' ')}. It provides insights into the spread and outliers of funded amounts.")
152
+
153
+ plt.title('Funded Amount by Selected Variable')
154
+ plt.xlabel(plot_type)
155
+ plt.ylabel('Funded Amount')
156
+ plt.xticks(rotation=90)
157
+ st.pyplot(fig)
158
+
159
+ # Remaining pages (Repayment Interval by Selected Variable, Country Comparison Deepdive, Sector Comparison Deepdive)
160
+ elif page == "Repayment Interval by Selected Variable":
161
+ st.subheader('Repayment Interval by Selected Variable')
162
+
163
+ # Dropdown for selecting variable for Seaborn countplot
164
+ plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])
165
+
166
+ # Slider to select the number of top values to display for Seaborn countplot
167
+ num_top_values = st.slider(
168
+ "Select Number of Top Values to Display",
169
+ min_value=5,
170
+ max_value=50,
171
+ value=10, # default value
172
+ step=1
173
+ )
174
+
175
+ # Filter the data based on the selected variable and number of top values
176
+ if plot_var == 'sector':
177
+ top_values_plot = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
178
+ filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
179
+ description = f"This countplot shows the distribution of repayment intervals for the top {num_top_values} sectors based on the number of loans. In terms of sectors Agriculture got the most monthly repayment loans followed by food. Also a lot of irregulars were in the Food, Retail and Agriculture sectors, which again confirms that loans for first necessities are given more easily. "
180
+ elif plot_var == 'country':
181
+ top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
182
+ filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
183
+ description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans. In terms of countries the Philippines had a great number of Irregular loans."
184
+
185
+ # Display description
186
+ st.write(description)
187
+
188
+ # Create a count plot
189
+ fig, ax = plt.subplots(figsize=(10, 6))
190
+
191
+ # Count the occurrences of repayment intervals for the filtered data
192
+ count_data = filtered_df_plot.groupby('repayment_interval')[plot_var].value_counts().unstack(fill_value=0)
193
+
194
+ # Calculate total counts for sorting
195
+ total_counts = count_data.sum(axis=1)
196
+
197
+ # Sort the repayment intervals based on the total count of loans in descending order
198
+ sorted_index = total_counts.sort_values(ascending=False).index
199
+ count_data = count_data.loc[sorted_index]
200
+
201
+ # Create a grouped bar plot
202
+ count_data.plot(kind='bar', ax=ax, position=0, width=0.8)
203
+ plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
204
+ plt.xlabel('Repayment Interval')
205
+ plt.ylabel('Count of Loans')
206
+ plt.xticks(rotation=45)
207
+ plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
208
+ st.pyplot(fig)
209
+
210
+ # Page 5: Country Comparison Deepdive
211
+ elif page == "Country Comparison Deepdive":
212
+ st.subheader("Country Comparison Deepdive")
213
+
214
+ # Multi-select for countries
215
+ selected_countries = st.multiselect("Select Countries to Compare (Please select one or more)", options=df_kiva_loans_cleaned['country'].unique())
216
+
217
+ # Option to choose between count or sum of funded amounts
218
+ aggregation_option = st.radio("Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
219
+
220
+ if selected_countries:
221
+ # Filter the data based on selected countries
222
+ filtered_data = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(selected_countries)]
223
+
224
+ # Create a combined bar plot for sector summary
225
+ st.subheader("Total Funded Amounts by Sector for Selected Countries")
226
+ if aggregation_option == "Sum":
227
+ sector_summary = filtered_data.groupby(['country', 'sector']).agg(
228
+ total_funded_amount=('funded_amount', 'sum')
229
+ ).reset_index()
230
+ st.write("This graph shows the total funded amount in each Sector for the selected Countries by the user.")
231
+ else: # Count
232
+ sector_summary = filtered_data.groupby(['country', 'sector']).agg(
233
+ total_funded_amount=('funded_amount', 'count')
234
+ ).reset_index()
235
+ st.write("This graph shows the number of loans in each Sector for the selected Countries by the user.")
236
+
237
+ fig, ax = plt.subplots(figsize=(12, 6))
238
+ sns.barplot(x='sector', y='total_funded_amount', hue='country', data=sector_summary, ax=ax)
239
+ plt.title(f'Total Funded Amount by Sector for Selected Countries ({aggregation_option})')
240
+ plt.xlabel('Sector')
241
+ plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
242
+ plt.xticks(rotation=45)
243
+ st.pyplot(fig)
244
+
245
+ # Create a combined bar plot for repayment summary
246
+ st.subheader("Total Funded Amounts by Repayment Interval for Selected Countries")
247
+ if aggregation_option == "Summary of Funded Amount":
248
+ repayment_summary = filtered_data.groupby(['country', 'repayment_interval']).agg(
249
+ total_funded_amount=('funded_amount', 'sum')
250
+ ).reset_index()
251
+ st.write("This graph shows the total funded amount in each Repayment interval for the selected Countries by the user.")
252
+ else: # Count
253
+ repayment_summary = filtered_data.groupby(['country', 'repayment_interval']).agg(
254
+ total_funded_amount=('funded_amount', 'count')
255
+ ).reset_index()
256
+ st.write("This graph shows the number of loans in each Repayment interval for the selected Countries by the user.")
257
+
258
+ fig, ax = plt.subplots(figsize=(12, 6))
259
+ sns.barplot(x='repayment_interval', y='total_funded_amount', hue='country', data=repayment_summary, ax=ax)
260
+ plt.title(f'Total Funded Amount by Repayment Interval for Selected Countries ({aggregation_option})')
261
+ plt.xlabel('Repayment Interval')
262
+ plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
263
+ plt.xticks(rotation=45)
264
+ st.pyplot(fig)
265
+ else:
266
+ st.write("Please select one or more countries to compare from the dropdown above.")
267
+
268
+ # Page 6: Sector Comparison Deepdive
269
+ elif page == "Sector Comparison Deepdive":
270
+ st.subheader("Sector Comparison Deepdive")
271
+
272
+ # Multi-select for sectors
273
+ selected_sectors = st.multiselect("Select Sectors to Compare (Please select one or more)", options=df_kiva_loans_cleaned['sector'].unique())
274
+
275
+ # Option to choose between count or sum of funded amounts
276
+ aggregation_option = st.radio("Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
277
+
278
+ if selected_sectors:
279
+ # Filter the data based on selected sectors
280
+ filtered_data = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(selected_sectors)]
281
+
282
+ # Create a combined bar plot for sector summary by country
283
+ st.subheader("Total Funded Amounts by Country for Selected Sectors")
284
+ if aggregation_option == "Summary of Funded Amount":
285
+ country_summary = filtered_data.groupby(['country', 'sector']).agg(
286
+ total_funded_amount=('funded_amount', 'sum')
287
+ ).reset_index()
288
+ st.write("This graph shows the total funded amount in each Country, for the selected Sectors by the user.")
289
+ else: # Count
290
+ country_summary = filtered_data.groupby(['country', 'sector']).agg(
291
+ total_funded_amount=('funded_amount', 'count')
292
+ ).reset_index()
293
+ st.write("This graph shows the number of loans in each Country, for the selected Sectors by the user.")
294
+
295
+ fig, ax = plt.subplots(figsize=(12, 6))
296
+ sns.barplot(x='country', y='total_funded_amount', hue='sector', data=country_summary, ax=ax)
297
+ plt.title(f'Total Funded Amount by Country for Selected Sectors ({aggregation_option})')
298
+ plt.xlabel('Country')
299
+ plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
300
+ plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')
301
+ plt.xticks(rotation=90)
302
+ st.pyplot(fig)
303
+
304
+ # Create a combined bar plot for repayment summary
305
+ st.subheader("Total Funded Amounts by Repayment Interval for Selected Sectors")
306
+ if aggregation_option == "Sum":
307
+ repayment_summary = filtered_data.groupby(['repayment_interval', 'sector']).agg(
308
+ total_funded_amount=('funded_amount', 'sum')
309
+ ).reset_index()
310
+ st.write("This graph shows the funded amount in each Repayment interval for the selected Sectors by the user.")
311
+ else: # Count
312
+ repayment_summary = filtered_data.groupby(['repayment_interval', 'sector']).agg(
313
+ total_funded_amount=('funded_amount', 'count')
314
+ ).reset_index()
315
+ st.write("This graph shows the number of loans in each Repayment interval for the selected Sectors by the user.")
316
+
317
+ fig, ax = plt.subplots(figsize=(12, 6))
318
+ sns.barplot(x='repayment_interval', y='total_funded_amount', hue='sector', data=repayment_summary, ax=ax)
319
+ plt.title(f'Total Funded Amount by Repayment Interval for Selected Sectors ({aggregation_option})')
320
+ plt.xlabel('Repayment Interval')
321
+ plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
322
+ plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')
323
+ plt.xticks(rotation=90)
324
+ st.pyplot(fig)
325
+ else:
326
+ st.write("Please select one or more countries to compare from the dropdown above.")
327
+
328
+ # Page 7: KMeans Clustering & Recommendations
329
+ elif page == "KMeans Clustering & Recommendations":
330
+ st.subheader("KMeans Clustering & Recommendations")
331
+
332
+ # User input to choose the number of sample rows
333
+ sample_size = st.slider("Select the number of sample rows for clustering:", min_value=1000, max_value=100000, value=20000, step=1000)
334
+
335
+ # Sample the selected number of rows from the DataFrame
336
+ df_sample = df_kiva_loans_cleaned.sample(n=sample_size, random_state=42).copy()
337
+
338
+ # Keeping only the relevant columns and storing original indices
339
+ df_original = df_sample[['country','funded_amount', 'sector','repayment_interval']].copy()
340
+ df_original['original_index'] = df_sample.index # Keep track of original indices
341
+
342
+ # Label Encoding for categorical variables and adding encoded columns with "_id" suffix
343
+ label_encoders = {}
344
+ for column in df_original.select_dtypes(include=['object']).columns:
345
+ le = LabelEncoder()
346
+ df_original[column + '_id'] = le.fit_transform(df_original[column])
347
+ label_encoders[column] = le
348
+
349
+ # Standardizing the data using the encoded columns
350
+ encoded_columns = [col + '_id' for col in df_original.select_dtypes(include=['object']).columns]
351
+ scaler = StandardScaler()
352
+ df_scaled = scaler.fit_transform(df_original[encoded_columns + ['funded_amount']])
353
+
354
+ # Applying PCA
355
+ pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
356
+ df_pca = pca.fit_transform(df_scaled)
357
+
358
+ # Elbow Method to find the optimal number of clusters
359
+ inertia = []
360
+ for n in range(1, 11):
361
+ kmeans = KMeans(n_clusters=n, random_state=42)
362
+ kmeans.fit(df_pca)
363
+ inertia.append(kmeans.inertia_)
364
+
365
+ # Plotting the Elbow Method
366
+ plt.figure(figsize=(8, 6))
367
+ plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
368
+ plt.title('Elbow Method for Optimal Number of Clusters')
369
+ plt.xlabel('Number of Clusters')
370
+ plt.ylabel('Inertia')
371
+ st.pyplot(plt.gcf())
372
+
373
+ # User input to choose the optimal number of clusters
374
+ optimal_clusters = st.slider("Select the number of optimal clusters:", min_value=1, max_value=10, value=4, step=1)
375
+
376
+ # Apply KMeans with optimal clusters
377
+ kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
378
+ df_original['cluster'] = kmeans.fit_predict(df_pca)
379
+
380
+ # Visualize the clustering results at different iterations
381
+ max_iters = [1, 2, 5, 6, 8, 10] # Different iterations you want to visualize
382
+
383
+ # Increase the figure size for better visibility
384
+ plt.figure(figsize=(15, 55)) # Adjusted the figsize to make plots larger
385
+ for i, max_iter in enumerate(max_iters, start=1):
386
+ kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, max_iter=max_iter)
387
+ df_original['cluster'] = kmeans.fit_predict(df_pca)
388
+
389
+ # Plotting the clusters
390
+ plt.subplot(6, 1, i) # Changed the layout to 3 rows x 2 columns for larger plots
391
+ sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_original['cluster'], palette='viridis', s=100)
392
+
393
+ # Plotting the centroids
394
+ centroids = kmeans.cluster_centers_
395
+ plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=300, marker='X', label='Centroids') # Increased centroid size
396
+
397
+ plt.title(f'K-means Clustering - Iteration {max_iter}', fontsize=16)
398
+ plt.xlabel('Principal Component 1', fontsize=14)
399
+ plt.ylabel('Principal Component 2', fontsize=14)
400
+ plt.xticks(fontsize=12)
401
+ plt.yticks(fontsize=12)
402
+ if i == 1:
403
+ plt.legend()
404
+
405
+ plt.tight_layout()
406
+ st.pyplot(plt.gcf())
407
+
408
+ # New Input: Select a cluster and display top 10 data points
409
+ st.subheader("Explore a Cluster")
410
+ selected_cluster = st.selectbox("Select a Cluster", options=sorted(df_original['cluster'].unique()))
411
+
412
+ # Filter data based on selected cluster
413
+ cluster_data = df_original[df_original['cluster'] == selected_cluster]
414
+
415
+ st.write(f"Top 10 items in Cluster {selected_cluster}:")
416
+ st.write(cluster_data.head(10))
417
+
418
+ # Dynamic input for the new data point
419
+ st.subheader("Input New Data Point for Recommendations")
420
+
421
+ # Allow the user to select the country, sector, and repayment interval
422
+ country = st.selectbox("Select Country", options=df_kiva_loans_cleaned['country'].unique())
423
+ sector = st.selectbox("Select Sector", options=df_kiva_loans_cleaned['sector'].unique())
424
+ repayment_interval = st.selectbox("Select Repayment Interval", options=df_kiva_loans_cleaned['repayment_interval'].unique())
425
+
426
+ # Allow the user to select the funded amount using a slider
427
+ funded_amount = st.slider("Select Funded Amount", min_value=int(df_kiva_loans_cleaned['funded_amount'].min()), max_value=int(df_kiva_loans_cleaned['funded_amount'].max()), value=1500)
428
+
429
+ new_data = {
430
+ 'country': country,
431
+ 'funded_amount': funded_amount,
432
+ 'sector': sector,
433
+ 'repayment_interval': repayment_interval
434
+ }
435
+
436
+ # Convert new data to DataFrame
437
+ new_data_df = pd.DataFrame([new_data])
438
+
439
+ # Encode the new data point and add encoded columns with "_id" suffix
440
+ for column in new_data_df.select_dtypes(include=['object']).columns:
441
+ new_data_df[column + '_id'] = label_encoders[column].transform(new_data_df[column])
442
+
443
+ # Standardize the new data using the encoded columns
444
+ new_data_scaled = scaler.transform(new_data_df[[col + '_id' for col in new_data_df.select_dtypes(include=['object']).columns] + ['funded_amount']])
445
+
446
+ # Apply PCA to the new data
447
+ new_data_pca = pca.transform(new_data_scaled)
448
+
449
+ # Predict the cluster for the new data point
450
+ new_cluster = kmeans.predict(new_data_pca)[0]
451
+
452
+ st.subheader("Top 5 Similar Items to the Input")
453
+ st.write(f"The new data point belongs to cluster: {new_cluster}")
454
+
455
+ # Get all data points in the same cluster
456
+ cluster_data = df_original[df_original['cluster'] == new_cluster]
457
+
458
+ # Apply the same PCA transformation to the scaled data of the entire cluster
459
+ cluster_data_pca = pca.transform(scaler.transform(cluster_data[encoded_columns + ['funded_amount']]))
460
+
461
+ # Calculate the Euclidean distance between the new data point and all points in the same cluster
462
+ distances = cdist(new_data_pca, cluster_data_pca, 'euclidean')[0]
463
+
464
+ # Add distances to the cluster data DataFrame
465
+ cluster_data = cluster_data.copy()
466
+ cluster_data['distance'] = distances
467
+
468
+ # Sort by distance and select the top 5 closest items
469
+ top_5_recommendations = cluster_data.sort_values('distance').head(5)
470
+
471
+ # Retrieve the original rows from the original DataFrame before encoding
472
+ recommended_indices = top_5_recommendations['original_index']
473
+ recommendations = df_kiva_loans_cleaned.loc[recommended_indices]
474
+
475
+ # Display the original rows as the top 5 recommendations
476
+ st.write(recommendations)
477
+
478
+
479
+
480
+ # Page 8: Hierarchical Clustering & Dendrogram
481
+ elif page == "Hierarchical Clustering & Dendrogram":
482
+ st.subheader("Hierarchical Clustering & Dendrogram")
483
+
484
+ # User input to choose the number of sample rows
485
+ sample_size = st.slider("Select the number of sample rows for clustering:", min_value=100, max_value=5000, value=150, step=50)
486
+
487
+ # User input to choose the number of clusters
488
+ n_clusters = st.slider("Select the number of clusters:", min_value=2, max_value=10, value=4, step=1)
489
+
490
+ # Sample the selected number of rows from the DataFrame
491
+ df_sample = df_kiva_loans_cleaned.sample(n=sample_size, random_state=42).copy()
492
+
493
+ # Keeping only the relevant columns and storing original indices
494
+ df_original = df_sample[['funded_amount', 'loan_amount']].copy()
495
+ df_original['original_index'] = df_sample.index # Keep track of original indices
496
+
497
+ # Standardizing the data
498
+ scaler = StandardScaler()
499
+ df_scaled = scaler.fit_transform(df_original[['funded_amount', 'loan_amount']])
500
+
501
+ # Perform Agglomerative Clustering with dynamic n_clusters
502
+ agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
503
+ df_original['cluster'] = agg_clustering.fit_predict(df_scaled)
504
+
505
+ # Plot the resulting clusters
506
+ plt.figure(figsize=(10, 7))
507
+ sns.scatterplot(x=df_original['funded_amount'], y=df_original['loan_amount'], hue=df_original['cluster'], palette='viridis', s=50)
508
+ plt.title(f'Agglomerative Clustering (Hierarchical) Results - {n_clusters} Clusters')
509
+ plt.xlabel('Funded Amount')
510
+ plt.ylabel('Loan Amount')
511
+ st.pyplot(plt.gcf())
512
+
513
+ # Dendrogram Visualization
514
+ linked = linkage(df_scaled, method='ward')
515
+
516
+ plt.figure(figsize=(10, 7))
517
+ dendrogram(linked,
518
+ orientation='top',
519
+ labels=df_original['original_index'].values, # Loan IDs as labels
520
+ distance_sort='descending',
521
+ show_leaf_counts=True)
522
+ plt.title('Hierarchical Clustering Dendrogram with Loan IDs')
523
+ plt.xlabel('Loan ID')
524
+ plt.xticks(rotation=90)
525
+ plt.ylabel('Distance')
526
+ st.pyplot(plt.gcf())
527
+
BDS24_Weekly_Assignments/kiva_loans.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b20efc20de600b27608d69fe07e728b00a075c3db29849e146b717098f778d92
3
+ size 195852823
BDS24_Weekly_Assignments/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ seaborn
3
+ pandas
4
+ matplotlib
5
+ altair
6
+ scipy
7
+ scikit-learn
Streamlit_Lecture/app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import altair as alt
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ # Function to load the dataset
9
+ @st.cache_data # Cache the function to enhance performance
10
+ def load_data():
11
+ # Define the file path
12
+ file_path = 'https://raw.githubusercontent.com/aaubs/ds-master/main/apps/M1-attrition-streamlit/HR-Employee-Attrition-synth.csv'
13
+
14
+ # Load the CSV file into a pandas dataframe
15
+ df = pd.read_csv(file_path)
16
+
17
+ # Create age groups and add as a new column
18
+ bin_edges = [18, 25, 35, 45, 60]
19
+ bin_labels = ['18-24', '25-34', '35-44', '45-60']
20
+ df['AgeGroup'] = pd.cut(df['Age'], bins=bin_edges, labels=bin_labels, right=False)
21
+
22
+ return df
23
+
24
+ # Load the data using the defined function
25
+ df = load_data()
26
+
27
+ # Set the app title and sidebar header
28
+ st.title("Employee Attrition Dashboard 😊📈")
29
+ st.sidebar.header("Filters 📊")
30
+
31
+ # Introduction
32
+
33
+ # HR Attrition Dashboard
34
+
35
+ st.markdown("""
36
+ Welcome to the HR Attrition Dashboard. In the backdrop of rising employee turnovers, HR departments are stressing the significance of predicting and understanding employee departures. Through the lens of data analytics, this dashboard unveils the deeper causes of employee churn and proposes strategies to boost employee retention.
37
+ """)
38
+ with st.expander("📊 **Objective**"):
39
+ st.markdown("""
40
+ At the heart of this dashboard is the mission to visually decode data, equipping HR experts with insights to tackle these queries:
41
+ - Which company factions face a greater likelihood of employee exits?
42
+ - What might be pushing these individuals to part ways?
43
+ - Observing the discerned trends, what incentives might hold the key to decreasing the attrition rate?
44
+ """
45
+ )
46
+
47
+ # Tutorial Expander
48
+ with st.expander("How to Use the Dashboard 📚"):
49
+ st.markdown("""
50
+ 1. **Filter Data** - Use the sidebar filters to narrow down specific data sets.
51
+ 2. **Visualize Data** - From the dropdown, select a visualization type to view patterns.
52
+ 3. **Insights & Recommendations** - Scroll down to see insights derived from the visualizations and actionable recommendations.
53
+ """)
54
+
55
+
56
+ # Sidebar filter: Age Group
57
+ selected_age_group = st.sidebar.multiselect("Select Age Groups 🕰️", df['AgeGroup'].unique().tolist(), default=df['AgeGroup'].unique().tolist())
58
+ if not selected_age_group:
59
+ st.warning("Please select an age group from the sidebar ⚠️")
60
+ st.stop()
61
+ filtered_df = df[df['AgeGroup'].isin(selected_age_group)]
62
+
63
+ # Sidebar filter: Department
64
+ departments = df['Department'].unique().tolist()
65
+ selected_department = st.sidebar.multiselect("Select Departments 🏢", departments, default=departments)
66
+ if not selected_department:
67
+ st.warning("Please select a department from the sidebar ⚠️")
68
+ st.stop()
69
+ filtered_df = filtered_df[filtered_df['Department'].isin(selected_department)]
70
+
71
+ # Sidebar filter: Monthly Income Range
72
+ min_income = int(df['MonthlyIncome'].min())
73
+ max_income = int(df['MonthlyIncome'].max())
74
+ income_range = st.sidebar.slider("Select Monthly Income Range 💰", min_income, max_income, (min_income, max_income))
75
+ filtered_df = filtered_df[(filtered_df['MonthlyIncome'] >= income_range[0]) & (filtered_df['MonthlyIncome'] <= income_range[1])]
76
+
77
+ # Sidebar filter: Job Satisfaction Level
78
+ satisfaction_levels = sorted(df['JobSatisfaction'].unique().tolist())
79
+ selected_satisfaction = st.sidebar.multiselect("Select Job Satisfaction Levels 😊", satisfaction_levels, default=satisfaction_levels)
80
+ if not selected_satisfaction:
81
+ st.warning("Please select a job satisfaction level from the sidebar ⚠️")
82
+ st.stop()
83
+ filtered_df = filtered_df[filtered_df['JobSatisfaction'].isin(selected_satisfaction)]
84
+
85
+ # Displaying the Attrition Analysis header
86
+ st.header("Attrition Analysis 📊")
87
+
88
+ # Dropdown to select the type of visualization
89
+ visualization_option = st.selectbox(
90
+ "Select Visualization 🎨",
91
+ ["Attrition by Age Group",
92
+ "KDE Plot: Distance from Home by Attrition",
93
+ "Attrition by Job Role",
94
+ "Attrition Distribution by Gender",
95
+ "MonthlyRate and DailyRate by JobLevel"]
96
+ )
97
+
98
+ # Visualizations based on user selection
99
+ if visualization_option == "Attrition by Age Group":
100
+ # Bar chart for attrition by age group
101
+ chart = alt.Chart(filtered_df).mark_bar().encode(
102
+ x='AgeGroup',
103
+ y='count()',
104
+ color='Attrition'
105
+ ).properties(
106
+ title='Attrition Rate by Age Group'
107
+ )
108
+ st.altair_chart(chart, use_container_width=True)
109
+
110
+ elif visualization_option == "KDE Plot: Distance from Home by Attrition":
111
+ # KDE plot for Distance from Home based on Attrition
112
+ plt.figure(figsize=(10, 6))
113
+ sns.kdeplot(data=filtered_df, x='DistanceFromHome', hue='Attrition', fill=True, palette='Set2')
114
+ plt.xlabel('Distance From Home')
115
+ plt.ylabel('Density')
116
+ plt.title('KDE Plot of Distance From Home by Attrition')
117
+ st.pyplot(plt)
118
+
119
+ elif visualization_option == "Attrition by Job Role":
120
+ # Bar chart for attrition by job role
121
+ chart = alt.Chart(filtered_df).mark_bar().encode(
122
+ y='JobRole',
123
+ x='count()',
124
+ color='Attrition'
125
+ ).properties(
126
+ title='Attrition by Job Role'
127
+ )
128
+ st.altair_chart(chart, use_container_width=True)
129
+
130
+ elif visualization_option == "Attrition Distribution by Gender":
131
+ # Pie chart for attrition distribution by gender
132
+ pie_chart_data = filtered_df[filtered_df['Attrition'] == 'Yes']['Gender'].value_counts().reset_index()
133
+ pie_chart_data.columns = ['Gender', 'count']
134
+
135
+ chart = alt.Chart(pie_chart_data).mark_arc().encode(
136
+ theta='count:Q',
137
+ color='Gender:N',
138
+ tooltip=['Gender', 'count']
139
+ ).properties(
140
+ title='Attrition Distribution by Gender',
141
+ width=300,
142
+ height=300
143
+ ).project('identity')
144
+ st.altair_chart(chart, use_container_width=True)
145
+
146
+ elif visualization_option == "MonthlyRate and DailyRate by JobLevel":
147
+ # Boxplots for MonthlyRate and DailyRate by JobLevel
148
+ fig, ax = plt.subplots(1, 2, figsize=(15, 7))
149
+
150
+ # MonthlyRate by JobLevel
151
+ sns.boxplot(x="JobLevel", y="MonthlyRate", data=filtered_df, ax=ax[0], hue="JobLevel", palette='Set2', legend=False)
152
+ ax[0].set_title('MonthlyRate by JobLevel')
153
+ ax[0].set_xlabel('Job Level')
154
+ ax[0].set_ylabel('Monthly Rate')
155
+
156
+ # DailyRate by JobLevel
157
+ sns.boxplot(x="JobLevel", y="DailyRate", data=filtered_df, ax=ax[1], hue="JobLevel", palette='Set2', legend=False)
158
+ ax[1].set_title('DailyRate by JobLevel')
159
+ ax[1].set_xlabel('Job Level')
160
+ ax[1].set_ylabel('Daily Rate')
161
+
162
+ plt.tight_layout()
163
+ st.pyplot(fig)
164
+
165
+ # Display dataset overview
166
+ st.header("Dataset Overview")
167
+ st.dataframe(df.describe())
168
+
169
+
170
+ # Insights from Visualization Section Expander
171
+ with st.expander("Insights from Visualization 🧠"):
172
+ st.markdown("""
173
+ 1. **Age Groups & Attrition** - The 'Attrition by Age Group' plot showcases which age brackets face higher attrition.
174
+ 2. **Home Distance's Impact** - The 'KDE Plot: Distance from Home by Attrition' visualizes if being farther away influences leaving tendencies.
175
+ 3. **Roles & Attrition** - 'Attrition by Job Role' reveals which roles might be more attrition-prone.
176
+ 4. **Gender & Attrition** - The pie chart for 'Attrition Distribution by Gender' provides insights into any gender-based patterns.
177
+ 5. **Earnings Patterns** - 'MonthlyRate and DailyRate by JobLevel' boxplots display the compensation distribution across job levels.
178
+ """)
179
+
180
+ # Recommendations Expander
181
+ with st.expander("Recommendations for Action 🌟"):
182
+ st.markdown("""
183
+ - 🎁 **Incentive Programs:** Introduce incentives tailored for groups showing higher attrition tendencies.
184
+ - 🏡 **Remote Work Options:** Providing flexibility, especially for those living farther from the workplace, could reduce attrition.
185
+ - 🚀 **Training & Growth:** Invest in employee development, especially in roles with higher attrition rates.
186
+ - 👫 **Gender Equality:** Foster an environment that supports equal opportunities regardless of gender.
187
+ - 💸 **Compensation Review:** Regularly review and adjust compensation structures to stay competitive and retain talent.
188
+ """)
Streamlit_Lecture/app_inclass_task.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ import pandas as pd
4
+
5
+ st.title('my shitty app ')
6
+
7
+ file_path='https://raw.githubusercontent.com/aaubs/ds-master/main/apps/M1-attrition-streamlit/HR-Employee-Attrition-synth.csv'
8
+
9
+ data=pd.read_csv(file_path)
10
+
11
+ selected=st.sidebar.selectbox("Select Variable of Interest", ['Department','Gender'])
12
+
13
+ st.table(data.groupby(['Attrition',selected])['MonthlyIncome'].mean())
Streamlit_Lecture/data.csv ADDED
The diff for this file is too large to render. See raw diff
 
Streamlit_Lecture/exploring_stuff.ipynb ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "\n",
11
+ "file_path='https://raw.githubusercontent.com/aaubs/ds-master/main/apps/M1-attrition-streamlit/HR-Employee-Attrition-synth.csv'\n",
12
+ "\n",
13
+ "data=pd.read_csv(file_path)\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 12,
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "name": "stdout",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "<class 'pandas.core.frame.DataFrame'>\n",
26
+ "RangeIndex: 2000 entries, 0 to 1999\n",
27
+ "Data columns (total 36 columns):\n",
28
+ " # Column Non-Null Count Dtype \n",
29
+ "--- ------ -------------- ----- \n",
30
+ " 0 Unnamed: 0 2000 non-null int64 \n",
31
+ " 1 Age 2000 non-null int64 \n",
32
+ " 2 Attrition 2000 non-null object\n",
33
+ " 3 BusinessTravel 2000 non-null object\n",
34
+ " 4 DailyRate 2000 non-null int64 \n",
35
+ " 5 Department 2000 non-null object\n",
36
+ " 6 DistanceFromHome 2000 non-null int64 \n",
37
+ " 7 Education 2000 non-null int64 \n",
38
+ " 8 EducationField 2000 non-null object\n",
39
+ " 9 EmployeeCount 2000 non-null int64 \n",
40
+ " 10 EmployeeNumber 2000 non-null int64 \n",
41
+ " 11 EnvironmentSatisfaction 2000 non-null int64 \n",
42
+ " 12 Gender 2000 non-null object\n",
43
+ " 13 HourlyRate 2000 non-null int64 \n",
44
+ " 14 JobInvolvement 2000 non-null int64 \n",
45
+ " 15 JobLevel 2000 non-null int64 \n",
46
+ " 16 JobRole 2000 non-null object\n",
47
+ " 17 JobSatisfaction 2000 non-null int64 \n",
48
+ " 18 MaritalStatus 2000 non-null object\n",
49
+ " 19 MonthlyIncome 2000 non-null int64 \n",
50
+ " 20 MonthlyRate 2000 non-null int64 \n",
51
+ " 21 NumCompaniesWorked 2000 non-null int64 \n",
52
+ " 22 Over18 2000 non-null object\n",
53
+ " 23 OverTime 2000 non-null object\n",
54
+ " 24 PercentSalaryHike 2000 non-null int64 \n",
55
+ " 25 PerformanceRating 2000 non-null int64 \n",
56
+ " 26 RelationshipSatisfaction 2000 non-null int64 \n",
57
+ " 27 StandardHours 2000 non-null int64 \n",
58
+ " 28 StockOptionLevel 2000 non-null int64 \n",
59
+ " 29 TotalWorkingYears 2000 non-null int64 \n",
60
+ " 30 TrainingTimesLastYear 2000 non-null int64 \n",
61
+ " 31 WorkLifeBalance 2000 non-null int64 \n",
62
+ " 32 YearsAtCompany 2000 non-null int64 \n",
63
+ " 33 YearsInCurrentRole 2000 non-null int64 \n",
64
+ " 34 YearsSinceLastPromotion 2000 non-null int64 \n",
65
+ " 35 YearsWithCurrManager 2000 non-null int64 \n",
66
+ "dtypes: int64(27), object(9)\n",
67
+ "memory usage: 562.6+ KB\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "data.info()"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 8,
78
+ "metadata": {},
79
+ "outputs": [
80
+ {
81
+ "data": {
82
+ "text/plain": [
83
+ "Attrition\n",
84
+ "No 8033.23208\n",
85
+ "Yes 8676.02349\n",
86
+ "Name: MonthlyIncome, dtype: float64"
87
+ ]
88
+ },
89
+ "execution_count": 8,
90
+ "metadata": {},
91
+ "output_type": "execute_result"
92
+ }
93
+ ],
94
+ "source": [
95
+ "data.groupby('Attrition')['MonthlyIncome'].mean()"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 13,
101
+ "metadata": {},
102
+ "outputs": [
103
+ {
104
+ "data": {
105
+ "text/plain": [
106
+ "Attrition Department \n",
107
+ "No Human Resources 8212.311828\n",
108
+ " Research & Development 8018.842391\n",
109
+ " Sales 8028.274311\n",
110
+ "Yes Human Resources 7557.600000\n",
111
+ " Research & Development 8908.348315\n",
112
+ " Sales 8401.754545\n",
113
+ "Name: MonthlyIncome, dtype: float64"
114
+ ]
115
+ },
116
+ "execution_count": 13,
117
+ "metadata": {},
118
+ "output_type": "execute_result"
119
+ }
120
+ ],
121
+ "source": [
122
+ "data.groupby(['Attrition','Department'])['MonthlyIncome'].mean()"
123
+ ]
124
+ }
125
+ ],
126
+ "metadata": {
127
+ "kernelspec": {
128
+ "display_name": "base",
129
+ "language": "python",
130
+ "name": "python3"
131
+ },
132
+ "language_info": {
133
+ "codemirror_mode": {
134
+ "name": "ipython",
135
+ "version": 3
136
+ },
137
+ "file_extension": ".py",
138
+ "mimetype": "text/x-python",
139
+ "name": "python",
140
+ "nbconvert_exporter": "python",
141
+ "pygments_lexer": "ipython3",
142
+ "version": "3.12.4"
143
+ }
144
+ },
145
+ "nbformat": 4,
146
+ "nbformat_minor": 2
147
+ }
Streamlit_Lecture/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ seaborn
3
+ pandas
4
+ matplotlib
5
+ altair