Tom commited on
Commit
c04ece2
·
1 Parent(s): a0d8c9f
Files changed (1) hide show
  1. app.py +260 -84
app.py CHANGED
@@ -3,15 +3,17 @@ import numpy as np
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import warnings
6
- import gradio as gr
 
 
 
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.ensemble import RandomForestClassifier
9
  from sklearn.metrics import classification_report
10
  from sklearn.preprocessing import StandardScaler
11
  from sklearn.cluster import KMeans
12
  from sklearn.decomposition import PCA
13
- import io
14
- from PIL import Image
15
 
16
  # Suppress specific FutureWarnings
17
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -39,6 +41,19 @@ def clean_data(df):
39
 
40
  df = df.drop(columns=['nsn'], errors='ignore')
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  if 'ncea_results' in df.columns:
43
  ncea_results = []
44
  for idx, row in df.iterrows():
@@ -55,11 +70,22 @@ def clean_data(df):
55
  else:
56
  df['Total Credits'] = 0
57
 
 
 
 
 
 
58
  df['credit_threshold'] = df['year_level'].apply(lambda x: 80 if x == 'Year 11' else 60)
59
  df['credit_achievement_rate'] = df['Total Credits'] / df['credit_threshold']
60
 
61
  return df
62
 
 
 
 
 
 
 
63
  def identify_at_risk_students(df):
64
  def prepare_data_for_modeling(df):
65
  df_model = df.drop(columns=[
@@ -88,43 +114,43 @@ def identify_at_risk_students(df):
88
  model.fit(X_train, y_train)
89
  y_pred = model.predict(X_test)
90
 
91
- classification_rep = classification_report(y_test, y_pred)
92
 
93
  importances = model.feature_importances_
94
  feature_names = features.columns
95
  feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
96
  feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
97
 
98
- return classification_rep, feature_importance_df
99
-
100
- def plot_credit_achievement_rate(df):
101
- plt.figure(figsize=(10, 6))
102
- sns.histplot(df['credit_achievement_rate'], kde=True, bins=30)
103
- plt.title('Distribution of Credit Achievement Rate')
104
- plt.xlabel('Credit Achievement Rate')
105
- plt.ylabel('Frequency')
106
- buf = io.BytesIO()
107
- plt.savefig(buf, format='png')
108
- buf.seek(0)
109
- plt.close()
110
- return Image.open(buf)
111
-
112
- def plot_feature_importance(feature_importance_df):
113
- plt.figure(figsize=(12, 6))
114
- sns.barplot(data=feature_importance_df.head(10), x='Importance', y='Feature', palette='viridis')
115
- plt.title('Top 10 Important Features for Predicting At-Risk Students', fontsize=14)
116
- plt.xlabel('Importance', fontsize=12)
117
- plt.ylabel('Feature', fontsize=12)
118
- plt.tight_layout()
119
- buf = io.BytesIO()
120
- plt.savefig(buf, format='png')
121
- buf.seek(0)
122
- plt.close()
123
- return Image.open(buf)
124
 
125
  def analyze_extra_curricular_impact(df):
 
126
  activity_cols = [col for col in df.columns if col in ['Cricket', 'Debating', 'Football', 'Art Club', 'Drama Club', 'Rugby']]
127
- images = []
128
  for activity in activity_cols:
129
  if activity in df.columns:
130
  data = df.copy()
@@ -136,19 +162,86 @@ def analyze_extra_curricular_impact(df):
136
  plt.xlabel('Participation Status', fontsize=12)
137
  plt.ylabel('Average Credit Achievement Rate', fontsize=12)
138
  plt.tight_layout()
139
- buf = io.BytesIO()
140
- plt.savefig(buf, format='png')
141
- buf.seek(0)
142
- plt.close()
143
- images.append(Image.open(buf))
144
- return images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  def plot_correlation_with_credit_achievement(df):
 
 
147
  numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
148
  corr_matrix = df[numeric_cols].corr()
149
 
150
  if 'credit_achievement_rate' not in corr_matrix.columns:
151
- return None
 
152
 
153
  corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=1, errors='ignore')
154
  corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=0, errors='ignore')
@@ -160,52 +253,135 @@ def plot_correlation_with_credit_achievement(df):
160
  plt.title('Correlation with Credit Achievement Rate', fontsize=16)
161
  plt.xticks(rotation=45, ha='right', fontsize=10)
162
  plt.tight_layout()
163
- buf = io.BytesIO()
164
- plt.savefig(buf, format='png')
165
- buf.seek(0)
166
- plt.close()
167
- return Image.open(buf)
168
-
169
- def analyze_data(file):
170
- try:
171
- df = pd.read_csv(file)
172
- df = clean_data(df)
173
- # Generate classification report and feature importance
174
- classification_rep, feature_importance_df = identify_at_risk_students(df)
175
- # Generate plots
176
- plot_img = plot_credit_achievement_rate(df)
177
- feature_importance_img = plot_feature_importance(feature_importance_df)
178
- extra_curricular_imgs = analyze_extra_curricular_impact(df)
179
- correlation_img = plot_correlation_with_credit_achievement(df)
180
- result_text = f"Data analysis complete.\n\nClassification Report:\n{classification_rep}"
181
- return result_text, plot_img, feature_importance_img, extra_curricular_imgs, correlation_img
182
- except Exception as e:
183
- return f"An error occurred: {str(e)}", None, None, [], None
184
-
185
- # Gradio Interface
186
- def analyze_uploaded_file(file):
187
- text, plot, feature_importance_plot, extra_curricular_plots, correlation_plot = analyze_data(file)
188
- outputs = [text, plot if plot else None, feature_importance_plot if feature_importance_plot else None]
189
- outputs.extend(extra_curricular_plots if extra_curricular_plots else [None] * 6)
190
- outputs.append(correlation_plot if correlation_plot else None)
191
- return outputs
192
-
193
- with gr.Blocks() as demo:
194
- gr.Markdown("""
195
- # Student Data Analysis Tool
196
- Upload your CSV file to analyze student data and generate insights.
197
- """)
198
 
199
- with gr.Row():
200
- file_input = gr.File(label="Upload CSV File")
201
- text_output = gr.Textbox(label="Analysis Summary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- with gr.Row():
204
- image_output = gr.Image(label="Generated Plot", type="pil")
205
- feature_importance_output = gr.Image(label="Feature Importance Plot", type="pil")
206
- extra_curricular_outputs = [gr.Image(label=f"Extra Curricular Impact Plot {i+1}", type="pil") for i in range(6)]
207
- correlation_output = gr.Image(label="Correlation with Credit Achievement Rate", type="pil")
208
 
209
- file_input.change(analyze_uploaded_file, inputs=file_input, outputs=[text_output, image_output, feature_importance_output] + extra_curricular_outputs + [correlation_output])
210
-
211
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import warnings
6
+ import io
7
+ import base64
8
+ import os
9
+ import tempfile
10
  from sklearn.model_selection import train_test_split
11
  from sklearn.ensemble import RandomForestClassifier
12
  from sklearn.metrics import classification_report
13
  from sklearn.preprocessing import StandardScaler
14
  from sklearn.cluster import KMeans
15
  from sklearn.decomposition import PCA
16
+ import gradio as gr
 
17
 
18
  # Suppress specific FutureWarnings
19
  warnings.filterwarnings("ignore", category=FutureWarning)
 
41
 
42
  df = df.drop(columns=['nsn'], errors='ignore')
43
 
44
+ category_cols = [
45
+ 'gender', 'ethnicity', 'year_level', 'contributing_primary_school',
46
+ 'year_11_english_teacher', 'year_11_maths_teacher', 'year_12_english_teacher',
47
+ 'year_12_maths_teacher', 'form_teacher', 'leaving_date', 'primary_language',
48
+ 'first_language', 'secondary_language', 'term_1_intervention',
49
+ 'term_2_intervention', 'term_3_intervention', 'term_4_intervention',
50
+ 'major_life_event', 'learning_difficulty', 'pastoral_care_incident',
51
+ 'pastoral_care_action_taken', 'pastoral_care_follow_up'
52
+ ]
53
+ for col in category_cols:
54
+ if col in df.columns:
55
+ df[col] = df[col].astype('category')
56
+
57
  if 'ncea_results' in df.columns:
58
  ncea_results = []
59
  for idx, row in df.iterrows():
 
70
  else:
71
  df['Total Credits'] = 0
72
 
73
+ if 'pastoral_care_follow_up' in df.columns:
74
+ df['action_effective'] = df['pastoral_care_follow_up'].apply(
75
+ lambda x: 'Effective' if 'resolved' in str(x).lower() else 'Not Effective'
76
+ )
77
+
78
  df['credit_threshold'] = df['year_level'].apply(lambda x: 80 if x == 'Year 11' else 60)
79
  df['credit_achievement_rate'] = df['Total Credits'] / df['credit_threshold']
80
 
81
  return df
82
 
83
+ def plt_to_file():
84
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmpfile:
85
+ plt.savefig(tmpfile.name)
86
+ plt.close()
87
+ return tmpfile.name
88
+
89
  def identify_at_risk_students(df):
90
  def prepare_data_for_modeling(df):
91
  df_model = df.drop(columns=[
 
114
  model.fit(X_train, y_train)
115
  y_pred = model.predict(X_test)
116
 
117
+ report = classification_report(y_test, y_pred)
118
 
119
  importances = model.feature_importances_
120
  feature_names = features.columns
121
  feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
122
  feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
123
 
124
+ graphs = []
125
+ tables = {}
126
+
127
+ tables['classification_report'] = report
128
+ tables['feature_importance'] = feature_importance_df.head(10).to_string()
129
+
130
+ if feature_importance_df['Importance'].sum() > 0:
131
+ plt.figure(figsize=(12, 6))
132
+ sns.barplot(data=feature_importance_df.head(10), x='Importance', y='Feature', palette='viridis')
133
+ plt.title('Top 10 Important Features for Predicting At-Risk Students', fontsize=14)
134
+ plt.xlabel('Importance', fontsize=12)
135
+ plt.ylabel('Feature', fontsize=12)
136
+ plt.tight_layout()
137
+ graphs.append(plt_to_file())
138
+
139
+ return graphs, tables
140
+
141
+ def process_extra_curricular(df):
142
+ df['extra_curricular_activities'] = df['extra_curricular_activities'].apply(
143
+ lambda x: eval(x) if isinstance(x, str) else []
144
+ )
145
+ activities = df['extra_curricular_activities'].explode().unique()
146
+ activities = [activity for activity in activities if activity is not None]
147
+ for activity in activities:
148
+ df[activity] = df['extra_curricular_activities'].apply(lambda x: int(activity in x))
149
+ return df
150
 
151
  def analyze_extra_curricular_impact(df):
152
+ graphs = []
153
  activity_cols = [col for col in df.columns if col in ['Cricket', 'Debating', 'Football', 'Art Club', 'Drama Club', 'Rugby']]
 
154
  for activity in activity_cols:
155
  if activity in df.columns:
156
  data = df.copy()
 
162
  plt.xlabel('Participation Status', fontsize=12)
163
  plt.ylabel('Average Credit Achievement Rate', fontsize=12)
164
  plt.tight_layout()
165
+ graphs.append(plt_to_file())
166
+ return graphs
167
+
168
+ def analyze_teacher_performance(df):
169
+ graphs = []
170
+ tables = {}
171
+ teacher_year_levels = {
172
+ 'year_11_english_teacher': 'Year 11',
173
+ 'year_11_maths_teacher': 'Year 11',
174
+ 'year_12_english_teacher': 'Year 12',
175
+ 'year_12_maths_teacher': 'Year 12'
176
+ }
177
+ for col, year_level in teacher_year_levels.items():
178
+ data = df[(df[col] != 'Unknown') & (df['year_level'] == year_level)]
179
+ if not data.empty:
180
+ group = data.groupby(col)['credit_achievement_rate'].mean().reset_index()
181
+ plt.figure(figsize=(10, 6))
182
+ sns.barplot(data=group, x=col, y='credit_achievement_rate', palette='Set3', edgecolor='w', errorbar=None)
183
+ plt.title(f'Average Credit Achievement Rate by {col.replace("_", " ").title()} ({year_level})', fontsize=14)
184
+ plt.xlabel('Teacher', fontsize=12)
185
+ plt.ylabel('Average Credit Achievement Rate', fontsize=12)
186
+ plt.xticks(rotation=45)
187
+ plt.tight_layout()
188
+ graphs.append(plt_to_file())
189
+ else:
190
+ tables[f"{col}_{year_level}"] = f"No data available for {col} in {year_level}."
191
+ return graphs, tables
192
+
193
+ def analyze_language_impact(df):
194
+ graphs = []
195
+ tables = {}
196
+ data = df[df['primary_language'] != 'Unknown']
197
+ if not data.empty:
198
+ group = data.groupby('primary_language')['credit_achievement_rate'].mean().reset_index()
199
+ plt.figure(figsize=(10, 6))
200
+ sns.barplot(data=group, x='primary_language', y='credit_achievement_rate', palette='Pastel1', edgecolor='w', errorbar=None)
201
+ plt.title('Average Credit Achievement Rate by Primary Language', fontsize=14)
202
+ plt.xlabel('Primary Language', fontsize=12)
203
+ plt.ylabel('Average Credit Achievement Rate', fontsize=12)
204
+ plt.xticks(rotation=45)
205
+ plt.tight_layout()
206
+ graphs.append(plt_to_file())
207
+ else:
208
+ tables['language_impact'] = "No data available for primary languages."
209
+ return graphs, tables
210
+
211
+ def perform_clustering(df):
212
+ graphs = []
213
+ tables = {}
214
+ attendance_cols = [col for col in df.columns if 'attendance' in col]
215
+ features = df[['credit_achievement_rate', 'age'] + attendance_cols]
216
+ features = features.fillna(0)
217
+ scaler = StandardScaler()
218
+ scaled_features = scaler.fit_transform(features)
219
+ pca = PCA(n_components=2)
220
+ principal_components = pca.fit_transform(scaled_features)
221
+ kmeans = KMeans(n_clusters=3, random_state=42)
222
+ clusters = kmeans.fit_predict(principal_components)
223
+ df['Cluster'] = clusters
224
+ cluster_analysis = df.groupby('Cluster')[['credit_achievement_rate', 'age'] + attendance_cols].mean()
225
+ tables['cluster_analysis'] = cluster_analysis.to_string()
226
+ plt.figure(figsize=(8, 6))
227
+ sns.scatterplot(x=principal_components[:,0], y=principal_components[:,1], hue=clusters, palette='Set1', s=100, alpha=0.7)
228
+ plt.title('Student Clusters', fontsize=14)
229
+ plt.xlabel('Principal Component 1', fontsize=12)
230
+ plt.ylabel('Principal Component 2', fontsize=12)
231
+ plt.legend(title='Cluster')
232
+ plt.tight_layout()
233
+ graphs.append(plt_to_file())
234
+ return graphs, tables
235
 
236
  def plot_correlation_with_credit_achievement(df):
237
+ graphs = []
238
+ tables = {}
239
  numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
240
  corr_matrix = df[numeric_cols].corr()
241
 
242
  if 'credit_achievement_rate' not in corr_matrix.columns:
243
+ tables['correlation_error'] = "Error: 'credit_achievement_rate' column not found in the dataset."
244
+ return graphs, tables
245
 
246
  corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=1, errors='ignore')
247
  corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=0, errors='ignore')
 
253
  plt.title('Correlation with Credit Achievement Rate', fontsize=16)
254
  plt.xticks(rotation=45, ha='right', fontsize=10)
255
  plt.tight_layout()
256
+ graphs.append(plt_to_file())
257
+
258
+ tables['correlation_with_credit'] = correlation_with_credit.to_string()
259
+
260
+ corr_matrix_clean = corr_matrix.replace([np.inf, -np.inf], np.nan).fillna(0)
261
+
262
+ plt.figure(figsize=(12, 12))
263
+ sns.clustermap(corr_matrix_clean, annot=False, cmap='coolwarm', figsize=(12, 12), method='average')
264
+ plt.title('Cluster Map of Feature Correlations (excluding credit_threshold, Total Credits)', fontsize=16)
265
+ graphs.append(plt_to_file())
266
+
267
+ return graphs, tables
268
+
269
+ def plot_top_features_vs_credit(df):
270
+ graphs = []
271
+ tables = {}
272
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
273
+ corr_matrix = df[numeric_cols].corr()
274
+
275
+ corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=0, errors='ignore')
276
+ corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=1, errors='ignore')
277
+
278
+ top_corr_features = corr_matrix['credit_achievement_rate'].abs().sort_values(ascending=False).index[1:6]
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
+ tables['top_corr_features'] = f"Top features most correlated with Credit Achievement Rate:\n{', '.join(top_corr_features)}"
281
+
282
+ for feature in top_corr_features:
283
+ if pd.api.types.is_numeric_dtype(df[feature]):
284
+ df_sorted = df[[feature, 'credit_achievement_rate']].sort_values(by=feature)
285
+
286
+ plt.figure(figsize=(10, 6))
287
+ sns.lineplot(x=df_sorted[feature], y=df_sorted['credit_achievement_rate'], marker='o')
288
+ plt.title(f'Line Graph: {feature} vs Credit Achievement Rate', fontsize=14)
289
+ plt.xlabel(feature.replace('_', ' ').title(), fontsize=12)
290
+ plt.ylabel('Credit Achievement Rate', fontsize=12)
291
+ plt.tight_layout()
292
+ graphs.append(plt_to_file())
293
+ elif pd.api.types.is_categorical_dtype(df[feature]) or pd.api.types.is_object_dtype(df[feature]):
294
+ group = df.groupby(feature)['credit_achievement_rate'].mean().reset_index()
295
+
296
+ plt.figure(figsize=(10, 6))
297
+ sns.barplot(x=group[feature], y=group['credit_achievement_rate'], palette='Set2')
298
+ plt.title(f'Bar Plot: {feature} vs Credit Achievement Rate', fontsize=14)
299
+ plt.xlabel(feature.replace('_', ' ').title(), fontsize=12)
300
+ plt.ylabel('Average Credit Achievement Rate', fontsize=12)
301
+ plt.xticks(rotation=45)
302
+ plt.tight_layout()
303
+ graphs.append(plt_to_file())
304
+
305
+ return graphs, tables
306
+
307
+ def perform_comprehensive_analysis(df):
308
+ all_graphs = []
309
+ all_tables = {}
310
+
311
+ # 1. Identifying At-Risk Students
312
+ graphs, tables = identify_at_risk_students(df)
313
+ all_graphs.extend(graphs)
314
+ all_tables.update(tables)
315
+
316
+ # 2. Analyzing Impact of Extra-Curricular Activities
317
+ df = process_extra_curricular(df)
318
+ graphs = analyze_extra_curricular_impact(df)
319
+ all_graphs.extend(graphs)
320
+
321
+ # 3. Analyzing Teacher Performance
322
+ graphs, tables = analyze_teacher_performance(df)
323
+ all_graphs.extend(graphs)
324
+ all_tables.update(tables)
325
+
326
+ # 4. Analyzing Language Proficiency Impact
327
+ graphs, tables = analyze_language_impact(df)
328
+ all_graphs.extend(graphs)
329
+ all_tables.update(tables)
330
+
331
+ # 5. Performing Cluster Analysis
332
+ graphs, tables = perform_clustering(df)
333
+ all_graphs.extend(graphs)
334
+ all_tables.update(tables)
335
+
336
+ # 6. Correlation Analysis for Credit Achievement Rate
337
+ graphs, tables = plot_correlation_with_credit_achievement(df)
338
+ all_graphs.extend(graphs)
339
+ all_tables.update(tables)
340
+
341
+ # 7. Plotting Top Features vs Credit Achievement Rate
342
+ graphs, tables = plot_top_features_vs_credit(df)
343
+ all_graphs.extend(graphs)
344
+ all_tables.update(tables)
345
+
346
+ return all_graphs, all_tables
347
+
348
+ def gradio_wrapper(file):
349
+ df = pd.read_csv(file.name)
350
+ df = clean_data(df)
351
+ graphs, tables = perform_comprehensive_analysis(df)
352
 
353
+ # Convert tables to a list of strings for easier display
354
+ table_outputs = [f"### {k}\n```\n{v}\n```" for k, v in tables.items()]
 
 
 
355
 
356
+ return [graphs] + table_outputs
357
+
358
+ # Create Gradio interface
359
+ iface = gr.Interface(
360
+ fn=gradio_wrapper,
361
+ inputs=gr.File(label="Upload CSV"),
362
+ outputs=[
363
+ gr.Gallery(label="Graphs", columns=2, rows=3, height="auto"),
364
+ gr.Markdown(label="Classification Report"),
365
+ gr.Markdown(label="Feature Importance"),
366
+ gr.Markdown(label="Teacher Performance"),
367
+ gr.Markdown(label="Language Impact"),
368
+ gr.Markdown(label="Cluster Analysis"),
369
+ gr.Markdown(label="Correlation with Credit Achievement Rate"),
370
+ gr.Markdown(label="Top Correlated Features")
371
+ ],
372
+ title="Comprehensive Student Data Analysis",
373
+ description="Upload a CSV file to analyze student data. The analysis includes identifying at-risk students, impact of extra-curricular activities, teacher performance, language proficiency impact, cluster analysis, and correlation analysis."
374
+ )
375
+
376
+ # Launch the interface
377
+ iface.launch()
378
+
379
+ # Clean up temporary files
380
+ def cleanup_temp_files():
381
+ for filename in os.listdir(tempfile.gettempdir()):
382
+ if filename.endswith(".png"):
383
+ os.remove(os.path.join(tempfile.gettempdir(), filename))
384
+
385
+ # Register the cleanup function to be called when the script exits
386
+ import atexit
387
+ atexit.register(cleanup_temp_files)