Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import warnings | |
| import tempfile | |
| import os | |
| import dash | |
| from dash import dcc | |
| from dash import html | |
| from dash import dash_table | |
| import gradio as gr | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans, DBSCAN | |
| from sklearn.metrics import classification_report, accuracy_score, silhouette_score | |
| from sklearn.decomposition import PCA | |
| from sklearn.manifold import TSNE | |
| # Suppress specific FutureWarnings | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| # Set seaborn style for better aesthetics | |
| sns.set(style="whitegrid") | |
| def enhanced_preprocessing(df): | |
| # Handling missing values | |
| df = df.fillna('Unknown') | |
| # Encoding categorical features | |
| categorical_cols = df.select_dtypes(include=['object']).columns.tolist() | |
| for col in categorical_cols: | |
| if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality | |
| label_encoder = LabelEncoder() | |
| df[col] = label_encoder.fit_transform(df[col]) | |
| else: # One-Hot Encoding for high-cardinality features | |
| one_hot = pd.get_dummies(df[col], prefix=col) | |
| df = pd.concat([df, one_hot], axis=1).drop(col, axis=1) | |
| # Vectorizing free-text columns (example: interventions column) | |
| if 'interventions' in df.columns: | |
| tfidf = TfidfVectorizer() | |
| tfidf_matrix = tfidf.fit_transform(df['interventions']) | |
| tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()) | |
| df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1) | |
| return df | |
| def calculate_correlations(df, threshold=0.3): | |
| correlations = df.corr() | |
| significant_corr = correlations[abs(correlations) > threshold].stack().reset_index() | |
| significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']] | |
| significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation'] | |
| return significant_corr | |
| def perform_clustering(df): | |
| # Normalize the data for clustering | |
| scaler = StandardScaler() | |
| df_scaled = scaler.fit_transform(df) | |
| # Determine best clustering method based on dataset characteristics | |
| kmeans = KMeans(n_clusters=4, random_state=42) | |
| dbscan = DBSCAN(eps=0.5, min_samples=5) | |
| kmeans_labels = kmeans.fit_predict(df_scaled) | |
| dbscan_labels = dbscan.fit_predict(df_scaled) | |
| kmeans_score = silhouette_score(df_scaled, kmeans_labels) | |
| dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1 | |
| if kmeans_score > dbscan_score: | |
| df['Cluster'] = kmeans_labels | |
| best_model = 'K-Means' | |
| else: | |
| df['Cluster'] = dbscan_labels | |
| best_model = 'DBSCAN' | |
| # Use PCA for visualization | |
| pca = PCA(n_components=2) | |
| pca_components = pca.fit_transform(df_scaled) | |
| df['PCA1'] = pca_components[:, 0] | |
| df['PCA2'] = pca_components[:, 1] | |
| return df, best_model | |
| def perform_predictions(df): | |
| results = [] | |
| target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']] | |
| for target in target_cols: | |
| X = df.drop(target, axis=1) | |
| y = df[target] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Model 1: Random Forest | |
| rf_model = RandomForestClassifier(random_state=42) | |
| rf_model.fit(X_train, y_train) | |
| rf_pred = rf_model.predict(X_test) | |
| rf_accuracy = accuracy_score(y_test, rf_pred) | |
| # Model 2: Logistic Regression | |
| lr_model = LogisticRegression(max_iter=1000) | |
| lr_model.fit(X_train, y_train) | |
| lr_pred = lr_model.predict(X_test) | |
| lr_accuracy = accuracy_score(y_test, lr_pred) | |
| if rf_accuracy > lr_accuracy: | |
| results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy}) | |
| else: | |
| results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy}) | |
| return results | |
| def create_dashboard(df, correlation_data, clustering_data, prediction_results): | |
| app = dash.Dash(__name__) | |
| app.layout = html.Div([ | |
| html.H1('Comprehensive Student Data Analysis'), | |
| html.Div([ | |
| html.H2('Correlation Analysis'), | |
| dash_table.DataTable( | |
| id='correlation_table', | |
| columns=[{'name': i, 'id': i} for i in correlation_data.columns], | |
| data=correlation_data.to_dict('records') | |
| ) | |
| ]), | |
| html.Div([ | |
| html.H2('Clustering Analysis'), | |
| html.P(f"Best Clustering Algorithm: {clustering_data['best_model']}"), | |
| dcc.Graph( | |
| id='clustering_scatter', | |
| figure={ | |
| 'data': [ | |
| { | |
| 'x': df['PCA1'], | |
| 'y': df['PCA2'], | |
| 'mode': 'markers', | |
| 'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10}, | |
| 'text': df['Cluster'], | |
| 'type': 'scatter' | |
| } | |
| ], | |
| 'layout': { | |
| 'title': 'Cluster Visualization using PCA', | |
| 'xaxis': {'title': 'PCA Component 1'}, | |
| 'yaxis': {'title': 'PCA Component 2'} | |
| } | |
| } | |
| ) | |
| ]), | |
| html.Div([ | |
| html.H2('Prediction Models'), | |
| dash_table.DataTable( | |
| id='prediction_table', | |
| columns=[{'name': i, 'id': i} for i in prediction_results.columns], | |
| data=prediction_results.to_dict('records') | |
| ) | |
| ]) | |
| ]) | |
| app.run_server(debug=True) | |
| def load_csv(file): | |
| df = pd.read_csv(file.name) | |
| df = enhanced_preprocessing(df) | |
| return df | |
| # Main execution | |
| iface = gr.Interface( | |
| fn=load_csv, | |
| inputs=gr.File(label="Upload CSV File"), | |
| outputs=gr.Dataframe(label="Preview of Uploaded Data"), | |
| description="Upload a CSV file to perform comprehensive student data analysis." | |
| ) | |
| iface.launch() | |
| # Note: The data loading is done through Gradio, no need for an additional file parameter. | |
| if __name__ == "__main__": | |
| pass |