import pandas as pd import numpy as np import gradio as gr from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score from sklearn.cluster import KMeans, AgglomerativeClustering from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import f_classif import matplotlib.pyplot as plt import seaborn as sns import io from PIL import Image # Constants RANDOM_STATE = 42 MIN_ROWS = 10 MIN_COLS = 2 MAX_FEATURES_TO_SHOW = 10 # Global variable to store trained model and data global_data = {'model': None, 'scaler': None, 'X_columns': None, 'y_type': None, 'uniques': None} def update_dropdown(file): if file is None: return gr.update(choices=[], value=None) try: if file.name.endswith('.csv'): df = pd.read_csv(file.name) elif file.name.endswith('.xlsx'): df = pd.read_excel(file.name) else: return gr.update(choices=[], value=None) return gr.update(choices=list(df.columns), value=None) except Exception as e: print(f"Error in update_dropdown: {e}") # Debug logging return gr.update(choices=[], value=None) def analyze_file(file, label_col, n_clusters): if file is None: return ("Please upload a file.", None, None, None, None, None) try: if file.name.endswith('.csv'): df = pd.read_csv(file.name) elif file.name.endswith('.xlsx'): df = pd.read_excel(file.name) else: return ("Unsupported file type. Please upload a CSV or XLSX file.", None, None, None, None, None) except Exception as e: print(f"Error reading file: {e}") # Debug logging return (f"Error reading file: {e}", None, None, None, None, None) if df.empty: return ("File is empty.", None, None, None, None, None) if label_col not in df.columns: return (f"Label column '{label_col}' not found.", None, None, None, None, None) df = df.dropna() if df.shape[0] < MIN_ROWS: return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None) if df.shape[1] < MIN_COLS: return ("Need at least one feature and one label column.", None, None, None, None, None) y = df[label_col] X = df.drop(columns=[label_col]) X_processed = pd.get_dummies(X) if X_processed.shape[1] == 0: return ("No valid features after preprocessing.", None, None, None, None, None) scaler = StandardScaler() X_scaled = scaler.fit_transform(X_processed) results_text = "" model_img = None fi_img = None kmeans_img = None agg_img = None diff_img = None try: if pd.api.types.is_numeric_dtype(y): # Regression X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE) model = RandomForestRegressor(random_state=RANDOM_STATE) model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) results_text += ( "Regression Results:\n" f"- MSE: {mse:.3f}\n" f"- R²: {r2:.3f}\n" "\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n" ) # 2D Plots: Top 3 features vs predicted and true vs predicted fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False) top_features = fi.head(3).index fig, axes = plt.subplots(2, 2, figsize=(12, 10)) axes = axes.flatten() for i, feature in enumerate(top_features): ax = axes[i] ax.scatter(X_test[feature], y_pred, alpha=0.5) ax.set_xlabel(feature) ax.set_ylabel('Predicted Value') ax.set_title(f'{feature} vs Predicted') ax = axes[3] ax.scatter(y_test, y_pred, alpha=0.5) ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction') ax.set_xlabel('True Value') ax.set_ylabel('Predicted Value') ax.set_title('True vs Predicted') min_val = min(y_test.min(), y_pred.min()) max_val = max(y_test.max(), y_pred.max()) ax.set_xlim(min_val, max_val) ax.set_ylim(min_val, max_val) ax.legend() plt.tight_layout() buf = io.BytesIO() plt.savefig(buf, format="png", bbox_inches="tight") plt.close() buf.seek(0) model_img = Image.open(buf) global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'regression', 'uniques': None}) else: # Classification if len(y.unique()) < 2: return ("Label must have at least 2 unique values.", None, None, None, None, None) y_encoded, uniques = pd.factorize(y) X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE) model = RandomForestClassifier(random_state=RANDOM_STATE) model.fit(X_train, y_train) y_pred = model.predict(X_test) cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques]) results_text += "Classification Results:\n" + cr + "\n" # 2D Confusion Matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(u) for u in uniques], yticklabels=[str(u) for u in uniques]) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix') buf = io.BytesIO() plt.savefig(buf, format="png", bbox_inches="tight") plt.close() buf.seek(0) model_img = Image.open(buf) global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'classification', 'uniques': uniques}) except Exception as e: results_text += f"\nError during model training: {e}" try: fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW) plt.figure(figsize=(10, 6)) sns.barplot(x=fi.values, y=fi.index) plt.title("Top 10 Feature Importances") plt.xlabel("Importance") plt.ylabel("Feature") buf = io.BytesIO() plt.savefig(buf, format="png", bbox_inches="tight") plt.close() buf.seek(0) fi_img = Image.open(buf) except Exception as e: results_text += f"\nWarning: Could not compute feature importance: {e}" try: kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE) clusters_kmeans = kmeans.fit_predict(X_scaled) pca = PCA(n_components=2, random_state=RANDOM_STATE) X_pca = pca.fit_transform(X_scaled) explained_var = sum(pca.explained_variance_ratio_) plt.figure(figsize=(8, 6)) scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7) plt.xlabel("PCA 1") plt.ylabel("PCA 2") plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)") plt.colorbar(scatter, ticks=range(n_clusters)) buf = io.BytesIO() plt.savefig(buf, format="png", bbox_inches="tight") plt.close() buf.seek(0) kmeans_img = Image.open(buf) except Exception as e: results_text += f"\nWarning: KMeans clustering failed: {e}" try: agg = AgglomerativeClustering(n_clusters=n_clusters) clusters_agg = agg.fit_predict(X_scaled) plt.figure(figsize=(8, 6)) scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7) plt.xlabel("PCA 1") plt.ylabel("PCA 2") plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)") plt.colorbar(scatter, ticks=range(n_clusters)) buf = io.BytesIO() plt.savefig(buf, format="png", bbox_inches="tight") plt.close() buf.seek(0) agg_img = Image.open(buf) except Exception as e: results_text += f"\nWarning: Agglomerative clustering failed: {e}" try: f_scores, _ = f_classif(X_processed, clusters_kmeans) # Handle potential division by zero or NaN values f_scores = np.nan_to_num(f_scores, nan=0.0, posinf=0.0) f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW) plt.figure(figsize=(10, 6)) sns.barplot(data=f_series.reset_index(), x="index", y=0, hue="index", legend=False) # Fix palette warning plt.title("Top 10 Differentiating Features (ANOVA F-scores)") plt.xlabel("F-score") plt.ylabel("Feature") plt.xticks(rotation=45) buf = io.BytesIO() plt.savefig(buf, format="png", bbox_inches="tight") plt.close() buf.seek(0) diff_img = Image.open(buf) except Exception as e: results_text += f"\nWarning: Could not compute differentiating features: {e}" return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img def predict_interactive(*args): if global_data['model'] is None: return "Please analyze a file first to train the model." try: # Convert args to kwargs based on column names kwargs = {} if len(args) > 0 and global_data['X_columns'] is not None: for i, col in enumerate(global_data['X_columns']): if i < len(args): kwargs[col] = args[i] # Create DataFrame from user inputs input_data = pd.DataFrame([kwargs]) # Handle categorical variables with one-hot encoding X_processed = pd.get_dummies(input_data) # Ensure all expected columns are present for col in global_data['X_columns']: if col not in X_processed.columns: X_processed[col] = 0 # Reorder columns to match training data X_processed = X_processed[global_data['X_columns']] # Scale the input X_scaled = global_data['scaler'].transform(X_processed) # Predict prediction = global_data['model'].predict(X_scaled) if global_data['y_type'] == 'classification': pred_value = global_data['uniques'][int(prediction[0])] return f"Predicted class: {pred_value}" else: return f"Predicted value: {prediction[0]:.3f}" except Exception as e: return f"Error in prediction: {str(e)}. Please ensure all inputs are valid numbers or categories." def create_interactive_inputs(file, label_col): if file is None or label_col is None: print("No file or label column provided") # Debug logging return [] try: if file.name.endswith('.csv'): df = pd.read_csv(file.name) elif file.name.endswith('.xlsx'): df = pd.read_excel(file.name) else: print("Unsupported file type") # Debug logging return [] if df.empty or label_col not in df.columns: print(f"Empty DataFrame or invalid label column: {label_col}") # Debug logging return [] X = df.drop(columns=[label_col]) if X.empty: print("No features available after dropping label column") # Debug logging return [] components = [] for col in X.columns: examples = X[col].dropna().sample(min(3, len(X[col].dropna()))).tolist() if pd.api.types.is_numeric_dtype(X[col]): components.append(gr.Number(label=f"{col} (e.g., {', '.join(map(str, examples))})", value=None)) else: unique_values = X[col].dropna().unique().tolist() components.append(gr.Dropdown(label=f"{col} (e.g., {', '.join(map(str, examples))})", choices=unique_values, value=None)) print(f"Generated {len(components)} input components") # Debug logging return components except Exception as e: print(f"Error in create_interactive_inputs: {e}") # Debug logging return [] with gr.Blocks() as demo: gr.Markdown("## Data Analysis Explorer") gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!") with gr.Row(): file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"]) label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True) clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters") file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown) analyze_btn = gr.Button("Analyze") with gr.Tabs(): with gr.TabItem("Prediction Results"): gr.Markdown("### Classification or Regression") gr.Markdown(""" - **Regression**: Predicts numbers (e.g., sales). Uses Random Forest. - **Classification**: Predicts categories (e.g., yes/no). Uses Random Forest. - Rows with missing values are removed. 70% of data trains the model; 30% tests it. """) results_textbox = gr.Textbox(label="Performance Metrics", lines=10) with gr.TabItem("Prediction Plot"): gr.Markdown("### Prediction Visualization") gr.Markdown("For regression: scatter plots of top 3 features vs. predicted values and true vs. predicted. For classification: confusion matrix.") model_img_output = gr.Image(label="Prediction Output") with gr.TabItem("Feature Importances"): gr.Markdown("### Top 10 Key Features") gr.Markdown("Shows the most important features for predictions. Higher bars mean bigger impact.") fi_output = gr.Image(label="Feature Importances") with gr.TabItem("KMeans Clustering"): gr.Markdown("### KMeans Clustering") gr.Markdown("Groups similar data points without using the selected column. Colors show clusters in 2D (PCA projection).") kmeans_output = gr.Image(label="KMeans Clusters") with gr.TabItem("Agglomerative Clustering"): gr.Markdown("### Agglomerative Clustering") gr.Markdown("Another way to group data hierarchically. Compare with KMeans to see differences!") agg_output = gr.Image(label="Agglomerative Clusters") with gr.TabItem("Cluster Differences"): gr.Markdown("### Top 10 Cluster-Differentiating Features") gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.") diff_output = gr.Image(label="Differentiating Features") with gr.TabItem("Interactive"): gr.Markdown("### Interactive Prediction") gr.Markdown("Enter values for each feature to get a prediction based on the trained model.") with gr.Column(): input_components = gr.State(value=[]) dynamic_inputs = gr.Column(visible=True) predict_btn = gr.Button("Predict") prediction_output = gr.Textbox(label="Prediction Result") def update_inputs(file, label_col): print(f"Updating inputs with file: {file}, label_col: {label_col}") # Debug logging components = create_interactive_inputs(file, label_col) # Return the components and update the Column's visibility return components, gr.update(visible=True) # Only update visibility, components are rendered in Blocks # Use Blocks to render components dynamically with dynamic_inputs: for component in components: component.render() file_input.change( fn=update_inputs, inputs=[file_input, label_dropdown], outputs=[input_components, dynamic_inputs] ) label_dropdown.change( fn=update_inputs, inputs=[file_input, label_dropdown], outputs=[input_components, dynamic_inputs] ) predict_btn.click( fn=predict_interactive, inputs=input_components, outputs=prediction_output ) analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider], outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output]) demo.launch(debug=True) # Enable debug mode for more detailed error logging