Spaces:
Sleeping
Sleeping
File size: 17,444 Bytes
13bf251 6753c42 13bf251 6753c42 13bf251 3e930db 13bf251 3e930db e791e5b 2fe6c63 eaca5d0 5e1bd42 13bf251 eaca5d0 5e1bd42 e791e5b 5715ac3 5e1bd42 eaca5d0 e791e5b eaca5d0 13bf251 5715ac3 eaca5d0 e791e5b 13bf251 e791e5b 13bf251 e791e5b eaca5d0 13bf251 6753c42 3e930db e791e5b 6753c42 eaca5d0 13bf251 6753c42 864aed7 13bf251 e791e5b 755fb3a 3e930db e791e5b 3e930db 755fb3a 3e930db 2fe6c63 3e930db 2fe6c63 3e930db 2fe6c63 e791e5b 2fe6c63 e791e5b 755fb3a e791e5b 755fb3a 2fe6c63 e791e5b 864aed7 e791e5b 3e930db e791e5b eaca5d0 e791e5b 13bf251 6753c42 13bf251 3e930db e791e5b eaca5d0 e791e5b 3e930db e791e5b 5b239bf e791e5b 5b239bf e791e5b 5b239bf 13bf251 6753c42 13bf251 3e930db e791e5b 13bf251 6753c42 13bf251 5b239bf 2fe6c63 5b239bf 2fe6c63 5715ac3 2fe6c63 5715ac3 2fe6c63 5715ac3 3ee9608 2fe6c63 6f355e0 2fe6c63 6f355e0 5715ac3 3ee9608 5715ac3 2fe6c63 13bf251 eaca5d0 ee936fb e791e5b ee936fb 2fe6c63 ee936fb 2fe6c63 1ed57a6 2fe6c63 6f355e0 1ed57a6 6f355e0 1ed57a6 6f355e0 1ed57a6 2fe6c63 ee936fb 5715ac3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 |
import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import io
from PIL import Image
# Constants
RANDOM_STATE = 42
MIN_ROWS = 10
MIN_COLS = 2
MAX_FEATURES_TO_SHOW = 10
# Global variable to store trained model and data
global_data = {'model': None, 'scaler': None, 'X_columns': None, 'y_type': None, 'uniques': None}
def update_dropdown(file):
if file is None:
return gr.update(choices=[], value=None)
try:
if file.name.endswith('.csv'):
df = pd.read_csv(file.name)
elif file.name.endswith('.xlsx'):
df = pd.read_excel(file.name)
else:
return gr.update(choices=[], value=None)
return gr.update(choices=list(df.columns), value=None)
except Exception as e:
print(f"Error in update_dropdown: {e}") # Debug logging
return gr.update(choices=[], value=None)
def analyze_file(file, label_col, n_clusters):
if file is None:
return ("Please upload a file.", None, None, None, None, None)
try:
if file.name.endswith('.csv'):
df = pd.read_csv(file.name)
elif file.name.endswith('.xlsx'):
df = pd.read_excel(file.name)
else:
return ("Unsupported file type. Please upload a CSV or XLSX file.", None, None, None, None, None)
except Exception as e:
print(f"Error reading file: {e}") # Debug logging
return (f"Error reading file: {e}", None, None, None, None, None)
if df.empty:
return ("File is empty.", None, None, None, None, None)
if label_col not in df.columns:
return (f"Label column '{label_col}' not found.", None, None, None, None, None)
df = df.dropna()
if df.shape[0] < MIN_ROWS:
return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
if df.shape[1] < MIN_COLS:
return ("Need at least one feature and one label column.", None, None, None, None, None)
y = df[label_col]
X = df.drop(columns=[label_col])
X_processed = pd.get_dummies(X)
if X_processed.shape[1] == 0:
return ("No valid features after preprocessing.", None, None, None, None, None)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)
results_text = ""
model_img = None
fi_img = None
kmeans_img = None
agg_img = None
diff_img = None
try:
if pd.api.types.is_numeric_dtype(y):
# Regression
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE)
model = RandomForestRegressor(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results_text += (
"Regression Results:\n"
f"- MSE: {mse:.3f}\n"
f"- R²: {r2:.3f}\n"
"\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n"
)
# 2D Plots: Top 3 features vs predicted and true vs predicted
fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
top_features = fi.head(3).index
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()
for i, feature in enumerate(top_features):
ax = axes[i]
ax.scatter(X_test[feature], y_pred, alpha=0.5)
ax.set_xlabel(feature)
ax.set_ylabel('Predicted Value')
ax.set_title(f'{feature} vs Predicted')
ax = axes[3]
ax.scatter(y_test, y_pred, alpha=0.5)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
ax.set_xlabel('True Value')
ax.set_ylabel('Predicted Value')
ax.set_title('True vs Predicted')
min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())
ax.set_xlim(min_val, max_val)
ax.set_ylim(min_val, max_val)
ax.legend()
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
model_img = Image.open(buf)
global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'regression', 'uniques': None})
else:
# Classification
if len(y.unique()) < 2:
return ("Label must have at least 2 unique values.", None, None, None, None, None)
y_encoded, uniques = pd.factorize(y)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
model = RandomForestClassifier(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
results_text += "Classification Results:\n" + cr + "\n"
# 2D Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(u) for u in uniques], yticklabels=[str(u) for u in uniques])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
model_img = Image.open(buf)
global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'classification', 'uniques': uniques})
except Exception as e:
results_text += f"\nError during model training: {e}"
try:
fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
plt.figure(figsize=(10, 6))
sns.barplot(x=fi.values, y=fi.index)
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
fi_img = Image.open(buf)
except Exception as e:
results_text += f"\nWarning: Could not compute feature importance: {e}"
try:
kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
clusters_kmeans = kmeans.fit_predict(X_scaled)
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_scaled)
explained_var = sum(pca.explained_variance_ratio_)
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
plt.colorbar(scatter, ticks=range(n_clusters))
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
kmeans_img = Image.open(buf)
except Exception as e:
results_text += f"\nWarning: KMeans clustering failed: {e}"
try:
agg = AgglomerativeClustering(n_clusters=n_clusters)
clusters_agg = agg.fit_predict(X_scaled)
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
plt.colorbar(scatter, ticks=range(n_clusters))
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
agg_img = Image.open(buf)
except Exception as e:
results_text += f"\nWarning: Agglomerative clustering failed: {e}"
try:
f_scores, _ = f_classif(X_processed, clusters_kmeans)
# Handle potential division by zero or NaN values
f_scores = np.nan_to_num(f_scores, nan=0.0, posinf=0.0)
f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
plt.figure(figsize=(10, 6))
sns.barplot(data=f_series.reset_index(), x="index", y=0, hue="index", legend=False) # Fix palette warning
plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
plt.xlabel("F-score")
plt.ylabel("Feature")
plt.xticks(rotation=45)
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
diff_img = Image.open(buf)
except Exception as e:
results_text += f"\nWarning: Could not compute differentiating features: {e}"
return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
def predict_interactive(*args):
if global_data['model'] is None:
return "Please analyze a file first to train the model."
try:
# Convert args to kwargs based on column names
kwargs = {}
if len(args) > 0 and global_data['X_columns'] is not None:
for i, col in enumerate(global_data['X_columns']):
if i < len(args):
kwargs[col] = args[i]
# Create DataFrame from user inputs
input_data = pd.DataFrame([kwargs])
# Handle categorical variables with one-hot encoding
X_processed = pd.get_dummies(input_data)
# Ensure all expected columns are present
for col in global_data['X_columns']:
if col not in X_processed.columns:
X_processed[col] = 0
# Reorder columns to match training data
X_processed = X_processed[global_data['X_columns']]
# Scale the input
X_scaled = global_data['scaler'].transform(X_processed)
# Predict
prediction = global_data['model'].predict(X_scaled)
if global_data['y_type'] == 'classification':
pred_value = global_data['uniques'][int(prediction[0])]
return f"Predicted class: {pred_value}"
else:
return f"Predicted value: {prediction[0]:.3f}"
except Exception as e:
return f"Error in prediction: {str(e)}. Please ensure all inputs are valid numbers or categories."
def create_interactive_inputs(file, label_col):
if file is None or label_col is None:
print("No file or label column provided") # Debug logging
return []
try:
if file.name.endswith('.csv'):
df = pd.read_csv(file.name)
elif file.name.endswith('.xlsx'):
df = pd.read_excel(file.name)
else:
print("Unsupported file type") # Debug logging
return []
if df.empty or label_col not in df.columns:
print(f"Empty DataFrame or invalid label column: {label_col}") # Debug logging
return []
X = df.drop(columns=[label_col])
if X.empty:
print("No features available after dropping label column") # Debug logging
return []
components = []
for col in X.columns:
examples = X[col].dropna().sample(min(3, len(X[col].dropna()))).tolist()
if pd.api.types.is_numeric_dtype(X[col]):
components.append(gr.Number(label=f"{col} (e.g., {', '.join(map(str, examples))})", value=None))
else:
unique_values = X[col].dropna().unique().tolist()
components.append(gr.Dropdown(label=f"{col} (e.g., {', '.join(map(str, examples))})", choices=unique_values, value=None))
print(f"Generated {len(components)} input components") # Debug logging
return components
except Exception as e:
print(f"Error in create_interactive_inputs: {e}") # Debug logging
return []
with gr.Blocks() as demo:
gr.Markdown("## Data Analysis Explorer")
gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
with gr.Row():
file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True)
clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")
file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
analyze_btn = gr.Button("Analyze")
with gr.Tabs():
with gr.TabItem("Prediction Results"):
gr.Markdown("### Classification or Regression")
gr.Markdown("""
- **Regression**: Predicts numbers (e.g., sales). Uses Random Forest.
- **Classification**: Predicts categories (e.g., yes/no). Uses Random Forest.
- Rows with missing values are removed. 70% of data trains the model; 30% tests it.
""")
results_textbox = gr.Textbox(label="Performance Metrics", lines=10)
with gr.TabItem("Prediction Plot"):
gr.Markdown("### Prediction Visualization")
gr.Markdown("For regression: scatter plots of top 3 features vs. predicted values and true vs. predicted. For classification: confusion matrix.")
model_img_output = gr.Image(label="Prediction Output")
with gr.TabItem("Feature Importances"):
gr.Markdown("### Top 10 Key Features")
gr.Markdown("Shows the most important features for predictions. Higher bars mean bigger impact.")
fi_output = gr.Image(label="Feature Importances")
with gr.TabItem("KMeans Clustering"):
gr.Markdown("### KMeans Clustering")
gr.Markdown("Groups similar data points without using the selected column. Colors show clusters in 2D (PCA projection).")
kmeans_output = gr.Image(label="KMeans Clusters")
with gr.TabItem("Agglomerative Clustering"):
gr.Markdown("### Agglomerative Clustering")
gr.Markdown("Another way to group data hierarchically. Compare with KMeans to see differences!")
agg_output = gr.Image(label="Agglomerative Clusters")
with gr.TabItem("Cluster Differences"):
gr.Markdown("### Top 10 Cluster-Differentiating Features")
gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.")
diff_output = gr.Image(label="Differentiating Features")
with gr.TabItem("Interactive"):
gr.Markdown("### Interactive Prediction")
gr.Markdown("Enter values for each feature to get a prediction based on the trained model.")
with gr.Column():
input_components = gr.State(value=[])
dynamic_inputs = gr.Column(visible=True)
predict_btn = gr.Button("Predict")
prediction_output = gr.Textbox(label="Prediction Result")
def update_inputs(file, label_col):
print(f"Updating inputs with file: {file}, label_col: {label_col}") # Debug logging
components = create_interactive_inputs(file, label_col)
# Return the components and update the Column's visibility
return components, gr.update(visible=True) # Only update visibility, components are rendered in Blocks
# Use Blocks to render components dynamically
with dynamic_inputs:
for component in components:
component.render()
file_input.change(
fn=update_inputs,
inputs=[file_input, label_dropdown],
outputs=[input_components, dynamic_inputs]
)
label_dropdown.change(
fn=update_inputs,
inputs=[file_input, label_dropdown],
outputs=[input_components, dynamic_inputs]
)
predict_btn.click(
fn=predict_interactive,
inputs=input_components,
outputs=prediction_output
)
analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider],
outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])
demo.launch(debug=True) # Enable debug mode for more detailed error logging |