Update app.py
Browse files
app.py
CHANGED
|
@@ -23,12 +23,24 @@ from sklearn.feature_selection import SelectKBest, f_classif
|
|
| 23 |
|
| 24 |
# Authenticate Hugging Face
|
| 25 |
hf_token = os.getenv("HF_TOKEN")
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
# Initialize Model
|
| 29 |
model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
@lru_cache(maxsize=10)
|
| 33 |
def cached_generate_lime_explanation(insight_text: str, class_names: tuple = ("Negative", "Positive")):
|
| 34 |
"""Generate and cache LIME explanations to improve performance"""
|
|
@@ -69,112 +81,96 @@ def cached_generate_lime_explanation(insight_text: str, class_names: tuple = ("N
|
|
| 69 |
|
| 70 |
def generate_shap_explanation(model, X_train, X_test):
|
| 71 |
"""Generate SHAP explanations for model predictions"""
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
# Save SHAP plots
|
| 76 |
-
shap_figures = []
|
| 77 |
-
for plot_type in ['summary', 'bar', 'waterfall']:
|
| 78 |
-
plt.figure()
|
| 79 |
-
if plot_type == 'summary':
|
| 80 |
-
shap.summary_plot(shap_values, X_test, plot_size=(10, 8), show=False)
|
| 81 |
-
elif plot_type == 'bar':
|
| 82 |
-
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
|
| 83 |
-
elif plot_type == 'waterfall':
|
| 84 |
-
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0],
|
| 85 |
-
shap_values[0][0],
|
| 86 |
-
feature_names=X_test.columns, show=False)
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
def feature_engineering_experiments(X_train, X_test, y_train, y_test):
|
| 96 |
"""Run different feature engineering approaches and compare results"""
|
| 97 |
results = {}
|
| 98 |
|
| 99 |
-
# Original features baseline
|
| 100 |
-
base_model = RandomForestClassifier(random_state=42)
|
| 101 |
-
base_model.fit(X_train, y_train)
|
| 102 |
-
y_pred = base_model.predict(X_test)
|
| 103 |
-
results['baseline'] = {
|
| 104 |
-
'accuracy': accuracy_score(y_test, y_pred),
|
| 105 |
-
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 106 |
-
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 107 |
-
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 108 |
-
}
|
| 109 |
-
|
| 110 |
-
# Standardized features
|
| 111 |
-
scaler = StandardScaler()
|
| 112 |
-
X_train_scaled = scaler.fit_transform(X_train)
|
| 113 |
-
X_test_scaled = scaler.transform(X_test)
|
| 114 |
-
|
| 115 |
-
scaled_model = RandomForestClassifier(random_state=42)
|
| 116 |
-
scaled_model.fit(X_train_scaled, y_train)
|
| 117 |
-
y_pred = scaled_model.predict(X_test_scaled)
|
| 118 |
-
results['scaled'] = {
|
| 119 |
-
'accuracy': accuracy_score(y_test, y_pred),
|
| 120 |
-
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 121 |
-
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 122 |
-
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
# Polynomial features
|
| 126 |
-
poly = PolynomialFeatures(degree=2, interaction_only=True)
|
| 127 |
-
X_train_poly = poly.fit_transform(X_train)
|
| 128 |
-
X_test_poly = poly.transform(X_test)
|
| 129 |
-
|
| 130 |
-
poly_model = RandomForestClassifier(random_state=42)
|
| 131 |
-
poly_model.fit(X_train_poly, y_train)
|
| 132 |
-
y_pred = poly_model.predict(X_test_poly)
|
| 133 |
-
results['polynomial'] = {
|
| 134 |
-
'accuracy': accuracy_score(y_test, y_pred),
|
| 135 |
-
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 136 |
-
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 137 |
-
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
-
# Feature selection
|
| 141 |
-
selector = SelectKBest(f_classif, k=5)
|
| 142 |
-
X_train_selected = selector.fit_transform(X_train, y_train)
|
| 143 |
-
X_test_selected = selector.transform(X_test)
|
| 144 |
-
|
| 145 |
-
selected_model = RandomForestClassifier(random_state=42)
|
| 146 |
-
selected_model.fit(X_train_selected, y_train)
|
| 147 |
-
y_pred = selected_model.predict(X_test_selected)
|
| 148 |
-
results['selected'] = {
|
| 149 |
-
'accuracy': accuracy_score(y_test, y_pred),
|
| 150 |
-
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 151 |
-
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 152 |
-
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
return results
|
| 156 |
-
|
| 157 |
-
def format_analysis_report(raw_output, visuals):
|
| 158 |
try:
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
-
report = f"""
|
| 162 |
-
<div style="font-family: Arial, sans-serif; padding: 20px; color: #333;">
|
| 163 |
-
<h1 style="color: #2B547E; border-bottom: 2px solid #2B547E; padding-bottom: 10px;">📊 Data Analysis Report</h1>
|
| 164 |
-
<div style="margin-top: 25px; background: #f8f9fa; padding: 20px; border-radius: 8px;">
|
| 165 |
-
<h2 style="color: #2B547E;">🔍 Key Observations</h2>
|
| 166 |
-
{format_observations(analysis_dict.get('observations', {}))}
|
| 167 |
-
</div>
|
| 168 |
-
<div style="margin-top: 30px;">
|
| 169 |
-
<h2 style="color: #2B547E;">💡 Insights & Visualizations</h2>
|
| 170 |
-
{format_insights(analysis_dict.get('insights', {}), visuals)}
|
| 171 |
-
</div>
|
| 172 |
-
</div>
|
| 173 |
-
"""
|
| 174 |
-
return report, visuals, list(analysis_dict.get('insights', {})).values()
|
| 175 |
except Exception as e:
|
| 176 |
-
print(f"Error
|
| 177 |
-
|
|
|
|
| 178 |
|
| 179 |
def analyze_data(csv_file, additional_notes=""):
|
| 180 |
start_time = time.time()
|
|
@@ -185,102 +181,104 @@ def analyze_data(csv_file, additional_notes=""):
|
|
| 185 |
shutil.rmtree('./figures')
|
| 186 |
os.makedirs('./figures', exist_ok=True)
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
X = data.drop('target', axis=1) # Assuming 'target' is the label column
|
| 198 |
-
y = data['target']
|
| 199 |
-
|
| 200 |
-
# Split data
|
| 201 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 202 |
-
|
| 203 |
-
# Feature engineering experiments
|
| 204 |
-
feat_eng_results = feature_engineering_experiments(X_train, X_test, y_train, y_test)
|
| 205 |
-
wandb.log({"feature_engineering": feat_eng_results})
|
| 206 |
-
|
| 207 |
-
# Train final model with best approach (using baseline here for demo)
|
| 208 |
-
final_model = RandomForestClassifier(random_state=42)
|
| 209 |
-
final_model.fit(X_train, y_train)
|
| 210 |
-
|
| 211 |
-
# Generate SHAP explanations
|
| 212 |
-
shap_figs = generate_shap_explanation(final_model, X_train, X_test)
|
| 213 |
-
|
| 214 |
-
agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn"])
|
| 215 |
-
analysis_result = agent.run("""
|
| 216 |
-
You are an expert data analyst. Perform comprehensive analysis including:
|
| 217 |
-
1. Basic statistics and data quality checks
|
| 218 |
-
2. Feature engineering experiment results
|
| 219 |
-
3. 3 insightful analytical questions about relationships in the data
|
| 220 |
-
4. Visualization of key patterns and correlations
|
| 221 |
-
5. Actionable real-world insights derived from findings
|
| 222 |
-
Generate publication-quality visualizations and save to './figures/'
|
| 223 |
-
""", additional_args={"additional_notes": additional_notes, "source_file": csv_file})
|
| 224 |
-
|
| 225 |
-
execution_time = time.time() - start_time
|
| 226 |
-
final_memory = process.memory_info().rss / 1024 ** 2
|
| 227 |
-
memory_usage = final_memory - initial_memory
|
| 228 |
-
wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
|
| 229 |
-
|
| 230 |
-
visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
|
| 231 |
-
visuals.extend(shap_figs) # Add SHAP visualizations
|
| 232 |
-
|
| 233 |
-
for viz in visuals:
|
| 234 |
-
wandb.log({os.path.basename(viz): wandb.Image(viz)})
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
|
| 255 |
def tune_hyperparameters(csv_file, n_trials: int):
|
| 256 |
"""Run hyperparameter optimization with Optuna"""
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 280 |
gr.Markdown("## 📊 AI Data Analysis Agent with Explainability")
|
| 281 |
|
| 282 |
insights_store = gr.State([])
|
| 283 |
-
data_store = gr.State(None) # Store loaded data
|
| 284 |
|
| 285 |
with gr.Row():
|
| 286 |
with gr.Column():
|
|
@@ -339,4 +337,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 339 |
outputs=[optuna_output]
|
| 340 |
)
|
| 341 |
|
| 342 |
-
demo.launch(debug=True)
|
|
|
|
| 23 |
|
| 24 |
# Authenticate Hugging Face
|
| 25 |
hf_token = os.getenv("HF_TOKEN")
|
| 26 |
+
if hf_token:
|
| 27 |
+
login(token=hf_token, add_to_git_credential=True)
|
| 28 |
|
| 29 |
# Initialize Model
|
| 30 |
model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
|
| 31 |
|
| 32 |
+
def detect_target_column(df):
|
| 33 |
+
"""Try to automatically detect the target column"""
|
| 34 |
+
# Common target column names
|
| 35 |
+
possible_targets = ['target', 'label', 'class', 'y', 'outcome', 'result']
|
| 36 |
+
|
| 37 |
+
for col in possible_targets:
|
| 38 |
+
if col in df.columns:
|
| 39 |
+
return col
|
| 40 |
+
|
| 41 |
+
# If none found, return the last column by default
|
| 42 |
+
return df.columns[-1]
|
| 43 |
+
|
| 44 |
@lru_cache(maxsize=10)
|
| 45 |
def cached_generate_lime_explanation(insight_text: str, class_names: tuple = ("Negative", "Positive")):
|
| 46 |
"""Generate and cache LIME explanations to improve performance"""
|
|
|
|
| 81 |
|
| 82 |
def generate_shap_explanation(model, X_train, X_test):
|
| 83 |
"""Generate SHAP explanations for model predictions"""
|
| 84 |
+
try:
|
| 85 |
+
explainer = shap.TreeExplainer(model)
|
| 86 |
+
shap_values = explainer.shap_values(X_test)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
# Save SHAP plots
|
| 89 |
+
shap_figures = []
|
| 90 |
+
for plot_type in ['summary', 'bar']:
|
| 91 |
+
plt.figure()
|
| 92 |
+
if plot_type == 'summary':
|
| 93 |
+
shap.summary_plot(shap_values, X_test, plot_size=(10, 8), show=False)
|
| 94 |
+
elif plot_type == 'bar':
|
| 95 |
+
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
|
| 96 |
+
|
| 97 |
+
fig_path = f'./figures/shap_{plot_type}.png'
|
| 98 |
+
plt.savefig(fig_path, bbox_inches='tight')
|
| 99 |
+
plt.close()
|
| 100 |
+
shap_figures.append(fig_path)
|
| 101 |
+
|
| 102 |
+
return shap_figures
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"Error generating SHAP explanation: {e}")
|
| 105 |
+
return []
|
| 106 |
|
| 107 |
def feature_engineering_experiments(X_train, X_test, y_train, y_test):
|
| 108 |
"""Run different feature engineering approaches and compare results"""
|
| 109 |
results = {}
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
try:
|
| 112 |
+
# Original features baseline
|
| 113 |
+
base_model = RandomForestClassifier(random_state=42)
|
| 114 |
+
base_model.fit(X_train, y_train)
|
| 115 |
+
y_pred = base_model.predict(X_test)
|
| 116 |
+
results['baseline'] = {
|
| 117 |
+
'accuracy': accuracy_score(y_test, y_pred),
|
| 118 |
+
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 119 |
+
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 120 |
+
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
# Standardized features
|
| 124 |
+
scaler = StandardScaler()
|
| 125 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 126 |
+
X_test_scaled = scaler.transform(X_test)
|
| 127 |
+
|
| 128 |
+
scaled_model = RandomForestClassifier(random_state=42)
|
| 129 |
+
scaled_model.fit(X_train_scaled, y_train)
|
| 130 |
+
y_pred = scaled_model.predict(X_test_scaled)
|
| 131 |
+
results['scaled'] = {
|
| 132 |
+
'accuracy': accuracy_score(y_test, y_pred),
|
| 133 |
+
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 134 |
+
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 135 |
+
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# Polynomial features (only if few features)
|
| 139 |
+
if X_train.shape[1] < 10:
|
| 140 |
+
poly = PolynomialFeatures(degree=2, interaction_only=True)
|
| 141 |
+
X_train_poly = poly.fit_transform(X_train)
|
| 142 |
+
X_test_poly = poly.transform(X_test)
|
| 143 |
+
|
| 144 |
+
poly_model = RandomForestClassifier(random_state=42)
|
| 145 |
+
poly_model.fit(X_train_poly, y_train)
|
| 146 |
+
y_pred = poly_model.predict(X_test_poly)
|
| 147 |
+
results['polynomial'] = {
|
| 148 |
+
'accuracy': accuracy_score(y_test, y_pred),
|
| 149 |
+
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 150 |
+
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 151 |
+
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# Feature selection
|
| 155 |
+
k = min(5, X_train.shape[1])
|
| 156 |
+
selector = SelectKBest(f_classif, k=k)
|
| 157 |
+
X_train_selected = selector.fit_transform(X_train, y_train)
|
| 158 |
+
X_test_selected = selector.transform(X_test)
|
| 159 |
+
|
| 160 |
+
selected_model = RandomForestClassifier(random_state=42)
|
| 161 |
+
selected_model.fit(X_train_selected, y_train)
|
| 162 |
+
y_pred = selected_model.predict(X_test_selected)
|
| 163 |
+
results['selected'] = {
|
| 164 |
+
'accuracy': accuracy_score(y_test, y_pred),
|
| 165 |
+
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 166 |
+
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 167 |
+
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 168 |
+
}
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
except Exception as e:
|
| 171 |
+
print(f"Error in feature engineering experiments: {e}")
|
| 172 |
+
|
| 173 |
+
return results
|
| 174 |
|
| 175 |
def analyze_data(csv_file, additional_notes=""):
|
| 176 |
start_time = time.time()
|
|
|
|
| 181 |
shutil.rmtree('./figures')
|
| 182 |
os.makedirs('./figures', exist_ok=True)
|
| 183 |
|
| 184 |
+
try:
|
| 185 |
+
wandb.login(key=os.environ.get('WANDB_API_KEY'))
|
| 186 |
+
run = wandb.init(project="huggingface-data-analysis", config={
|
| 187 |
+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 188 |
+
"additional_notes": additional_notes,
|
| 189 |
+
"source_file": csv_file.name if csv_file else None
|
| 190 |
+
})
|
| 191 |
+
except:
|
| 192 |
+
run = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
try:
|
| 195 |
+
# Load and preprocess data
|
| 196 |
+
data = pd.read_csv(csv_file.name)
|
| 197 |
+
target_col = detect_target_column(data)
|
| 198 |
+
X = data.drop(target_col, axis=1)
|
| 199 |
+
y = data[target_col]
|
| 200 |
+
|
| 201 |
+
# Split data
|
| 202 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 203 |
+
|
| 204 |
+
# Feature engineering experiments
|
| 205 |
+
feat_eng_results = feature_engineering_experiments(X_train, X_test, y_train, y_test)
|
| 206 |
+
if run:
|
| 207 |
+
wandb.log({"feature_engineering": feat_eng_results})
|
| 208 |
+
|
| 209 |
+
# Train final model with best approach (using baseline here for demo)
|
| 210 |
+
final_model = RandomForestClassifier(random_state=42)
|
| 211 |
+
final_model.fit(X_train, y_train)
|
| 212 |
+
|
| 213 |
+
# Generate SHAP explanations
|
| 214 |
+
shap_figs = generate_shap_explanation(final_model, X_train, X_test)
|
| 215 |
+
|
| 216 |
+
agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn"])
|
| 217 |
+
analysis_result = agent.run(f"""
|
| 218 |
+
You are an expert data analyst. Perform comprehensive analysis including:
|
| 219 |
+
1. Basic statistics and data quality checks
|
| 220 |
+
2. Feature engineering experiment results: {feat_eng_results}
|
| 221 |
+
3. Target column used: {target_col}
|
| 222 |
+
4. 3 insightful analytical questions about relationships in the data
|
| 223 |
+
5. Visualization of key patterns and correlations
|
| 224 |
+
6. Actionable real-world insights derived from findings
|
| 225 |
+
Generate publication-quality visualizations and save to './figures/'
|
| 226 |
+
""", additional_args={"additional_notes": additional_notes, "source_file": csv_file})
|
| 227 |
+
|
| 228 |
+
execution_time = time.time() - start_time
|
| 229 |
+
final_memory = process.memory_info().rss / 1024 ** 2
|
| 230 |
+
memory_usage = final_memory - initial_memory
|
| 231 |
+
if run:
|
| 232 |
+
wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
|
| 233 |
+
|
| 234 |
+
visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
|
| 235 |
+
visuals.extend(shap_figs) # Add SHAP visualizations
|
| 236 |
+
|
| 237 |
+
if run:
|
| 238 |
+
for viz in visuals:
|
| 239 |
+
wandb.log({os.path.basename(viz): wandb.Image(viz)})
|
| 240 |
+
|
| 241 |
+
if run:
|
| 242 |
+
run.finish()
|
| 243 |
+
return format_analysis_report(analysis_result, visuals)
|
| 244 |
|
| 245 |
+
except Exception as e:
|
| 246 |
+
if run:
|
| 247 |
+
run.finish()
|
| 248 |
+
return f"Error analyzing data: {str(e)}", [], []
|
| 249 |
|
| 250 |
def tune_hyperparameters(csv_file, n_trials: int):
|
| 251 |
"""Run hyperparameter optimization with Optuna"""
|
| 252 |
+
try:
|
| 253 |
+
data = pd.read_csv(csv_file.name)
|
| 254 |
+
target_col = detect_target_column(data)
|
| 255 |
+
X = data.drop(target_col, axis=1)
|
| 256 |
+
y = data[target_col]
|
| 257 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 258 |
+
|
| 259 |
+
study = optuna.create_study(direction="maximize")
|
| 260 |
+
study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)
|
| 261 |
+
|
| 262 |
+
# Train final model with best params
|
| 263 |
+
best_model = RandomForestClassifier(**study.best_params, random_state=42)
|
| 264 |
+
best_model.fit(X_train, y_train)
|
| 265 |
+
y_pred = best_model.predict(X_test)
|
| 266 |
+
|
| 267 |
+
metrics = {
|
| 268 |
+
'accuracy': accuracy_score(y_test, y_pred),
|
| 269 |
+
'precision': precision_score(y_test, y_pred, average='weighted'),
|
| 270 |
+
'recall': recall_score(y_test, y_pred, average='weighted'),
|
| 271 |
+
'f1': f1_score(y_test, y_pred, average='weighted')
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
return f"Best Hyperparameters: {study.best_params}\n\nValidation Metrics:\n{metrics}"
|
| 275 |
+
except Exception as e:
|
| 276 |
+
return f"Error tuning hyperparameters: {str(e)}"
|
| 277 |
|
| 278 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 279 |
gr.Markdown("## 📊 AI Data Analysis Agent with Explainability")
|
| 280 |
|
| 281 |
insights_store = gr.State([])
|
|
|
|
| 282 |
|
| 283 |
with gr.Row():
|
| 284 |
with gr.Column():
|
|
|
|
| 337 |
outputs=[optuna_output]
|
| 338 |
)
|
| 339 |
|
| 340 |
+
demo.launch(debug=True, share=True)
|