AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 10, 2025

Commit

43f4c69

verified ·

1 Parent(s): ebf5f6d

Update app.py

Browse files

Files changed (1) hide show

app.py +203 -84

app.py CHANGED Viewed

@@ -16,8 +16,8 @@ import shap
 import lime
 import lime.lime_tabular
 import matplotlib.pyplot as plt
-import matplotlib
-matplotlib.use('Agg')  # For headless environments
 # Authenticate Hugging Face
 hf_token = os.getenv("HF_TOKEN")
@@ -35,17 +35,17 @@ def format_analysis_report(raw_output, visuals):
                 analysis_dict = ast.literal_eval(str(raw_output))
             except (SyntaxError, ValueError) as e:
                 print(f"Error parsing CodeAgent output: {e}")
-                return str(raw_output), visuals
         report = f"""
-        <div style='font-family: Arial, sans-serif; padding: 20px; color: #333;'>
-            <h1 style='color: #2B547E; border-bottom: 2px solid #2B547E; padding-bottom: 10px;'>📊 Data Analysis Report</h1>
-            <div style='margin-top: 25px; background: #f8f9fa; padding: 20px; border-radius: 8px;'>
-                <h2 style='color: #2B547E;'>🔍 Key Observations</h2>
                 {format_observations(analysis_dict.get('observations', {}))}
             </div>
-            <div style='margin-top: 30px;'>
-                <h2 style='color: #2B547E;'>💡 Insights & Visualizations</h2>
                 {format_insights(analysis_dict.get('insights', {}), visuals)}
             </div>
         </div>
@@ -58,9 +58,9 @@ def format_analysis_report(raw_output, visuals):
 def format_observations(observations):
     return '\n'.join([
         f"""
-        <div style='margin: 15px 0; padding: 15px; background: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);'>
-            <h3 style='margin: 0 0 10px 0; color: #4A708B;'>{key.replace('_', ' ').title()}</h3>
-            <pre style='margin: 0; padding: 10px; background: #f8f9fa; border-radius: 4px;'>{value}</pre>
         </div>
         """ for key, value in observations.items() if 'proportions' in key
     ])
@@ -68,12 +68,12 @@ def format_observations(observations):
 def format_insights(insights, visuals):
     return '\n'.join([
         f"""
-        <div style='margin: 20px 0; padding: 20px; background: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);'>
-            <div style='display: flex; align-items: center; gap: 10px;'>
-                <div style='background: #2B547E; color: white; width: 30px; height: 30px; border-radius: 50%; display: flex; align-items: center; justify-content: center;'>{idx+1}</div>
-                <p style='margin: 0; font-size: 16px;'>{insight}</p>
             </div>
-            {f"<img src='/file={visuals[idx]}' style='max-width: 100%; height: auto; margin-top: 10px; border-radius: 6px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>" if idx < len(visuals) else ''}
         </div>
         """ for idx, (key, insight) in enumerate(insights.items())
     ])
@@ -82,18 +82,18 @@ def analyze_data(csv_file, additional_notes=""):
     start_time = time.time()
     process = psutil.Process(os.getpid())
     initial_memory = process.memory_info().rss / 1024 ** 2
     if os.path.exists('./figures'):
         shutil.rmtree('./figures')
     os.makedirs('./figures', exist_ok=True)
     wandb.login(key=os.environ.get('WANDB_API_KEY'))
     run = wandb.init(project="huggingface-data-analysis", config={
         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "additional_notes": additional_notes,
         "source_file": csv_file.name if csv_file else None
     })
     agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn"])
     analysis_result = agent.run("""
         You are an expert data analyst. Perform comprehensive analysis including:
@@ -105,114 +105,233 @@ def analyze_data(csv_file, additional_notes=""):
         Return the analysis results as a python dictionary that can be parsed by ast.literal_eval().
         The dictionary should have the following structure:
         {
-            'observations': {...},
-            'insights': {...}
         }
     """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
     execution_time = time.time() - start_time
     final_memory = process.memory_info().rss / 1024 ** 2
     memory_usage = final_memory - initial_memory
     wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
     visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
     for viz in visuals:
         wandb.log({os.path.basename(viz): wandb.Image(viz)})
     run.finish()
     return format_analysis_report(analysis_result, visuals)
-def tune_hyperparameters(csv_file, n_trials: int):
-    df = pd.read_csv(csv_file)
-    y = df.iloc[:, -1]
-    X = df.iloc[:, :-1]
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    study = optuna.create_study(direction="maximize")
-    objective_func = lambda trial: objective(trial, X_train, y_train, X_test, y_test)
-    study.optimize(objective_func, n_trials=n_trials)
-    best_params = study.best_params
-    best_value = study.best_value
-    model = RandomForestClassifier(**best_params, random_state=42)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     accuracy = accuracy_score(y_test, predictions)
     precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
     recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
     f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
     wandb.log({
-        "best_params": best_params,
-        "accuracy": accuracy,
-        "precision": precision,
-        "recall": recall,
-        "f1": f1
     })
-    shap_explainer = shap.TreeExplainer(model)
     shap_values = shap_explainer.shap_values(X_test)
     shap.summary_plot(shap_values, X_test, show=False)
     shap_fig_path = "./figures/shap_summary.png"
     plt.savefig(shap_fig_path)
-    wandb.log({"shap_summary": wandb.Image(shap_fig_path)})
     plt.clf()
     lime_explainer = lime.lime_tabular.LimeTabularExplainer(
-        X_train.values,
-        feature_names=X_train.columns,
-        class_names=['target'],
         mode='classification'
     )
-    lime_explanation = lime_explainer.explain_instance(X_test.iloc[0].values, model.predict_proba)
     lime_fig = lime_explanation.as_pyplot_figure()
     lime_fig_path = "./figures/lime_explanation.png"
     lime_fig.savefig(lime_fig_path)
-    wandb.log({"lime_explanation": wandb.Image(lime_fig_path)})
     plt.clf()
-    # Plot optimization history
-    fig = optuna.visualization.matplotlib.plot_optimization_history(study)
-    history_fig_path = "./figures/optuna_history.png"
-    fig.savefig(history_fig_path)
-    wandb.log({"optuna_history": wandb.Image(history_fig_path)})
-    plt.clf()
-    df_trials = study.trials_dataframe()
-    df_html = df_trials.sort_values("value", ascending=False).to_html(classes="table table-striped", index=False)
-    summary_html = f"""
-    <h3>🏆 Best Hyperparameters</h3>
-    <ul>
-        {''.join([f'<li><b>{k}</b>: {v}</li>' for k, v in best_params.items()])}
-    </ul>
-    <p><b>Accuracy:</b> {accuracy:.4f} &nbsp; <b>Precision:</b> {precision:.4f} &nbsp; <b>Recall:</b> {recall:.4f} &nbsp; <b>F1-score:</b> {f1:.4f}</p>
-    <hr>
-    <h3>📋 All Trials Summary</h3>
-    {df_html}
     """
-    return summary_html, history_fig_path
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 📊 AI Data Analysis Agent with Hyperparameter Optimization")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(label="Upload CSV Dataset", type="filepath")
             notes_input = gr.Textbox(label="Dataset Notes (Optional)", lines=3)
             analyze_btn = gr.Button("Analyze", variant="primary")
-            optuna_trials = gr.Number(label="Number of Hyperparameter Tuning Trials", value=10)
             tune_btn = gr.Button("Optimize Hyperparameters", variant="secondary")
         with gr.Column():
             analysis_output = gr.Markdown("### Analysis results will appear here...")
             optuna_output = gr.HTML(label="Hyperparameter Tuning Results")
-            optuna_plot = gr.Image(label="Optimization History Plot")
-            gallery = gr.Gallery(label="Data Visualizations", columns=2)
     analyze_btn.click(fn=analyze_data, inputs=[file_input, notes_input], outputs=[analysis_output, gallery])
-    tune_btn.click(fn=tune_hyperparameters, inputs=[file_input, optuna_trials], outputs=[optuna_output, optuna_plot])
-if __name__ == "__main__":
-    demo.launch(debug=True)

 import lime
 import lime.lime_tabular
 import matplotlib.pyplot as plt
+import numpy as np
+from optuna.visualization import plot_optimization_history, plot_param_importances
 # Authenticate Hugging Face
 hf_token = os.getenv("HF_TOKEN")
                 analysis_dict = ast.literal_eval(str(raw_output))
             except (SyntaxError, ValueError) as e:
                 print(f"Error parsing CodeAgent output: {e}")
+                return str(raw_output), visuals  # Return raw output as string
         report = f"""
+        <div style="font-family: Arial, sans-serif; padding: 20px; color: #333;">
+            <h1 style="color: #2B547E; border-bottom: 2px solid #2B547E; padding-bottom: 10px;">📊 Data Analysis Report</h1>
+            <div style="margin-top: 25px; background: #f8f9fa; padding: 20px; border-radius: 8px;">
+                <h2 style="color: #2B547E;">🔍 Key Observations</h2>
                 {format_observations(analysis_dict.get('observations', {}))}
             </div>
+            <div style="margin-top: 30px;">
+                <h2 style="color: #2B547E;">💡 Insights & Visualizations</h2>
                 {format_insights(analysis_dict.get('insights', {}), visuals)}
             </div>
         </div>
 def format_observations(observations):
     return '\n'.join([
         f"""
+        <div style="margin: 15px 0; padding: 15px; background: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
+            <h3 style="margin: 0 0 10px 0; color: #4A708B;">{key.replace('_', ' ').title()}</h3>
+            <pre style="margin: 0; padding: 10px; background: #f8f9fa; border-radius: 4px;">{value}</pre>
         </div>
         """ for key, value in observations.items() if 'proportions' in key
     ])
 def format_insights(insights, visuals):
     return '\n'.join([
         f"""
+        <div style="margin: 20px 0; padding: 20px; background: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
+            <div style="display: flex; align-items: center; gap: 10px;">
+                <div style="background: #2B547E; color: white; width: 30px; height: 30px; border-radius: 50%; display: flex; align-items: center; justify-content: center;">{idx+1}</div>
+                <p style="margin: 0; font-size: 16px;">{insight}</p>
             </div>
+            {f'<img src="/file={visuals[idx]}" style="max-width: 100%; height: auto; margin-top: 10px; border-radius: 6px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">' if idx < len(visuals) else ''}
         </div>
         """ for idx, (key, insight) in enumerate(insights.items())
     ])
     start_time = time.time()
     process = psutil.Process(os.getpid())
     initial_memory = process.memory_info().rss / 1024 ** 2
     if os.path.exists('./figures'):
         shutil.rmtree('./figures')
     os.makedirs('./figures', exist_ok=True)
     wandb.login(key=os.environ.get('WANDB_API_KEY'))
     run = wandb.init(project="huggingface-data-analysis", config={
         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "additional_notes": additional_notes,
         "source_file": csv_file.name if csv_file else None
     })
     agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn"])
     analysis_result = agent.run("""
         You are an expert data analyst. Perform comprehensive analysis including:
         Return the analysis results as a python dictionary that can be parsed by ast.literal_eval().
         The dictionary should have the following structure:
         {
+            'observations': {
+                'observation_1_key': 'observation_1_value',
+                'observation_2_key': 'observation_2_value',
+                ...
+            },
+            'insights': {
+                'insight_1_key': 'insight_1_value',
+                'insight_2_key': 'insight_2_value',
+                ...
+            }
         }
     """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
     execution_time = time.time() - start_time
     final_memory = process.memory_info().rss / 1024 ** 2
     memory_usage = final_memory - initial_memory
     wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
     visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
     for viz in visuals:
         wandb.log({os.path.basename(viz): wandb.Image(viz)})
     run.finish()
     return format_analysis_report(analysis_result, visuals)
+def objective(trial, X_train, y_train, X_test, y_test):
+    # Enhanced hyperparameter space
+    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
+    max_depth = trial.suggest_int("max_depth", 3, 15)
+    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
+    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
+    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
+    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
+    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
+    model = RandomForestClassifier(
+        n_estimators=n_estimators,
+        max_depth=max_depth,
+        min_samples_split=min_samples_split,
+        min_samples_leaf=min_samples_leaf,
+        max_features=max_features,
+        bootstrap=bootstrap,
+        criterion=criterion,
+        random_state=42,
+        n_jobs=-1
+    )
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
+    # Track multiple metrics
     accuracy = accuracy_score(y_test, predictions)
     precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
     recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
     f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
+    # Log metrics to W&B
     wandb.log({
+        "trial_accuracy": accuracy,
+        "trial_precision": precision,
+        "trial_recall": recall,
+        "trial_f1": f1,
+        "n_estimators": n_estimators,
+        "max_depth": max_depth,
+        "min_samples_split": min_samples_split,
+        "min_samples_leaf": min_samples_leaf,
+        "max_features": str(max_features),
+        "bootstrap": bootstrap,
+        "criterion": criterion
     })
+    return accuracy
+def tune_hyperparameters(csv_file, n_trials: int):
+    # Initialize W&B run
+    wandb.login(key=os.environ.get('WANDB_API_KEY'))
+    run = wandb.init(project="hyperparameter-optimization",
+                    config={"n_trials": n_trials, "model_type": "RandomForest"})
+    df = pd.read_csv(csv_file)
+    y = df.iloc[:, -1]
+    X = df.iloc[:, :-1]
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Create study with enhanced settings
+    study = optuna.create_study(
+        direction="maximize",
+        sampler=optuna.samplers.TPESampler(),
+        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
+    )
+    # Run optimization
+    study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test),
+                  n_trials=n_trials,
+                  callbacks=[wandb_callback])
+    # Get best trial results
+    best_params = study.best_params
+    best_value = study.best_value
+    # Train final model with best parameters
+    final_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
+    final_model.fit(X_train, y_train)
+    final_predictions = final_model.predict(X_test)
+    # Calculate final metrics
+    accuracy = accuracy_score(y_test, final_predictions)
+    precision = precision_score(y_test, final_predictions, average='weighted', zero_division=0)
+    recall = recall_score(y_test, final_predictions, average='weighted', zero_division=0)
+    f1 = f1_score(y_test, final_predictions, average='weighted', zero_division=0)
+    # Generate optimization visualizations
+    optimization_history = plot_optimization_history(study)
+    param_importance = plot_param_importances(study)
+    # Save visualizations
+    os.makedirs('./figures', exist_ok=True)
+    history_path = "./figures/optimization_history.png"
+    importance_path = "./figures/param_importance.png"
+    optimization_history.figure.savefig(history_path)
+    param_importance.figure.savefig(importance_path)
+    # Generate SHAP and LIME explanations
+    shap_explainer = shap.TreeExplainer(final_model)
     shap_values = shap_explainer.shap_values(X_test)
     shap.summary_plot(shap_values, X_test, show=False)
     shap_fig_path = "./figures/shap_summary.png"
     plt.savefig(shap_fig_path)
     plt.clf()
     lime_explainer = lime.lime_tabular.LimeTabularExplainer(
+        X_train.values,
+        feature_names=X_train.columns,
+        class_names=['target'],
         mode='classification'
     )
+    lime_explanation = lime_explainer.explain_instance(
+        X_test.iloc[0].values,
+        final_model.predict_proba
+    )
     lime_fig = lime_explanation.as_pyplot_figure()
     lime_fig_path = "./figures/lime_explanation.png"
     lime_fig.savefig(lime_fig_path)
     plt.clf()
+    # Log everything to W&B
+    wandb.log({
+        "best_params": best_params,
+        "best_accuracy": best_value,
+        "final_accuracy": accuracy,
+        "final_precision": precision,
+        "final_recall": recall,
+        "final_f1": f1,
+        "optimization_history": wandb.Image(history_path),
+        "parameter_importance": wandb.Image(importance_path),
+        "shap_summary": wandb.Image(shap_fig_path),
+        "lime_explanation": wandb.Image(lime_fig_path)
+    })
+    # Generate HTML report
+    report = f"""
+    <div style="font-family: Arial, sans-serif; padding: 20px; color: #333;">
+        <h1 style="color: #2B547E; border-bottom: 2px solid #2B547E; padding-bottom: 10px;">🎯 Hyperparameter Optimization Results</h1>
+        <div style="margin-top: 20px; background: #f8f9fa; padding: 15px; border-radius: 8px;">
+            <h2 style="color: #2B547E;">📈 Performance Metrics</h2>
+            <p><strong>Best Accuracy:</strong> {best_value:.4f}</p>
+            <p><strong>Final Model Accuracy:</strong> {accuracy:.4f}</p>
+            <p><strong>Precision:</strong> {precision:.4f}</p>
+            <p><strong>Recall:</strong> {recall:.4f}</p>
+            <p><strong>F1 Score:</strong> {f1:.4f}</p>
+        </div>
+        <div style="margin-top: 25px; background: #f8f9fa; padding: 15px; border-radius: 8px;">
+            <h2 style="color: #2B547E;">⚙️ Best Parameters</h2>
+            <pre style="background: white; padding: 10px; border-radius: 4px;">{best_params}</pre>
+        </div>
+        <div style="margin-top: 25px;">
+            <h2 style="color: #2B547E;">📊 Optimization Process</h2>
+            <img src="/file={history_path}" style="max-width: 100%; border-radius: 6px; margin-bottom: 15px;">
+            <img src="/file={importance_path}" style="max-width: 100%; border-radius: 6px;">
+        </div>
+    </div>
     """
+    # Get visualization paths for gallery
+    visuals = [
+        history_path,
+        importance_path,
+        shap_fig_path,
+        lime_fig_path
+    ]
+    run.finish()
+    return report, visuals
+def wandb_callback(study, trial):
+    """Callback to log study information to W&B after each trial"""
+    wandb.log({
+        "best_accuracy": study.best_value,
+        "current_trial": trial.number,
+        "current_accuracy": trial.value
+    })
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 📊 AI Data Analysis Agent with Enhanced Hyperparameter Optimization")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(label="Upload CSV Dataset", type="filepath")
             notes_input = gr.Textbox(label="Dataset Notes (Optional)", lines=3)
             analyze_btn = gr.Button("Analyze", variant="primary")
+            optuna_trials = gr.Number(
+                label="Number of Hyperparameter Tuning Trials",
+                value=50,
+                minimum=10,
+                maximum=200,
+                step=5
+            )
             tune_btn = gr.Button("Optimize Hyperparameters", variant="secondary")
         with gr.Column():
             analysis_output = gr.Markdown("### Analysis results will appear here...")
             optuna_output = gr.HTML(label="Hyperparameter Tuning Results")
+            gallery = gr.Gallery(label="Optimization Visualizations", columns=2)
     analyze_btn.click(fn=analyze_data, inputs=[file_input, notes_input], outputs=[analysis_output, gallery])
+    tune_btn.click(fn=tune_hyperparameters, inputs=[file_input, optuna_trials], outputs=[optuna_output, gallery])
+demo.launch(debug=True)