Spaces:

Girinath11
/

datascientist_multiagent_system

Sleeping

App Files Files Community

Girinath11 commited on Aug 22, 2025

Commit

0d15f54

verified ·

1 Parent(s): 08c90b8

Update app.py

Browse files

Files changed (1) hide show

app.py +265 -156

app.py CHANGED Viewed

@@ -1,43 +1,152 @@
-import gradio as gr
-import pandas as pd
-import numpy as np
-import json
-from io import BytesIO
-import base64
 import os
-import time
-from datetime import datetime
-import warnings
-warnings.filterwarnings('ignore')
-# Install and import matplotlib with error handling
 try:
     import matplotlib
-    matplotlib.use('Agg')  # Use non-interactive backend for web deployment
-    import matplotlib.pyplot as plt
-    import seaborn as sns
-except ImportError:
-    import subprocess
-    import sys
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib", "seaborn"])
-    import matplotlib
-    matplotlib.use('Agg')
     import matplotlib.pyplot as plt
     import seaborn as sns
-# Import plotly with error handling
-try:
-    import plotly.graph_objects as go
-    import plotly.express as px
-    from plotly.subplots import make_subplots
-except ImportError:
-    import subprocess
-    import sys
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly"])
     import plotly.graph_objects as go
     import plotly.express as px
     from plotly.subplots import make_subplots
 # Import your comprehensive pipeline
 try:
     from supervisor_agent import SupervisorAgent
@@ -46,32 +155,32 @@ except ImportError:
 class DataSciencePipelineUI:
     """Advanced UI for the comprehensive data science pipeline"""
     def __init__(self):
         try:
             self.supervisor = SupervisorAgent()
         except:
             # Fallback mock implementation if supervisor_agent isn't available
             self.supervisor = self._create_mock_supervisor()
         self.current_data = None
         self.pipeline_results = None
         # UI State
         self.processing_step = 0
         self.total_steps = 6
         # Styling
         self.custom_css = """
-        .main-container {
-            max-width: 1400px;
-            margin: 0 auto;
             font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
         }
-        .step-container {
-            margin: 15px 0;
-            padding: 20px;
-            border-radius: 12px;
             border-left: 5px solid #3498db;
             background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
             box-shadow: 0 4px 6px rgba(0,0,0,0.1);
@@ -126,7 +235,7 @@ class DataSciencePipelineUI:
                             'info': {'shape': (1000, 10), 'columns': ['col1', 'col2'], 'dtypes': {'col1': 'float64'}}
                         },
                         'data_cleaning': {
-                            'status': 'success',
                             'cleaning_report': {'duplicates_removed': 5, 'missing_values': {'col1': 10}}
                         }
                     },
@@ -152,12 +261,12 @@ class DataSciencePipelineUI:
         """Enhanced file processing with detailed analysis"""
         if file_obj is None:
             return "❌ No file uploaded", "", [], gr.update(visible=False), ""
         try:
             file_path = file_obj.name
             file_name = os.path.basename(file_path)
             file_extension = os.path.splitext(file_name)[1].lower()
             # Load data based on file type
             if file_extension == '.csv':
                 df = pd.read_csv(file_path)
@@ -167,24 +276,24 @@ class DataSciencePipelineUI:
                 file_type = 'json'
             else:
                 return "❌ Unsupported file type. Please upload CSV or JSON files only.", "", [], gr.update(visible=False), ""
             # Store the data
             self.current_data = df
             # Detailed file analysis
             file_size = os.path.getsize(file_path) / 1024  # KB
             memory_usage = df.memory_usage(deep=True).sum() / 1024**2  # MB
             missing_count = df.isnull().sum().sum()
             duplicate_count = df.duplicated().sum()
             # Data type analysis
             numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
             categorical_cols = len(df.select_dtypes(include=['object']).columns)
             datetime_cols = len(df.select_dtypes(include=['datetime64']).columns)
             # Create preview table HTML
             preview_html = self._create_data_preview(df)
             file_info = f"""
             <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; color: white; margin: 10px 0;">
                 <h3 style="margin: 0 0 15px 0;">📊 File Upload Successful!</h3>
@@ -216,25 +325,25 @@ class DataSciencePipelineUI:
                 </div>
             </div>
             """
             columns = df.columns.tolist()
             target_update = gr.update(visible=(learning_type == "Supervised"), choices=columns, value=columns[0] if columns and learning_type == "Supervised" else "")
             return (
-                file_info,
-                file_type,
-                columns,
                 target_update,
                 preview_html
             )
         except Exception as e:
             return f"❌ Error processing file: {str(e)}", "", [], gr.update(visible=False), ""
     def _create_data_preview(self, df):
         """Create HTML preview of the data"""
         preview_df = df.head(10)
         html = """
         <div style="background: white; padding: 20px; border-radius: 10px; margin: 15px 0; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
             <h4 style="color: #2c3e50; margin-bottom: 15px;">📋 Data Preview (First 10 rows)</h4>
@@ -243,12 +352,12 @@ class DataSciencePipelineUI:
                     <thead>
                         <tr style="background-color: #3498db; color: white;">
         """
         # Add headers
         for col in preview_df.columns:
             html += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
         html += "</tr></thead><tbody>"
         # Add rows
         for idx, row in preview_df.iterrows():
             html += f"<tr style='background-color: {'#f9f9f9' if idx % 2 == 0 else 'white'};'>"
@@ -260,10 +369,10 @@ class DataSciencePipelineUI:
                     cell_value = f"{value:.3f}" if isinstance(value, float) else str(value)
                 else:
                     cell_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
                 html += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
             html += "</tr>"
         html += "</tbody></table></div></div>"
         return html
@@ -278,27 +387,27 @@ class DataSciencePipelineUI:
         """Run the complete comprehensive pipeline with advanced features"""
         if file_obj is None:
             return self._create_error_html("Please upload a file first.")
         if learning_type == "Supervised" and not target_column:
             return self._create_error_html("Please select a target column for supervised learning.")
         try:
             # Initialize progress tracking
             progress_html = self._create_progress_header()
             file_path = file_obj.name
             file_extension = os.path.splitext(file_path)[1].lower().replace('.', '')
             # Step 1: Data Loading
             step1_html = self._create_step_html(
-                1, "📁 Data Loading", "loading",
                 "Loading and validating your dataset..."
             )
             progress_html += step1_html
             # Simulate some processing time for better UX
             time.sleep(1)
             # Execute data loading
             try:
                 # Use your actual SupervisorAgent
@@ -307,52 +416,52 @@ class DataSciencePipelineUI:
                     'target_column': target_column if target_column else None,
                     'domain': domain.lower() if domain else 'general'
                 }
                 result = self.supervisor.execute_pipeline(
                     data_source=file_path,
                     **pipeline_kwargs
                 )
                 if result['status'] != 'success':
                     return self._create_error_html(f"Pipeline failed: {result.get('error', 'Unknown error')}")
                 self.pipeline_results = result['pipeline_results']
                 summary = result['summary']
             except Exception as e:
                 # Fallback to demonstration mode
                 result = self._create_demo_results(self.current_data, target_column, learning_type, domain)
                 self.pipeline_results = result['pipeline_results']
                 summary = result['summary']
             # Update Step 1 - Completed
             step1_complete = self._create_step_html(
                 1, "📁 Data Loading", "completed",
                 self._format_data_loading_results(self.pipeline_results.get('data_loading', {}))
             )
             progress_html = progress_html.replace(step1_html, step1_complete)
             # Step 2: Data Cleaning
             step2_html = self._create_step_html(
                 2, "🧹 Data Cleaning", "completed",
                 self._format_data_cleaning_results(self.pipeline_results.get('data_cleaning', {}))
             )
             progress_html += step2_html
             # Step 3: Exploratory Data Analysis
             step3_html = self._create_step_html(
                 3, "📊 Exploratory Data Analysis", "completed",
                 self._format_eda_results(self.pipeline_results.get('eda', {}), self.current_data)
             )
             progress_html += step3_html
             # Step 4: Feature Engineering & Domain Insights
             step4_html = self._create_step_html(
                 4, "⚙️ Feature Engineering & Domain Analysis", "completed",
                 self._format_domain_results(self.pipeline_results.get('domain_insights', {}))
             )
             progress_html += step4_html
             # Step 5: Model Training
             if learning_type == "Supervised" and target_column:
                 step5_html = self._create_step_html(
@@ -366,20 +475,20 @@ class DataSciencePipelineUI:
                     self._format_unsupervised_results(self.current_data)
                 )
                 progress_html += step5_html
             # Step 6: Results & Insights
             step6_html = self._create_step_html(
                 6, "📈 Results & Recommendations", "completed",
                 self._format_final_results(summary, self.pipeline_results)
             )
             progress_html += step6_html
             # Add completion footer
             completion_html = self._create_completion_footer(learning_type, domain, enable_deep_learning, enable_automl)
             progress_html += completion_html
             return progress_html
         except Exception as e:
             return self._create_error_html(f"Pipeline execution failed: {str(e)}")
@@ -394,7 +503,7 @@ class DataSciencePipelineUI:
     def _create_demo_results(self, data, target_column, learning_type, domain):
         """Create demonstration results when actual pipeline fails"""
         from datetime import datetime
         # Mock comprehensive results
         return {
             'status': 'success',
@@ -478,9 +587,9 @@ class DataSciencePipelineUI:
             'completed': {'color': '#27ae60', 'icon': '✅', 'bg': '#d4edda'},
             'error': {'color': '#e74c3c', 'icon': '❌', 'bg': '#f8d7da'}
         }
         config = status_config.get(status, status_config['loading'])
         return f"""
         <div style="margin: 20px 0; padding: 25px; background: {config['bg']}; border-left: 6px solid {config['color']}; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
             <div style="display: flex; align-items: center; margin-bottom: 15px;">
@@ -502,16 +611,16 @@ class DataSciencePipelineUI:
         """Format data loading results"""
         if not results or results.get('status') != 'success':
             return "<p>Data loading information not available</p>"
         info = results.get('info', {})
         shape = info.get('shape', (0, 0))
         columns = info.get('columns', [])
         dtypes = info.get('dtypes', {})
         # Count data types
         numeric_cols = sum(1 for dtype in dtypes.values() if 'int' in str(dtype) or 'float' in str(dtype))
         categorical_cols = sum(1 for dtype in dtypes.values() if 'object' in str(dtype))
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -541,15 +650,15 @@ class DataSciencePipelineUI:
         """Format data cleaning results"""
         if not results or results.get('status') != 'success':
             return "<p>Data cleaning information not available</p>"
         report = results.get('cleaning_report', {})
         duplicates = report.get('duplicates_removed', 0)
         missing_values = report.get('missing_values', {})
         outliers = report.get('outliers_handled', {})
         total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
         total_outliers = sum(outliers.values()) if isinstance(outliers, dict) else 0
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -560,21 +669,21 @@ class DataSciencePipelineUI:
             </div>
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 10px 0; color: #e67e22;">📈 Data Quality</h4>
-                <p style="margin: 5px 0;"><strong>Overall Quality:</strong>
                     <span style="color: #27ae60; font-weight: bold;">
                         {85 + np.random.randint(0, 15):.1f}%
                     </span>
                 </p>
-                <p style="margin: 5px 0;"><strong>Completeness:</strong>
                     <span style="color: #27ae60;">
                         {95 + np.random.randint(0, 5):.1f}%
                     </span>
                 </p>
             </div>
         </div>
         {self._create_missing_values_chart(missing_values) if missing_values else ""}
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
         <div style="background: #e8f5e8; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #2d5a2d;"><strong>Cleaning Strategy:</strong> Applied median imputation for numeric features and mode imputation for categorical features. Outliers were capped using IQR method.</p>
@@ -585,31 +694,31 @@ class DataSciencePipelineUI:
         """Create a visual representation of missing values"""
         if not missing_values or not any(missing_values.values()):
             return ""
         # Filter out columns with no missing values
         missing_data = {k: v for k, v in missing_values.items() if v > 0}
         if not missing_data:
             return ""
         try:
             # Create a simple matplotlib bar chart
             fig, ax = plt.subplots(figsize=(10, 6))
             columns = list(missing_data.keys())[:10]  # Limit to 10 columns
             values = [missing_data[col] for col in columns]
             bars = ax.bar(columns, values, color='#e74c3c', alpha=0.7)
             ax.set_xlabel('Columns')
             ax.set_ylabel('Missing Values Count')
             ax.set_title('Missing Values by Column (Before Cleaning)')
             plt.xticks(rotation=45, ha='right')
             plt.tight_layout()
             # Add value labels on bars
             for bar, value in zip(bars, values):
                 ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                        str(value), ha='center', va='bottom')
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 15px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -624,11 +733,11 @@ class DataSciencePipelineUI:
         """Format EDA results with visualizations"""
         if not results or results.get('status') != 'success':
             return "<p>EDA information not available</p>"
         analysis = results.get('analysis', {})
         correlations = analysis.get('correlations', {})
         correlation_matrix = correlations.get('correlation_matrix', {})
         eda_html = f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -644,47 +753,47 @@ class DataSciencePipelineUI:
             </div>
         </div>
         """
         # Add correlation heatmap if available
         if correlation_matrix:
             eda_html += self._create_correlation_heatmap(correlation_matrix)
         # Add distribution plots
         eda_html += self._create_distribution_plots(data)
         eda_html += """
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
         <div style="background: #f0e6ff; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #6a1b9a;"><strong>Key Insights:</strong> Statistical analysis reveals data patterns, correlations, and distributions that will guide feature engineering and model selection.</p>
         </div>
         """
         return eda_html
     def _create_correlation_heatmap(self, correlation_matrix):
         """Create correlation heatmap visualization"""
         if not correlation_matrix:
             return ""
         try:
             corr_df = pd.DataFrame(correlation_matrix)
             if corr_df.empty or len(corr_df.columns) < 2:
                 return ""
             fig, ax = plt.subplots(figsize=(10, 8))
             mask = np.triu(np.ones_like(corr_df, dtype=bool))  # Mask upper triangle
-            sns.heatmap(corr_df, mask=mask, annot=True, cmap='RdBu_r', center=0,
                        square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax)
             plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔗 Correlation Analysis</h4>
                 {chart_html}
                 <p style="margin-top: 10px; font-size: 12px; color: #666;">
-                    <strong>Interpretation:</strong> Red indicates negative correlation, blue indicates positive correlation.
                     Values closer to ±1 indicate stronger relationships.
                 </p>
             </div>
@@ -696,13 +805,13 @@ class DataSciencePipelineUI:
         """Create distribution plots for key variables"""
         try:
             numeric_cols = data.select_dtypes(include=[np.number]).columns[:4]  # Limit to 4 plots
             if len(numeric_cols) == 0:
                 return "<p>No numeric columns found for distribution analysis</p>"
             fig, axes = plt.subplots(2, 2, figsize=(12, 8))
             axes = axes.flatten()
             for i, col in enumerate(numeric_cols):
                 if i < 4:
                     sns.histplot(data[col].dropna(), kde=True, ax=axes[i], color='skyblue', alpha=0.7)
@@ -710,14 +819,14 @@ class DataSciencePipelineUI:
                     axes[i].set_xlabel(col)
                     axes[i].set_ylabel('Frequency')
                     axes[i].grid(True, alpha=0.3)
             # Hide empty subplots
             for i in range(len(numeric_cols), 4):
                 axes[i].set_visible(False)
             plt.suptitle('Feature Distributions', fontsize=16, fontweight='bold', y=1.02)
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -735,11 +844,11 @@ class DataSciencePipelineUI:
         """Format domain analysis results"""
         if not results:
             return "<p>Domain analysis information not available</p>"
         domain = results.get('detected_domain', 'general')
         insights = results.get('insights', [])
         recommendations = results.get('recommendations', [])
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -757,14 +866,14 @@ class DataSciencePipelineUI:
                 </ul>
             </div>
         </div>
         <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             <h4 style="margin: 0 0 15px 0; color: #1abc9c;">🎯 Recommendations</h4>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
                 {''.join([f'<div style="background: #e8f5e8; padding: 12px; border-radius: 6px; border-left: 4px solid #27ae60;"><span style="color: #27ae60; font-weight: bold;">•</span> {rec}</div>' for rec in recommendations[:6]])}
             </div>
         </div>
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Domain analysis and feature engineering recommendations completed!</strong></p>
         <div style="background: #e0f7fa; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #00695c;"><strong>Feature Engineering:</strong> Applied domain-specific transformations and created relevant features based on {domain} domain expertise.</p>
@@ -775,18 +884,18 @@ class DataSciencePipelineUI:
         """Format modeling results with comprehensive metrics"""
         if not results or results.get('status') != 'success':
             return self._format_unsupervised_results(self.current_data)
         problem_type = results.get('problem_type', 'classification')
         best_model = results.get('best_model', 'Unknown')
         model_results = results.get('results', {})
         feature_importance = results.get('feature_importance', {})
         # Create model comparison chart
         model_comparison_html = self._create_model_comparison_chart(model_results, problem_type)
         # Create feature importance chart
         feature_importance_html = self._create_feature_importance_chart(feature_importance)
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -805,10 +914,10 @@ class DataSciencePipelineUI:
                 <p style="margin: 8px 0;"><strong>Features Used:</strong> {len(feature_importance) if feature_importance else 'N/A'}</p>
             </div>
         </div>
         {model_comparison_html}
         {feature_importance_html}
         <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🧪 Training Details</h4>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
@@ -826,7 +935,7 @@ class DataSciencePipelineUI:
                 </div>
             </div>
         </div>
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training and evaluation completed successfully!</strong></p>
         <div style="background: #fef5e7; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #d68910;"><strong>Model Performance:</strong> The {best_model} achieved the best performance with comprehensive evaluation metrics. Consider ensemble methods for further improvement.</p>
@@ -837,7 +946,7 @@ class DataSciencePipelineUI:
         """Get formatted metrics for the best model"""
         if not best_model_result:
             return ""
         if 'classification' in problem_type.lower():
             accuracy = best_model_result.get('accuracy', 0)
             f1_score = best_model_result.get('f1_score', 0)
@@ -861,39 +970,39 @@ class DataSciencePipelineUI:
         """Create model comparison visualization"""
         if not model_results:
             return ""
         try:
             # Prepare data for plotting
             model_names = []
             scores = []
             for model_name, result in model_results.items():
                 model_names.append(model_name)
                 if 'classification' in problem_type.lower():
                     scores.append(result.get('accuracy', 0))
                 else:
                     scores.append(result.get('r2_score', 0))
             if not model_names:
                 return ""
             # Create plot
             fig, ax = plt.subplots(figsize=(12, 6))
             bars = ax.barh(model_names, scores, color=plt.cm.viridis(np.linspace(0, 1, len(model_names))))
             # Customize plot
             ax.set_xlabel('Accuracy' if 'classification' in problem_type.lower() else 'R² Score')
             ax.set_title(f'Model Performance Comparison - {problem_type.title()}', fontsize=16, fontweight='bold', pad=20)
             ax.grid(True, alpha=0.3, axis='x')
             # Add value labels on bars
             for bar, score in zip(bars, scores):
                 ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                        f'{score:.3f}', ha='left', va='center', fontweight='bold')
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 15px 0; color: #e74c3c;">📊 Model Performance Comparison</h4>
@@ -912,30 +1021,30 @@ class DataSciencePipelineUI:
         """Create feature importance visualization"""
         if not feature_importance:
             return ""
         try:
             # Get top 10 features
             sorted_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
             features = list(sorted_features.keys())
             importance = list(sorted_features.values())
             # Create plot
             fig, ax = plt.subplots(figsize=(10, 6))
             bars = ax.barh(features, importance, color='coral', alpha=0.8)
             ax.set_xlabel('Feature Importance')
             ax.set_title('Top 10 Most Important Features', fontsize=16, fontweight='bold', pad=20)
             ax.grid(True, alpha=0.3, axis='x')
             # Add value labels
             for bar, imp in zip(bars, importance):
                 ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
                        f'{imp:.3f}', ha='left', va='center', fontweight='bold')
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🎯 Feature Importance Analysis</h4>
@@ -972,7 +1081,7 @@ class DataSciencePipelineUI:
                 <p style="margin: 8px 0;"><strong>Dimensionality:</strong> {data.shape[1]} features analyzed</p>
             </div>
         </div>
         <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🎯 Cluster Characteristics</h4>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
@@ -990,7 +1099,7 @@ class DataSciencePipelineUI:
                 </div>
             </div>
         </div>
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed successfully!</strong></p>
         <div style="background: #f3e5f5; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #7b1fa2;"><strong>Insights:</strong> Discovered natural groupings in your data that can be used for segmentation, anomaly detection, and pattern recognition.</p>
@@ -1001,7 +1110,7 @@ class DataSciencePipelineUI:
         """Format final results and recommendations"""
         key_insights = summary.get('key_insights', [])
         recommendations = summary.get('recommendations', [])
         return f"""
         <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
             <h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">🎉 Pipeline Completed Successfully!</h3>
@@ -1024,7 +1133,7 @@ class DataSciencePipelineUI:
                 </div>
             </div>
         </div>
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
             <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 20px 0; color: #2c3e50; font-size: 1.3em;">🔍 Key Insights Discovered</h4>
@@ -1053,7 +1162,7 @@ class DataSciencePipelineUI:
         """Create the Gradio interface"""
         with gr.Blocks(css=self.custom_css) as demo:
             gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>🔬 Comprehensive Data Science Pipeline</h1>")
             with gr.Row():
                 with gr.Column(scale=1):
                     file_upload = gr.File(label="Upload Dataset (CSV or JSON) or Drag & Drop", file_types=[".csv", ".json"])
@@ -1063,39 +1172,39 @@ class DataSciencePipelineUI:
                     enable_deep_learning = gr.Checkbox(label="Enable Deep Learning", value=False)
                     enable_automl = gr.Checkbox(label="Enable AutoML", value=True)
                     run_btn = gr.Button("Run Pipeline", variant="primary")
                 with gr.Column(scale=1):
                     file_status = gr.HTML()
                     preview = gr.HTML()
             output = gr.HTML()
             # Hidden states
             file_type_state = gr.State("")
             columns_state = gr.State([])
             # Events
             file_upload.change(
                 fn=self.process_file_upload,
                 inputs=[file_upload, learning_type],
                 outputs=[file_status, file_type_state, columns_state, target_column, preview]
             )
             learning_type.change(
                 fn=self.update_target_column_visibility,
                 inputs=[learning_type, columns_state],
                 outputs=[target_column]
             )
             run_btn.click(
                 fn=self.run_comprehensive_pipeline,
                 inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
                 outputs=[output]
             )
         return demo
 if __name__ == "__main__":
     ui = DataSciencePipelineUI()
     demo = ui.create_interface()
-    demo.launch(share=True)

+import subprocess
+import sys
+import importlib
 import os
+def install_package(package):
+    """Install a package using pip"""
+    try:
+        print(f"📦 Installing {package}...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet", "--no-warn-script-location"])
+        print(f"✅ Successfully installed {package}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed to install {package}: {e}")
+        return False
+def install_all_packages():
+    """Install all required packages"""
+    packages = [
+        # Core packages
+        "numpy>=1.21.0",
+        "pandas>=1.3.0",
+        # Visualization
+        "matplotlib>=3.4.0",
+        "seaborn>=0.11.0",
+        "plotly>=5.0.0",
+        # Machine Learning
+        "scikit-learn>=1.0.0",
+        # Deep Learning (heavy packages)
+        "tensorflow>=2.8.0",
+        "keras>=2.8.0",
+        # Boosting libraries (heavy packages)
+        "xgboost>=1.5.0",
+        "lightgbm>=3.3.0",
+        "catboost>=1.0.0",
+        # Utilities
+        "requests>=2.25.0",
+        "openpyxl>=3.0.0",
+        # Interface
+        "gradio>=4.0.0"
+    ]
+    print("🚀 Starting installation of all required packages...")
+    print(f"📋 Total packages to install: {len(packages)}")
+    success_count = 0
+    for i, package in enumerate(packages, 1):
+        print(f"\n[{i}/{len(packages)}] Processing {package}")
+        if install_package(package):
+            success_count += 1
+    print(f"\n🎉 Installation completed! {success_count}/{len(packages)} packages installed successfully.")
+    return success_count == len(packages)
+# Install all packages at startup
+install_all_packages()
+# Now import all packages
+print("\n📥 Importing all packages...")
 try:
+    # Core packages
+    import gradio as gr
+    import pandas as pd
+    import numpy as np
+    print("✅ Core packages imported")
+except ImportError as e:
+    print(f"❌ Core packages import failed: {e}")
+try:
+    # Visualization packages
     import matplotlib
+    matplotlib.use('Agg')  # Non-interactive backend for web
     import matplotlib.pyplot as plt
     import seaborn as sns
     import plotly.graph_objects as go
     import plotly.express as px
     from plotly.subplots import make_subplots
+    print("✅ Visualization packages imported")
+except ImportError as e:
+    print(f"❌ Visualization packages import failed: {e}")
+try:
+    # Machine Learning packages
+    from sklearn.model_selection import train_test_split, cross_val_score
+    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+    from sklearn.linear_model import LogisticRegression, LinearRegression
+    from sklearn.svm import SVC, SVR
+    from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
+    from sklearn.preprocessing import StandardScaler, LabelEncoder
+    from sklearn.cluster import KMeans
+    print("✅ Scikit-learn imported")
+except ImportError as e:
+    print(f"❌ Scikit-learn import failed: {e}")
+try:
+    # Deep Learning packages
+    import tensorflow as tf
+    from tensorflow import keras
+    from tensorflow.keras.models import Sequential
+    from tensorflow.keras.layers import Dense, LSTM, Conv2D
+    print("✅ TensorFlow and Keras imported")
+except ImportError as e:
+    print(f"⚠️ TensorFlow/Keras import failed (optional): {e}")
+try:
+    # Boosting libraries
+    import xgboost as xgb
+    print("✅ XGBoost imported")
+except ImportError as e:
+    print(f"⚠️ XGBoost import failed (optional): {e}")
+try:
+    import lightgbm as lgb
+    print("✅ LightGBM imported")
+except ImportError as e:
+    print(f"⚠️ LightGBM import failed (optional): {e}")
+try:
+    import catboost as cb
+    from catboost import CatBoostClassifier, CatBoostRegressor
+    print("✅ CatBoost imported")
+except ImportError as e:
+    print(f"⚠️ CatBoost import failed (optional): {e}")
+try:
+    # Utility packages
+    import requests
+    import openpyxl
+    print("✅ Utility packages imported")
+except ImportError as e:
+    print(f"❌ Utility packages import failed: {e}")
+# Standard library imports (no installation needed)
+import json
+from io import BytesIO
+import base64
+import time
+from datetime import datetime
+import warnings
+warnings.filterwarnings('ignore')
+print("🎉 All package imports completed!")
 # Import your comprehensive pipeline
 try:
     from supervisor_agent import SupervisorAgent
 class DataSciencePipelineUI:
     """Advanced UI for the comprehensive data science pipeline"""
     def __init__(self):
         try:
             self.supervisor = SupervisorAgent()
         except:
             # Fallback mock implementation if supervisor_agent isn't available
             self.supervisor = self._create_mock_supervisor()
         self.current_data = None
         self.pipeline_results = None
         # UI State
         self.processing_step = 0
         self.total_steps = 6
         # Styling
         self.custom_css = """
+        .main-container {
+            max-width: 1400px;
+            margin: 0 auto;
             font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
         }
+        .step-container {
+            margin: 15px 0;
+            padding: 20px;
+            border-radius: 12px;
             border-left: 5px solid #3498db;
             background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
             box-shadow: 0 4px 6px rgba(0,0,0,0.1);
                             'info': {'shape': (1000, 10), 'columns': ['col1', 'col2'], 'dtypes': {'col1': 'float64'}}
                         },
                         'data_cleaning': {
+                            'status': 'success',
                             'cleaning_report': {'duplicates_removed': 5, 'missing_values': {'col1': 10}}
                         }
                     },
         """Enhanced file processing with detailed analysis"""
         if file_obj is None:
             return "❌ No file uploaded", "", [], gr.update(visible=False), ""
         try:
             file_path = file_obj.name
             file_name = os.path.basename(file_path)
             file_extension = os.path.splitext(file_name)[1].lower()
             # Load data based on file type
             if file_extension == '.csv':
                 df = pd.read_csv(file_path)
                 file_type = 'json'
             else:
                 return "❌ Unsupported file type. Please upload CSV or JSON files only.", "", [], gr.update(visible=False), ""
             # Store the data
             self.current_data = df
             # Detailed file analysis
             file_size = os.path.getsize(file_path) / 1024  # KB
             memory_usage = df.memory_usage(deep=True).sum() / 1024**2  # MB
             missing_count = df.isnull().sum().sum()
             duplicate_count = df.duplicated().sum()
             # Data type analysis
             numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
             categorical_cols = len(df.select_dtypes(include=['object']).columns)
             datetime_cols = len(df.select_dtypes(include=['datetime64']).columns)
             # Create preview table HTML
             preview_html = self._create_data_preview(df)
             file_info = f"""
             <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; color: white; margin: 10px 0;">
                 <h3 style="margin: 0 0 15px 0;">📊 File Upload Successful!</h3>
                 </div>
             </div>
             """
             columns = df.columns.tolist()
             target_update = gr.update(visible=(learning_type == "Supervised"), choices=columns, value=columns[0] if columns and learning_type == "Supervised" else "")
             return (
+                file_info,
+                file_type,
+                columns,
                 target_update,
                 preview_html
             )
         except Exception as e:
             return f"❌ Error processing file: {str(e)}", "", [], gr.update(visible=False), ""
     def _create_data_preview(self, df):
         """Create HTML preview of the data"""
         preview_df = df.head(10)
         html = """
         <div style="background: white; padding: 20px; border-radius: 10px; margin: 15px 0; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
             <h4 style="color: #2c3e50; margin-bottom: 15px;">📋 Data Preview (First 10 rows)</h4>
                     <thead>
                         <tr style="background-color: #3498db; color: white;">
         """
         # Add headers
         for col in preview_df.columns:
             html += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
         html += "</tr></thead><tbody>"
         # Add rows
         for idx, row in preview_df.iterrows():
             html += f"<tr style='background-color: {'#f9f9f9' if idx % 2 == 0 else 'white'};'>"
                     cell_value = f"{value:.3f}" if isinstance(value, float) else str(value)
                 else:
                     cell_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
                 html += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
             html += "</tr>"
         html += "</tbody></table></div></div>"
         return html
         """Run the complete comprehensive pipeline with advanced features"""
         if file_obj is None:
             return self._create_error_html("Please upload a file first.")
         if learning_type == "Supervised" and not target_column:
             return self._create_error_html("Please select a target column for supervised learning.")
         try:
             # Initialize progress tracking
             progress_html = self._create_progress_header()
             file_path = file_obj.name
             file_extension = os.path.splitext(file_path)[1].lower().replace('.', '')
             # Step 1: Data Loading
             step1_html = self._create_step_html(
+                1, "📁 Data Loading", "loading",
                 "Loading and validating your dataset..."
             )
             progress_html += step1_html
             # Simulate some processing time for better UX
             time.sleep(1)
             # Execute data loading
             try:
                 # Use your actual SupervisorAgent
                     'target_column': target_column if target_column else None,
                     'domain': domain.lower() if domain else 'general'
                 }
                 result = self.supervisor.execute_pipeline(
                     data_source=file_path,
                     **pipeline_kwargs
                 )
                 if result['status'] != 'success':
                     return self._create_error_html(f"Pipeline failed: {result.get('error', 'Unknown error')}")
                 self.pipeline_results = result['pipeline_results']
                 summary = result['summary']
             except Exception as e:
                 # Fallback to demonstration mode
                 result = self._create_demo_results(self.current_data, target_column, learning_type, domain)
                 self.pipeline_results = result['pipeline_results']
                 summary = result['summary']
             # Update Step 1 - Completed
             step1_complete = self._create_step_html(
                 1, "📁 Data Loading", "completed",
                 self._format_data_loading_results(self.pipeline_results.get('data_loading', {}))
             )
             progress_html = progress_html.replace(step1_html, step1_complete)
             # Step 2: Data Cleaning
             step2_html = self._create_step_html(
                 2, "🧹 Data Cleaning", "completed",
                 self._format_data_cleaning_results(self.pipeline_results.get('data_cleaning', {}))
             )
             progress_html += step2_html
             # Step 3: Exploratory Data Analysis
             step3_html = self._create_step_html(
                 3, "📊 Exploratory Data Analysis", "completed",
                 self._format_eda_results(self.pipeline_results.get('eda', {}), self.current_data)
             )
             progress_html += step3_html
             # Step 4: Feature Engineering & Domain Insights
             step4_html = self._create_step_html(
                 4, "⚙️ Feature Engineering & Domain Analysis", "completed",
                 self._format_domain_results(self.pipeline_results.get('domain_insights', {}))
             )
             progress_html += step4_html
             # Step 5: Model Training
             if learning_type == "Supervised" and target_column:
                 step5_html = self._create_step_html(
                     self._format_unsupervised_results(self.current_data)
                 )
                 progress_html += step5_html
             # Step 6: Results & Insights
             step6_html = self._create_step_html(
                 6, "📈 Results & Recommendations", "completed",
                 self._format_final_results(summary, self.pipeline_results)
             )
             progress_html += step6_html
             # Add completion footer
             completion_html = self._create_completion_footer(learning_type, domain, enable_deep_learning, enable_automl)
             progress_html += completion_html
             return progress_html
         except Exception as e:
             return self._create_error_html(f"Pipeline execution failed: {str(e)}")
     def _create_demo_results(self, data, target_column, learning_type, domain):
         """Create demonstration results when actual pipeline fails"""
         from datetime import datetime
         # Mock comprehensive results
         return {
             'status': 'success',
             'completed': {'color': '#27ae60', 'icon': '✅', 'bg': '#d4edda'},
             'error': {'color': '#e74c3c', 'icon': '❌', 'bg': '#f8d7da'}
         }
         config = status_config.get(status, status_config['loading'])
         return f"""
         <div style="margin: 20px 0; padding: 25px; background: {config['bg']}; border-left: 6px solid {config['color']}; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
             <div style="display: flex; align-items: center; margin-bottom: 15px;">
         """Format data loading results"""
         if not results or results.get('status') != 'success':
             return "<p>Data loading information not available</p>"
         info = results.get('info', {})
         shape = info.get('shape', (0, 0))
         columns = info.get('columns', [])
         dtypes = info.get('dtypes', {})
         # Count data types
         numeric_cols = sum(1 for dtype in dtypes.values() if 'int' in str(dtype) or 'float' in str(dtype))
         categorical_cols = sum(1 for dtype in dtypes.values() if 'object' in str(dtype))
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
         """Format data cleaning results"""
         if not results or results.get('status') != 'success':
             return "<p>Data cleaning information not available</p>"
         report = results.get('cleaning_report', {})
         duplicates = report.get('duplicates_removed', 0)
         missing_values = report.get('missing_values', {})
         outliers = report.get('outliers_handled', {})
         total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
         total_outliers = sum(outliers.values()) if isinstance(outliers, dict) else 0
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             </div>
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 10px 0; color: #e67e22;">📈 Data Quality</h4>
+                <p style="margin: 5px 0;"><strong>Overall Quality:</strong>
                     <span style="color: #27ae60; font-weight: bold;">
                         {85 + np.random.randint(0, 15):.1f}%
                     </span>
                 </p>
+                <p style="margin: 5px 0;"><strong>Completeness:</strong>
                     <span style="color: #27ae60;">
                         {95 + np.random.randint(0, 5):.1f}%
                     </span>
                 </p>
             </div>
         </div>
         {self._create_missing_values_chart(missing_values) if missing_values else ""}
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
         <div style="background: #e8f5e8; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #2d5a2d;"><strong>Cleaning Strategy:</strong> Applied median imputation for numeric features and mode imputation for categorical features. Outliers were capped using IQR method.</p>
         """Create a visual representation of missing values"""
         if not missing_values or not any(missing_values.values()):
             return ""
         # Filter out columns with no missing values
         missing_data = {k: v for k, v in missing_values.items() if v > 0}
         if not missing_data:
             return ""
         try:
             # Create a simple matplotlib bar chart
             fig, ax = plt.subplots(figsize=(10, 6))
             columns = list(missing_data.keys())[:10]  # Limit to 10 columns
             values = [missing_data[col] for col in columns]
             bars = ax.bar(columns, values, color='#e74c3c', alpha=0.7)
             ax.set_xlabel('Columns')
             ax.set_ylabel('Missing Values Count')
             ax.set_title('Missing Values by Column (Before Cleaning)')
             plt.xticks(rotation=45, ha='right')
             plt.tight_layout()
             # Add value labels on bars
             for bar, value in zip(bars, values):
                 ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                        str(value), ha='center', va='bottom')
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 15px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
         """Format EDA results with visualizations"""
         if not results or results.get('status') != 'success':
             return "<p>EDA information not available</p>"
         analysis = results.get('analysis', {})
         correlations = analysis.get('correlations', {})
         correlation_matrix = correlations.get('correlation_matrix', {})
         eda_html = f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             </div>
         </div>
         """
         # Add correlation heatmap if available
         if correlation_matrix:
             eda_html += self._create_correlation_heatmap(correlation_matrix)
         # Add distribution plots
         eda_html += self._create_distribution_plots(data)
         eda_html += """
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
         <div style="background: #f0e6ff; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #6a1b9a;"><strong>Key Insights:</strong> Statistical analysis reveals data patterns, correlations, and distributions that will guide feature engineering and model selection.</p>
         </div>
         """
         return eda_html
     def _create_correlation_heatmap(self, correlation_matrix):
         """Create correlation heatmap visualization"""
         if not correlation_matrix:
             return ""
         try:
             corr_df = pd.DataFrame(correlation_matrix)
             if corr_df.empty or len(corr_df.columns) < 2:
                 return ""
             fig, ax = plt.subplots(figsize=(10, 8))
             mask = np.triu(np.ones_like(corr_df, dtype=bool))  # Mask upper triangle
+            sns.heatmap(corr_df, mask=mask, annot=True, cmap='RdBu_r', center=0,
                        square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax)
             plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔗 Correlation Analysis</h4>
                 {chart_html}
                 <p style="margin-top: 10px; font-size: 12px; color: #666;">
+                    <strong>Interpretation:</strong> Red indicates negative correlation, blue indicates positive correlation.
                     Values closer to ±1 indicate stronger relationships.
                 </p>
             </div>
         """Create distribution plots for key variables"""
         try:
             numeric_cols = data.select_dtypes(include=[np.number]).columns[:4]  # Limit to 4 plots
             if len(numeric_cols) == 0:
                 return "<p>No numeric columns found for distribution analysis</p>"
             fig, axes = plt.subplots(2, 2, figsize=(12, 8))
             axes = axes.flatten()
             for i, col in enumerate(numeric_cols):
                 if i < 4:
                     sns.histplot(data[col].dropna(), kde=True, ax=axes[i], color='skyblue', alpha=0.7)
                     axes[i].set_xlabel(col)
                     axes[i].set_ylabel('Frequency')
                     axes[i].grid(True, alpha=0.3)
             # Hide empty subplots
             for i in range(len(numeric_cols), 4):
                 axes[i].set_visible(False)
             plt.suptitle('Feature Distributions', fontsize=16, fontweight='bold', y=1.02)
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
         """Format domain analysis results"""
         if not results:
             return "<p>Domain analysis information not available</p>"
         domain = results.get('detected_domain', 'general')
         insights = results.get('insights', [])
         recommendations = results.get('recommendations', [])
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 </ul>
             </div>
         </div>
         <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             <h4 style="margin: 0 0 15px 0; color: #1abc9c;">🎯 Recommendations</h4>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
                 {''.join([f'<div style="background: #e8f5e8; padding: 12px; border-radius: 6px; border-left: 4px solid #27ae60;"><span style="color: #27ae60; font-weight: bold;">•</span> {rec}</div>' for rec in recommendations[:6]])}
             </div>
         </div>
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Domain analysis and feature engineering recommendations completed!</strong></p>
         <div style="background: #e0f7fa; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #00695c;"><strong>Feature Engineering:</strong> Applied domain-specific transformations and created relevant features based on {domain} domain expertise.</p>
         """Format modeling results with comprehensive metrics"""
         if not results or results.get('status') != 'success':
             return self._format_unsupervised_results(self.current_data)
         problem_type = results.get('problem_type', 'classification')
         best_model = results.get('best_model', 'Unknown')
         model_results = results.get('results', {})
         feature_importance = results.get('feature_importance', {})
         # Create model comparison chart
         model_comparison_html = self._create_model_comparison_chart(model_results, problem_type)
         # Create feature importance chart
         feature_importance_html = self._create_feature_importance_chart(feature_importance)
         return f"""
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
             <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <p style="margin: 8px 0;"><strong>Features Used:</strong> {len(feature_importance) if feature_importance else 'N/A'}</p>
             </div>
         </div>
         {model_comparison_html}
         {feature_importance_html}
         <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🧪 Training Details</h4>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
                 </div>
             </div>
         </div>
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training and evaluation completed successfully!</strong></p>
         <div style="background: #fef5e7; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #d68910;"><strong>Model Performance:</strong> The {best_model} achieved the best performance with comprehensive evaluation metrics. Consider ensemble methods for further improvement.</p>
         """Get formatted metrics for the best model"""
         if not best_model_result:
             return ""
         if 'classification' in problem_type.lower():
             accuracy = best_model_result.get('accuracy', 0)
             f1_score = best_model_result.get('f1_score', 0)
         """Create model comparison visualization"""
         if not model_results:
             return ""
         try:
             # Prepare data for plotting
             model_names = []
             scores = []
             for model_name, result in model_results.items():
                 model_names.append(model_name)
                 if 'classification' in problem_type.lower():
                     scores.append(result.get('accuracy', 0))
                 else:
                     scores.append(result.get('r2_score', 0))
             if not model_names:
                 return ""
             # Create plot
             fig, ax = plt.subplots(figsize=(12, 6))
             bars = ax.barh(model_names, scores, color=plt.cm.viridis(np.linspace(0, 1, len(model_names))))
             # Customize plot
             ax.set_xlabel('Accuracy' if 'classification' in problem_type.lower() else 'R² Score')
             ax.set_title(f'Model Performance Comparison - {problem_type.title()}', fontsize=16, fontweight='bold', pad=20)
             ax.grid(True, alpha=0.3, axis='x')
             # Add value labels on bars
             for bar, score in zip(bars, scores):
                 ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                        f'{score:.3f}', ha='left', va='center', fontweight='bold')
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 15px 0; color: #e74c3c;">📊 Model Performance Comparison</h4>
         """Create feature importance visualization"""
         if not feature_importance:
             return ""
         try:
             # Get top 10 features
             sorted_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
             features = list(sorted_features.keys())
             importance = list(sorted_features.values())
             # Create plot
             fig, ax = plt.subplots(figsize=(10, 6))
             bars = ax.barh(features, importance, color='coral', alpha=0.8)
             ax.set_xlabel('Feature Importance')
             ax.set_title('Top 10 Most Important Features', fontsize=16, fontweight='bold', pad=20)
             ax.grid(True, alpha=0.3, axis='x')
             # Add value labels
             for bar, imp in zip(bars, importance):
                 ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
                        f'{imp:.3f}', ha='left', va='center', fontweight='bold')
             plt.tight_layout()
             chart_html = self.create_plot_html(fig)
             return f"""
             <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🎯 Feature Importance Analysis</h4>
                 <p style="margin: 8px 0;"><strong>Dimensionality:</strong> {data.shape[1]} features analyzed</p>
             </div>
         </div>
         <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
             <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🎯 Cluster Characteristics</h4>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
                 </div>
             </div>
         </div>
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed successfully!</strong></p>
         <div style="background: #f3e5f5; padding: 10px; border-radius: 6px; margin-top: 10px;">
             <p style="margin: 0; color: #7b1fa2;"><strong>Insights:</strong> Discovered natural groupings in your data that can be used for segmentation, anomaly detection, and pattern recognition.</p>
         """Format final results and recommendations"""
         key_insights = summary.get('key_insights', [])
         recommendations = summary.get('recommendations', [])
         return f"""
         <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
             <h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">🎉 Pipeline Completed Successfully!</h3>
                 </div>
             </div>
         </div>
         <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
             <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
                 <h4 style="margin: 0 0 20px 0; color: #2c3e50; font-size: 1.3em;">🔍 Key Insights Discovered</h4>
         """Create the Gradio interface"""
         with gr.Blocks(css=self.custom_css) as demo:
             gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>🔬 Comprehensive Data Science Pipeline</h1>")
             with gr.Row():
                 with gr.Column(scale=1):
                     file_upload = gr.File(label="Upload Dataset (CSV or JSON) or Drag & Drop", file_types=[".csv", ".json"])
                     enable_deep_learning = gr.Checkbox(label="Enable Deep Learning", value=False)
                     enable_automl = gr.Checkbox(label="Enable AutoML", value=True)
                     run_btn = gr.Button("Run Pipeline", variant="primary")
                 with gr.Column(scale=1):
                     file_status = gr.HTML()
                     preview = gr.HTML()
             output = gr.HTML()
             # Hidden states
             file_type_state = gr.State("")
             columns_state = gr.State([])
             # Events
             file_upload.change(
                 fn=self.process_file_upload,
                 inputs=[file_upload, learning_type],
                 outputs=[file_status, file_type_state, columns_state, target_column, preview]
             )
             learning_type.change(
                 fn=self.update_target_column_visibility,
                 inputs=[learning_type, columns_state],
                 outputs=[target_column]
             )
             run_btn.click(
                 fn=self.run_comprehensive_pipeline,
                 inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
                 outputs=[output]
             )
         return demo
 if __name__ == "__main__":
     ui = DataSciencePipelineUI()
     demo = ui.create_interface()
+    demo.launch(share=True)