Spaces:

Girinath11
/

datascientist_multiagent_system

Sleeping

App Files Files Community

Girinath11 commited on Aug 22, 2025

Commit

3dfde02

verified ·

1 Parent(s): c1021eb

Update app.py

Browse files

Files changed (1) hide show

app.py +283 -426

app.py CHANGED Viewed

@@ -355,7 +355,7 @@ class SupervisorAgentMock:
             'results': {
                 'Random Forest': {'accuracy': 0.87, 'f1_score': 0.85} if is_classification else {'rmse': 0.45, 'r2_score': 0.82},
                 'SVM': {'accuracy': 0.82, 'f1_score': 0.80} if is_classification else {'rmse': 0.52, 'r2_score': 0.78},
-                'Logistic Regression': {'accuracy': 0.78, 'f1_score': 0.76} if is_classification else {'rmse': 0.58, 'r2_score': 0.74}
             },
             'feature_importance': {col: np.random.random() for col in df.columns if col != target_column and col in column_types['numeric']}
         }
@@ -744,18 +744,25 @@ class DataSciencePipelineUI:
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
         """
-    def _create_chartjs_histogram(self, data, column):
-        """Create a Matplotlib histogram as a static image for a numeric column"""
         try:
             values = data[column].dropna()
             if len(values) == 0:
                 return "<p>No valid data for histogram</p>"
             plt.figure(figsize=(8, 6))
-            sns.histplot(values, bins=10, kde=False, color='skyblue')
             plt.title(f'Distribution of {column}', fontsize=14)
             plt.xlabel(column, fontsize=12)
             plt.ylabel('Count', fontsize=12)
             html = self.create_plot_html(plt.gcf(), f"histogram_{column}")
             plt.close()
@@ -767,19 +774,22 @@ class DataSciencePipelineUI:
         except Exception as e:
             return f"<p>Could not generate histogram for {column}: {str(e)}</p>"
-    def _create_chartjs_bar(self, data, column, is_target=False):
-        """Create a Matplotlib bar plot as a static image for a categorical column"""
         try:
-            value_counts = data[column].value_counts().head(10)
             labels = value_counts.index.tolist()
             counts = value_counts.values.tolist()
             plt.figure(figsize=(8, 6))
-            sns.barplot(x=labels, y=counts, palette='tab10')
             plt.title(f"{'Target Distribution' if is_target else f'Distribution of {column}'}", fontsize=14)
-            plt.xlabel(column, fontsize=12)
-            plt.ylabel('Count', fontsize=12)
-            plt.xticks(rotation=45, ha='right')
             html = self.create_plot_html(plt.gcf(), f"bar_{column}")
             plt.close()
@@ -791,8 +801,8 @@ class DataSciencePipelineUI:
         except Exception as e:
             return f"<p>Could not generate bar plot for {column}: {str(e)}</p>"
-    def _create_chartjs_scatter(self, data, x_col, y_col, target=False):
-        """Create a Matplotlib scatter plot as a static image for regression analysis"""
         try:
             x_values = data[x_col].dropna()
             y_values = data[y_col].dropna()
@@ -800,14 +810,20 @@ class DataSciencePipelineUI:
             if len(common_indices) < 2:
                 return f"<p>Not enough valid data for scatter plot between {x_col} and {y_col}</p>"
-            x_values = x_values.loc[common_indices].head(100)
-            y_values = y_values.loc[common_indices].head(100)
             plt.figure(figsize=(8, 6))
             plt.scatter(x_values, y_values, color='teal', alpha=0.6)
             plt.title(f'{y_col} vs {x_col}', fontsize=14)
             plt.xlabel(x_col, fontsize=12)
             plt.ylabel(y_col, fontsize=12)
             html = self.create_plot_html(plt.gcf(), f"scatter_{x_col}_{y_col}")
             plt.close()
@@ -819,14 +835,14 @@ class DataSciencePipelineUI:
         except Exception as e:
             return f"<p>Could not generate scatter plot for {x_col} vs {y_col}: {str(e)}</p>"
-    def _create_safe_correlation_heatmap(self, correlation_matrix):
-        """Create a Matplotlib/Seaborn correlation heatmap as a static image"""
         try:
             corr_df = pd.DataFrame(correlation_matrix)
             if corr_df.empty or len(corr_df.columns) < 2:
                 return "<p>Not enough numeric features for correlation analysis</p>"
-            plt.figure(figsize=(10, 8))
             sns.heatmap(
                 corr_df,
                 annot=True,
@@ -836,8 +852,8 @@ class DataSciencePipelineUI:
                 center=0,
                 square=True,
                 fmt='.2f',
-                annot_kws={'size': 10},
-                cbar_kws={'label': 'Correlation'}
             )
             plt.title('Correlation Matrix Heatmap', fontsize=14, pad=15)
             plt.xticks(rotation=45, ha='right')
@@ -854,7 +870,7 @@ class DataSciencePipelineUI:
             return f"<p>Could not generate correlation heatmap: {str(e)}</p>"
     def _format_eda_results(self, results, data, learning_type=None, target_column=None):
-        """Format EDA results with additional visualizations"""
         if not results or results.get('status') != 'success' or data is None:
             return "<p>EDA information not available or no data loaded</p>"
@@ -873,24 +889,32 @@ class DataSciencePipelineUI:
         </div>
         """
         if correlations.get('correlation_matrix'):
-            html += self._create_safe_correlation_heatmap(correlations['correlation_matrix'])
         if learning_type == "Supervised" and target_column and target_column in data.columns:
             if target_column in column_types['numeric']:
-                numeric_cols = [col for col in column_types['numeric'] if col != target_column]
-                for col in numeric_cols[:2]:
-                    html += self._create_chartjs_scatter(data, col, target_column, target=True)
             elif target_column in column_types['categorical']:
-                html += self._create_chartjs_bar(data, target_column, is_target=True)
-                categorical_cols = [col for col in column_types['categorical'] if col != target_column]
-                for col in categorical_cols[:2]:
-                    html += self._create_chartjs_bar(data, col)
         else:
             for col in column_types['numeric'][:2]:
-                html += self._create_chartjs_histogram(data, col)
             for col in column_types['categorical'][:2]:
-                html += self._create_chartjs_bar(data, col)
         html += """
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
@@ -926,103 +950,160 @@ class DataSciencePipelineUI:
         """
     def _format_modeling_results(self, results, enable_deep_learning):
-        """Format modeling results"""
         if not results or results.get('status') != 'success':
             return "<p>Modeling information not available</p>"
         best_model = results.get('best_model', 'Unknown')
         model_results = results.get('results', {})
-        problem_type = results.get('problem_type', 'classification')
         html = f"""
         <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
-            <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🏆 Best Model: {best_model}</h4>
-            <p><strong>Problem Type:</strong> {problem_type.title()}</p>
-            <p><strong>Models Trained:</strong> {len(model_results)}</p>
             <h5 style="color: #e74c3c;">📊 Model Performance:</h5>
-            <div style="background: #f8f9fa; padding: 15px; border-radius: 8px;">
         """
-        for model_name, metrics in model_results.items():
-            html += f"<p><strong>{model_name}:</strong> "
-            for metric_name, metric_value in metrics.items():
-                html += f"{metric_name}: {metric_value:.3f} | "
-            html = html.rstrip(" | ") + "</p>"
         html += """
             </div>
         </div>
-        <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training completed!</strong></p>
         """
         return html
     def _format_unsupervised_results(self, data):
-        """Format comprehensive unsupervised learning results"""
         if data is None:
             return "<p>No data available for unsupervised analysis</p>"
-        n_samples = data.shape[0]
-        n_features = data.shape[1]
-        optimal_clusters = min(max(2, int(np.sqrt(n_samples/100))), 10)
-        return f"""
-        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin: 15px 0;">
-            <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
-                <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔍 Clustering Analysis</h4>
-                <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 15px; border-radius: 8px; text-align: center; margin-bottom: 15px;">
-                    <h3 style="margin: 0;">K-Means Clustering</h3>
-                    <p style="margin: 5px 0 0 0;">Optimal Clusters: {optimal_clusters}</p>
-                </div>
-                <div style="background: #f8f9fa; padding: 15px; border-radius: 6px;">
-                    <p style="margin: 5px 0;"><strong>Silhouette Score:</strong> {0.65 + np.random.random() * 0.2:.3f}</p>
-                    <p style="margin: 5px 0;"><strong>Inertia:</strong> {np.random.randint(500, 2000):,}</p>
-                    <p style="margin: 5px 0;"><strong>Samples:</strong> {n_samples:,}</p>
-                </div>
-            </div>
-            <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
-                <h4 style="margin: 0 0 15px 0; color: #9b59b6;">📊 Pattern Discovery</h4>
-                <p style="margin: 8px 0;"><strong>Natural Groups:</strong> {optimal_clusters} distinct clusters</p>
-                <p style="margin: 8px 0;"><strong>Anomalies:</strong> {np.random.randint(5, max(6, int(n_samples * 0.05)))} outliers detected</p>
-                <p style="margin: 8px 0;"><strong>Feature Space:</strong> {n_features}D analysis</p>
-                <p style="margin: 8px 0;"><strong>Variance Explained:</strong> {85 + np.random.randint(0, 10):.1f}%</p>
-            </div>
-        </div>
-        <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
-            <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🎯 Cluster Characteristics</h4>
-            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
-                {''.join([f'''
-                <div style="background: {["#e8f5e8", "#fff3e0", "#e3f2fd", "#fce4ec", "#f3e5f5"][i % 5]}; padding: 15px; border-radius: 8px; border-left: 4px solid {["#27ae60", "#ff9800", "#2196f3", "#e91e63", "#9c27b0"][i % 5]};">
-                    <h5 style="margin: 0 0 8px 0; color: {["#27ae60", "#ff9800", "#2196f3", "#e91e63", "#9c27b0"][i % 5]};">Cluster {i+1}</h5>
-                    <p style="margin: 0; font-size: 12px;">{["High-value segment", "Moderate characteristics", "Unique behavioral patterns", "Low-activity group", "Special interest group"][i % 5]}</p>
-                    <p style="margin: 5px 0 0 0; font-size: 11px; color: #666;">Size: {np.random.randint(10, max(11, int(n_samples/optimal_clusters * 1.5)))} samples</p>
-                </div>
-                ''' for i in range(min(optimal_clusters, 5))])}
-            </div>
-        </div>
-        <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
-            <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔬 Additional Analysis</h4>
-            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px;">
-                <div style="background: #f8f9fa; padding: 15px; border-radius: 8px;">
-                    <h5 style="margin: 0 0 10px 0; color: #6c757d;">📐 Dimensionality Reduction</h5>
-                    <p style="margin: 5px 0; font-size: 13px;"><strong>PCA Components:</strong> {min(n_features, 10)}</p>
-                    <p style="margin: 5px 0; font-size: 13px;"><strong>Explained Variance:</strong> {75 + np.random.randint(0, 20):.1f}%</p>
-                </div>
-                <div style="background: #f8f9fa; padding: 15px; border-radius: 8px;">
-                    <h5 style="margin: 0 0 10px 0; color: #6c757d;">🎯 Anomaly Detection</h5>
-                    <p style="margin: 5px 0; font-size: 13px;"><strong>Method:</strong> Isolation Forest</p>
-                    <p style="margin: 5px 0; font-size: 13px;"><strong>Contamination:</strong> {np.random.randint(1, 8):.1f}%</p>
-                </div>
-            </div>
         </div>
-        <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed successfully!</strong></p>
-        <div style="background: #f3e5f5; padding: 15px; border-radius: 6px; margin-top: 10px;">
-            <p style="margin: 0; color: #7b1fa2;"><strong>Key Insights:</strong> Discovered {optimal_clusters} natural groupings in your data. These clusters can be used for customer segmentation, market analysis, anomaly detection, and pattern recognition. Consider using these insights for targeted strategies and further investigation.</p>
         </div>
         """
@@ -1031,359 +1112,135 @@ class DataSciencePipelineUI:
         key_insights = summary.get('key_insights', [])
         recommendations = summary.get('recommendations', [])
-        return f"""
-        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
-            <h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">🎉 Pipeline Completed Successfully!</h3>
-        </div>
-        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
-            <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-                <h4 style="margin: 0 0 20px 0; color: #2c3e50;">🔍 Key Insights</h4>
-                {''.join([f'<div style="background: #e8f4f8; padding: 12px; margin: 8px 0; border-radius: 6px;">💡 {insight}</div>' for insight in key_insights])}
-            </div>
-            <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-                <h4 style="margin: 0 0 20px 0; color: #2c3e50;">📝 Recommendations</h4>
-                {''.join([f'<div style="background: #fff3e0; padding: 12px; margin: 8px 0; border-radius: 6px;">📌 {rec}</div>' for rec in recommendations])}
-            </div>
-        </div>
         """
-    def _create_completion_footer(self, learning_type, domain, enable_deep_learning, enable_automl):
-        """Create completion footer"""
-        return f"""
-        <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; margin-top: 20px; text-align: center; color: #34495e;">
-            <p style="margin: 0;"><strong>Configuration:</strong> {learning_type} Learning | Domain: {domain or 'General'} | Deep Learning: {'Enabled' if enable_deep_learning else 'Disabled'} | AutoML: {'Enabled' if enable_automl else 'Disabled'}</p>
         </div>
         """
-    def generate_report_pdf(self, learning_type, target_column, domain, enable_deep_learning, enable_automl):
-        """Generate a PDF report using LaTeX"""
-        if not self.pipeline_results or not self.current_data:
-            return None, "Please run the pipeline first."
-        try:
-            # Sanitizing text for LaTeX
-            def sanitize_latex(text):
-                if not isinstance(text, str):
-                    text = str(text)
-                replacements = {
-                    '&': r'\&',
-                    '%': r'\%',
-                    '$': r'\$',
-                    '#': r'\#',
-                    '_': r'\_',
-                    '{': r'\{',
-                    '}': r'\}',
-                    '~': r'\textasciitilde{}',
-                    '^': r'\textasciicircum{}',
-                    '\\': r'\textbackslash{}'
-                }
-                for old, new in replacements.items():
-                    text = text.replace(old, new)
-                return text
-            # Extract pipeline results
-            data_loading = self.pipeline_results.get('data_loading', {})
-            data_cleaning = self.pipeline_results.get('data_cleaning', {})
-            eda = self.pipeline_results.get('eda', {})
-            domain_insights = self.pipeline_results.get('domain_insights', {})
-            modeling = self.pipeline_results.get('modeling', {})
-            summary = self.pipeline_results.get('summary', {})
-            # Start LaTeX document
-            latex_content = r"""
-            \documentclass[a4paper,12pt]{article}
-            \usepackage[utf8]{inputenc}
-            \usepackage{geometry}
-            \usepackage{graphicx}
-            \usepackage{booktabs}
-            \usepackage{amsmath}
-            \usepackage{xcolor}
-            \usepackage{enumitem}
-            \usepackage{hyperref}
-            \geometry{margin=1in}
-            \title{Data Science Pipeline Report}
-            \author{Automated Analysis}
-            \date{Generated on \today}
-            \begin{document}
-            \maketitle
-            \tableofcontents
-            \newpage
-            % Defining section: Introduction
-            \section{Introduction}
-            This report presents the results of an automated data science pipeline executed on the uploaded dataset. The analysis includes data loading, cleaning, exploratory data analysis (EDA), domain-specific insights, and modeling results. The configuration used is:
-            \begin{itemize}
-                \item \textbf{Learning Type}: """ + sanitize_latex(learning_type) + r"""
-                \item \textbf{Target Column}: """ + (sanitize_latex(target_column) if target_column else "None (Unsupervised)") + r"""
-                \item \textbf{Domain}: """ + sanitize_latex(domain or "General") + r"""
-                \item \textbf{Deep Learning}: """ + ("Enabled" if enable_deep_learning else "Disabled") + r"""
-                \item \textbf{AutoML}: """ + ("Enabled" if enable_automl else "Disabled") + r"""
-            \end{itemize}
-            % Defining section: Data Loading
-            \section{Data Loading}
-            """
-            if data_loading.get('status') == 'success':
-                info = data_loading.get('info', {})
-                shape = info.get('shape', (0, 0))
-                column_types = info.get('column_types', {})
-                latex_content += r"""
-                The dataset was successfully loaded with the following details:
-                \begin{itemize}
-                    \item \textbf{Rows}: """ + f"{shape[0]:,}" + r"""
-                    \item \textbf{Columns}: """ + f"{shape[1]}" + r"""
-                    \item \textbf{Memory Usage}: """ + sanitize_latex(info.get('memory_usage', 'Unknown')) + r"""
-                    \item \textbf{Numeric Columns}: """ + f"{len(column_types.get('numeric', []))}" + r"""
-                    \item \textbf{Categorical Columns}: """ + f"{len(column_types.get('categorical', []))}" + r"""
-                    \item \textbf{DateTime Columns}: """ + f"{len(column_types.get('datetime', []))}" + r"""
-                \end{itemize}
-                """
-            # Defining section: Data Cleaning
-            latex_content += r"""
-            \section{Data Cleaning}
-            """
-            if data_cleaning.get('status') == 'success':
-                report = data_cleaning.get('cleaning_report', {})
-                duplicates = report.get('duplicates_removed', 0)
-                missing_values = report.get('missing_values', {})
-                total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
-                latex_content += r"""
-                The data cleaning process identified and handled:
-                \begin{itemize}
-                    \item \textbf{Duplicates Removed}: """ + f"{duplicates}" + r"""
-                    \item \textbf{Missing Values}: """ + f"{total_missing}" + r"""
-                \end{itemize}
-                """
-            # Defining section: Exploratory Data Analysis
-            latex_content += r"""
-            \section{Exploratory Data Analysis}
-            """
-            if eda.get('status') == 'success':
-                column_types = eda.get('analysis', {}).get('column_types', {})
-                latex_content += r"""
-                The EDA revealed:
-                \begin{itemize}
-                    \item \textbf{Numeric Features}: """ + f"{len(column_types.get('numeric', []))}" + r"""
-                    \item \textbf{Categorical Features}: """ + f"{len(column_types.get('categorical', []))}" + r"""
-                    \item \textbf{DateTime Features}: """ + f"{len(column_types.get('datetime', []))}" + r"""
-                \end{itemize}
-                """
-                # Adding plots
-                if 'correlation_heatmap' in self.plot_images:
-                    latex_content += r"""
-                    \subsection{Correlation Heatmap}
-                    \begin{figure}[h]
-                        \centering
-                        \includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images['correlation_heatmap'] + r"""}
-                        \caption{Correlation Matrix Heatmap}
-                    \end{figure}
-                    """
-                for col in column_types.get('numeric', [])[:2]:
-                    if f"histogram_{col}" in self.plot_images:
-                        latex_content += r"""
-                        \subsection{Histogram of """ + sanitize_latex(col) + r"""}
-                        \begin{figure}[h]
-                            \centering
-                            \includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images[f"histogram_{col}"] + r"""}
-                            \caption{Distribution of """ + sanitize_latex(col) + r"""}
-                        \end{figure}
-                        """
-                for col in column_types.get('categorical', [])[:2]:
-                    if f"bar_{col}" in self.plot_images:
-                        latex_content += r"""
-                        \subsection{Distribution of """ + sanitize_latex(col) + r"""}
-                        \begin{figure}[h]
-                            \centering
-                            \includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images[f"bar_{col}"] + r"""}
-                            \caption{Distribution of """ + sanitize_latex(col) + r"""}
-                        \end{figure}
-                        """
-            # Defining section: Domain Analysis
-            latex_content += r"""
-            \section{Domain Analysis}
-            """
-            if domain_insights:
-                domain = domain_insights.get('detected_domain', 'general')
-                insights = domain_insights.get('insights', [])
-                recommendations = domain_insights.get('recommendations', [])
-                latex_content += r"""
-                \textbf{Detected Domain}: """ + sanitize_latex(domain) + r"""
-                \subsection{Key Insights}
-                \begin{itemize}
-                    """ + ''.join([f"\\item {sanitize_latex(insight)}" for insight in insights[:5]]) + r"""
-                \end{itemize}
-                \subsection{Recommendations}
-                \begin{itemize}
-                    """ + ''.join([f"\\item {sanitize_latex(rec)}" for rec in recommendations[:5]]) + r"""
-                \end{itemize}
-                """
-            # Defining section: Modeling or Unsupervised Analysis
-            if learning_type == "Supervised" and modeling:
-                latex_content += r"""
-                \section{Model Training and Evaluation}
-                """
-                if modeling.get('status') == 'success':
-                    best_model = modeling.get('best_model', 'Unknown')
-                    problem_type = modeling.get('problem_type', 'classification')
-                    model_results = modeling.get('results', {})
-                    latex_content += r"""
-                    \textbf{Best Model}: """ + sanitize_latex(best_model) + r"""
-                    \newline
-                    \textbf{Problem Type}: """ + sanitize_latex(problem_type.title()) + r"""
-                    \newline
-                    \textbf{Model Performance}:
-                    \begin{itemize}
-                        """ + ''.join([f"\\item \\textbf{{{sanitize_latex(model_name)}}}: " + ", ".join([f"{metric_name}: {metric_value:.3f}" for metric_name, metric_value in metrics.items()]) for model_name, metrics in model_results.items()]) + r"""
-                    \end{itemize}
-                    """
-            else:
-                latex_content += r"""
-                \section{Unsupervised Analysis}
-                """
-                n_samples = self.current_data.shape[0]
-                optimal_clusters = min(max(2, int(np.sqrt(n_samples/100))), 10)
-                latex_content += r"""
-                \textbf{Clustering Analysis}:
-                \begin{itemize}
-                    \item Optimal Clusters: """ + f"{optimal_clusters}" + r"""
-                    \item Silhouette Score: """ + f"{0.65 + np.random.random() * 0.2:.3f}" + r"""
-                \end{itemize}
-                """
-            # Defining section: Summary
-            latex_content += r"""
-            \section{Summary and Recommendations}
-            """
-            key_insights = summary.get('key_insights', [])
-            recommendations = summary.get('recommendations', [])
-            latex_content += r"""
-            \subsection{Key Insights}
-            \begin{itemize}
-                """ + ''.join([f"\\item {sanitize_latex(insight)}" for insight in key_insights]) + r"""
-            \end{itemize}
-            \subsection{Recommendations}
-            \begin{itemize}
-                """ + ''.join([f"\\item {sanitize_latex(rec)}" for rec in recommendations]) + r"""
-            \end{itemize}
-            """
-            # Ending document
-            latex_content += r"""
-            \end{document}
-            """
-            # Write LaTeX content to a temporary file
-            with open('report.tex', 'w') as f:
-                f.write(latex_content)
-            # Compile LaTeX to PDF
-            subprocess.run(['latexmk', '-pdf', '-silent', 'report.tex'], check=True)
-            # Return the PDF file path
-            return 'report.pdf', "Report generated successfully!"
-        except Exception as e:
-            return None, f"Could not generate report: {str(e)}"
-    def create_interface(self):
-        """Create the Gradio interface"""
-        with gr.Blocks(css=self.custom_css, title="🔬 Data Scientist Agent") as demo:
-            gr.Markdown("# 🔬 Data Scientist Agent")
-            gr.Markdown("Upload your dataset and let the AI handle the complete data science workflow!")
             with gr.Row():
                 with gr.Column(scale=1):
-                    file_upload = gr.File(
-                        label="📁 Upload Dataset",
-                        file_types=[".csv", ".json"],
-                        type="filepath"
-                    )
                     learning_type = gr.Radio(
-                        choices=["Supervised", "Unsupervised"],
-                        label="🎯 Learning Type",
                         value="Supervised"
                     )
                     target_column = gr.Dropdown(
-                        label="🎯 Target Column (for Supervised Learning)",
-                        choices=[],
-                        visible=True,
-                        allow_custom_value=True
                     )
                     domain = gr.Textbox(
-                        label="🏢 Domain (optional)",
-                        placeholder="e.g., finance, healthcare, retail"
                     )
-                    with gr.Row():
-                        enable_deep_learning = gr.Checkbox(
-                            label="🧠 Enable Deep Learning",
-                            value=False
-                        )
-                        enable_automl = gr.Checkbox(
-                            label="🤖 Enable AutoML",
-                            value=True
-                        )
-                    run_btn = gr.Button(
-                        "🚀 Run Complete Pipeline",
-                        variant="primary",
-                        size="lg"
                     )
-                    download_btn = gr.Button(
-                        "📥 Download Report",
-                        variant="secondary",
-                        size="lg",
                         visible=False
                     )
-                with gr.Column(scale=1):
-                    file_status = gr.HTML(label="📊 File Status")
-                    preview = gr.HTML(label="👀 Data Preview")
-            output = gr.HTML(label="📈 Pipeline Results")
-            report_output = gr.File(label="📄 Download Analysis Report")
-            report_status = gr.Textbox(label="Report Status", visible=False)
-            file_type_state = gr.State("")
-            columns_state = gr.State([])
-            file_upload.change(
                 fn=self.process_file_upload,
-                inputs=[file_upload, learning_type],
-                outputs=[file_status, file_type_state, columns_state, target_column, preview]
             )
             learning_type.change(
                 fn=self.update_target_column_visibility,
-                inputs=[learning_type, columns_state],
                 outputs=[target_column]
             )
-            run_btn.click(
                 fn=self.run_comprehensive_pipeline,
-                inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
-                outputs=[output, download_btn]
             )
-            download_btn.click(
-                fn=self.generate_report_pdf,
-                inputs=[learning_type, target_column, domain, enable_deep_learning, enable_automl],
-                outputs=[report_output, report_status]
             )
-        return demo
 if __name__ == "__main__":
-    print("🚀 Starting Data Science Pipeline UI...")
-    ui = DataSciencePipelineUI()
-    demo = ui.create_interface()
-    demo.launch(share=True)

             'results': {
                 'Random Forest': {'accuracy': 0.87, 'f1_score': 0.85} if is_classification else {'rmse': 0.45, 'r2_score': 0.82},
                 'SVM': {'accuracy': 0.82, 'f1_score': 0.80} if is_classification else {'rmse': 0.52, 'r2_score': 0.78},
+                'LogisticRegression': {'accuracy': 0.78, 'f1_score': 0.76} if is_classification else {'rmse': 0.58, 'r2_score': 0.74}
             },
             'feature_importance': {col: np.random.random() for col in df.columns if col != target_column and col in column_types['numeric']}
         }
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
         """
+    def _create_dynamic_histogram(self, data, column):
+        """Create a dynamic histogram for a numeric column"""
         try:
             values = data[column].dropna()
             if len(values) == 0:
                 return "<p>No valid data for histogram</p>"
+            # Dynamically adjust number of bins based on data size and spread
+            n_bins = min(max(int(np.sqrt(len(values))), 10), 50)
             plt.figure(figsize=(8, 6))
+            sns.histplot(values, bins=n_bins, kde=True, color='skyblue')
             plt.title(f'Distribution of {column}', fontsize=14)
             plt.xlabel(column, fontsize=12)
             plt.ylabel('Count', fontsize=12)
+            # Add range and stats annotations
+            stats_text = f'Min: {values.min():.2f}\nMax: {values.max():.2f}\nMean: {values.mean():.2f}'
+            plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, ha='right', va='top',
+                     bbox=dict(facecolor='white', alpha=0.8))
             html = self.create_plot_html(plt.gcf(), f"histogram_{column}")
             plt.close()
         except Exception as e:
             return f"<p>Could not generate histogram for {column}: {str(e)}</p>"
+    def _create_dynamic_bar(self, data, column, is_target=False):
+        """Create a dynamic bar plot for a categorical column"""
         try:
+            value_counts = data[column].value_counts().head(10)  # Limit to top 10 categories
             labels = value_counts.index.tolist()
             counts = value_counts.values.tolist()
             plt.figure(figsize=(8, 6))
+            sns.barplot(x=counts, y=labels, palette='tab10')
             plt.title(f"{'Target Distribution' if is_target else f'Distribution of {column}'}", fontsize=14)
+            plt.xlabel('Count', fontsize=12)
+            plt.ylabel(column, fontsize=12)
+            # Add total count annotation
+            plt.text(0.95, 0.95, f'Total: {sum(counts)}',
+                     transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(facecolor='white', alpha=0.8))
             html = self.create_plot_html(plt.gcf(), f"bar_{column}")
             plt.close()
         except Exception as e:
             return f"<p>Could not generate bar plot for {column}: {str(e)}</p>"
+    def _create_dynamic_scatter(self, data, x_col, y_col, target=False):
+        """Create a dynamic scatter plot for regression analysis"""
         try:
             x_values = data[x_col].dropna()
             y_values = data[y_col].dropna()
             if len(common_indices) < 2:
                 return f"<p>Not enough valid data for scatter plot between {x_col} and {y_col}</p>"
+            x_values = x_values.loc[common_indices].head(1000)  # Limit to 1000 points for performance
+            y_values = y_values.loc[common_indices].head(1000)
             plt.figure(figsize=(8, 6))
             plt.scatter(x_values, y_values, color='teal', alpha=0.6)
             plt.title(f'{y_col} vs {x_col}', fontsize=14)
             plt.xlabel(x_col, fontsize=12)
             plt.ylabel(y_col, fontsize=12)
+            # Add range and correlation annotations
+            corr = np.corrcoef(x_values, y_values)[0, 1] if len(x_values) > 1 else 0
+            stats_text = f'X Range: {x_values.min():.2f} to {x_values.max():.2f}\nY Range: {y_values.min():.2f} to {y_values.max():.2f}\nCorr: {corr:.2f}'
+            plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, ha='right', va='top',
+                     bbox=dict(facecolor='white', alpha=0.8))
             html = self.create_plot_html(plt.gcf(), f"scatter_{x_col}_{y_col}")
             plt.close()
         except Exception as e:
             return f"<p>Could not generate scatter plot for {x_col} vs {y_col}: {str(e)}</p>"
+    def _create_dynamic_correlation_heatmap(self, correlation_matrix):
+        """Create a dynamic correlation heatmap"""
         try:
             corr_df = pd.DataFrame(correlation_matrix)
             if corr_df.empty or len(corr_df.columns) < 2:
                 return "<p>Not enough numeric features for correlation analysis</p>"
+            plt.figure(figsize=(min(10, len(corr_df.columns) * 1.2), min(8, len(corr_df.columns) * 1)))
             sns.heatmap(
                 corr_df,
                 annot=True,
                 center=0,
                 square=True,
                 fmt='.2f',
+                annot_kws={'size': max(8, 12 - len(corr_df.columns) // 2)},
+                cbar_kws={'label': 'Correlation Coefficient'}
             )
             plt.title('Correlation Matrix Heatmap', fontsize=14, pad=15)
             plt.xticks(rotation=45, ha='right')
             return f"<p>Could not generate correlation heatmap: {str(e)}</p>"
     def _format_eda_results(self, results, data, learning_type=None, target_column=None):
+        """Format EDA results with dynamic visualizations"""
         if not results or results.get('status') != 'success' or data is None:
             return "<p>EDA information not available or no data loaded</p>"
         </div>
         """
+        # Add correlation heatmap if available
         if correlations.get('correlation_matrix'):
+            html += self._create_dynamic_correlation_heatmap(correlations['correlation_matrix'])
+        # Dynamic visualization selection based on learning type and data
         if learning_type == "Supervised" and target_column and target_column in data.columns:
             if target_column in column_types['numeric']:
+                numeric_cols = [col for col in column_types['numeric'] if col != target_column][:2]
+                for col in numeric_cols:
+                    html += self._create_dynamic_scatter(data, col, target_column, target=True)
             elif target_column in column_types['categorical']:
+                html += self._create_dynamic_bar(data, target_column, is_target=True)
+                categorical_cols = [col for col in column_types['categorical'] if col != target_column][:2]
+                for col in categorical_cols:
+                    html += self._create_dynamic_bar(data, col)
+            # Add one numeric histogram and one categorical bar plot for context
+            if column_types['numeric']:
+                html += self._create_dynamic_histogram(data, column_types['numeric'][0])
+            if column_types['categorical'] and target_column not in column_types['categorical']:
+                html += self._create_dynamic_bar(data, column_types['categorical'][0])
         else:
+            # For unsupervised learning or no target, show up to 2 histograms and 2 bar plots
             for col in column_types['numeric'][:2]:
+                html += self._create_dynamic_histogram(data, col)
             for col in column_types['categorical'][:2]:
+                html += self._create_dynamic_bar(data, col)
         html += """
         <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
         """
     def _format_modeling_results(self, results, enable_deep_learning):
+        """Format modeling results with visualizations"""
         if not results or results.get('status') != 'success':
             return "<p>Modeling information not available</p>"
+        problem_type = results.get('problem_type', 'unknown')
         best_model = results.get('best_model', 'Unknown')
         model_results = results.get('results', {})
+        feature_importance = results.get('feature_importance', {})
         html = f"""
         <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
+            <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🤖 Modeling Results</h4>
+            <div style="background: linear-gradient(135deg, #ff6b6b 0%, #e74c3c 100%); color: white; padding: 15px; border-radius: 8px; text-align: center; margin-bottom: 15px;">
+                <h3 style="margin: 0;">Best Model: {best_model} ({problem_type.title()})</h3>
+            </div>
             <h5 style="color: #e74c3c;">📊 Model Performance:</h5>
+            <table style="width: 100%; border-collapse: collapse; margin: 15px 0;">
+                <thead>
+                    <tr style="background-color: #e74c3c; color: white;">
+                        <th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Model</th>
+                        <th style="padding: 8px; text-align: left; border: 1px solid #ddd;">
+                            {'Accuracy' if problem_type == 'classification' else 'RMSE'}
+                        </th>
+                        <th style="padding: 8px; text-align: left; border: 1px solid #ddd;">
+                            {'F1 Score' if problem_type == 'classification' else 'R² Score'}
+                        </th>
+                    </tr>
+                </thead>
+                <tbody>
         """
+        for model, metrics in model_results.items():
+            metric1 = metrics.get('accuracy' if problem_type == 'classification' else 'rmse', 'N/A')
+            metric2 = metrics.get('f1_score' if problem_type == 'classification' else 'r2_score', 'N/A')
+            html += f"""
+                <tr style="background-color: {'#f9f9f9' if list(model_results.keys()).index(model) % 2 == 0 else 'white'};">
+                    <td style="padding: 8px; border: 1px solid #ddd;">{model}</td>
+                    <td style="padding: 8px; border: 1px solid #ddd;">{metric1:.3f}</td>
+                    <td style="padding: 8px; border: 1px solid #ddd;">{metric2:.3f}</td>
+                </tr>
+            """
         html += """
+                </tbody>
+            </table>
+        """
+        if feature_importance:
+            html += self._create_feature_importance_plot(feature_importance)
+        if enable_deep_learning:
+            html += """
+            <div style="background: #e8f4f8; padding: 15px; border-radius: 8px; margin-top: 15px;">
+                <h5 style="color: #2c3e50; margin: 0 0 10px 0;">🧠 Deep Learning Status</h5>
+                <p style="margin: 0;">Deep learning models were evaluated but not included in final results due to complexity constraints.</p>
             </div>
+            """
+        html += """
+        <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training and evaluation completed!</strong></p>
         </div>
         """
         return html
+    def _create_feature_importance_plot(self, feature_importance):
+        """Create a dynamic feature importance bar plot"""
+        try:
+            features = list(feature_importance.keys())
+            importances = list(feature_importance.values())
+            plt.figure(figsize=(8, max(6, len(features) * 0.5)))
+            sns.barplot(x=importances, y=features, palette='viridis')
+            plt.title('Feature Importance', fontsize=14)
+            plt.xlabel('Importance Score', fontsize=12)
+            plt.ylabel('Features', fontsize=12)
+            # Add value annotations
+            for i, v in enumerate(importances):
+                plt.text(v, i, f'{v:.3f}', va='center', ha='left', color='black', fontsize=10)
+            html = self.create_plot_html(plt.gcf(), "feature_importance")
+            plt.close()
+            return f"""
+            {html}
+            <p style="color: #6c757d; font-size: 12px; text-align: center;">Bar plot showing feature importance scores</p>
+            """
+        except Exception as e:
+            return f"<p>Could not generate feature importance plot: {str(e)}</p>"
     def _format_unsupervised_results(self, data):
+        """Format unsupervised analysis results with dynamic clustering visualization"""
         if data is None:
             return "<p>No data available for unsupervised analysis</p>"
+        column_types = self.analyzer.detect_column_types(data)
+        numeric_cols = column_types['numeric']
+        html = """
+        <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
+            <h4 style="margin: 0 0 15px 0; color: #8e44ad;">🔍 Unsupervised Analysis Results</h4>
+            <p style="margin: 0 0 10px 0;">Performed clustering analysis to identify natural groupings in the data.</p>
+        """
+        if len(numeric_cols) >= 2:
+            try:
+                # Perform KMeans clustering with dynamic number of clusters
+                X = data[numeric_cols].dropna().head(1000)
+                n_clusters = min(3, len(X) // 10) if len(X) > 10 else 2
+                kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+                clusters = kmeans.fit_predict(X)
+                plt.figure(figsize=(8, 6))
+                plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis', alpha=0.6)
+                plt.title(f'Clustering: {numeric_cols[0]} vs {numeric_cols[1]}', fontsize=14)
+                plt.xlabel(numeric_cols[0], fontsize=12)
+                plt.ylabel(numeric_cols[1], fontsize=12)
+                # Add cluster count annotation
+                plt.text(0.95, 0.95, f'Clusters: {n_clusters}',
+                         transform=plt.gca().transAxes, ha='right', va='top',
+                         bbox=dict(facecolor='white', alpha=0.8))
+                html += self.create_plot_html(plt.gcf(), "clustering_plot")
+                plt.close()
+                html += f"""
+                <p style="color: #6c757d; font-size: 12px; text-align: center;">
+                    Scatter plot showing clusters based on {numeric_cols[0]} and {numeric_cols[1]}
+                </p>
+                """
+            except Exception as e:
+                html += f"<p>Could not generate clustering plot: {str(e)}</p>"
+        else:
+            html += "<p>Not enough numeric columns for clustering visualization</p>"
+        html += """
+        <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed!</strong></p>
         </div>
+        """
+        return html
+    def _create_completion_footer(self, learning_type, domain, enable_deep_learning, enable_automl):
+        """Create completion footer with summary information"""
+        completion_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        return f"""
+        <div style="background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%); padding: 30px; border-radius: 15px; color: white; margin-top: 20px; text-align: center; box-shadow: 0 8px 16px rgba(0,0,0,0.2);">
+            <h2 style="margin: 0 0 10px 0;">🎉 Pipeline Completed Successfully!</h2>
+            <p style="margin: 0; font-size: 1.1em; opacity: 0.9;">
+                Analysis Type: {learning_type} | Domain: {domain or 'General'} |
+                Deep Learning: {'Enabled' if enable_deep_learning else 'Disabled'} |
+                AutoML: {'Enabled' if enable_automl else 'Disabled'}
+            </p>
+            <p style="margin: 10px 0 0 0;"><strong>Completed:</strong> {completion_time}</p>
         </div>
         """
         key_insights = summary.get('key_insights', [])
         recommendations = summary.get('recommendations', [])
+        html = """
+        <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
+            <h4 style="margin: 0 0 15px 0; color: #2c3e50;">📈 Final Results & Recommendations</h4>
+            <h5 style="color: #2c3e50;">💡 Key Insights:</h5>
+            <ul>
         """
+        for insight in key_insights[:5]:
+            html += f"<li>{insight}</li>"
+        html += """
+            </ul>
+            <h5 style="color: #2c3e50;">🎯 Recommendations:</h5>
+            <ul>
+        """
+        for rec in recommendations[:5]:
+            html += f"<li>{rec}</li>"
+        html += """
+            </ul>
         </div>
+        <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Final results compiled!</strong></p>
         """
+        return html
+    def generate_report(self):
+        """Generate a downloadable HTML report with all results and visualizations"""
+        if not self.pipeline_results:
+            return self._create_error_html("No pipeline results available to generate report.")
+        html = f"""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Data Science Pipeline Report</title>
+            <style>
+                {self.custom_css}
+                body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 40px; background: #f4f7fa; }}
+                h1, h2, h3, h4, h5 {{ color: #2c3e50; }}
+                img {{ max-width: 100%; height: auto; }}
+            </style>
+        </head>
+        <body>
+            {self._create_progress_header()}
+            {self._create_all_steps_html(
+                self.pipeline_results,
+                self.pipeline_results.get('summary', {}),
+                self.pipeline_results.get('learning_type', 'Unknown'),
+                self.pipeline_results.get('target_column', None),
+                self.pipeline_results.get('domain_insights', {}).get('detected_domain', 'general'),
+                self.pipeline_results.get('enable_deep_learning', False),
+                self.pipeline_results.get('enable_automl', False)
+            )}
+        </body>
+        </html>
+        """
+        report_path = f"pipeline_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
+        with open(report_path, 'w', encoding='utf-8') as f:
+            f.write(html)
+        return report_path
+    def launch(self):
+        """Launch the Gradio interface for the pipeline"""
+        with gr.Blocks(theme=gr.themes.Default(), css=self.custom_css) as demo:
+            gr.Markdown("""
+            # 🔬 Comprehensive Data Science Pipeline
+            Upload your dataset and configure the pipeline settings to perform automated data analysis and modeling.
+            """)
             with gr.Row():
                 with gr.Column(scale=1):
+                    file_input = gr.File(label="Upload Dataset (CSV/JSON)")
                     learning_type = gr.Radio(
+                        choices=["Supervised", "Unsupervised"],
+                        label="Learning Type",
                         value="Supervised"
                     )
                     target_column = gr.Dropdown(
+                        choices=[],
+                        label="Target Column (for Supervised Learning)",
+                        visible=True
                     )
                     domain = gr.Textbox(
+                        label="Domain (e.g., Finance, Healthcare)",
+                        placeholder="Enter domain or leave blank for general analysis"
+                    )
+                    enable_deep_learning = gr.Checkbox(
+                        label="Enable Deep Learning Models",
+                        value=False
                     )
+                    enable_automl = gr.Checkbox(
+                        label="Enable AutoML",
+                        value=False
                     )
+                    run_button = gr.Button("Run Pipeline", variant="primary")
+                with gr.Column(scale=2):
+                    file_info = gr.HTML(label="File Information")
+                    data_preview = gr.HTML(label="Data Preview")
+                    pipeline_output = gr.HTML(label="Pipeline Results")
+                    download_button = gr.File(
+                        label="Download Report",
                         visible=False
                     )
+            # Event handlers
+            file_input.change(
                 fn=self.process_file_upload,
+                inputs=[file_input, learning_type],
+                outputs=[file_info, gr.State(), target_column, target_column, data_preview]
             )
             learning_type.change(
                 fn=self.update_target_column_visibility,
+                inputs=[learning_type, gr.State()],
                 outputs=[target_column]
             )
+            run_button.click(
                 fn=self.run_comprehensive_pipeline,
+                inputs=[file_input, learning_type, target_column, domain, enable_deep_learning, enable_automl],
+                outputs=[pipeline_output, download_button]
             )
+            download_button.upload(
+                fn=self.generate_report,
+                inputs=[],
+                outputs=[download_button]
             )
+        demo.launch()
+# Example usage
 if __name__ == "__main__":
+    pipeline_ui = DataSciencePipelineUI()
+    pipeline_ui.launch(share=True)