Girinath11 commited on
Commit
3dfde02
·
verified ·
1 Parent(s): c1021eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -426
app.py CHANGED
@@ -355,7 +355,7 @@ class SupervisorAgentMock:
355
  'results': {
356
  'Random Forest': {'accuracy': 0.87, 'f1_score': 0.85} if is_classification else {'rmse': 0.45, 'r2_score': 0.82},
357
  'SVM': {'accuracy': 0.82, 'f1_score': 0.80} if is_classification else {'rmse': 0.52, 'r2_score': 0.78},
358
- 'Logistic Regression': {'accuracy': 0.78, 'f1_score': 0.76} if is_classification else {'rmse': 0.58, 'r2_score': 0.74}
359
  },
360
  'feature_importance': {col: np.random.random() for col in df.columns if col != target_column and col in column_types['numeric']}
361
  }
@@ -744,18 +744,25 @@ class DataSciencePipelineUI:
744
  <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
745
  """
746
 
747
- def _create_chartjs_histogram(self, data, column):
748
- """Create a Matplotlib histogram as a static image for a numeric column"""
749
  try:
750
  values = data[column].dropna()
751
  if len(values) == 0:
752
  return "<p>No valid data for histogram</p>"
753
 
 
 
754
  plt.figure(figsize=(8, 6))
755
- sns.histplot(values, bins=10, kde=False, color='skyblue')
756
  plt.title(f'Distribution of {column}', fontsize=14)
757
  plt.xlabel(column, fontsize=12)
758
  plt.ylabel('Count', fontsize=12)
 
 
 
 
 
759
 
760
  html = self.create_plot_html(plt.gcf(), f"histogram_{column}")
761
  plt.close()
@@ -767,19 +774,22 @@ class DataSciencePipelineUI:
767
  except Exception as e:
768
  return f"<p>Could not generate histogram for {column}: {str(e)}</p>"
769
 
770
- def _create_chartjs_bar(self, data, column, is_target=False):
771
- """Create a Matplotlib bar plot as a static image for a categorical column"""
772
  try:
773
- value_counts = data[column].value_counts().head(10)
774
  labels = value_counts.index.tolist()
775
  counts = value_counts.values.tolist()
776
 
777
  plt.figure(figsize=(8, 6))
778
- sns.barplot(x=labels, y=counts, palette='tab10')
779
  plt.title(f"{'Target Distribution' if is_target else f'Distribution of {column}'}", fontsize=14)
780
- plt.xlabel(column, fontsize=12)
781
- plt.ylabel('Count', fontsize=12)
782
- plt.xticks(rotation=45, ha='right')
 
 
 
783
 
784
  html = self.create_plot_html(plt.gcf(), f"bar_{column}")
785
  plt.close()
@@ -791,8 +801,8 @@ class DataSciencePipelineUI:
791
  except Exception as e:
792
  return f"<p>Could not generate bar plot for {column}: {str(e)}</p>"
793
 
794
- def _create_chartjs_scatter(self, data, x_col, y_col, target=False):
795
- """Create a Matplotlib scatter plot as a static image for regression analysis"""
796
  try:
797
  x_values = data[x_col].dropna()
798
  y_values = data[y_col].dropna()
@@ -800,14 +810,20 @@ class DataSciencePipelineUI:
800
  if len(common_indices) < 2:
801
  return f"<p>Not enough valid data for scatter plot between {x_col} and {y_col}</p>"
802
 
803
- x_values = x_values.loc[common_indices].head(100)
804
- y_values = y_values.loc[common_indices].head(100)
805
 
806
  plt.figure(figsize=(8, 6))
807
  plt.scatter(x_values, y_values, color='teal', alpha=0.6)
808
  plt.title(f'{y_col} vs {x_col}', fontsize=14)
809
  plt.xlabel(x_col, fontsize=12)
810
  plt.ylabel(y_col, fontsize=12)
 
 
 
 
 
 
811
 
812
  html = self.create_plot_html(plt.gcf(), f"scatter_{x_col}_{y_col}")
813
  plt.close()
@@ -819,14 +835,14 @@ class DataSciencePipelineUI:
819
  except Exception as e:
820
  return f"<p>Could not generate scatter plot for {x_col} vs {y_col}: {str(e)}</p>"
821
 
822
- def _create_safe_correlation_heatmap(self, correlation_matrix):
823
- """Create a Matplotlib/Seaborn correlation heatmap as a static image"""
824
  try:
825
  corr_df = pd.DataFrame(correlation_matrix)
826
  if corr_df.empty or len(corr_df.columns) < 2:
827
  return "<p>Not enough numeric features for correlation analysis</p>"
828
 
829
- plt.figure(figsize=(10, 8))
830
  sns.heatmap(
831
  corr_df,
832
  annot=True,
@@ -836,8 +852,8 @@ class DataSciencePipelineUI:
836
  center=0,
837
  square=True,
838
  fmt='.2f',
839
- annot_kws={'size': 10},
840
- cbar_kws={'label': 'Correlation'}
841
  )
842
  plt.title('Correlation Matrix Heatmap', fontsize=14, pad=15)
843
  plt.xticks(rotation=45, ha='right')
@@ -854,7 +870,7 @@ class DataSciencePipelineUI:
854
  return f"<p>Could not generate correlation heatmap: {str(e)}</p>"
855
 
856
  def _format_eda_results(self, results, data, learning_type=None, target_column=None):
857
- """Format EDA results with additional visualizations"""
858
  if not results or results.get('status') != 'success' or data is None:
859
  return "<p>EDA information not available or no data loaded</p>"
860
 
@@ -873,24 +889,32 @@ class DataSciencePipelineUI:
873
  </div>
874
  """
875
 
 
876
  if correlations.get('correlation_matrix'):
877
- html += self._create_safe_correlation_heatmap(correlations['correlation_matrix'])
878
 
 
879
  if learning_type == "Supervised" and target_column and target_column in data.columns:
880
  if target_column in column_types['numeric']:
881
- numeric_cols = [col for col in column_types['numeric'] if col != target_column]
882
- for col in numeric_cols[:2]:
883
- html += self._create_chartjs_scatter(data, col, target_column, target=True)
884
  elif target_column in column_types['categorical']:
885
- html += self._create_chartjs_bar(data, target_column, is_target=True)
886
- categorical_cols = [col for col in column_types['categorical'] if col != target_column]
887
- for col in categorical_cols[:2]:
888
- html += self._create_chartjs_bar(data, col)
 
 
 
 
 
889
  else:
 
890
  for col in column_types['numeric'][:2]:
891
- html += self._create_chartjs_histogram(data, col)
892
  for col in column_types['categorical'][:2]:
893
- html += self._create_chartjs_bar(data, col)
894
 
895
  html += """
896
  <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
@@ -926,103 +950,160 @@ class DataSciencePipelineUI:
926
  """
927
 
928
  def _format_modeling_results(self, results, enable_deep_learning):
929
- """Format modeling results"""
930
  if not results or results.get('status') != 'success':
931
  return "<p>Modeling information not available</p>"
932
 
 
933
  best_model = results.get('best_model', 'Unknown')
934
  model_results = results.get('results', {})
935
- problem_type = results.get('problem_type', 'classification')
936
 
937
  html = f"""
938
  <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
939
- <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🏆 Best Model: {best_model}</h4>
940
- <p><strong>Problem Type:</strong> {problem_type.title()}</p>
941
- <p><strong>Models Trained:</strong> {len(model_results)}</p>
942
-
943
  <h5 style="color: #e74c3c;">📊 Model Performance:</h5>
944
- <div style="background: #f8f9fa; padding: 15px; border-radius: 8px;">
 
 
 
 
 
 
 
 
 
 
 
 
945
  """
946
 
947
- for model_name, metrics in model_results.items():
948
- html += f"<p><strong>{model_name}:</strong> "
949
- for metric_name, metric_value in metrics.items():
950
- html += f"{metric_name}: {metric_value:.3f} | "
951
- html = html.rstrip(" | ") + "</p>"
 
 
 
 
 
952
 
953
  html += """
 
 
 
 
 
 
 
 
 
 
 
 
954
  </div>
 
 
 
 
955
  </div>
956
- <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training completed!</strong></p>
957
  """
958
-
959
  return html
960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
  def _format_unsupervised_results(self, data):
962
- """Format comprehensive unsupervised learning results"""
963
  if data is None:
964
  return "<p>No data available for unsupervised analysis</p>"
965
-
966
- n_samples = data.shape[0]
967
- n_features = data.shape[1]
968
- optimal_clusters = min(max(2, int(np.sqrt(n_samples/100))), 10)
969
-
970
- return f"""
971
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin: 15px 0;">
972
- <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
973
- <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔍 Clustering Analysis</h4>
974
- <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 15px; border-radius: 8px; text-align: center; margin-bottom: 15px;">
975
- <h3 style="margin: 0;">K-Means Clustering</h3>
976
- <p style="margin: 5px 0 0 0;">Optimal Clusters: {optimal_clusters}</p>
977
- </div>
978
- <div style="background: #f8f9fa; padding: 15px; border-radius: 6px;">
979
- <p style="margin: 5px 0;"><strong>Silhouette Score:</strong> {0.65 + np.random.random() * 0.2:.3f}</p>
980
- <p style="margin: 5px 0;"><strong>Inertia:</strong> {np.random.randint(500, 2000):,}</p>
981
- <p style="margin: 5px 0;"><strong>Samples:</strong> {n_samples:,}</p>
982
- </div>
983
- </div>
984
-
985
- <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
986
- <h4 style="margin: 0 0 15px 0; color: #9b59b6;">📊 Pattern Discovery</h4>
987
- <p style="margin: 8px 0;"><strong>Natural Groups:</strong> {optimal_clusters} distinct clusters</p>
988
- <p style="margin: 8px 0;"><strong>Anomalies:</strong> {np.random.randint(5, max(6, int(n_samples * 0.05)))} outliers detected</p>
989
- <p style="margin: 8px 0;"><strong>Feature Space:</strong> {n_features}D analysis</p>
990
- <p style="margin: 8px 0;"><strong>Variance Explained:</strong> {85 + np.random.randint(0, 10):.1f}%</p>
991
- </div>
992
- </div>
993
 
994
- <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
995
- <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🎯 Cluster Characteristics</h4>
996
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
997
- {''.join([f'''
998
- <div style="background: {["#e8f5e8", "#fff3e0", "#e3f2fd", "#fce4ec", "#f3e5f5"][i % 5]}; padding: 15px; border-radius: 8px; border-left: 4px solid {["#27ae60", "#ff9800", "#2196f3", "#e91e63", "#9c27b0"][i % 5]};">
999
- <h5 style="margin: 0 0 8px 0; color: {["#27ae60", "#ff9800", "#2196f3", "#e91e63", "#9c27b0"][i % 5]};">Cluster {i+1}</h5>
1000
- <p style="margin: 0; font-size: 12px;">{["High-value segment", "Moderate characteristics", "Unique behavioral patterns", "Low-activity group", "Special interest group"][i % 5]}</p>
1001
- <p style="margin: 5px 0 0 0; font-size: 11px; color: #666;">Size: {np.random.randint(10, max(11, int(n_samples/optimal_clusters * 1.5)))} samples</p>
1002
- </div>
1003
- ''' for i in range(min(optimal_clusters, 5))])}
1004
- </div>
1005
- </div>
1006
 
1007
- <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1008
- <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔬 Additional Analysis</h4>
1009
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px;">
1010
- <div style="background: #f8f9fa; padding: 15px; border-radius: 8px;">
1011
- <h5 style="margin: 0 0 10px 0; color: #6c757d;">📐 Dimensionality Reduction</h5>
1012
- <p style="margin: 5px 0; font-size: 13px;"><strong>PCA Components:</strong> {min(n_features, 10)}</p>
1013
- <p style="margin: 5px 0; font-size: 13px;"><strong>Explained Variance:</strong> {75 + np.random.randint(0, 20):.1f}%</p>
1014
- </div>
1015
- <div style="background: #f8f9fa; padding: 15px; border-radius: 8px;">
1016
- <h5 style="margin: 0 0 10px 0; color: #6c757d;">🎯 Anomaly Detection</h5>
1017
- <p style="margin: 5px 0; font-size: 13px;"><strong>Method:</strong> Isolation Forest</p>
1018
- <p style="margin: 5px 0; font-size: 13px;"><strong>Contamination:</strong> {np.random.randint(1, 8):.1f}%</p>
1019
- </div>
1020
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1021
  </div>
 
 
1022
 
1023
- <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed successfully!</strong></p>
1024
- <div style="background: #f3e5f5; padding: 15px; border-radius: 6px; margin-top: 10px;">
1025
- <p style="margin: 0; color: #7b1fa2;"><strong>Key Insights:</strong> Discovered {optimal_clusters} natural groupings in your data. These clusters can be used for customer segmentation, market analysis, anomaly detection, and pattern recognition. Consider using these insights for targeted strategies and further investigation.</p>
 
 
 
 
 
 
 
 
 
1026
  </div>
1027
  """
1028
 
@@ -1031,359 +1112,135 @@ class DataSciencePipelineUI:
1031
  key_insights = summary.get('key_insights', [])
1032
  recommendations = summary.get('recommendations', [])
1033
 
1034
- return f"""
1035
- <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
1036
- <h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">🎉 Pipeline Completed Successfully!</h3>
1037
- </div>
1038
-
1039
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
1040
- <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
1041
- <h4 style="margin: 0 0 20px 0; color: #2c3e50;">🔍 Key Insights</h4>
1042
- {''.join([f'<div style="background: #e8f4f8; padding: 12px; margin: 8px 0; border-radius: 6px;">💡 {insight}</div>' for insight in key_insights])}
1043
- </div>
1044
- <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
1045
- <h4 style="margin: 0 0 20px 0; color: #2c3e50;">📝 Recommendations</h4>
1046
- {''.join([f'<div style="background: #fff3e0; padding: 12px; margin: 8px 0; border-radius: 6px;">📌 {rec}</div>' for rec in recommendations])}
1047
- </div>
1048
- </div>
1049
  """
1050
-
1051
- def _create_completion_footer(self, learning_type, domain, enable_deep_learning, enable_automl):
1052
- """Create completion footer"""
1053
- return f"""
1054
- <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; margin-top: 20px; text-align: center; color: #34495e;">
1055
- <p style="margin: 0;"><strong>Configuration:</strong> {learning_type} Learning | Domain: {domain or 'General'} | Deep Learning: {'Enabled' if enable_deep_learning else 'Disabled'} | AutoML: {'Enabled' if enable_automl else 'Disabled'}</p>
 
 
 
 
 
1056
  </div>
 
1057
  """
 
1058
 
1059
- def generate_report_pdf(self, learning_type, target_column, domain, enable_deep_learning, enable_automl):
1060
- """Generate a PDF report using LaTeX"""
1061
- if not self.pipeline_results or not self.current_data:
1062
- return None, "Please run the pipeline first."
1063
-
1064
- try:
1065
- # Sanitizing text for LaTeX
1066
- def sanitize_latex(text):
1067
- if not isinstance(text, str):
1068
- text = str(text)
1069
- replacements = {
1070
- '&': r'\&',
1071
- '%': r'\%',
1072
- '$': r'\$',
1073
- '#': r'\#',
1074
- '_': r'\_',
1075
- '{': r'\{',
1076
- '}': r'\}',
1077
- '~': r'\textasciitilde{}',
1078
- '^': r'\textasciicircum{}',
1079
- '\\': r'\textbackslash{}'
1080
- }
1081
- for old, new in replacements.items():
1082
- text = text.replace(old, new)
1083
- return text
1084
-
1085
- # Extract pipeline results
1086
- data_loading = self.pipeline_results.get('data_loading', {})
1087
- data_cleaning = self.pipeline_results.get('data_cleaning', {})
1088
- eda = self.pipeline_results.get('eda', {})
1089
- domain_insights = self.pipeline_results.get('domain_insights', {})
1090
- modeling = self.pipeline_results.get('modeling', {})
1091
- summary = self.pipeline_results.get('summary', {})
1092
-
1093
- # Start LaTeX document
1094
- latex_content = r"""
1095
- \documentclass[a4paper,12pt]{article}
1096
- \usepackage[utf8]{inputenc}
1097
- \usepackage{geometry}
1098
- \usepackage{graphicx}
1099
- \usepackage{booktabs}
1100
- \usepackage{amsmath}
1101
- \usepackage{xcolor}
1102
- \usepackage{enumitem}
1103
- \usepackage{hyperref}
1104
- \geometry{margin=1in}
1105
- \title{Data Science Pipeline Report}
1106
- \author{Automated Analysis}
1107
- \date{Generated on \today}
1108
- \begin{document}
1109
- \maketitle
1110
- \tableofcontents
1111
- \newpage
1112
-
1113
- % Defining section: Introduction
1114
- \section{Introduction}
1115
- This report presents the results of an automated data science pipeline executed on the uploaded dataset. The analysis includes data loading, cleaning, exploratory data analysis (EDA), domain-specific insights, and modeling results. The configuration used is:
1116
- \begin{itemize}
1117
- \item \textbf{Learning Type}: """ + sanitize_latex(learning_type) + r"""
1118
- \item \textbf{Target Column}: """ + (sanitize_latex(target_column) if target_column else "None (Unsupervised)") + r"""
1119
- \item \textbf{Domain}: """ + sanitize_latex(domain or "General") + r"""
1120
- \item \textbf{Deep Learning}: """ + ("Enabled" if enable_deep_learning else "Disabled") + r"""
1121
- \item \textbf{AutoML}: """ + ("Enabled" if enable_automl else "Disabled") + r"""
1122
- \end{itemize}
1123
-
1124
- % Defining section: Data Loading
1125
- \section{Data Loading}
1126
- """
1127
-
1128
- if data_loading.get('status') == 'success':
1129
- info = data_loading.get('info', {})
1130
- shape = info.get('shape', (0, 0))
1131
- column_types = info.get('column_types', {})
1132
- latex_content += r"""
1133
- The dataset was successfully loaded with the following details:
1134
- \begin{itemize}
1135
- \item \textbf{Rows}: """ + f"{shape[0]:,}" + r"""
1136
- \item \textbf{Columns}: """ + f"{shape[1]}" + r"""
1137
- \item \textbf{Memory Usage}: """ + sanitize_latex(info.get('memory_usage', 'Unknown')) + r"""
1138
- \item \textbf{Numeric Columns}: """ + f"{len(column_types.get('numeric', []))}" + r"""
1139
- \item \textbf{Categorical Columns}: """ + f"{len(column_types.get('categorical', []))}" + r"""
1140
- \item \textbf{DateTime Columns}: """ + f"{len(column_types.get('datetime', []))}" + r"""
1141
- \end{itemize}
1142
- """
1143
-
1144
- # Defining section: Data Cleaning
1145
- latex_content += r"""
1146
- \section{Data Cleaning}
1147
- """
1148
- if data_cleaning.get('status') == 'success':
1149
- report = data_cleaning.get('cleaning_report', {})
1150
- duplicates = report.get('duplicates_removed', 0)
1151
- missing_values = report.get('missing_values', {})
1152
- total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
1153
- latex_content += r"""
1154
- The data cleaning process identified and handled:
1155
- \begin{itemize}
1156
- \item \textbf{Duplicates Removed}: """ + f"{duplicates}" + r"""
1157
- \item \textbf{Missing Values}: """ + f"{total_missing}" + r"""
1158
- \end{itemize}
1159
- """
1160
-
1161
- # Defining section: Exploratory Data Analysis
1162
- latex_content += r"""
1163
- \section{Exploratory Data Analysis}
1164
- """
1165
- if eda.get('status') == 'success':
1166
- column_types = eda.get('analysis', {}).get('column_types', {})
1167
- latex_content += r"""
1168
- The EDA revealed:
1169
- \begin{itemize}
1170
- \item \textbf{Numeric Features}: """ + f"{len(column_types.get('numeric', []))}" + r"""
1171
- \item \textbf{Categorical Features}: """ + f"{len(column_types.get('categorical', []))}" + r"""
1172
- \item \textbf{DateTime Features}: """ + f"{len(column_types.get('datetime', []))}" + r"""
1173
- \end{itemize}
1174
- """
1175
-
1176
- # Adding plots
1177
- if 'correlation_heatmap' in self.plot_images:
1178
- latex_content += r"""
1179
- \subsection{Correlation Heatmap}
1180
- \begin{figure}[h]
1181
- \centering
1182
- \includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images['correlation_heatmap'] + r"""}
1183
- \caption{Correlation Matrix Heatmap}
1184
- \end{figure}
1185
- """
1186
-
1187
- for col in column_types.get('numeric', [])[:2]:
1188
- if f"histogram_{col}" in self.plot_images:
1189
- latex_content += r"""
1190
- \subsection{Histogram of """ + sanitize_latex(col) + r"""}
1191
- \begin{figure}[h]
1192
- \centering
1193
- \includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images[f"histogram_{col}"] + r"""}
1194
- \caption{Distribution of """ + sanitize_latex(col) + r"""}
1195
- \end{figure}
1196
- """
1197
-
1198
- for col in column_types.get('categorical', [])[:2]:
1199
- if f"bar_{col}" in self.plot_images:
1200
- latex_content += r"""
1201
- \subsection{Distribution of """ + sanitize_latex(col) + r"""}
1202
- \begin{figure}[h]
1203
- \centering
1204
- \includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images[f"bar_{col}"] + r"""}
1205
- \caption{Distribution of """ + sanitize_latex(col) + r"""}
1206
- \end{figure}
1207
- """
1208
-
1209
- # Defining section: Domain Analysis
1210
- latex_content += r"""
1211
- \section{Domain Analysis}
1212
- """
1213
- if domain_insights:
1214
- domain = domain_insights.get('detected_domain', 'general')
1215
- insights = domain_insights.get('insights', [])
1216
- recommendations = domain_insights.get('recommendations', [])
1217
- latex_content += r"""
1218
- \textbf{Detected Domain}: """ + sanitize_latex(domain) + r"""
1219
- \subsection{Key Insights}
1220
- \begin{itemize}
1221
- """ + ''.join([f"\\item {sanitize_latex(insight)}" for insight in insights[:5]]) + r"""
1222
- \end{itemize}
1223
- \subsection{Recommendations}
1224
- \begin{itemize}
1225
- """ + ''.join([f"\\item {sanitize_latex(rec)}" for rec in recommendations[:5]]) + r"""
1226
- \end{itemize}
1227
- """
1228
-
1229
- # Defining section: Modeling or Unsupervised Analysis
1230
- if learning_type == "Supervised" and modeling:
1231
- latex_content += r"""
1232
- \section{Model Training and Evaluation}
1233
- """
1234
- if modeling.get('status') == 'success':
1235
- best_model = modeling.get('best_model', 'Unknown')
1236
- problem_type = modeling.get('problem_type', 'classification')
1237
- model_results = modeling.get('results', {})
1238
- latex_content += r"""
1239
- \textbf{Best Model}: """ + sanitize_latex(best_model) + r"""
1240
- \newline
1241
- \textbf{Problem Type}: """ + sanitize_latex(problem_type.title()) + r"""
1242
- \newline
1243
- \textbf{Model Performance}:
1244
- \begin{itemize}
1245
- """ + ''.join([f"\\item \\textbf{{{sanitize_latex(model_name)}}}: " + ", ".join([f"{metric_name}: {metric_value:.3f}" for metric_name, metric_value in metrics.items()]) for model_name, metrics in model_results.items()]) + r"""
1246
- \end{itemize}
1247
- """
1248
- else:
1249
- latex_content += r"""
1250
- \section{Unsupervised Analysis}
1251
- """
1252
- n_samples = self.current_data.shape[0]
1253
- optimal_clusters = min(max(2, int(np.sqrt(n_samples/100))), 10)
1254
- latex_content += r"""
1255
- \textbf{Clustering Analysis}:
1256
- \begin{itemize}
1257
- \item Optimal Clusters: """ + f"{optimal_clusters}" + r"""
1258
- \item Silhouette Score: """ + f"{0.65 + np.random.random() * 0.2:.3f}" + r"""
1259
- \end{itemize}
1260
- """
1261
-
1262
- # Defining section: Summary
1263
- latex_content += r"""
1264
- \section{Summary and Recommendations}
1265
- """
1266
- key_insights = summary.get('key_insights', [])
1267
- recommendations = summary.get('recommendations', [])
1268
- latex_content += r"""
1269
- \subsection{Key Insights}
1270
- \begin{itemize}
1271
- """ + ''.join([f"\\item {sanitize_latex(insight)}" for insight in key_insights]) + r"""
1272
- \end{itemize}
1273
- \subsection{Recommendations}
1274
- \begin{itemize}
1275
- """ + ''.join([f"\\item {sanitize_latex(rec)}" for rec in recommendations]) + r"""
1276
- \end{itemize}
1277
- """
1278
-
1279
- # Ending document
1280
- latex_content += r"""
1281
- \end{document}
1282
- """
1283
-
1284
- # Write LaTeX content to a temporary file
1285
- with open('report.tex', 'w') as f:
1286
- f.write(latex_content)
1287
-
1288
- # Compile LaTeX to PDF
1289
- subprocess.run(['latexmk', '-pdf', '-silent', 'report.tex'], check=True)
1290
 
1291
- # Return the PDF file path
1292
- return 'report.pdf', "Report generated successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1293
 
1294
- except Exception as e:
1295
- return None, f"Could not generate report: {str(e)}"
 
 
 
1296
 
1297
- def create_interface(self):
1298
- """Create the Gradio interface"""
1299
- with gr.Blocks(css=self.custom_css, title="🔬 Data Scientist Agent") as demo:
1300
- gr.Markdown("# 🔬 Data Scientist Agent")
1301
- gr.Markdown("Upload your dataset and let the AI handle the complete data science workflow!")
 
 
1302
 
1303
  with gr.Row():
1304
  with gr.Column(scale=1):
1305
- file_upload = gr.File(
1306
- label="📁 Upload Dataset",
1307
- file_types=[".csv", ".json"],
1308
- type="filepath"
1309
- )
1310
  learning_type = gr.Radio(
1311
- choices=["Supervised", "Unsupervised"],
1312
- label="🎯 Learning Type",
1313
  value="Supervised"
1314
  )
1315
  target_column = gr.Dropdown(
1316
- label="🎯 Target Column (for Supervised Learning)",
1317
- choices=[],
1318
- visible=True,
1319
- allow_custom_value=True
1320
  )
1321
  domain = gr.Textbox(
1322
- label="🏢 Domain (optional)",
1323
- placeholder="e.g., finance, healthcare, retail"
 
 
 
 
1324
  )
1325
-
1326
- with gr.Row():
1327
- enable_deep_learning = gr.Checkbox(
1328
- label="🧠 Enable Deep Learning",
1329
- value=False
1330
- )
1331
- enable_automl = gr.Checkbox(
1332
- label="🤖 Enable AutoML",
1333
- value=True
1334
- )
1335
-
1336
- run_btn = gr.Button(
1337
- "🚀 Run Complete Pipeline",
1338
- variant="primary",
1339
- size="lg"
1340
  )
1341
- download_btn = gr.Button(
1342
- "📥 Download Report",
1343
- variant="secondary",
1344
- size="lg",
 
 
 
 
1345
  visible=False
1346
  )
1347
 
1348
- with gr.Column(scale=1):
1349
- file_status = gr.HTML(label="📊 File Status")
1350
- preview = gr.HTML(label="👀 Data Preview")
1351
-
1352
- output = gr.HTML(label="📈 Pipeline Results")
1353
- report_output = gr.File(label="📄 Download Analysis Report")
1354
- report_status = gr.Textbox(label="Report Status", visible=False)
1355
-
1356
- file_type_state = gr.State("")
1357
- columns_state = gr.State([])
1358
-
1359
- file_upload.change(
1360
  fn=self.process_file_upload,
1361
- inputs=[file_upload, learning_type],
1362
- outputs=[file_status, file_type_state, columns_state, target_column, preview]
1363
  )
1364
-
1365
  learning_type.change(
1366
  fn=self.update_target_column_visibility,
1367
- inputs=[learning_type, columns_state],
1368
  outputs=[target_column]
1369
  )
1370
-
1371
- run_btn.click(
1372
  fn=self.run_comprehensive_pipeline,
1373
- inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
1374
- outputs=[output, download_btn]
1375
  )
1376
-
1377
- download_btn.click(
1378
- fn=self.generate_report_pdf,
1379
- inputs=[learning_type, target_column, domain, enable_deep_learning, enable_automl],
1380
- outputs=[report_output, report_status]
1381
  )
1382
 
1383
- return demo
1384
 
 
1385
  if __name__ == "__main__":
1386
- print("🚀 Starting Data Science Pipeline UI...")
1387
- ui = DataSciencePipelineUI()
1388
- demo = ui.create_interface()
1389
- demo.launch(share=True)
 
355
  'results': {
356
  'Random Forest': {'accuracy': 0.87, 'f1_score': 0.85} if is_classification else {'rmse': 0.45, 'r2_score': 0.82},
357
  'SVM': {'accuracy': 0.82, 'f1_score': 0.80} if is_classification else {'rmse': 0.52, 'r2_score': 0.78},
358
+ 'LogisticRegression': {'accuracy': 0.78, 'f1_score': 0.76} if is_classification else {'rmse': 0.58, 'r2_score': 0.74}
359
  },
360
  'feature_importance': {col: np.random.random() for col in df.columns if col != target_column and col in column_types['numeric']}
361
  }
 
744
  <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
745
  """
746
 
747
+ def _create_dynamic_histogram(self, data, column):
748
+ """Create a dynamic histogram for a numeric column"""
749
  try:
750
  values = data[column].dropna()
751
  if len(values) == 0:
752
  return "<p>No valid data for histogram</p>"
753
 
754
+ # Dynamically adjust number of bins based on data size and spread
755
+ n_bins = min(max(int(np.sqrt(len(values))), 10), 50)
756
  plt.figure(figsize=(8, 6))
757
+ sns.histplot(values, bins=n_bins, kde=True, color='skyblue')
758
  plt.title(f'Distribution of {column}', fontsize=14)
759
  plt.xlabel(column, fontsize=12)
760
  plt.ylabel('Count', fontsize=12)
761
+
762
+ # Add range and stats annotations
763
+ stats_text = f'Min: {values.min():.2f}\nMax: {values.max():.2f}\nMean: {values.mean():.2f}'
764
+ plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, ha='right', va='top',
765
+ bbox=dict(facecolor='white', alpha=0.8))
766
 
767
  html = self.create_plot_html(plt.gcf(), f"histogram_{column}")
768
  plt.close()
 
774
  except Exception as e:
775
  return f"<p>Could not generate histogram for {column}: {str(e)}</p>"
776
 
777
+ def _create_dynamic_bar(self, data, column, is_target=False):
778
+ """Create a dynamic bar plot for a categorical column"""
779
  try:
780
+ value_counts = data[column].value_counts().head(10) # Limit to top 10 categories
781
  labels = value_counts.index.tolist()
782
  counts = value_counts.values.tolist()
783
 
784
  plt.figure(figsize=(8, 6))
785
+ sns.barplot(x=counts, y=labels, palette='tab10')
786
  plt.title(f"{'Target Distribution' if is_target else f'Distribution of {column}'}", fontsize=14)
787
+ plt.xlabel('Count', fontsize=12)
788
+ plt.ylabel(column, fontsize=12)
789
+
790
+ # Add total count annotation
791
+ plt.text(0.95, 0.95, f'Total: {sum(counts)}',
792
+ transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(facecolor='white', alpha=0.8))
793
 
794
  html = self.create_plot_html(plt.gcf(), f"bar_{column}")
795
  plt.close()
 
801
  except Exception as e:
802
  return f"<p>Could not generate bar plot for {column}: {str(e)}</p>"
803
 
804
+ def _create_dynamic_scatter(self, data, x_col, y_col, target=False):
805
+ """Create a dynamic scatter plot for regression analysis"""
806
  try:
807
  x_values = data[x_col].dropna()
808
  y_values = data[y_col].dropna()
 
810
  if len(common_indices) < 2:
811
  return f"<p>Not enough valid data for scatter plot between {x_col} and {y_col}</p>"
812
 
813
+ x_values = x_values.loc[common_indices].head(1000) # Limit to 1000 points for performance
814
+ y_values = y_values.loc[common_indices].head(1000)
815
 
816
  plt.figure(figsize=(8, 6))
817
  plt.scatter(x_values, y_values, color='teal', alpha=0.6)
818
  plt.title(f'{y_col} vs {x_col}', fontsize=14)
819
  plt.xlabel(x_col, fontsize=12)
820
  plt.ylabel(y_col, fontsize=12)
821
+
822
+ # Add range and correlation annotations
823
+ corr = np.corrcoef(x_values, y_values)[0, 1] if len(x_values) > 1 else 0
824
+ stats_text = f'X Range: {x_values.min():.2f} to {x_values.max():.2f}\nY Range: {y_values.min():.2f} to {y_values.max():.2f}\nCorr: {corr:.2f}'
825
+ plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, ha='right', va='top',
826
+ bbox=dict(facecolor='white', alpha=0.8))
827
 
828
  html = self.create_plot_html(plt.gcf(), f"scatter_{x_col}_{y_col}")
829
  plt.close()
 
835
  except Exception as e:
836
  return f"<p>Could not generate scatter plot for {x_col} vs {y_col}: {str(e)}</p>"
837
 
838
+ def _create_dynamic_correlation_heatmap(self, correlation_matrix):
839
+ """Create a dynamic correlation heatmap"""
840
  try:
841
  corr_df = pd.DataFrame(correlation_matrix)
842
  if corr_df.empty or len(corr_df.columns) < 2:
843
  return "<p>Not enough numeric features for correlation analysis</p>"
844
 
845
+ plt.figure(figsize=(min(10, len(corr_df.columns) * 1.2), min(8, len(corr_df.columns) * 1)))
846
  sns.heatmap(
847
  corr_df,
848
  annot=True,
 
852
  center=0,
853
  square=True,
854
  fmt='.2f',
855
+ annot_kws={'size': max(8, 12 - len(corr_df.columns) // 2)},
856
+ cbar_kws={'label': 'Correlation Coefficient'}
857
  )
858
  plt.title('Correlation Matrix Heatmap', fontsize=14, pad=15)
859
  plt.xticks(rotation=45, ha='right')
 
870
  return f"<p>Could not generate correlation heatmap: {str(e)}</p>"
871
 
872
  def _format_eda_results(self, results, data, learning_type=None, target_column=None):
873
+ """Format EDA results with dynamic visualizations"""
874
  if not results or results.get('status') != 'success' or data is None:
875
  return "<p>EDA information not available or no data loaded</p>"
876
 
 
889
  </div>
890
  """
891
 
892
+ # Add correlation heatmap if available
893
  if correlations.get('correlation_matrix'):
894
+ html += self._create_dynamic_correlation_heatmap(correlations['correlation_matrix'])
895
 
896
+ # Dynamic visualization selection based on learning type and data
897
  if learning_type == "Supervised" and target_column and target_column in data.columns:
898
  if target_column in column_types['numeric']:
899
+ numeric_cols = [col for col in column_types['numeric'] if col != target_column][:2]
900
+ for col in numeric_cols:
901
+ html += self._create_dynamic_scatter(data, col, target_column, target=True)
902
  elif target_column in column_types['categorical']:
903
+ html += self._create_dynamic_bar(data, target_column, is_target=True)
904
+ categorical_cols = [col for col in column_types['categorical'] if col != target_column][:2]
905
+ for col in categorical_cols:
906
+ html += self._create_dynamic_bar(data, col)
907
+ # Add one numeric histogram and one categorical bar plot for context
908
+ if column_types['numeric']:
909
+ html += self._create_dynamic_histogram(data, column_types['numeric'][0])
910
+ if column_types['categorical'] and target_column not in column_types['categorical']:
911
+ html += self._create_dynamic_bar(data, column_types['categorical'][0])
912
  else:
913
+ # For unsupervised learning or no target, show up to 2 histograms and 2 bar plots
914
  for col in column_types['numeric'][:2]:
915
+ html += self._create_dynamic_histogram(data, col)
916
  for col in column_types['categorical'][:2]:
917
+ html += self._create_dynamic_bar(data, col)
918
 
919
  html += """
920
  <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
 
950
  """
951
 
952
  def _format_modeling_results(self, results, enable_deep_learning):
953
+ """Format modeling results with visualizations"""
954
  if not results or results.get('status') != 'success':
955
  return "<p>Modeling information not available</p>"
956
 
957
+ problem_type = results.get('problem_type', 'unknown')
958
  best_model = results.get('best_model', 'Unknown')
959
  model_results = results.get('results', {})
960
+ feature_importance = results.get('feature_importance', {})
961
 
962
  html = f"""
963
  <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
964
+ <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🤖 Modeling Results</h4>
965
+ <div style="background: linear-gradient(135deg, #ff6b6b 0%, #e74c3c 100%); color: white; padding: 15px; border-radius: 8px; text-align: center; margin-bottom: 15px;">
966
+ <h3 style="margin: 0;">Best Model: {best_model} ({problem_type.title()})</h3>
967
+ </div>
968
  <h5 style="color: #e74c3c;">📊 Model Performance:</h5>
969
+ <table style="width: 100%; border-collapse: collapse; margin: 15px 0;">
970
+ <thead>
971
+ <tr style="background-color: #e74c3c; color: white;">
972
+ <th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Model</th>
973
+ <th style="padding: 8px; text-align: left; border: 1px solid #ddd;">
974
+ {'Accuracy' if problem_type == 'classification' else 'RMSE'}
975
+ </th>
976
+ <th style="padding: 8px; text-align: left; border: 1px solid #ddd;">
977
+ {'F1 Score' if problem_type == 'classification' else 'R² Score'}
978
+ </th>
979
+ </tr>
980
+ </thead>
981
+ <tbody>
982
  """
983
 
984
+ for model, metrics in model_results.items():
985
+ metric1 = metrics.get('accuracy' if problem_type == 'classification' else 'rmse', 'N/A')
986
+ metric2 = metrics.get('f1_score' if problem_type == 'classification' else 'r2_score', 'N/A')
987
+ html += f"""
988
+ <tr style="background-color: {'#f9f9f9' if list(model_results.keys()).index(model) % 2 == 0 else 'white'};">
989
+ <td style="padding: 8px; border: 1px solid #ddd;">{model}</td>
990
+ <td style="padding: 8px; border: 1px solid #ddd;">{metric1:.3f}</td>
991
+ <td style="padding: 8px; border: 1px solid #ddd;">{metric2:.3f}</td>
992
+ </tr>
993
+ """
994
 
995
  html += """
996
+ </tbody>
997
+ </table>
998
+ """
999
+
1000
+ if feature_importance:
1001
+ html += self._create_feature_importance_plot(feature_importance)
1002
+
1003
+ if enable_deep_learning:
1004
+ html += """
1005
+ <div style="background: #e8f4f8; padding: 15px; border-radius: 8px; margin-top: 15px;">
1006
+ <h5 style="color: #2c3e50; margin: 0 0 10px 0;">🧠 Deep Learning Status</h5>
1007
+ <p style="margin: 0;">Deep learning models were evaluated but not included in final results due to complexity constraints.</p>
1008
  </div>
1009
+ """
1010
+
1011
+ html += """
1012
+ <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training and evaluation completed!</strong></p>
1013
  </div>
 
1014
  """
 
1015
  return html
1016
 
1017
+ def _create_feature_importance_plot(self, feature_importance):
1018
+ """Create a dynamic feature importance bar plot"""
1019
+ try:
1020
+ features = list(feature_importance.keys())
1021
+ importances = list(feature_importance.values())
1022
+
1023
+ plt.figure(figsize=(8, max(6, len(features) * 0.5)))
1024
+ sns.barplot(x=importances, y=features, palette='viridis')
1025
+ plt.title('Feature Importance', fontsize=14)
1026
+ plt.xlabel('Importance Score', fontsize=12)
1027
+ plt.ylabel('Features', fontsize=12)
1028
+
1029
+ # Add value annotations
1030
+ for i, v in enumerate(importances):
1031
+ plt.text(v, i, f'{v:.3f}', va='center', ha='left', color='black', fontsize=10)
1032
+
1033
+ html = self.create_plot_html(plt.gcf(), "feature_importance")
1034
+ plt.close()
1035
+
1036
+ return f"""
1037
+ {html}
1038
+ <p style="color: #6c757d; font-size: 12px; text-align: center;">Bar plot showing feature importance scores</p>
1039
+ """
1040
+ except Exception as e:
1041
+ return f"<p>Could not generate feature importance plot: {str(e)}</p>"
1042
+
1043
  def _format_unsupervised_results(self, data):
1044
+ """Format unsupervised analysis results with dynamic clustering visualization"""
1045
  if data is None:
1046
  return "<p>No data available for unsupervised analysis</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
 
1048
+ column_types = self.analyzer.detect_column_types(data)
1049
+ numeric_cols = column_types['numeric']
 
 
 
 
 
 
 
 
 
 
1050
 
1051
+ html = """
1052
+ <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
1053
+ <h4 style="margin: 0 0 15px 0; color: #8e44ad;">🔍 Unsupervised Analysis Results</h4>
1054
+ <p style="margin: 0 0 10px 0;">Performed clustering analysis to identify natural groupings in the data.</p>
1055
+ """
1056
+
1057
+ if len(numeric_cols) >= 2:
1058
+ try:
1059
+ # Perform KMeans clustering with dynamic number of clusters
1060
+ X = data[numeric_cols].dropna().head(1000)
1061
+ n_clusters = min(3, len(X) // 10) if len(X) > 10 else 2
1062
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
1063
+ clusters = kmeans.fit_predict(X)
1064
+
1065
+ plt.figure(figsize=(8, 6))
1066
+ plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis', alpha=0.6)
1067
+ plt.title(f'Clustering: {numeric_cols[0]} vs {numeric_cols[1]}', fontsize=14)
1068
+ plt.xlabel(numeric_cols[0], fontsize=12)
1069
+ plt.ylabel(numeric_cols[1], fontsize=12)
1070
+
1071
+ # Add cluster count annotation
1072
+ plt.text(0.95, 0.95, f'Clusters: {n_clusters}',
1073
+ transform=plt.gca().transAxes, ha='right', va='top',
1074
+ bbox=dict(facecolor='white', alpha=0.8))
1075
+
1076
+ html += self.create_plot_html(plt.gcf(), "clustering_plot")
1077
+ plt.close()
1078
+
1079
+ html += f"""
1080
+ <p style="color: #6c757d; font-size: 12px; text-align: center;">
1081
+ Scatter plot showing clusters based on {numeric_cols[0]} and {numeric_cols[1]}
1082
+ </p>
1083
+ """
1084
+ except Exception as e:
1085
+ html += f"<p>Could not generate clustering plot: {str(e)}</p>"
1086
+ else:
1087
+ html += "<p>Not enough numeric columns for clustering visualization</p>"
1088
+
1089
+ html += """
1090
+ <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed!</strong></p>
1091
  </div>
1092
+ """
1093
+ return html
1094
 
1095
+ def _create_completion_footer(self, learning_type, domain, enable_deep_learning, enable_automl):
1096
+ """Create completion footer with summary information"""
1097
+ completion_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1098
+ return f"""
1099
+ <div style="background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%); padding: 30px; border-radius: 15px; color: white; margin-top: 20px; text-align: center; box-shadow: 0 8px 16px rgba(0,0,0,0.2);">
1100
+ <h2 style="margin: 0 0 10px 0;">🎉 Pipeline Completed Successfully!</h2>
1101
+ <p style="margin: 0; font-size: 1.1em; opacity: 0.9;">
1102
+ Analysis Type: {learning_type} | Domain: {domain or 'General'} |
1103
+ Deep Learning: {'Enabled' if enable_deep_learning else 'Disabled'} |
1104
+ AutoML: {'Enabled' if enable_automl else 'Disabled'}
1105
+ </p>
1106
+ <p style="margin: 10px 0 0 0;"><strong>Completed:</strong> {completion_time}</p>
1107
  </div>
1108
  """
1109
 
 
1112
  key_insights = summary.get('key_insights', [])
1113
  recommendations = summary.get('recommendations', [])
1114
 
1115
+ html = """
1116
+ <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
1117
+ <h4 style="margin: 0 0 15px 0; color: #2c3e50;">📈 Final Results & Recommendations</h4>
1118
+ <h5 style="color: #2c3e50;">💡 Key Insights:</h5>
1119
+ <ul>
 
 
 
 
 
 
 
 
 
 
1120
  """
1121
+ for insight in key_insights[:5]:
1122
+ html += f"<li>{insight}</li>"
1123
+ html += """
1124
+ </ul>
1125
+ <h5 style="color: #2c3e50;">🎯 Recommendations:</h5>
1126
+ <ul>
1127
+ """
1128
+ for rec in recommendations[:5]:
1129
+ html += f"<li>{rec}</li>"
1130
+ html += """
1131
+ </ul>
1132
  </div>
1133
+ <p style="color: #27ae60; margin-top: 15px;"><strong>✅ Final results compiled!</strong></p>
1134
  """
1135
+ return html
1136
 
1137
+ def generate_report(self):
1138
+ """Generate a downloadable HTML report with all results and visualizations"""
1139
+ if not self.pipeline_results:
1140
+ return self._create_error_html("No pipeline results available to generate report.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1141
 
1142
+ html = f"""
1143
+ <!DOCTYPE html>
1144
+ <html>
1145
+ <head>
1146
+ <title>Data Science Pipeline Report</title>
1147
+ <style>
1148
+ {self.custom_css}
1149
+ body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 40px; background: #f4f7fa; }}
1150
+ h1, h2, h3, h4, h5 {{ color: #2c3e50; }}
1151
+ img {{ max-width: 100%; height: auto; }}
1152
+ </style>
1153
+ </head>
1154
+ <body>
1155
+ {self._create_progress_header()}
1156
+ {self._create_all_steps_html(
1157
+ self.pipeline_results,
1158
+ self.pipeline_results.get('summary', {}),
1159
+ self.pipeline_results.get('learning_type', 'Unknown'),
1160
+ self.pipeline_results.get('target_column', None),
1161
+ self.pipeline_results.get('domain_insights', {}).get('detected_domain', 'general'),
1162
+ self.pipeline_results.get('enable_deep_learning', False),
1163
+ self.pipeline_results.get('enable_automl', False)
1164
+ )}
1165
+ </body>
1166
+ </html>
1167
+ """
1168
 
1169
+ report_path = f"pipeline_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
1170
+ with open(report_path, 'w', encoding='utf-8') as f:
1171
+ f.write(html)
1172
+
1173
+ return report_path
1174
 
1175
+ def launch(self):
1176
+ """Launch the Gradio interface for the pipeline"""
1177
+ with gr.Blocks(theme=gr.themes.Default(), css=self.custom_css) as demo:
1178
+ gr.Markdown("""
1179
+ # 🔬 Comprehensive Data Science Pipeline
1180
+ Upload your dataset and configure the pipeline settings to perform automated data analysis and modeling.
1181
+ """)
1182
 
1183
  with gr.Row():
1184
  with gr.Column(scale=1):
1185
+ file_input = gr.File(label="Upload Dataset (CSV/JSON)")
 
 
 
 
1186
  learning_type = gr.Radio(
1187
+ choices=["Supervised", "Unsupervised"],
1188
+ label="Learning Type",
1189
  value="Supervised"
1190
  )
1191
  target_column = gr.Dropdown(
1192
+ choices=[],
1193
+ label="Target Column (for Supervised Learning)",
1194
+ visible=True
 
1195
  )
1196
  domain = gr.Textbox(
1197
+ label="Domain (e.g., Finance, Healthcare)",
1198
+ placeholder="Enter domain or leave blank for general analysis"
1199
+ )
1200
+ enable_deep_learning = gr.Checkbox(
1201
+ label="Enable Deep Learning Models",
1202
+ value=False
1203
  )
1204
+ enable_automl = gr.Checkbox(
1205
+ label="Enable AutoML",
1206
+ value=False
 
 
 
 
 
 
 
 
 
 
 
 
1207
  )
1208
+ run_button = gr.Button("Run Pipeline", variant="primary")
1209
+
1210
+ with gr.Column(scale=2):
1211
+ file_info = gr.HTML(label="File Information")
1212
+ data_preview = gr.HTML(label="Data Preview")
1213
+ pipeline_output = gr.HTML(label="Pipeline Results")
1214
+ download_button = gr.File(
1215
+ label="Download Report",
1216
  visible=False
1217
  )
1218
 
1219
+ # Event handlers
1220
+ file_input.change(
 
 
 
 
 
 
 
 
 
 
1221
  fn=self.process_file_upload,
1222
+ inputs=[file_input, learning_type],
1223
+ outputs=[file_info, gr.State(), target_column, target_column, data_preview]
1224
  )
 
1225
  learning_type.change(
1226
  fn=self.update_target_column_visibility,
1227
+ inputs=[learning_type, gr.State()],
1228
  outputs=[target_column]
1229
  )
1230
+ run_button.click(
 
1231
  fn=self.run_comprehensive_pipeline,
1232
+ inputs=[file_input, learning_type, target_column, domain, enable_deep_learning, enable_automl],
1233
+ outputs=[pipeline_output, download_button]
1234
  )
1235
+ download_button.upload(
1236
+ fn=self.generate_report,
1237
+ inputs=[],
1238
+ outputs=[download_button]
 
1239
  )
1240
 
1241
+ demo.launch()
1242
 
1243
+ # Example usage
1244
  if __name__ == "__main__":
1245
+ pipeline_ui = DataSciencePipelineUI()
1246
+ pipeline_ui.launch(share=True)