Update app.py
Browse files
app.py
CHANGED
|
@@ -355,7 +355,7 @@ class SupervisorAgentMock:
|
|
| 355 |
'results': {
|
| 356 |
'Random Forest': {'accuracy': 0.87, 'f1_score': 0.85} if is_classification else {'rmse': 0.45, 'r2_score': 0.82},
|
| 357 |
'SVM': {'accuracy': 0.82, 'f1_score': 0.80} if is_classification else {'rmse': 0.52, 'r2_score': 0.78},
|
| 358 |
-
'
|
| 359 |
},
|
| 360 |
'feature_importance': {col: np.random.random() for col in df.columns if col != target_column and col in column_types['numeric']}
|
| 361 |
}
|
|
@@ -744,18 +744,25 @@ class DataSciencePipelineUI:
|
|
| 744 |
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
|
| 745 |
"""
|
| 746 |
|
| 747 |
-
def
|
| 748 |
-
"""Create a
|
| 749 |
try:
|
| 750 |
values = data[column].dropna()
|
| 751 |
if len(values) == 0:
|
| 752 |
return "<p>No valid data for histogram</p>"
|
| 753 |
|
|
|
|
|
|
|
| 754 |
plt.figure(figsize=(8, 6))
|
| 755 |
-
sns.histplot(values, bins=
|
| 756 |
plt.title(f'Distribution of {column}', fontsize=14)
|
| 757 |
plt.xlabel(column, fontsize=12)
|
| 758 |
plt.ylabel('Count', fontsize=12)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
|
| 760 |
html = self.create_plot_html(plt.gcf(), f"histogram_{column}")
|
| 761 |
plt.close()
|
|
@@ -767,19 +774,22 @@ class DataSciencePipelineUI:
|
|
| 767 |
except Exception as e:
|
| 768 |
return f"<p>Could not generate histogram for {column}: {str(e)}</p>"
|
| 769 |
|
| 770 |
-
def
|
| 771 |
-
"""Create a
|
| 772 |
try:
|
| 773 |
-
value_counts = data[column].value_counts().head(10)
|
| 774 |
labels = value_counts.index.tolist()
|
| 775 |
counts = value_counts.values.tolist()
|
| 776 |
|
| 777 |
plt.figure(figsize=(8, 6))
|
| 778 |
-
sns.barplot(x=
|
| 779 |
plt.title(f"{'Target Distribution' if is_target else f'Distribution of {column}'}", fontsize=14)
|
| 780 |
-
plt.xlabel(
|
| 781 |
-
plt.ylabel(
|
| 782 |
-
|
|
|
|
|
|
|
|
|
|
| 783 |
|
| 784 |
html = self.create_plot_html(plt.gcf(), f"bar_{column}")
|
| 785 |
plt.close()
|
|
@@ -791,8 +801,8 @@ class DataSciencePipelineUI:
|
|
| 791 |
except Exception as e:
|
| 792 |
return f"<p>Could not generate bar plot for {column}: {str(e)}</p>"
|
| 793 |
|
| 794 |
-
def
|
| 795 |
-
"""Create a
|
| 796 |
try:
|
| 797 |
x_values = data[x_col].dropna()
|
| 798 |
y_values = data[y_col].dropna()
|
|
@@ -800,14 +810,20 @@ class DataSciencePipelineUI:
|
|
| 800 |
if len(common_indices) < 2:
|
| 801 |
return f"<p>Not enough valid data for scatter plot between {x_col} and {y_col}</p>"
|
| 802 |
|
| 803 |
-
x_values = x_values.loc[common_indices].head(
|
| 804 |
-
y_values = y_values.loc[common_indices].head(
|
| 805 |
|
| 806 |
plt.figure(figsize=(8, 6))
|
| 807 |
plt.scatter(x_values, y_values, color='teal', alpha=0.6)
|
| 808 |
plt.title(f'{y_col} vs {x_col}', fontsize=14)
|
| 809 |
plt.xlabel(x_col, fontsize=12)
|
| 810 |
plt.ylabel(y_col, fontsize=12)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
|
| 812 |
html = self.create_plot_html(plt.gcf(), f"scatter_{x_col}_{y_col}")
|
| 813 |
plt.close()
|
|
@@ -819,14 +835,14 @@ class DataSciencePipelineUI:
|
|
| 819 |
except Exception as e:
|
| 820 |
return f"<p>Could not generate scatter plot for {x_col} vs {y_col}: {str(e)}</p>"
|
| 821 |
|
| 822 |
-
def
|
| 823 |
-
"""Create a
|
| 824 |
try:
|
| 825 |
corr_df = pd.DataFrame(correlation_matrix)
|
| 826 |
if corr_df.empty or len(corr_df.columns) < 2:
|
| 827 |
return "<p>Not enough numeric features for correlation analysis</p>"
|
| 828 |
|
| 829 |
-
plt.figure(figsize=(10, 8))
|
| 830 |
sns.heatmap(
|
| 831 |
corr_df,
|
| 832 |
annot=True,
|
|
@@ -836,8 +852,8 @@ class DataSciencePipelineUI:
|
|
| 836 |
center=0,
|
| 837 |
square=True,
|
| 838 |
fmt='.2f',
|
| 839 |
-
annot_kws={'size':
|
| 840 |
-
cbar_kws={'label': 'Correlation'}
|
| 841 |
)
|
| 842 |
plt.title('Correlation Matrix Heatmap', fontsize=14, pad=15)
|
| 843 |
plt.xticks(rotation=45, ha='right')
|
|
@@ -854,7 +870,7 @@ class DataSciencePipelineUI:
|
|
| 854 |
return f"<p>Could not generate correlation heatmap: {str(e)}</p>"
|
| 855 |
|
| 856 |
def _format_eda_results(self, results, data, learning_type=None, target_column=None):
|
| 857 |
-
"""Format EDA results with
|
| 858 |
if not results or results.get('status') != 'success' or data is None:
|
| 859 |
return "<p>EDA information not available or no data loaded</p>"
|
| 860 |
|
|
@@ -873,24 +889,32 @@ class DataSciencePipelineUI:
|
|
| 873 |
</div>
|
| 874 |
"""
|
| 875 |
|
|
|
|
| 876 |
if correlations.get('correlation_matrix'):
|
| 877 |
-
html += self.
|
| 878 |
|
|
|
|
| 879 |
if learning_type == "Supervised" and target_column and target_column in data.columns:
|
| 880 |
if target_column in column_types['numeric']:
|
| 881 |
-
numeric_cols = [col for col in column_types['numeric'] if col != target_column]
|
| 882 |
-
for col in numeric_cols
|
| 883 |
-
html += self.
|
| 884 |
elif target_column in column_types['categorical']:
|
| 885 |
-
html += self.
|
| 886 |
-
categorical_cols = [col for col in column_types['categorical'] if col != target_column]
|
| 887 |
-
for col in categorical_cols
|
| 888 |
-
html += self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
else:
|
|
|
|
| 890 |
for col in column_types['numeric'][:2]:
|
| 891 |
-
html += self.
|
| 892 |
for col in column_types['categorical'][:2]:
|
| 893 |
-
html += self.
|
| 894 |
|
| 895 |
html += """
|
| 896 |
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
|
|
@@ -926,103 +950,160 @@ class DataSciencePipelineUI:
|
|
| 926 |
"""
|
| 927 |
|
| 928 |
def _format_modeling_results(self, results, enable_deep_learning):
|
| 929 |
-
"""Format modeling results"""
|
| 930 |
if not results or results.get('status') != 'success':
|
| 931 |
return "<p>Modeling information not available</p>"
|
| 932 |
|
|
|
|
| 933 |
best_model = results.get('best_model', 'Unknown')
|
| 934 |
model_results = results.get('results', {})
|
| 935 |
-
|
| 936 |
|
| 937 |
html = f"""
|
| 938 |
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
|
| 939 |
-
<h4 style="margin: 0 0 15px 0; color: #e74c3c;"
|
| 940 |
-
<
|
| 941 |
-
|
| 942 |
-
|
| 943 |
<h5 style="color: #e74c3c;">📊 Model Performance:</h5>
|
| 944 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
"""
|
| 946 |
|
| 947 |
-
for
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
|
| 953 |
html += """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 954 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
</div>
|
| 956 |
-
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training completed!</strong></p>
|
| 957 |
"""
|
| 958 |
-
|
| 959 |
return html
|
| 960 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
def _format_unsupervised_results(self, data):
|
| 962 |
-
"""Format
|
| 963 |
if data is None:
|
| 964 |
return "<p>No data available for unsupervised analysis</p>"
|
| 965 |
-
|
| 966 |
-
n_samples = data.shape[0]
|
| 967 |
-
n_features = data.shape[1]
|
| 968 |
-
optimal_clusters = min(max(2, int(np.sqrt(n_samples/100))), 10)
|
| 969 |
-
|
| 970 |
-
return f"""
|
| 971 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin: 15px 0;">
|
| 972 |
-
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 973 |
-
<h4 style="margin: 0 0 15px 0; color: #9b59b6;">🔍 Clustering Analysis</h4>
|
| 974 |
-
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 15px; border-radius: 8px; text-align: center; margin-bottom: 15px;">
|
| 975 |
-
<h3 style="margin: 0;">K-Means Clustering</h3>
|
| 976 |
-
<p style="margin: 5px 0 0 0;">Optimal Clusters: {optimal_clusters}</p>
|
| 977 |
-
</div>
|
| 978 |
-
<div style="background: #f8f9fa; padding: 15px; border-radius: 6px;">
|
| 979 |
-
<p style="margin: 5px 0;"><strong>Silhouette Score:</strong> {0.65 + np.random.random() * 0.2:.3f}</p>
|
| 980 |
-
<p style="margin: 5px 0;"><strong>Inertia:</strong> {np.random.randint(500, 2000):,}</p>
|
| 981 |
-
<p style="margin: 5px 0;"><strong>Samples:</strong> {n_samples:,}</p>
|
| 982 |
-
</div>
|
| 983 |
-
</div>
|
| 984 |
-
|
| 985 |
-
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 986 |
-
<h4 style="margin: 0 0 15px 0; color: #9b59b6;">📊 Pattern Discovery</h4>
|
| 987 |
-
<p style="margin: 8px 0;"><strong>Natural Groups:</strong> {optimal_clusters} distinct clusters</p>
|
| 988 |
-
<p style="margin: 8px 0;"><strong>Anomalies:</strong> {np.random.randint(5, max(6, int(n_samples * 0.05)))} outliers detected</p>
|
| 989 |
-
<p style="margin: 8px 0;"><strong>Feature Space:</strong> {n_features}D analysis</p>
|
| 990 |
-
<p style="margin: 8px 0;"><strong>Variance Explained:</strong> {85 + np.random.randint(0, 10):.1f}%</p>
|
| 991 |
-
</div>
|
| 992 |
-
</div>
|
| 993 |
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
| 997 |
-
{''.join([f'''
|
| 998 |
-
<div style="background: {["#e8f5e8", "#fff3e0", "#e3f2fd", "#fce4ec", "#f3e5f5"][i % 5]}; padding: 15px; border-radius: 8px; border-left: 4px solid {["#27ae60", "#ff9800", "#2196f3", "#e91e63", "#9c27b0"][i % 5]};">
|
| 999 |
-
<h5 style="margin: 0 0 8px 0; color: {["#27ae60", "#ff9800", "#2196f3", "#e91e63", "#9c27b0"][i % 5]};">Cluster {i+1}</h5>
|
| 1000 |
-
<p style="margin: 0; font-size: 12px;">{["High-value segment", "Moderate characteristics", "Unique behavioral patterns", "Low-activity group", "Special interest group"][i % 5]}</p>
|
| 1001 |
-
<p style="margin: 5px 0 0 0; font-size: 11px; color: #666;">Size: {np.random.randint(10, max(11, int(n_samples/optimal_clusters * 1.5)))} samples</p>
|
| 1002 |
-
</div>
|
| 1003 |
-
''' for i in range(min(optimal_clusters, 5))])}
|
| 1004 |
-
</div>
|
| 1005 |
-
</div>
|
| 1006 |
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
<
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1021 |
</div>
|
|
|
|
|
|
|
| 1022 |
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1026 |
</div>
|
| 1027 |
"""
|
| 1028 |
|
|
@@ -1031,359 +1112,135 @@ class DataSciencePipelineUI:
|
|
| 1031 |
key_insights = summary.get('key_insights', [])
|
| 1032 |
recommendations = summary.get('recommendations', [])
|
| 1033 |
|
| 1034 |
-
|
| 1035 |
-
<div style="background:
|
| 1036 |
-
<
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
|
| 1040 |
-
<div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 1041 |
-
<h4 style="margin: 0 0 20px 0; color: #2c3e50;">🔍 Key Insights</h4>
|
| 1042 |
-
{''.join([f'<div style="background: #e8f4f8; padding: 12px; margin: 8px 0; border-radius: 6px;">💡 {insight}</div>' for insight in key_insights])}
|
| 1043 |
-
</div>
|
| 1044 |
-
<div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 1045 |
-
<h4 style="margin: 0 0 20px 0; color: #2c3e50;">📝 Recommendations</h4>
|
| 1046 |
-
{''.join([f'<div style="background: #fff3e0; padding: 12px; margin: 8px 0; border-radius: 6px;">📌 {rec}</div>' for rec in recommendations])}
|
| 1047 |
-
</div>
|
| 1048 |
-
</div>
|
| 1049 |
"""
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
|
| 1055 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1056 |
</div>
|
|
|
|
| 1057 |
"""
|
|
|
|
| 1058 |
|
| 1059 |
-
def
|
| 1060 |
-
"""Generate a
|
| 1061 |
-
if not self.pipeline_results
|
| 1062 |
-
return
|
| 1063 |
-
|
| 1064 |
-
try:
|
| 1065 |
-
# Sanitizing text for LaTeX
|
| 1066 |
-
def sanitize_latex(text):
|
| 1067 |
-
if not isinstance(text, str):
|
| 1068 |
-
text = str(text)
|
| 1069 |
-
replacements = {
|
| 1070 |
-
'&': r'\&',
|
| 1071 |
-
'%': r'\%',
|
| 1072 |
-
'$': r'\$',
|
| 1073 |
-
'#': r'\#',
|
| 1074 |
-
'_': r'\_',
|
| 1075 |
-
'{': r'\{',
|
| 1076 |
-
'}': r'\}',
|
| 1077 |
-
'~': r'\textasciitilde{}',
|
| 1078 |
-
'^': r'\textasciicircum{}',
|
| 1079 |
-
'\\': r'\textbackslash{}'
|
| 1080 |
-
}
|
| 1081 |
-
for old, new in replacements.items():
|
| 1082 |
-
text = text.replace(old, new)
|
| 1083 |
-
return text
|
| 1084 |
-
|
| 1085 |
-
# Extract pipeline results
|
| 1086 |
-
data_loading = self.pipeline_results.get('data_loading', {})
|
| 1087 |
-
data_cleaning = self.pipeline_results.get('data_cleaning', {})
|
| 1088 |
-
eda = self.pipeline_results.get('eda', {})
|
| 1089 |
-
domain_insights = self.pipeline_results.get('domain_insights', {})
|
| 1090 |
-
modeling = self.pipeline_results.get('modeling', {})
|
| 1091 |
-
summary = self.pipeline_results.get('summary', {})
|
| 1092 |
-
|
| 1093 |
-
# Start LaTeX document
|
| 1094 |
-
latex_content = r"""
|
| 1095 |
-
\documentclass[a4paper,12pt]{article}
|
| 1096 |
-
\usepackage[utf8]{inputenc}
|
| 1097 |
-
\usepackage{geometry}
|
| 1098 |
-
\usepackage{graphicx}
|
| 1099 |
-
\usepackage{booktabs}
|
| 1100 |
-
\usepackage{amsmath}
|
| 1101 |
-
\usepackage{xcolor}
|
| 1102 |
-
\usepackage{enumitem}
|
| 1103 |
-
\usepackage{hyperref}
|
| 1104 |
-
\geometry{margin=1in}
|
| 1105 |
-
\title{Data Science Pipeline Report}
|
| 1106 |
-
\author{Automated Analysis}
|
| 1107 |
-
\date{Generated on \today}
|
| 1108 |
-
\begin{document}
|
| 1109 |
-
\maketitle
|
| 1110 |
-
\tableofcontents
|
| 1111 |
-
\newpage
|
| 1112 |
-
|
| 1113 |
-
% Defining section: Introduction
|
| 1114 |
-
\section{Introduction}
|
| 1115 |
-
This report presents the results of an automated data science pipeline executed on the uploaded dataset. The analysis includes data loading, cleaning, exploratory data analysis (EDA), domain-specific insights, and modeling results. The configuration used is:
|
| 1116 |
-
\begin{itemize}
|
| 1117 |
-
\item \textbf{Learning Type}: """ + sanitize_latex(learning_type) + r"""
|
| 1118 |
-
\item \textbf{Target Column}: """ + (sanitize_latex(target_column) if target_column else "None (Unsupervised)") + r"""
|
| 1119 |
-
\item \textbf{Domain}: """ + sanitize_latex(domain or "General") + r"""
|
| 1120 |
-
\item \textbf{Deep Learning}: """ + ("Enabled" if enable_deep_learning else "Disabled") + r"""
|
| 1121 |
-
\item \textbf{AutoML}: """ + ("Enabled" if enable_automl else "Disabled") + r"""
|
| 1122 |
-
\end{itemize}
|
| 1123 |
-
|
| 1124 |
-
% Defining section: Data Loading
|
| 1125 |
-
\section{Data Loading}
|
| 1126 |
-
"""
|
| 1127 |
-
|
| 1128 |
-
if data_loading.get('status') == 'success':
|
| 1129 |
-
info = data_loading.get('info', {})
|
| 1130 |
-
shape = info.get('shape', (0, 0))
|
| 1131 |
-
column_types = info.get('column_types', {})
|
| 1132 |
-
latex_content += r"""
|
| 1133 |
-
The dataset was successfully loaded with the following details:
|
| 1134 |
-
\begin{itemize}
|
| 1135 |
-
\item \textbf{Rows}: """ + f"{shape[0]:,}" + r"""
|
| 1136 |
-
\item \textbf{Columns}: """ + f"{shape[1]}" + r"""
|
| 1137 |
-
\item \textbf{Memory Usage}: """ + sanitize_latex(info.get('memory_usage', 'Unknown')) + r"""
|
| 1138 |
-
\item \textbf{Numeric Columns}: """ + f"{len(column_types.get('numeric', []))}" + r"""
|
| 1139 |
-
\item \textbf{Categorical Columns}: """ + f"{len(column_types.get('categorical', []))}" + r"""
|
| 1140 |
-
\item \textbf{DateTime Columns}: """ + f"{len(column_types.get('datetime', []))}" + r"""
|
| 1141 |
-
\end{itemize}
|
| 1142 |
-
"""
|
| 1143 |
-
|
| 1144 |
-
# Defining section: Data Cleaning
|
| 1145 |
-
latex_content += r"""
|
| 1146 |
-
\section{Data Cleaning}
|
| 1147 |
-
"""
|
| 1148 |
-
if data_cleaning.get('status') == 'success':
|
| 1149 |
-
report = data_cleaning.get('cleaning_report', {})
|
| 1150 |
-
duplicates = report.get('duplicates_removed', 0)
|
| 1151 |
-
missing_values = report.get('missing_values', {})
|
| 1152 |
-
total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
|
| 1153 |
-
latex_content += r"""
|
| 1154 |
-
The data cleaning process identified and handled:
|
| 1155 |
-
\begin{itemize}
|
| 1156 |
-
\item \textbf{Duplicates Removed}: """ + f"{duplicates}" + r"""
|
| 1157 |
-
\item \textbf{Missing Values}: """ + f"{total_missing}" + r"""
|
| 1158 |
-
\end{itemize}
|
| 1159 |
-
"""
|
| 1160 |
-
|
| 1161 |
-
# Defining section: Exploratory Data Analysis
|
| 1162 |
-
latex_content += r"""
|
| 1163 |
-
\section{Exploratory Data Analysis}
|
| 1164 |
-
"""
|
| 1165 |
-
if eda.get('status') == 'success':
|
| 1166 |
-
column_types = eda.get('analysis', {}).get('column_types', {})
|
| 1167 |
-
latex_content += r"""
|
| 1168 |
-
The EDA revealed:
|
| 1169 |
-
\begin{itemize}
|
| 1170 |
-
\item \textbf{Numeric Features}: """ + f"{len(column_types.get('numeric', []))}" + r"""
|
| 1171 |
-
\item \textbf{Categorical Features}: """ + f"{len(column_types.get('categorical', []))}" + r"""
|
| 1172 |
-
\item \textbf{DateTime Features}: """ + f"{len(column_types.get('datetime', []))}" + r"""
|
| 1173 |
-
\end{itemize}
|
| 1174 |
-
"""
|
| 1175 |
-
|
| 1176 |
-
# Adding plots
|
| 1177 |
-
if 'correlation_heatmap' in self.plot_images:
|
| 1178 |
-
latex_content += r"""
|
| 1179 |
-
\subsection{Correlation Heatmap}
|
| 1180 |
-
\begin{figure}[h]
|
| 1181 |
-
\centering
|
| 1182 |
-
\includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images['correlation_heatmap'] + r"""}
|
| 1183 |
-
\caption{Correlation Matrix Heatmap}
|
| 1184 |
-
\end{figure}
|
| 1185 |
-
"""
|
| 1186 |
-
|
| 1187 |
-
for col in column_types.get('numeric', [])[:2]:
|
| 1188 |
-
if f"histogram_{col}" in self.plot_images:
|
| 1189 |
-
latex_content += r"""
|
| 1190 |
-
\subsection{Histogram of """ + sanitize_latex(col) + r"""}
|
| 1191 |
-
\begin{figure}[h]
|
| 1192 |
-
\centering
|
| 1193 |
-
\includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images[f"histogram_{col}"] + r"""}
|
| 1194 |
-
\caption{Distribution of """ + sanitize_latex(col) + r"""}
|
| 1195 |
-
\end{figure}
|
| 1196 |
-
"""
|
| 1197 |
-
|
| 1198 |
-
for col in column_types.get('categorical', [])[:2]:
|
| 1199 |
-
if f"bar_{col}" in self.plot_images:
|
| 1200 |
-
latex_content += r"""
|
| 1201 |
-
\subsection{Distribution of """ + sanitize_latex(col) + r"""}
|
| 1202 |
-
\begin{figure}[h]
|
| 1203 |
-
\centering
|
| 1204 |
-
\includegraphics[width=0.8\textwidth]{data:image/png;base64,""" + self.plot_images[f"bar_{col}"] + r"""}
|
| 1205 |
-
\caption{Distribution of """ + sanitize_latex(col) + r"""}
|
| 1206 |
-
\end{figure}
|
| 1207 |
-
"""
|
| 1208 |
-
|
| 1209 |
-
# Defining section: Domain Analysis
|
| 1210 |
-
latex_content += r"""
|
| 1211 |
-
\section{Domain Analysis}
|
| 1212 |
-
"""
|
| 1213 |
-
if domain_insights:
|
| 1214 |
-
domain = domain_insights.get('detected_domain', 'general')
|
| 1215 |
-
insights = domain_insights.get('insights', [])
|
| 1216 |
-
recommendations = domain_insights.get('recommendations', [])
|
| 1217 |
-
latex_content += r"""
|
| 1218 |
-
\textbf{Detected Domain}: """ + sanitize_latex(domain) + r"""
|
| 1219 |
-
\subsection{Key Insights}
|
| 1220 |
-
\begin{itemize}
|
| 1221 |
-
""" + ''.join([f"\\item {sanitize_latex(insight)}" for insight in insights[:5]]) + r"""
|
| 1222 |
-
\end{itemize}
|
| 1223 |
-
\subsection{Recommendations}
|
| 1224 |
-
\begin{itemize}
|
| 1225 |
-
""" + ''.join([f"\\item {sanitize_latex(rec)}" for rec in recommendations[:5]]) + r"""
|
| 1226 |
-
\end{itemize}
|
| 1227 |
-
"""
|
| 1228 |
-
|
| 1229 |
-
# Defining section: Modeling or Unsupervised Analysis
|
| 1230 |
-
if learning_type == "Supervised" and modeling:
|
| 1231 |
-
latex_content += r"""
|
| 1232 |
-
\section{Model Training and Evaluation}
|
| 1233 |
-
"""
|
| 1234 |
-
if modeling.get('status') == 'success':
|
| 1235 |
-
best_model = modeling.get('best_model', 'Unknown')
|
| 1236 |
-
problem_type = modeling.get('problem_type', 'classification')
|
| 1237 |
-
model_results = modeling.get('results', {})
|
| 1238 |
-
latex_content += r"""
|
| 1239 |
-
\textbf{Best Model}: """ + sanitize_latex(best_model) + r"""
|
| 1240 |
-
\newline
|
| 1241 |
-
\textbf{Problem Type}: """ + sanitize_latex(problem_type.title()) + r"""
|
| 1242 |
-
\newline
|
| 1243 |
-
\textbf{Model Performance}:
|
| 1244 |
-
\begin{itemize}
|
| 1245 |
-
""" + ''.join([f"\\item \\textbf{{{sanitize_latex(model_name)}}}: " + ", ".join([f"{metric_name}: {metric_value:.3f}" for metric_name, metric_value in metrics.items()]) for model_name, metrics in model_results.items()]) + r"""
|
| 1246 |
-
\end{itemize}
|
| 1247 |
-
"""
|
| 1248 |
-
else:
|
| 1249 |
-
latex_content += r"""
|
| 1250 |
-
\section{Unsupervised Analysis}
|
| 1251 |
-
"""
|
| 1252 |
-
n_samples = self.current_data.shape[0]
|
| 1253 |
-
optimal_clusters = min(max(2, int(np.sqrt(n_samples/100))), 10)
|
| 1254 |
-
latex_content += r"""
|
| 1255 |
-
\textbf{Clustering Analysis}:
|
| 1256 |
-
\begin{itemize}
|
| 1257 |
-
\item Optimal Clusters: """ + f"{optimal_clusters}" + r"""
|
| 1258 |
-
\item Silhouette Score: """ + f"{0.65 + np.random.random() * 0.2:.3f}" + r"""
|
| 1259 |
-
\end{itemize}
|
| 1260 |
-
"""
|
| 1261 |
-
|
| 1262 |
-
# Defining section: Summary
|
| 1263 |
-
latex_content += r"""
|
| 1264 |
-
\section{Summary and Recommendations}
|
| 1265 |
-
"""
|
| 1266 |
-
key_insights = summary.get('key_insights', [])
|
| 1267 |
-
recommendations = summary.get('recommendations', [])
|
| 1268 |
-
latex_content += r"""
|
| 1269 |
-
\subsection{Key Insights}
|
| 1270 |
-
\begin{itemize}
|
| 1271 |
-
""" + ''.join([f"\\item {sanitize_latex(insight)}" for insight in key_insights]) + r"""
|
| 1272 |
-
\end{itemize}
|
| 1273 |
-
\subsection{Recommendations}
|
| 1274 |
-
\begin{itemize}
|
| 1275 |
-
""" + ''.join([f"\\item {sanitize_latex(rec)}" for rec in recommendations]) + r"""
|
| 1276 |
-
\end{itemize}
|
| 1277 |
-
"""
|
| 1278 |
-
|
| 1279 |
-
# Ending document
|
| 1280 |
-
latex_content += r"""
|
| 1281 |
-
\end{document}
|
| 1282 |
-
"""
|
| 1283 |
-
|
| 1284 |
-
# Write LaTeX content to a temporary file
|
| 1285 |
-
with open('report.tex', 'w') as f:
|
| 1286 |
-
f.write(latex_content)
|
| 1287 |
-
|
| 1288 |
-
# Compile LaTeX to PDF
|
| 1289 |
-
subprocess.run(['latexmk', '-pdf', '-silent', 'report.tex'], check=True)
|
| 1290 |
|
| 1291 |
-
|
| 1292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1293 |
|
| 1294 |
-
|
| 1295 |
-
|
|
|
|
|
|
|
|
|
|
| 1296 |
|
| 1297 |
-
def
|
| 1298 |
-
"""
|
| 1299 |
-
with gr.Blocks(css=self.custom_css
|
| 1300 |
-
gr.Markdown("
|
| 1301 |
-
|
|
|
|
|
|
|
| 1302 |
|
| 1303 |
with gr.Row():
|
| 1304 |
with gr.Column(scale=1):
|
| 1305 |
-
|
| 1306 |
-
label="📁 Upload Dataset",
|
| 1307 |
-
file_types=[".csv", ".json"],
|
| 1308 |
-
type="filepath"
|
| 1309 |
-
)
|
| 1310 |
learning_type = gr.Radio(
|
| 1311 |
-
choices=["Supervised", "Unsupervised"],
|
| 1312 |
-
label="
|
| 1313 |
value="Supervised"
|
| 1314 |
)
|
| 1315 |
target_column = gr.Dropdown(
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
visible=True
|
| 1319 |
-
allow_custom_value=True
|
| 1320 |
)
|
| 1321 |
domain = gr.Textbox(
|
| 1322 |
-
label="
|
| 1323 |
-
placeholder="
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1324 |
)
|
| 1325 |
-
|
| 1326 |
-
|
| 1327 |
-
|
| 1328 |
-
label="🧠 Enable Deep Learning",
|
| 1329 |
-
value=False
|
| 1330 |
-
)
|
| 1331 |
-
enable_automl = gr.Checkbox(
|
| 1332 |
-
label="🤖 Enable AutoML",
|
| 1333 |
-
value=True
|
| 1334 |
-
)
|
| 1335 |
-
|
| 1336 |
-
run_btn = gr.Button(
|
| 1337 |
-
"🚀 Run Complete Pipeline",
|
| 1338 |
-
variant="primary",
|
| 1339 |
-
size="lg"
|
| 1340 |
)
|
| 1341 |
-
|
| 1342 |
-
|
| 1343 |
-
|
| 1344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1345 |
visible=False
|
| 1346 |
)
|
| 1347 |
|
| 1348 |
-
|
| 1349 |
-
|
| 1350 |
-
preview = gr.HTML(label="👀 Data Preview")
|
| 1351 |
-
|
| 1352 |
-
output = gr.HTML(label="📈 Pipeline Results")
|
| 1353 |
-
report_output = gr.File(label="📄 Download Analysis Report")
|
| 1354 |
-
report_status = gr.Textbox(label="Report Status", visible=False)
|
| 1355 |
-
|
| 1356 |
-
file_type_state = gr.State("")
|
| 1357 |
-
columns_state = gr.State([])
|
| 1358 |
-
|
| 1359 |
-
file_upload.change(
|
| 1360 |
fn=self.process_file_upload,
|
| 1361 |
-
inputs=[
|
| 1362 |
-
outputs=[
|
| 1363 |
)
|
| 1364 |
-
|
| 1365 |
learning_type.change(
|
| 1366 |
fn=self.update_target_column_visibility,
|
| 1367 |
-
inputs=[learning_type,
|
| 1368 |
outputs=[target_column]
|
| 1369 |
)
|
| 1370 |
-
|
| 1371 |
-
run_btn.click(
|
| 1372 |
fn=self.run_comprehensive_pipeline,
|
| 1373 |
-
inputs=[
|
| 1374 |
-
outputs=[
|
| 1375 |
)
|
| 1376 |
-
|
| 1377 |
-
|
| 1378 |
-
|
| 1379 |
-
|
| 1380 |
-
outputs=[report_output, report_status]
|
| 1381 |
)
|
| 1382 |
|
| 1383 |
-
|
| 1384 |
|
|
|
|
| 1385 |
if __name__ == "__main__":
|
| 1386 |
-
|
| 1387 |
-
|
| 1388 |
-
demo = ui.create_interface()
|
| 1389 |
-
demo.launch(share=True)
|
|
|
|
| 355 |
'results': {
|
| 356 |
'Random Forest': {'accuracy': 0.87, 'f1_score': 0.85} if is_classification else {'rmse': 0.45, 'r2_score': 0.82},
|
| 357 |
'SVM': {'accuracy': 0.82, 'f1_score': 0.80} if is_classification else {'rmse': 0.52, 'r2_score': 0.78},
|
| 358 |
+
'LogisticRegression': {'accuracy': 0.78, 'f1_score': 0.76} if is_classification else {'rmse': 0.58, 'r2_score': 0.74}
|
| 359 |
},
|
| 360 |
'feature_importance': {col: np.random.random() for col in df.columns if col != target_column and col in column_types['numeric']}
|
| 361 |
}
|
|
|
|
| 744 |
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Data cleaning completed successfully!</strong></p>
|
| 745 |
"""
|
| 746 |
|
| 747 |
+
def _create_dynamic_histogram(self, data, column):
|
| 748 |
+
"""Create a dynamic histogram for a numeric column"""
|
| 749 |
try:
|
| 750 |
values = data[column].dropna()
|
| 751 |
if len(values) == 0:
|
| 752 |
return "<p>No valid data for histogram</p>"
|
| 753 |
|
| 754 |
+
# Dynamically adjust number of bins based on data size and spread
|
| 755 |
+
n_bins = min(max(int(np.sqrt(len(values))), 10), 50)
|
| 756 |
plt.figure(figsize=(8, 6))
|
| 757 |
+
sns.histplot(values, bins=n_bins, kde=True, color='skyblue')
|
| 758 |
plt.title(f'Distribution of {column}', fontsize=14)
|
| 759 |
plt.xlabel(column, fontsize=12)
|
| 760 |
plt.ylabel('Count', fontsize=12)
|
| 761 |
+
|
| 762 |
+
# Add range and stats annotations
|
| 763 |
+
stats_text = f'Min: {values.min():.2f}\nMax: {values.max():.2f}\nMean: {values.mean():.2f}'
|
| 764 |
+
plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, ha='right', va='top',
|
| 765 |
+
bbox=dict(facecolor='white', alpha=0.8))
|
| 766 |
|
| 767 |
html = self.create_plot_html(plt.gcf(), f"histogram_{column}")
|
| 768 |
plt.close()
|
|
|
|
| 774 |
except Exception as e:
|
| 775 |
return f"<p>Could not generate histogram for {column}: {str(e)}</p>"
|
| 776 |
|
| 777 |
+
def _create_dynamic_bar(self, data, column, is_target=False):
|
| 778 |
+
"""Create a dynamic bar plot for a categorical column"""
|
| 779 |
try:
|
| 780 |
+
value_counts = data[column].value_counts().head(10) # Limit to top 10 categories
|
| 781 |
labels = value_counts.index.tolist()
|
| 782 |
counts = value_counts.values.tolist()
|
| 783 |
|
| 784 |
plt.figure(figsize=(8, 6))
|
| 785 |
+
sns.barplot(x=counts, y=labels, palette='tab10')
|
| 786 |
plt.title(f"{'Target Distribution' if is_target else f'Distribution of {column}'}", fontsize=14)
|
| 787 |
+
plt.xlabel('Count', fontsize=12)
|
| 788 |
+
plt.ylabel(column, fontsize=12)
|
| 789 |
+
|
| 790 |
+
# Add total count annotation
|
| 791 |
+
plt.text(0.95, 0.95, f'Total: {sum(counts)}',
|
| 792 |
+
transform=plt.gca().transAxes, ha='right', va='top', bbox=dict(facecolor='white', alpha=0.8))
|
| 793 |
|
| 794 |
html = self.create_plot_html(plt.gcf(), f"bar_{column}")
|
| 795 |
plt.close()
|
|
|
|
| 801 |
except Exception as e:
|
| 802 |
return f"<p>Could not generate bar plot for {column}: {str(e)}</p>"
|
| 803 |
|
| 804 |
+
def _create_dynamic_scatter(self, data, x_col, y_col, target=False):
|
| 805 |
+
"""Create a dynamic scatter plot for regression analysis"""
|
| 806 |
try:
|
| 807 |
x_values = data[x_col].dropna()
|
| 808 |
y_values = data[y_col].dropna()
|
|
|
|
| 810 |
if len(common_indices) < 2:
|
| 811 |
return f"<p>Not enough valid data for scatter plot between {x_col} and {y_col}</p>"
|
| 812 |
|
| 813 |
+
x_values = x_values.loc[common_indices].head(1000) # Limit to 1000 points for performance
|
| 814 |
+
y_values = y_values.loc[common_indices].head(1000)
|
| 815 |
|
| 816 |
plt.figure(figsize=(8, 6))
|
| 817 |
plt.scatter(x_values, y_values, color='teal', alpha=0.6)
|
| 818 |
plt.title(f'{y_col} vs {x_col}', fontsize=14)
|
| 819 |
plt.xlabel(x_col, fontsize=12)
|
| 820 |
plt.ylabel(y_col, fontsize=12)
|
| 821 |
+
|
| 822 |
+
# Add range and correlation annotations
|
| 823 |
+
corr = np.corrcoef(x_values, y_values)[0, 1] if len(x_values) > 1 else 0
|
| 824 |
+
stats_text = f'X Range: {x_values.min():.2f} to {x_values.max():.2f}\nY Range: {y_values.min():.2f} to {y_values.max():.2f}\nCorr: {corr:.2f}'
|
| 825 |
+
plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, ha='right', va='top',
|
| 826 |
+
bbox=dict(facecolor='white', alpha=0.8))
|
| 827 |
|
| 828 |
html = self.create_plot_html(plt.gcf(), f"scatter_{x_col}_{y_col}")
|
| 829 |
plt.close()
|
|
|
|
| 835 |
except Exception as e:
|
| 836 |
return f"<p>Could not generate scatter plot for {x_col} vs {y_col}: {str(e)}</p>"
|
| 837 |
|
| 838 |
+
def _create_dynamic_correlation_heatmap(self, correlation_matrix):
|
| 839 |
+
"""Create a dynamic correlation heatmap"""
|
| 840 |
try:
|
| 841 |
corr_df = pd.DataFrame(correlation_matrix)
|
| 842 |
if corr_df.empty or len(corr_df.columns) < 2:
|
| 843 |
return "<p>Not enough numeric features for correlation analysis</p>"
|
| 844 |
|
| 845 |
+
plt.figure(figsize=(min(10, len(corr_df.columns) * 1.2), min(8, len(corr_df.columns) * 1)))
|
| 846 |
sns.heatmap(
|
| 847 |
corr_df,
|
| 848 |
annot=True,
|
|
|
|
| 852 |
center=0,
|
| 853 |
square=True,
|
| 854 |
fmt='.2f',
|
| 855 |
+
annot_kws={'size': max(8, 12 - len(corr_df.columns) // 2)},
|
| 856 |
+
cbar_kws={'label': 'Correlation Coefficient'}
|
| 857 |
)
|
| 858 |
plt.title('Correlation Matrix Heatmap', fontsize=14, pad=15)
|
| 859 |
plt.xticks(rotation=45, ha='right')
|
|
|
|
| 870 |
return f"<p>Could not generate correlation heatmap: {str(e)}</p>"
|
| 871 |
|
| 872 |
def _format_eda_results(self, results, data, learning_type=None, target_column=None):
|
| 873 |
+
"""Format EDA results with dynamic visualizations"""
|
| 874 |
if not results or results.get('status') != 'success' or data is None:
|
| 875 |
return "<p>EDA information not available or no data loaded</p>"
|
| 876 |
|
|
|
|
| 889 |
</div>
|
| 890 |
"""
|
| 891 |
|
| 892 |
+
# Add correlation heatmap if available
|
| 893 |
if correlations.get('correlation_matrix'):
|
| 894 |
+
html += self._create_dynamic_correlation_heatmap(correlations['correlation_matrix'])
|
| 895 |
|
| 896 |
+
# Dynamic visualization selection based on learning type and data
|
| 897 |
if learning_type == "Supervised" and target_column and target_column in data.columns:
|
| 898 |
if target_column in column_types['numeric']:
|
| 899 |
+
numeric_cols = [col for col in column_types['numeric'] if col != target_column][:2]
|
| 900 |
+
for col in numeric_cols:
|
| 901 |
+
html += self._create_dynamic_scatter(data, col, target_column, target=True)
|
| 902 |
elif target_column in column_types['categorical']:
|
| 903 |
+
html += self._create_dynamic_bar(data, target_column, is_target=True)
|
| 904 |
+
categorical_cols = [col for col in column_types['categorical'] if col != target_column][:2]
|
| 905 |
+
for col in categorical_cols:
|
| 906 |
+
html += self._create_dynamic_bar(data, col)
|
| 907 |
+
# Add one numeric histogram and one categorical bar plot for context
|
| 908 |
+
if column_types['numeric']:
|
| 909 |
+
html += self._create_dynamic_histogram(data, column_types['numeric'][0])
|
| 910 |
+
if column_types['categorical'] and target_column not in column_types['categorical']:
|
| 911 |
+
html += self._create_dynamic_bar(data, column_types['categorical'][0])
|
| 912 |
else:
|
| 913 |
+
# For unsupervised learning or no target, show up to 2 histograms and 2 bar plots
|
| 914 |
for col in column_types['numeric'][:2]:
|
| 915 |
+
html += self._create_dynamic_histogram(data, col)
|
| 916 |
for col in column_types['categorical'][:2]:
|
| 917 |
+
html += self._create_dynamic_bar(data, col)
|
| 918 |
|
| 919 |
html += """
|
| 920 |
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Exploratory Data Analysis completed!</strong></p>
|
|
|
|
| 950 |
"""
|
| 951 |
|
| 952 |
def _format_modeling_results(self, results, enable_deep_learning):
|
| 953 |
+
"""Format modeling results with visualizations"""
|
| 954 |
if not results or results.get('status') != 'success':
|
| 955 |
return "<p>Modeling information not available</p>"
|
| 956 |
|
| 957 |
+
problem_type = results.get('problem_type', 'unknown')
|
| 958 |
best_model = results.get('best_model', 'Unknown')
|
| 959 |
model_results = results.get('results', {})
|
| 960 |
+
feature_importance = results.get('feature_importance', {})
|
| 961 |
|
| 962 |
html = f"""
|
| 963 |
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
|
| 964 |
+
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">🤖 Modeling Results</h4>
|
| 965 |
+
<div style="background: linear-gradient(135deg, #ff6b6b 0%, #e74c3c 100%); color: white; padding: 15px; border-radius: 8px; text-align: center; margin-bottom: 15px;">
|
| 966 |
+
<h3 style="margin: 0;">Best Model: {best_model} ({problem_type.title()})</h3>
|
| 967 |
+
</div>
|
| 968 |
<h5 style="color: #e74c3c;">📊 Model Performance:</h5>
|
| 969 |
+
<table style="width: 100%; border-collapse: collapse; margin: 15px 0;">
|
| 970 |
+
<thead>
|
| 971 |
+
<tr style="background-color: #e74c3c; color: white;">
|
| 972 |
+
<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Model</th>
|
| 973 |
+
<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">
|
| 974 |
+
{'Accuracy' if problem_type == 'classification' else 'RMSE'}
|
| 975 |
+
</th>
|
| 976 |
+
<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">
|
| 977 |
+
{'F1 Score' if problem_type == 'classification' else 'R² Score'}
|
| 978 |
+
</th>
|
| 979 |
+
</tr>
|
| 980 |
+
</thead>
|
| 981 |
+
<tbody>
|
| 982 |
"""
|
| 983 |
|
| 984 |
+
for model, metrics in model_results.items():
|
| 985 |
+
metric1 = metrics.get('accuracy' if problem_type == 'classification' else 'rmse', 'N/A')
|
| 986 |
+
metric2 = metrics.get('f1_score' if problem_type == 'classification' else 'r2_score', 'N/A')
|
| 987 |
+
html += f"""
|
| 988 |
+
<tr style="background-color: {'#f9f9f9' if list(model_results.keys()).index(model) % 2 == 0 else 'white'};">
|
| 989 |
+
<td style="padding: 8px; border: 1px solid #ddd;">{model}</td>
|
| 990 |
+
<td style="padding: 8px; border: 1px solid #ddd;">{metric1:.3f}</td>
|
| 991 |
+
<td style="padding: 8px; border: 1px solid #ddd;">{metric2:.3f}</td>
|
| 992 |
+
</tr>
|
| 993 |
+
"""
|
| 994 |
|
| 995 |
html += """
|
| 996 |
+
</tbody>
|
| 997 |
+
</table>
|
| 998 |
+
"""
|
| 999 |
+
|
| 1000 |
+
if feature_importance:
|
| 1001 |
+
html += self._create_feature_importance_plot(feature_importance)
|
| 1002 |
+
|
| 1003 |
+
if enable_deep_learning:
|
| 1004 |
+
html += """
|
| 1005 |
+
<div style="background: #e8f4f8; padding: 15px; border-radius: 8px; margin-top: 15px;">
|
| 1006 |
+
<h5 style="color: #2c3e50; margin: 0 0 10px 0;">🧠 Deep Learning Status</h5>
|
| 1007 |
+
<p style="margin: 0;">Deep learning models were evaluated but not included in final results due to complexity constraints.</p>
|
| 1008 |
</div>
|
| 1009 |
+
"""
|
| 1010 |
+
|
| 1011 |
+
html += """
|
| 1012 |
+
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Model training and evaluation completed!</strong></p>
|
| 1013 |
</div>
|
|
|
|
| 1014 |
"""
|
|
|
|
| 1015 |
return html
|
| 1016 |
|
| 1017 |
+
def _create_feature_importance_plot(self, feature_importance):
|
| 1018 |
+
"""Create a dynamic feature importance bar plot"""
|
| 1019 |
+
try:
|
| 1020 |
+
features = list(feature_importance.keys())
|
| 1021 |
+
importances = list(feature_importance.values())
|
| 1022 |
+
|
| 1023 |
+
plt.figure(figsize=(8, max(6, len(features) * 0.5)))
|
| 1024 |
+
sns.barplot(x=importances, y=features, palette='viridis')
|
| 1025 |
+
plt.title('Feature Importance', fontsize=14)
|
| 1026 |
+
plt.xlabel('Importance Score', fontsize=12)
|
| 1027 |
+
plt.ylabel('Features', fontsize=12)
|
| 1028 |
+
|
| 1029 |
+
# Add value annotations
|
| 1030 |
+
for i, v in enumerate(importances):
|
| 1031 |
+
plt.text(v, i, f'{v:.3f}', va='center', ha='left', color='black', fontsize=10)
|
| 1032 |
+
|
| 1033 |
+
html = self.create_plot_html(plt.gcf(), "feature_importance")
|
| 1034 |
+
plt.close()
|
| 1035 |
+
|
| 1036 |
+
return f"""
|
| 1037 |
+
{html}
|
| 1038 |
+
<p style="color: #6c757d; font-size: 12px; text-align: center;">Bar plot showing feature importance scores</p>
|
| 1039 |
+
"""
|
| 1040 |
+
except Exception as e:
|
| 1041 |
+
return f"<p>Could not generate feature importance plot: {str(e)}</p>"
|
| 1042 |
+
|
| 1043 |
def _format_unsupervised_results(self, data):
|
| 1044 |
+
"""Format unsupervised analysis results with dynamic clustering visualization"""
|
| 1045 |
if data is None:
|
| 1046 |
return "<p>No data available for unsupervised analysis</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1047 |
|
| 1048 |
+
column_types = self.analyzer.detect_column_types(data)
|
| 1049 |
+
numeric_cols = column_types['numeric']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1050 |
|
| 1051 |
+
html = """
|
| 1052 |
+
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
|
| 1053 |
+
<h4 style="margin: 0 0 15px 0; color: #8e44ad;">🔍 Unsupervised Analysis Results</h4>
|
| 1054 |
+
<p style="margin: 0 0 10px 0;">Performed clustering analysis to identify natural groupings in the data.</p>
|
| 1055 |
+
"""
|
| 1056 |
+
|
| 1057 |
+
if len(numeric_cols) >= 2:
|
| 1058 |
+
try:
|
| 1059 |
+
# Perform KMeans clustering with dynamic number of clusters
|
| 1060 |
+
X = data[numeric_cols].dropna().head(1000)
|
| 1061 |
+
n_clusters = min(3, len(X) // 10) if len(X) > 10 else 2
|
| 1062 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 1063 |
+
clusters = kmeans.fit_predict(X)
|
| 1064 |
+
|
| 1065 |
+
plt.figure(figsize=(8, 6))
|
| 1066 |
+
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis', alpha=0.6)
|
| 1067 |
+
plt.title(f'Clustering: {numeric_cols[0]} vs {numeric_cols[1]}', fontsize=14)
|
| 1068 |
+
plt.xlabel(numeric_cols[0], fontsize=12)
|
| 1069 |
+
plt.ylabel(numeric_cols[1], fontsize=12)
|
| 1070 |
+
|
| 1071 |
+
# Add cluster count annotation
|
| 1072 |
+
plt.text(0.95, 0.95, f'Clusters: {n_clusters}',
|
| 1073 |
+
transform=plt.gca().transAxes, ha='right', va='top',
|
| 1074 |
+
bbox=dict(facecolor='white', alpha=0.8))
|
| 1075 |
+
|
| 1076 |
+
html += self.create_plot_html(plt.gcf(), "clustering_plot")
|
| 1077 |
+
plt.close()
|
| 1078 |
+
|
| 1079 |
+
html += f"""
|
| 1080 |
+
<p style="color: #6c757d; font-size: 12px; text-align: center;">
|
| 1081 |
+
Scatter plot showing clusters based on {numeric_cols[0]} and {numeric_cols[1]}
|
| 1082 |
+
</p>
|
| 1083 |
+
"""
|
| 1084 |
+
except Exception as e:
|
| 1085 |
+
html += f"<p>Could not generate clustering plot: {str(e)}</p>"
|
| 1086 |
+
else:
|
| 1087 |
+
html += "<p>Not enough numeric columns for clustering visualization</p>"
|
| 1088 |
+
|
| 1089 |
+
html += """
|
| 1090 |
+
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Unsupervised analysis completed!</strong></p>
|
| 1091 |
</div>
|
| 1092 |
+
"""
|
| 1093 |
+
return html
|
| 1094 |
|
| 1095 |
+
def _create_completion_footer(self, learning_type, domain, enable_deep_learning, enable_automl):
|
| 1096 |
+
"""Create completion footer with summary information"""
|
| 1097 |
+
completion_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 1098 |
+
return f"""
|
| 1099 |
+
<div style="background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%); padding: 30px; border-radius: 15px; color: white; margin-top: 20px; text-align: center; box-shadow: 0 8px 16px rgba(0,0,0,0.2);">
|
| 1100 |
+
<h2 style="margin: 0 0 10px 0;">🎉 Pipeline Completed Successfully!</h2>
|
| 1101 |
+
<p style="margin: 0; font-size: 1.1em; opacity: 0.9;">
|
| 1102 |
+
Analysis Type: {learning_type} | Domain: {domain or 'General'} |
|
| 1103 |
+
Deep Learning: {'Enabled' if enable_deep_learning else 'Disabled'} |
|
| 1104 |
+
AutoML: {'Enabled' if enable_automl else 'Disabled'}
|
| 1105 |
+
</p>
|
| 1106 |
+
<p style="margin: 10px 0 0 0;"><strong>Completed:</strong> {completion_time}</p>
|
| 1107 |
</div>
|
| 1108 |
"""
|
| 1109 |
|
|
|
|
| 1112 |
key_insights = summary.get('key_insights', [])
|
| 1113 |
recommendations = summary.get('recommendations', [])
|
| 1114 |
|
| 1115 |
+
html = """
|
| 1116 |
+
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 15px 0;">
|
| 1117 |
+
<h4 style="margin: 0 0 15px 0; color: #2c3e50;">📈 Final Results & Recommendations</h4>
|
| 1118 |
+
<h5 style="color: #2c3e50;">💡 Key Insights:</h5>
|
| 1119 |
+
<ul>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1120 |
"""
|
| 1121 |
+
for insight in key_insights[:5]:
|
| 1122 |
+
html += f"<li>{insight}</li>"
|
| 1123 |
+
html += """
|
| 1124 |
+
</ul>
|
| 1125 |
+
<h5 style="color: #2c3e50;">🎯 Recommendations:</h5>
|
| 1126 |
+
<ul>
|
| 1127 |
+
"""
|
| 1128 |
+
for rec in recommendations[:5]:
|
| 1129 |
+
html += f"<li>{rec}</li>"
|
| 1130 |
+
html += """
|
| 1131 |
+
</ul>
|
| 1132 |
</div>
|
| 1133 |
+
<p style="color: #27ae60; margin-top: 15px;"><strong>✅ Final results compiled!</strong></p>
|
| 1134 |
"""
|
| 1135 |
+
return html
|
| 1136 |
|
| 1137 |
+
def generate_report(self):
|
| 1138 |
+
"""Generate a downloadable HTML report with all results and visualizations"""
|
| 1139 |
+
if not self.pipeline_results:
|
| 1140 |
+
return self._create_error_html("No pipeline results available to generate report.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1141 |
|
| 1142 |
+
html = f"""
|
| 1143 |
+
<!DOCTYPE html>
|
| 1144 |
+
<html>
|
| 1145 |
+
<head>
|
| 1146 |
+
<title>Data Science Pipeline Report</title>
|
| 1147 |
+
<style>
|
| 1148 |
+
{self.custom_css}
|
| 1149 |
+
body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 40px; background: #f4f7fa; }}
|
| 1150 |
+
h1, h2, h3, h4, h5 {{ color: #2c3e50; }}
|
| 1151 |
+
img {{ max-width: 100%; height: auto; }}
|
| 1152 |
+
</style>
|
| 1153 |
+
</head>
|
| 1154 |
+
<body>
|
| 1155 |
+
{self._create_progress_header()}
|
| 1156 |
+
{self._create_all_steps_html(
|
| 1157 |
+
self.pipeline_results,
|
| 1158 |
+
self.pipeline_results.get('summary', {}),
|
| 1159 |
+
self.pipeline_results.get('learning_type', 'Unknown'),
|
| 1160 |
+
self.pipeline_results.get('target_column', None),
|
| 1161 |
+
self.pipeline_results.get('domain_insights', {}).get('detected_domain', 'general'),
|
| 1162 |
+
self.pipeline_results.get('enable_deep_learning', False),
|
| 1163 |
+
self.pipeline_results.get('enable_automl', False)
|
| 1164 |
+
)}
|
| 1165 |
+
</body>
|
| 1166 |
+
</html>
|
| 1167 |
+
"""
|
| 1168 |
|
| 1169 |
+
report_path = f"pipeline_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
|
| 1170 |
+
with open(report_path, 'w', encoding='utf-8') as f:
|
| 1171 |
+
f.write(html)
|
| 1172 |
+
|
| 1173 |
+
return report_path
|
| 1174 |
|
| 1175 |
+
def launch(self):
|
| 1176 |
+
"""Launch the Gradio interface for the pipeline"""
|
| 1177 |
+
with gr.Blocks(theme=gr.themes.Default(), css=self.custom_css) as demo:
|
| 1178 |
+
gr.Markdown("""
|
| 1179 |
+
# 🔬 Comprehensive Data Science Pipeline
|
| 1180 |
+
Upload your dataset and configure the pipeline settings to perform automated data analysis and modeling.
|
| 1181 |
+
""")
|
| 1182 |
|
| 1183 |
with gr.Row():
|
| 1184 |
with gr.Column(scale=1):
|
| 1185 |
+
file_input = gr.File(label="Upload Dataset (CSV/JSON)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1186 |
learning_type = gr.Radio(
|
| 1187 |
+
choices=["Supervised", "Unsupervised"],
|
| 1188 |
+
label="Learning Type",
|
| 1189 |
value="Supervised"
|
| 1190 |
)
|
| 1191 |
target_column = gr.Dropdown(
|
| 1192 |
+
choices=[],
|
| 1193 |
+
label="Target Column (for Supervised Learning)",
|
| 1194 |
+
visible=True
|
|
|
|
| 1195 |
)
|
| 1196 |
domain = gr.Textbox(
|
| 1197 |
+
label="Domain (e.g., Finance, Healthcare)",
|
| 1198 |
+
placeholder="Enter domain or leave blank for general analysis"
|
| 1199 |
+
)
|
| 1200 |
+
enable_deep_learning = gr.Checkbox(
|
| 1201 |
+
label="Enable Deep Learning Models",
|
| 1202 |
+
value=False
|
| 1203 |
)
|
| 1204 |
+
enable_automl = gr.Checkbox(
|
| 1205 |
+
label="Enable AutoML",
|
| 1206 |
+
value=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1207 |
)
|
| 1208 |
+
run_button = gr.Button("Run Pipeline", variant="primary")
|
| 1209 |
+
|
| 1210 |
+
with gr.Column(scale=2):
|
| 1211 |
+
file_info = gr.HTML(label="File Information")
|
| 1212 |
+
data_preview = gr.HTML(label="Data Preview")
|
| 1213 |
+
pipeline_output = gr.HTML(label="Pipeline Results")
|
| 1214 |
+
download_button = gr.File(
|
| 1215 |
+
label="Download Report",
|
| 1216 |
visible=False
|
| 1217 |
)
|
| 1218 |
|
| 1219 |
+
# Event handlers
|
| 1220 |
+
file_input.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1221 |
fn=self.process_file_upload,
|
| 1222 |
+
inputs=[file_input, learning_type],
|
| 1223 |
+
outputs=[file_info, gr.State(), target_column, target_column, data_preview]
|
| 1224 |
)
|
|
|
|
| 1225 |
learning_type.change(
|
| 1226 |
fn=self.update_target_column_visibility,
|
| 1227 |
+
inputs=[learning_type, gr.State()],
|
| 1228 |
outputs=[target_column]
|
| 1229 |
)
|
| 1230 |
+
run_button.click(
|
|
|
|
| 1231 |
fn=self.run_comprehensive_pipeline,
|
| 1232 |
+
inputs=[file_input, learning_type, target_column, domain, enable_deep_learning, enable_automl],
|
| 1233 |
+
outputs=[pipeline_output, download_button]
|
| 1234 |
)
|
| 1235 |
+
download_button.upload(
|
| 1236 |
+
fn=self.generate_report,
|
| 1237 |
+
inputs=[],
|
| 1238 |
+
outputs=[download_button]
|
|
|
|
| 1239 |
)
|
| 1240 |
|
| 1241 |
+
demo.launch()
|
| 1242 |
|
| 1243 |
+
# Example usage
|
| 1244 |
if __name__ == "__main__":
|
| 1245 |
+
pipeline_ui = DataSciencePipelineUI()
|
| 1246 |
+
pipeline_ui.launch(share=True)
|
|
|
|
|
|