Update app.py
Browse files
app.py
CHANGED
|
@@ -1,43 +1,152 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
import json
|
| 5 |
-
from io import BytesIO
|
| 6 |
-
import base64
|
| 7 |
import os
|
| 8 |
-
import time
|
| 9 |
-
from datetime import datetime
|
| 10 |
-
import warnings
|
| 11 |
-
warnings.filterwarnings('ignore')
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
import matplotlib
|
| 16 |
-
matplotlib.use('Agg') #
|
| 17 |
-
import matplotlib.pyplot as plt
|
| 18 |
-
import seaborn as sns
|
| 19 |
-
except ImportError:
|
| 20 |
-
import subprocess
|
| 21 |
-
import sys
|
| 22 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib", "seaborn"])
|
| 23 |
-
import matplotlib
|
| 24 |
-
matplotlib.use('Agg')
|
| 25 |
import matplotlib.pyplot as plt
|
| 26 |
import seaborn as sns
|
| 27 |
-
|
| 28 |
-
# Import plotly with error handling
|
| 29 |
-
try:
|
| 30 |
-
import plotly.graph_objects as go
|
| 31 |
-
import plotly.express as px
|
| 32 |
-
from plotly.subplots import make_subplots
|
| 33 |
-
except ImportError:
|
| 34 |
-
import subprocess
|
| 35 |
-
import sys
|
| 36 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly"])
|
| 37 |
import plotly.graph_objects as go
|
| 38 |
import plotly.express as px
|
| 39 |
from plotly.subplots import make_subplots
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# Import your comprehensive pipeline
|
| 42 |
try:
|
| 43 |
from supervisor_agent import SupervisorAgent
|
|
@@ -46,32 +155,32 @@ except ImportError:
|
|
| 46 |
|
| 47 |
class DataSciencePipelineUI:
|
| 48 |
"""Advanced UI for the comprehensive data science pipeline"""
|
| 49 |
-
|
| 50 |
def __init__(self):
|
| 51 |
try:
|
| 52 |
self.supervisor = SupervisorAgent()
|
| 53 |
except:
|
| 54 |
# Fallback mock implementation if supervisor_agent isn't available
|
| 55 |
self.supervisor = self._create_mock_supervisor()
|
| 56 |
-
|
| 57 |
self.current_data = None
|
| 58 |
self.pipeline_results = None
|
| 59 |
-
|
| 60 |
# UI State
|
| 61 |
self.processing_step = 0
|
| 62 |
self.total_steps = 6
|
| 63 |
-
|
| 64 |
# Styling
|
| 65 |
self.custom_css = """
|
| 66 |
-
.main-container {
|
| 67 |
-
max-width: 1400px;
|
| 68 |
-
margin: 0 auto;
|
| 69 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 70 |
}
|
| 71 |
-
.step-container {
|
| 72 |
-
margin: 15px 0;
|
| 73 |
-
padding: 20px;
|
| 74 |
-
border-radius: 12px;
|
| 75 |
border-left: 5px solid #3498db;
|
| 76 |
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
|
| 77 |
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
|
@@ -126,7 +235,7 @@ class DataSciencePipelineUI:
|
|
| 126 |
'info': {'shape': (1000, 10), 'columns': ['col1', 'col2'], 'dtypes': {'col1': 'float64'}}
|
| 127 |
},
|
| 128 |
'data_cleaning': {
|
| 129 |
-
'status': 'success',
|
| 130 |
'cleaning_report': {'duplicates_removed': 5, 'missing_values': {'col1': 10}}
|
| 131 |
}
|
| 132 |
},
|
|
@@ -152,12 +261,12 @@ class DataSciencePipelineUI:
|
|
| 152 |
"""Enhanced file processing with detailed analysis"""
|
| 153 |
if file_obj is None:
|
| 154 |
return "β No file uploaded", "", [], gr.update(visible=False), ""
|
| 155 |
-
|
| 156 |
try:
|
| 157 |
file_path = file_obj.name
|
| 158 |
file_name = os.path.basename(file_path)
|
| 159 |
file_extension = os.path.splitext(file_name)[1].lower()
|
| 160 |
-
|
| 161 |
# Load data based on file type
|
| 162 |
if file_extension == '.csv':
|
| 163 |
df = pd.read_csv(file_path)
|
|
@@ -167,24 +276,24 @@ class DataSciencePipelineUI:
|
|
| 167 |
file_type = 'json'
|
| 168 |
else:
|
| 169 |
return "β Unsupported file type. Please upload CSV or JSON files only.", "", [], gr.update(visible=False), ""
|
| 170 |
-
|
| 171 |
# Store the data
|
| 172 |
self.current_data = df
|
| 173 |
-
|
| 174 |
# Detailed file analysis
|
| 175 |
file_size = os.path.getsize(file_path) / 1024 # KB
|
| 176 |
memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # MB
|
| 177 |
missing_count = df.isnull().sum().sum()
|
| 178 |
duplicate_count = df.duplicated().sum()
|
| 179 |
-
|
| 180 |
# Data type analysis
|
| 181 |
numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
|
| 182 |
categorical_cols = len(df.select_dtypes(include=['object']).columns)
|
| 183 |
datetime_cols = len(df.select_dtypes(include=['datetime64']).columns)
|
| 184 |
-
|
| 185 |
# Create preview table HTML
|
| 186 |
preview_html = self._create_data_preview(df)
|
| 187 |
-
|
| 188 |
file_info = f"""
|
| 189 |
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; color: white; margin: 10px 0;">
|
| 190 |
<h3 style="margin: 0 0 15px 0;">π File Upload Successful!</h3>
|
|
@@ -216,25 +325,25 @@ class DataSciencePipelineUI:
|
|
| 216 |
</div>
|
| 217 |
</div>
|
| 218 |
"""
|
| 219 |
-
|
| 220 |
columns = df.columns.tolist()
|
| 221 |
target_update = gr.update(visible=(learning_type == "Supervised"), choices=columns, value=columns[0] if columns and learning_type == "Supervised" else "")
|
| 222 |
-
|
| 223 |
return (
|
| 224 |
-
file_info,
|
| 225 |
-
file_type,
|
| 226 |
-
columns,
|
| 227 |
target_update,
|
| 228 |
preview_html
|
| 229 |
)
|
| 230 |
-
|
| 231 |
except Exception as e:
|
| 232 |
return f"β Error processing file: {str(e)}", "", [], gr.update(visible=False), ""
|
| 233 |
|
| 234 |
def _create_data_preview(self, df):
|
| 235 |
"""Create HTML preview of the data"""
|
| 236 |
preview_df = df.head(10)
|
| 237 |
-
|
| 238 |
html = """
|
| 239 |
<div style="background: white; padding: 20px; border-radius: 10px; margin: 15px 0; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 240 |
<h4 style="color: #2c3e50; margin-bottom: 15px;">π Data Preview (First 10 rows)</h4>
|
|
@@ -243,12 +352,12 @@ class DataSciencePipelineUI:
|
|
| 243 |
<thead>
|
| 244 |
<tr style="background-color: #3498db; color: white;">
|
| 245 |
"""
|
| 246 |
-
|
| 247 |
# Add headers
|
| 248 |
for col in preview_df.columns:
|
| 249 |
html += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
|
| 250 |
html += "</tr></thead><tbody>"
|
| 251 |
-
|
| 252 |
# Add rows
|
| 253 |
for idx, row in preview_df.iterrows():
|
| 254 |
html += f"<tr style='background-color: {'#f9f9f9' if idx % 2 == 0 else 'white'};'>"
|
|
@@ -260,10 +369,10 @@ class DataSciencePipelineUI:
|
|
| 260 |
cell_value = f"{value:.3f}" if isinstance(value, float) else str(value)
|
| 261 |
else:
|
| 262 |
cell_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
|
| 263 |
-
|
| 264 |
html += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
|
| 265 |
html += "</tr>"
|
| 266 |
-
|
| 267 |
html += "</tbody></table></div></div>"
|
| 268 |
return html
|
| 269 |
|
|
@@ -278,27 +387,27 @@ class DataSciencePipelineUI:
|
|
| 278 |
"""Run the complete comprehensive pipeline with advanced features"""
|
| 279 |
if file_obj is None:
|
| 280 |
return self._create_error_html("Please upload a file first.")
|
| 281 |
-
|
| 282 |
if learning_type == "Supervised" and not target_column:
|
| 283 |
return self._create_error_html("Please select a target column for supervised learning.")
|
| 284 |
-
|
| 285 |
try:
|
| 286 |
# Initialize progress tracking
|
| 287 |
progress_html = self._create_progress_header()
|
| 288 |
-
|
| 289 |
file_path = file_obj.name
|
| 290 |
file_extension = os.path.splitext(file_path)[1].lower().replace('.', '')
|
| 291 |
-
|
| 292 |
# Step 1: Data Loading
|
| 293 |
step1_html = self._create_step_html(
|
| 294 |
-
1, "π Data Loading", "loading",
|
| 295 |
"Loading and validating your dataset..."
|
| 296 |
)
|
| 297 |
progress_html += step1_html
|
| 298 |
-
|
| 299 |
# Simulate some processing time for better UX
|
| 300 |
time.sleep(1)
|
| 301 |
-
|
| 302 |
# Execute data loading
|
| 303 |
try:
|
| 304 |
# Use your actual SupervisorAgent
|
|
@@ -307,52 +416,52 @@ class DataSciencePipelineUI:
|
|
| 307 |
'target_column': target_column if target_column else None,
|
| 308 |
'domain': domain.lower() if domain else 'general'
|
| 309 |
}
|
| 310 |
-
|
| 311 |
result = self.supervisor.execute_pipeline(
|
| 312 |
data_source=file_path,
|
| 313 |
**pipeline_kwargs
|
| 314 |
)
|
| 315 |
-
|
| 316 |
if result['status'] != 'success':
|
| 317 |
return self._create_error_html(f"Pipeline failed: {result.get('error', 'Unknown error')}")
|
| 318 |
-
|
| 319 |
self.pipeline_results = result['pipeline_results']
|
| 320 |
summary = result['summary']
|
| 321 |
-
|
| 322 |
except Exception as e:
|
| 323 |
# Fallback to demonstration mode
|
| 324 |
result = self._create_demo_results(self.current_data, target_column, learning_type, domain)
|
| 325 |
self.pipeline_results = result['pipeline_results']
|
| 326 |
summary = result['summary']
|
| 327 |
-
|
| 328 |
# Update Step 1 - Completed
|
| 329 |
step1_complete = self._create_step_html(
|
| 330 |
1, "π Data Loading", "completed",
|
| 331 |
self._format_data_loading_results(self.pipeline_results.get('data_loading', {}))
|
| 332 |
)
|
| 333 |
progress_html = progress_html.replace(step1_html, step1_complete)
|
| 334 |
-
|
| 335 |
# Step 2: Data Cleaning
|
| 336 |
step2_html = self._create_step_html(
|
| 337 |
2, "π§Ή Data Cleaning", "completed",
|
| 338 |
self._format_data_cleaning_results(self.pipeline_results.get('data_cleaning', {}))
|
| 339 |
)
|
| 340 |
progress_html += step2_html
|
| 341 |
-
|
| 342 |
# Step 3: Exploratory Data Analysis
|
| 343 |
step3_html = self._create_step_html(
|
| 344 |
3, "π Exploratory Data Analysis", "completed",
|
| 345 |
self._format_eda_results(self.pipeline_results.get('eda', {}), self.current_data)
|
| 346 |
)
|
| 347 |
progress_html += step3_html
|
| 348 |
-
|
| 349 |
# Step 4: Feature Engineering & Domain Insights
|
| 350 |
step4_html = self._create_step_html(
|
| 351 |
4, "βοΈ Feature Engineering & Domain Analysis", "completed",
|
| 352 |
self._format_domain_results(self.pipeline_results.get('domain_insights', {}))
|
| 353 |
)
|
| 354 |
progress_html += step4_html
|
| 355 |
-
|
| 356 |
# Step 5: Model Training
|
| 357 |
if learning_type == "Supervised" and target_column:
|
| 358 |
step5_html = self._create_step_html(
|
|
@@ -366,20 +475,20 @@ class DataSciencePipelineUI:
|
|
| 366 |
self._format_unsupervised_results(self.current_data)
|
| 367 |
)
|
| 368 |
progress_html += step5_html
|
| 369 |
-
|
| 370 |
# Step 6: Results & Insights
|
| 371 |
step6_html = self._create_step_html(
|
| 372 |
6, "π Results & Recommendations", "completed",
|
| 373 |
self._format_final_results(summary, self.pipeline_results)
|
| 374 |
)
|
| 375 |
progress_html += step6_html
|
| 376 |
-
|
| 377 |
# Add completion footer
|
| 378 |
completion_html = self._create_completion_footer(learning_type, domain, enable_deep_learning, enable_automl)
|
| 379 |
progress_html += completion_html
|
| 380 |
-
|
| 381 |
return progress_html
|
| 382 |
-
|
| 383 |
except Exception as e:
|
| 384 |
return self._create_error_html(f"Pipeline execution failed: {str(e)}")
|
| 385 |
|
|
@@ -394,7 +503,7 @@ class DataSciencePipelineUI:
|
|
| 394 |
def _create_demo_results(self, data, target_column, learning_type, domain):
|
| 395 |
"""Create demonstration results when actual pipeline fails"""
|
| 396 |
from datetime import datetime
|
| 397 |
-
|
| 398 |
# Mock comprehensive results
|
| 399 |
return {
|
| 400 |
'status': 'success',
|
|
@@ -478,9 +587,9 @@ class DataSciencePipelineUI:
|
|
| 478 |
'completed': {'color': '#27ae60', 'icon': 'β
', 'bg': '#d4edda'},
|
| 479 |
'error': {'color': '#e74c3c', 'icon': 'β', 'bg': '#f8d7da'}
|
| 480 |
}
|
| 481 |
-
|
| 482 |
config = status_config.get(status, status_config['loading'])
|
| 483 |
-
|
| 484 |
return f"""
|
| 485 |
<div style="margin: 20px 0; padding: 25px; background: {config['bg']}; border-left: 6px solid {config['color']}; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 486 |
<div style="display: flex; align-items: center; margin-bottom: 15px;">
|
|
@@ -502,16 +611,16 @@ class DataSciencePipelineUI:
|
|
| 502 |
"""Format data loading results"""
|
| 503 |
if not results or results.get('status') != 'success':
|
| 504 |
return "<p>Data loading information not available</p>"
|
| 505 |
-
|
| 506 |
info = results.get('info', {})
|
| 507 |
shape = info.get('shape', (0, 0))
|
| 508 |
columns = info.get('columns', [])
|
| 509 |
dtypes = info.get('dtypes', {})
|
| 510 |
-
|
| 511 |
# Count data types
|
| 512 |
numeric_cols = sum(1 for dtype in dtypes.values() if 'int' in str(dtype) or 'float' in str(dtype))
|
| 513 |
categorical_cols = sum(1 for dtype in dtypes.values() if 'object' in str(dtype))
|
| 514 |
-
|
| 515 |
return f"""
|
| 516 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 517 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -541,15 +650,15 @@ class DataSciencePipelineUI:
|
|
| 541 |
"""Format data cleaning results"""
|
| 542 |
if not results or results.get('status') != 'success':
|
| 543 |
return "<p>Data cleaning information not available</p>"
|
| 544 |
-
|
| 545 |
report = results.get('cleaning_report', {})
|
| 546 |
duplicates = report.get('duplicates_removed', 0)
|
| 547 |
missing_values = report.get('missing_values', {})
|
| 548 |
outliers = report.get('outliers_handled', {})
|
| 549 |
-
|
| 550 |
total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
|
| 551 |
total_outliers = sum(outliers.values()) if isinstance(outliers, dict) else 0
|
| 552 |
-
|
| 553 |
return f"""
|
| 554 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 555 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -560,21 +669,21 @@ class DataSciencePipelineUI:
|
|
| 560 |
</div>
|
| 561 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 562 |
<h4 style="margin: 0 0 10px 0; color: #e67e22;">π Data Quality</h4>
|
| 563 |
-
<p style="margin: 5px 0;"><strong>Overall Quality:</strong>
|
| 564 |
<span style="color: #27ae60; font-weight: bold;">
|
| 565 |
{85 + np.random.randint(0, 15):.1f}%
|
| 566 |
</span>
|
| 567 |
</p>
|
| 568 |
-
<p style="margin: 5px 0;"><strong>Completeness:</strong>
|
| 569 |
<span style="color: #27ae60;">
|
| 570 |
{95 + np.random.randint(0, 5):.1f}%
|
| 571 |
</span>
|
| 572 |
</p>
|
| 573 |
</div>
|
| 574 |
</div>
|
| 575 |
-
|
| 576 |
{self._create_missing_values_chart(missing_values) if missing_values else ""}
|
| 577 |
-
|
| 578 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Data cleaning completed successfully!</strong></p>
|
| 579 |
<div style="background: #e8f5e8; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 580 |
<p style="margin: 0; color: #2d5a2d;"><strong>Cleaning Strategy:</strong> Applied median imputation for numeric features and mode imputation for categorical features. Outliers were capped using IQR method.</p>
|
|
@@ -585,31 +694,31 @@ class DataSciencePipelineUI:
|
|
| 585 |
"""Create a visual representation of missing values"""
|
| 586 |
if not missing_values or not any(missing_values.values()):
|
| 587 |
return ""
|
| 588 |
-
|
| 589 |
# Filter out columns with no missing values
|
| 590 |
missing_data = {k: v for k, v in missing_values.items() if v > 0}
|
| 591 |
-
|
| 592 |
if not missing_data:
|
| 593 |
return ""
|
| 594 |
-
|
| 595 |
try:
|
| 596 |
# Create a simple matplotlib bar chart
|
| 597 |
fig, ax = plt.subplots(figsize=(10, 6))
|
| 598 |
columns = list(missing_data.keys())[:10] # Limit to 10 columns
|
| 599 |
values = [missing_data[col] for col in columns]
|
| 600 |
-
|
| 601 |
bars = ax.bar(columns, values, color='#e74c3c', alpha=0.7)
|
| 602 |
ax.set_xlabel('Columns')
|
| 603 |
ax.set_ylabel('Missing Values Count')
|
| 604 |
ax.set_title('Missing Values by Column (Before Cleaning)')
|
| 605 |
plt.xticks(rotation=45, ha='right')
|
| 606 |
plt.tight_layout()
|
| 607 |
-
|
| 608 |
# Add value labels on bars
|
| 609 |
for bar, value in zip(bars, values):
|
| 610 |
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
| 611 |
str(value), ha='center', va='bottom')
|
| 612 |
-
|
| 613 |
chart_html = self.create_plot_html(fig)
|
| 614 |
return f"""
|
| 615 |
<div style="background: white; padding: 15px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -624,11 +733,11 @@ class DataSciencePipelineUI:
|
|
| 624 |
"""Format EDA results with visualizations"""
|
| 625 |
if not results or results.get('status') != 'success':
|
| 626 |
return "<p>EDA information not available</p>"
|
| 627 |
-
|
| 628 |
analysis = results.get('analysis', {})
|
| 629 |
correlations = analysis.get('correlations', {})
|
| 630 |
correlation_matrix = correlations.get('correlation_matrix', {})
|
| 631 |
-
|
| 632 |
eda_html = f"""
|
| 633 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 634 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -644,47 +753,47 @@ class DataSciencePipelineUI:
|
|
| 644 |
</div>
|
| 645 |
</div>
|
| 646 |
"""
|
| 647 |
-
|
| 648 |
# Add correlation heatmap if available
|
| 649 |
if correlation_matrix:
|
| 650 |
eda_html += self._create_correlation_heatmap(correlation_matrix)
|
| 651 |
-
|
| 652 |
# Add distribution plots
|
| 653 |
eda_html += self._create_distribution_plots(data)
|
| 654 |
-
|
| 655 |
eda_html += """
|
| 656 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Exploratory Data Analysis completed!</strong></p>
|
| 657 |
<div style="background: #f0e6ff; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 658 |
<p style="margin: 0; color: #6a1b9a;"><strong>Key Insights:</strong> Statistical analysis reveals data patterns, correlations, and distributions that will guide feature engineering and model selection.</p>
|
| 659 |
</div>
|
| 660 |
"""
|
| 661 |
-
|
| 662 |
return eda_html
|
| 663 |
|
| 664 |
def _create_correlation_heatmap(self, correlation_matrix):
|
| 665 |
"""Create correlation heatmap visualization"""
|
| 666 |
if not correlation_matrix:
|
| 667 |
return ""
|
| 668 |
-
|
| 669 |
try:
|
| 670 |
corr_df = pd.DataFrame(correlation_matrix)
|
| 671 |
if corr_df.empty or len(corr_df.columns) < 2:
|
| 672 |
return ""
|
| 673 |
-
|
| 674 |
fig, ax = plt.subplots(figsize=(10, 8))
|
| 675 |
mask = np.triu(np.ones_like(corr_df, dtype=bool)) # Mask upper triangle
|
| 676 |
-
sns.heatmap(corr_df, mask=mask, annot=True, cmap='RdBu_r', center=0,
|
| 677 |
square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax)
|
| 678 |
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
|
| 679 |
plt.tight_layout()
|
| 680 |
-
|
| 681 |
chart_html = self.create_plot_html(fig)
|
| 682 |
return f"""
|
| 683 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 684 |
<h4 style="margin: 0 0 15px 0; color: #9b59b6;">π Correlation Analysis</h4>
|
| 685 |
{chart_html}
|
| 686 |
<p style="margin-top: 10px; font-size: 12px; color: #666;">
|
| 687 |
-
<strong>Interpretation:</strong> Red indicates negative correlation, blue indicates positive correlation.
|
| 688 |
Values closer to Β±1 indicate stronger relationships.
|
| 689 |
</p>
|
| 690 |
</div>
|
|
@@ -696,13 +805,13 @@ class DataSciencePipelineUI:
|
|
| 696 |
"""Create distribution plots for key variables"""
|
| 697 |
try:
|
| 698 |
numeric_cols = data.select_dtypes(include=[np.number]).columns[:4] # Limit to 4 plots
|
| 699 |
-
|
| 700 |
if len(numeric_cols) == 0:
|
| 701 |
return "<p>No numeric columns found for distribution analysis</p>"
|
| 702 |
-
|
| 703 |
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
|
| 704 |
axes = axes.flatten()
|
| 705 |
-
|
| 706 |
for i, col in enumerate(numeric_cols):
|
| 707 |
if i < 4:
|
| 708 |
sns.histplot(data[col].dropna(), kde=True, ax=axes[i], color='skyblue', alpha=0.7)
|
|
@@ -710,14 +819,14 @@ class DataSciencePipelineUI:
|
|
| 710 |
axes[i].set_xlabel(col)
|
| 711 |
axes[i].set_ylabel('Frequency')
|
| 712 |
axes[i].grid(True, alpha=0.3)
|
| 713 |
-
|
| 714 |
# Hide empty subplots
|
| 715 |
for i in range(len(numeric_cols), 4):
|
| 716 |
axes[i].set_visible(False)
|
| 717 |
-
|
| 718 |
plt.suptitle('Feature Distributions', fontsize=16, fontweight='bold', y=1.02)
|
| 719 |
plt.tight_layout()
|
| 720 |
-
|
| 721 |
chart_html = self.create_plot_html(fig)
|
| 722 |
return f"""
|
| 723 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -735,11 +844,11 @@ class DataSciencePipelineUI:
|
|
| 735 |
"""Format domain analysis results"""
|
| 736 |
if not results:
|
| 737 |
return "<p>Domain analysis information not available</p>"
|
| 738 |
-
|
| 739 |
domain = results.get('detected_domain', 'general')
|
| 740 |
insights = results.get('insights', [])
|
| 741 |
recommendations = results.get('recommendations', [])
|
| 742 |
-
|
| 743 |
return f"""
|
| 744 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 745 |
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -757,14 +866,14 @@ class DataSciencePipelineUI:
|
|
| 757 |
</ul>
|
| 758 |
</div>
|
| 759 |
</div>
|
| 760 |
-
|
| 761 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 762 |
<h4 style="margin: 0 0 15px 0; color: #1abc9c;">π― Recommendations</h4>
|
| 763 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
|
| 764 |
{''.join([f'<div style="background: #e8f5e8; padding: 12px; border-radius: 6px; border-left: 4px solid #27ae60;"><span style="color: #27ae60; font-weight: bold;">β’</span> {rec}</div>' for rec in recommendations[:6]])}
|
| 765 |
</div>
|
| 766 |
</div>
|
| 767 |
-
|
| 768 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Domain analysis and feature engineering recommendations completed!</strong></p>
|
| 769 |
<div style="background: #e0f7fa; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 770 |
<p style="margin: 0; color: #00695c;"><strong>Feature Engineering:</strong> Applied domain-specific transformations and created relevant features based on {domain} domain expertise.</p>
|
|
@@ -775,18 +884,18 @@ class DataSciencePipelineUI:
|
|
| 775 |
"""Format modeling results with comprehensive metrics"""
|
| 776 |
if not results or results.get('status') != 'success':
|
| 777 |
return self._format_unsupervised_results(self.current_data)
|
| 778 |
-
|
| 779 |
problem_type = results.get('problem_type', 'classification')
|
| 780 |
best_model = results.get('best_model', 'Unknown')
|
| 781 |
model_results = results.get('results', {})
|
| 782 |
feature_importance = results.get('feature_importance', {})
|
| 783 |
-
|
| 784 |
# Create model comparison chart
|
| 785 |
model_comparison_html = self._create_model_comparison_chart(model_results, problem_type)
|
| 786 |
-
|
| 787 |
# Create feature importance chart
|
| 788 |
feature_importance_html = self._create_feature_importance_chart(feature_importance)
|
| 789 |
-
|
| 790 |
return f"""
|
| 791 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 792 |
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
@@ -805,10 +914,10 @@ class DataSciencePipelineUI:
|
|
| 805 |
<p style="margin: 8px 0;"><strong>Features Used:</strong> {len(feature_importance) if feature_importance else 'N/A'}</p>
|
| 806 |
</div>
|
| 807 |
</div>
|
| 808 |
-
|
| 809 |
{model_comparison_html}
|
| 810 |
{feature_importance_html}
|
| 811 |
-
|
| 812 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 813 |
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">π§ͺ Training Details</h4>
|
| 814 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
|
@@ -826,7 +935,7 @@ class DataSciencePipelineUI:
|
|
| 826 |
</div>
|
| 827 |
</div>
|
| 828 |
</div>
|
| 829 |
-
|
| 830 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Model training and evaluation completed successfully!</strong></p>
|
| 831 |
<div style="background: #fef5e7; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 832 |
<p style="margin: 0; color: #d68910;"><strong>Model Performance:</strong> The {best_model} achieved the best performance with comprehensive evaluation metrics. Consider ensemble methods for further improvement.</p>
|
|
@@ -837,7 +946,7 @@ class DataSciencePipelineUI:
|
|
| 837 |
"""Get formatted metrics for the best model"""
|
| 838 |
if not best_model_result:
|
| 839 |
return ""
|
| 840 |
-
|
| 841 |
if 'classification' in problem_type.lower():
|
| 842 |
accuracy = best_model_result.get('accuracy', 0)
|
| 843 |
f1_score = best_model_result.get('f1_score', 0)
|
|
@@ -861,39 +970,39 @@ class DataSciencePipelineUI:
|
|
| 861 |
"""Create model comparison visualization"""
|
| 862 |
if not model_results:
|
| 863 |
return ""
|
| 864 |
-
|
| 865 |
try:
|
| 866 |
# Prepare data for plotting
|
| 867 |
model_names = []
|
| 868 |
scores = []
|
| 869 |
-
|
| 870 |
for model_name, result in model_results.items():
|
| 871 |
model_names.append(model_name)
|
| 872 |
if 'classification' in problem_type.lower():
|
| 873 |
scores.append(result.get('accuracy', 0))
|
| 874 |
else:
|
| 875 |
scores.append(result.get('r2_score', 0))
|
| 876 |
-
|
| 877 |
if not model_names:
|
| 878 |
return ""
|
| 879 |
-
|
| 880 |
# Create plot
|
| 881 |
fig, ax = plt.subplots(figsize=(12, 6))
|
| 882 |
bars = ax.barh(model_names, scores, color=plt.cm.viridis(np.linspace(0, 1, len(model_names))))
|
| 883 |
-
|
| 884 |
# Customize plot
|
| 885 |
ax.set_xlabel('Accuracy' if 'classification' in problem_type.lower() else 'RΒ² Score')
|
| 886 |
ax.set_title(f'Model Performance Comparison - {problem_type.title()}', fontsize=16, fontweight='bold', pad=20)
|
| 887 |
ax.grid(True, alpha=0.3, axis='x')
|
| 888 |
-
|
| 889 |
# Add value labels on bars
|
| 890 |
for bar, score in zip(bars, scores):
|
| 891 |
ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
|
| 892 |
f'{score:.3f}', ha='left', va='center', fontweight='bold')
|
| 893 |
-
|
| 894 |
plt.tight_layout()
|
| 895 |
chart_html = self.create_plot_html(fig)
|
| 896 |
-
|
| 897 |
return f"""
|
| 898 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 899 |
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">π Model Performance Comparison</h4>
|
|
@@ -912,30 +1021,30 @@ class DataSciencePipelineUI:
|
|
| 912 |
"""Create feature importance visualization"""
|
| 913 |
if not feature_importance:
|
| 914 |
return ""
|
| 915 |
-
|
| 916 |
try:
|
| 917 |
# Get top 10 features
|
| 918 |
sorted_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
|
| 919 |
-
|
| 920 |
features = list(sorted_features.keys())
|
| 921 |
importance = list(sorted_features.values())
|
| 922 |
-
|
| 923 |
# Create plot
|
| 924 |
fig, ax = plt.subplots(figsize=(10, 6))
|
| 925 |
bars = ax.barh(features, importance, color='coral', alpha=0.8)
|
| 926 |
-
|
| 927 |
ax.set_xlabel('Feature Importance')
|
| 928 |
ax.set_title('Top 10 Most Important Features', fontsize=16, fontweight='bold', pad=20)
|
| 929 |
ax.grid(True, alpha=0.3, axis='x')
|
| 930 |
-
|
| 931 |
# Add value labels
|
| 932 |
for bar, imp in zip(bars, importance):
|
| 933 |
ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
|
| 934 |
f'{imp:.3f}', ha='left', va='center', fontweight='bold')
|
| 935 |
-
|
| 936 |
plt.tight_layout()
|
| 937 |
chart_html = self.create_plot_html(fig)
|
| 938 |
-
|
| 939 |
return f"""
|
| 940 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 941 |
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">π― Feature Importance Analysis</h4>
|
|
@@ -972,7 +1081,7 @@ class DataSciencePipelineUI:
|
|
| 972 |
<p style="margin: 8px 0;"><strong>Dimensionality:</strong> {data.shape[1]} features analyzed</p>
|
| 973 |
</div>
|
| 974 |
</div>
|
| 975 |
-
|
| 976 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 977 |
<h4 style="margin: 0 0 15px 0; color: #9b59b6;">π― Cluster Characteristics</h4>
|
| 978 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
|
@@ -990,7 +1099,7 @@ class DataSciencePipelineUI:
|
|
| 990 |
</div>
|
| 991 |
</div>
|
| 992 |
</div>
|
| 993 |
-
|
| 994 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Unsupervised analysis completed successfully!</strong></p>
|
| 995 |
<div style="background: #f3e5f5; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 996 |
<p style="margin: 0; color: #7b1fa2;"><strong>Insights:</strong> Discovered natural groupings in your data that can be used for segmentation, anomaly detection, and pattern recognition.</p>
|
|
@@ -1001,7 +1110,7 @@ class DataSciencePipelineUI:
|
|
| 1001 |
"""Format final results and recommendations"""
|
| 1002 |
key_insights = summary.get('key_insights', [])
|
| 1003 |
recommendations = summary.get('recommendations', [])
|
| 1004 |
-
|
| 1005 |
return f"""
|
| 1006 |
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
|
| 1007 |
<h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">π Pipeline Completed Successfully!</h3>
|
|
@@ -1024,7 +1133,7 @@ class DataSciencePipelineUI:
|
|
| 1024 |
</div>
|
| 1025 |
</div>
|
| 1026 |
</div>
|
| 1027 |
-
|
| 1028 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
|
| 1029 |
<div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 1030 |
<h4 style="margin: 0 0 20px 0; color: #2c3e50; font-size: 1.3em;">π Key Insights Discovered</h4>
|
|
@@ -1053,7 +1162,7 @@ class DataSciencePipelineUI:
|
|
| 1053 |
"""Create the Gradio interface"""
|
| 1054 |
with gr.Blocks(css=self.custom_css) as demo:
|
| 1055 |
gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>π¬ Comprehensive Data Science Pipeline</h1>")
|
| 1056 |
-
|
| 1057 |
with gr.Row():
|
| 1058 |
with gr.Column(scale=1):
|
| 1059 |
file_upload = gr.File(label="Upload Dataset (CSV or JSON) or Drag & Drop", file_types=[".csv", ".json"])
|
|
@@ -1063,39 +1172,39 @@ class DataSciencePipelineUI:
|
|
| 1063 |
enable_deep_learning = gr.Checkbox(label="Enable Deep Learning", value=False)
|
| 1064 |
enable_automl = gr.Checkbox(label="Enable AutoML", value=True)
|
| 1065 |
run_btn = gr.Button("Run Pipeline", variant="primary")
|
| 1066 |
-
|
| 1067 |
with gr.Column(scale=1):
|
| 1068 |
file_status = gr.HTML()
|
| 1069 |
preview = gr.HTML()
|
| 1070 |
-
|
| 1071 |
output = gr.HTML()
|
| 1072 |
-
|
| 1073 |
# Hidden states
|
| 1074 |
file_type_state = gr.State("")
|
| 1075 |
columns_state = gr.State([])
|
| 1076 |
-
|
| 1077 |
# Events
|
| 1078 |
file_upload.change(
|
| 1079 |
fn=self.process_file_upload,
|
| 1080 |
inputs=[file_upload, learning_type],
|
| 1081 |
outputs=[file_status, file_type_state, columns_state, target_column, preview]
|
| 1082 |
)
|
| 1083 |
-
|
| 1084 |
learning_type.change(
|
| 1085 |
fn=self.update_target_column_visibility,
|
| 1086 |
inputs=[learning_type, columns_state],
|
| 1087 |
outputs=[target_column]
|
| 1088 |
)
|
| 1089 |
-
|
| 1090 |
run_btn.click(
|
| 1091 |
fn=self.run_comprehensive_pipeline,
|
| 1092 |
inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
|
| 1093 |
outputs=[output]
|
| 1094 |
)
|
| 1095 |
-
|
| 1096 |
return demo
|
| 1097 |
|
| 1098 |
if __name__ == "__main__":
|
| 1099 |
ui = DataSciencePipelineUI()
|
| 1100 |
demo = ui.create_interface()
|
| 1101 |
-
demo.launch(share=True)
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import sys
|
| 3 |
+
import importlib
|
|
|
|
|
|
|
|
|
|
| 4 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
def install_package(package):
|
| 7 |
+
"""Install a package using pip"""
|
| 8 |
+
try:
|
| 9 |
+
print(f"π¦ Installing {package}...")
|
| 10 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet", "--no-warn-script-location"])
|
| 11 |
+
print(f"β
Successfully installed {package}")
|
| 12 |
+
return True
|
| 13 |
+
except Exception as e:
|
| 14 |
+
print(f"β Failed to install {package}: {e}")
|
| 15 |
+
return False
|
| 16 |
+
|
| 17 |
+
def install_all_packages():
|
| 18 |
+
"""Install all required packages"""
|
| 19 |
+
packages = [
|
| 20 |
+
# Core packages
|
| 21 |
+
"numpy>=1.21.0",
|
| 22 |
+
"pandas>=1.3.0",
|
| 23 |
+
|
| 24 |
+
# Visualization
|
| 25 |
+
"matplotlib>=3.4.0",
|
| 26 |
+
"seaborn>=0.11.0",
|
| 27 |
+
"plotly>=5.0.0",
|
| 28 |
+
|
| 29 |
+
# Machine Learning
|
| 30 |
+
"scikit-learn>=1.0.0",
|
| 31 |
+
|
| 32 |
+
# Deep Learning (heavy packages)
|
| 33 |
+
"tensorflow>=2.8.0",
|
| 34 |
+
"keras>=2.8.0",
|
| 35 |
+
|
| 36 |
+
# Boosting libraries (heavy packages)
|
| 37 |
+
"xgboost>=1.5.0",
|
| 38 |
+
"lightgbm>=3.3.0",
|
| 39 |
+
"catboost>=1.0.0",
|
| 40 |
+
|
| 41 |
+
# Utilities
|
| 42 |
+
"requests>=2.25.0",
|
| 43 |
+
"openpyxl>=3.0.0",
|
| 44 |
+
|
| 45 |
+
# Interface
|
| 46 |
+
"gradio>=4.0.0"
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
print("π Starting installation of all required packages...")
|
| 50 |
+
print(f"π Total packages to install: {len(packages)}")
|
| 51 |
+
|
| 52 |
+
success_count = 0
|
| 53 |
+
for i, package in enumerate(packages, 1):
|
| 54 |
+
print(f"\n[{i}/{len(packages)}] Processing {package}")
|
| 55 |
+
if install_package(package):
|
| 56 |
+
success_count += 1
|
| 57 |
+
|
| 58 |
+
print(f"\nπ Installation completed! {success_count}/{len(packages)} packages installed successfully.")
|
| 59 |
+
return success_count == len(packages)
|
| 60 |
+
|
| 61 |
+
# Install all packages at startup
|
| 62 |
+
install_all_packages()
|
| 63 |
+
|
| 64 |
+
# Now import all packages
|
| 65 |
+
print("\nπ₯ Importing all packages...")
|
| 66 |
+
|
| 67 |
try:
|
| 68 |
+
# Core packages
|
| 69 |
+
import gradio as gr
|
| 70 |
+
import pandas as pd
|
| 71 |
+
import numpy as np
|
| 72 |
+
print("β
Core packages imported")
|
| 73 |
+
except ImportError as e:
|
| 74 |
+
print(f"β Core packages import failed: {e}")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
# Visualization packages
|
| 78 |
import matplotlib
|
| 79 |
+
matplotlib.use('Agg') # Non-interactive backend for web
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
import matplotlib.pyplot as plt
|
| 81 |
import seaborn as sns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
import plotly.graph_objects as go
|
| 83 |
import plotly.express as px
|
| 84 |
from plotly.subplots import make_subplots
|
| 85 |
+
print("β
Visualization packages imported")
|
| 86 |
+
except ImportError as e:
|
| 87 |
+
print(f"β Visualization packages import failed: {e}")
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
# Machine Learning packages
|
| 91 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
| 92 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
| 93 |
+
from sklearn.linear_model import LogisticRegression, LinearRegression
|
| 94 |
+
from sklearn.svm import SVC, SVR
|
| 95 |
+
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
|
| 96 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 97 |
+
from sklearn.cluster import KMeans
|
| 98 |
+
print("β
Scikit-learn imported")
|
| 99 |
+
except ImportError as e:
|
| 100 |
+
print(f"β Scikit-learn import failed: {e}")
|
| 101 |
|
| 102 |
+
try:
|
| 103 |
+
# Deep Learning packages
|
| 104 |
+
import tensorflow as tf
|
| 105 |
+
from tensorflow import keras
|
| 106 |
+
from tensorflow.keras.models import Sequential
|
| 107 |
+
from tensorflow.keras.layers import Dense, LSTM, Conv2D
|
| 108 |
+
print("β
TensorFlow and Keras imported")
|
| 109 |
+
except ImportError as e:
|
| 110 |
+
print(f"β οΈ TensorFlow/Keras import failed (optional): {e}")
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
# Boosting libraries
|
| 114 |
+
import xgboost as xgb
|
| 115 |
+
print("β
XGBoost imported")
|
| 116 |
+
except ImportError as e:
|
| 117 |
+
print(f"β οΈ XGBoost import failed (optional): {e}")
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
import lightgbm as lgb
|
| 121 |
+
print("β
LightGBM imported")
|
| 122 |
+
except ImportError as e:
|
| 123 |
+
print(f"β οΈ LightGBM import failed (optional): {e}")
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
import catboost as cb
|
| 127 |
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
| 128 |
+
print("β
CatBoost imported")
|
| 129 |
+
except ImportError as e:
|
| 130 |
+
print(f"β οΈ CatBoost import failed (optional): {e}")
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
# Utility packages
|
| 134 |
+
import requests
|
| 135 |
+
import openpyxl
|
| 136 |
+
print("β
Utility packages imported")
|
| 137 |
+
except ImportError as e:
|
| 138 |
+
print(f"β Utility packages import failed: {e}")
|
| 139 |
+
|
| 140 |
+
# Standard library imports (no installation needed)
|
| 141 |
+
import json
|
| 142 |
+
from io import BytesIO
|
| 143 |
+
import base64
|
| 144 |
+
import time
|
| 145 |
+
from datetime import datetime
|
| 146 |
+
import warnings
|
| 147 |
+
warnings.filterwarnings('ignore')
|
| 148 |
+
|
| 149 |
+
print("π All package imports completed!")
|
| 150 |
# Import your comprehensive pipeline
|
| 151 |
try:
|
| 152 |
from supervisor_agent import SupervisorAgent
|
|
|
|
| 155 |
|
| 156 |
class DataSciencePipelineUI:
|
| 157 |
"""Advanced UI for the comprehensive data science pipeline"""
|
| 158 |
+
|
| 159 |
def __init__(self):
|
| 160 |
try:
|
| 161 |
self.supervisor = SupervisorAgent()
|
| 162 |
except:
|
| 163 |
# Fallback mock implementation if supervisor_agent isn't available
|
| 164 |
self.supervisor = self._create_mock_supervisor()
|
| 165 |
+
|
| 166 |
self.current_data = None
|
| 167 |
self.pipeline_results = None
|
| 168 |
+
|
| 169 |
# UI State
|
| 170 |
self.processing_step = 0
|
| 171 |
self.total_steps = 6
|
| 172 |
+
|
| 173 |
# Styling
|
| 174 |
self.custom_css = """
|
| 175 |
+
.main-container {
|
| 176 |
+
max-width: 1400px;
|
| 177 |
+
margin: 0 auto;
|
| 178 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 179 |
}
|
| 180 |
+
.step-container {
|
| 181 |
+
margin: 15px 0;
|
| 182 |
+
padding: 20px;
|
| 183 |
+
border-radius: 12px;
|
| 184 |
border-left: 5px solid #3498db;
|
| 185 |
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
|
| 186 |
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
|
|
|
| 235 |
'info': {'shape': (1000, 10), 'columns': ['col1', 'col2'], 'dtypes': {'col1': 'float64'}}
|
| 236 |
},
|
| 237 |
'data_cleaning': {
|
| 238 |
+
'status': 'success',
|
| 239 |
'cleaning_report': {'duplicates_removed': 5, 'missing_values': {'col1': 10}}
|
| 240 |
}
|
| 241 |
},
|
|
|
|
| 261 |
"""Enhanced file processing with detailed analysis"""
|
| 262 |
if file_obj is None:
|
| 263 |
return "β No file uploaded", "", [], gr.update(visible=False), ""
|
| 264 |
+
|
| 265 |
try:
|
| 266 |
file_path = file_obj.name
|
| 267 |
file_name = os.path.basename(file_path)
|
| 268 |
file_extension = os.path.splitext(file_name)[1].lower()
|
| 269 |
+
|
| 270 |
# Load data based on file type
|
| 271 |
if file_extension == '.csv':
|
| 272 |
df = pd.read_csv(file_path)
|
|
|
|
| 276 |
file_type = 'json'
|
| 277 |
else:
|
| 278 |
return "β Unsupported file type. Please upload CSV or JSON files only.", "", [], gr.update(visible=False), ""
|
| 279 |
+
|
| 280 |
# Store the data
|
| 281 |
self.current_data = df
|
| 282 |
+
|
| 283 |
# Detailed file analysis
|
| 284 |
file_size = os.path.getsize(file_path) / 1024 # KB
|
| 285 |
memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # MB
|
| 286 |
missing_count = df.isnull().sum().sum()
|
| 287 |
duplicate_count = df.duplicated().sum()
|
| 288 |
+
|
| 289 |
# Data type analysis
|
| 290 |
numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
|
| 291 |
categorical_cols = len(df.select_dtypes(include=['object']).columns)
|
| 292 |
datetime_cols = len(df.select_dtypes(include=['datetime64']).columns)
|
| 293 |
+
|
| 294 |
# Create preview table HTML
|
| 295 |
preview_html = self._create_data_preview(df)
|
| 296 |
+
|
| 297 |
file_info = f"""
|
| 298 |
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; color: white; margin: 10px 0;">
|
| 299 |
<h3 style="margin: 0 0 15px 0;">π File Upload Successful!</h3>
|
|
|
|
| 325 |
</div>
|
| 326 |
</div>
|
| 327 |
"""
|
| 328 |
+
|
| 329 |
columns = df.columns.tolist()
|
| 330 |
target_update = gr.update(visible=(learning_type == "Supervised"), choices=columns, value=columns[0] if columns and learning_type == "Supervised" else "")
|
| 331 |
+
|
| 332 |
return (
|
| 333 |
+
file_info,
|
| 334 |
+
file_type,
|
| 335 |
+
columns,
|
| 336 |
target_update,
|
| 337 |
preview_html
|
| 338 |
)
|
| 339 |
+
|
| 340 |
except Exception as e:
|
| 341 |
return f"β Error processing file: {str(e)}", "", [], gr.update(visible=False), ""
|
| 342 |
|
| 343 |
def _create_data_preview(self, df):
|
| 344 |
"""Create HTML preview of the data"""
|
| 345 |
preview_df = df.head(10)
|
| 346 |
+
|
| 347 |
html = """
|
| 348 |
<div style="background: white; padding: 20px; border-radius: 10px; margin: 15px 0; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 349 |
<h4 style="color: #2c3e50; margin-bottom: 15px;">π Data Preview (First 10 rows)</h4>
|
|
|
|
| 352 |
<thead>
|
| 353 |
<tr style="background-color: #3498db; color: white;">
|
| 354 |
"""
|
| 355 |
+
|
| 356 |
# Add headers
|
| 357 |
for col in preview_df.columns:
|
| 358 |
html += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
|
| 359 |
html += "</tr></thead><tbody>"
|
| 360 |
+
|
| 361 |
# Add rows
|
| 362 |
for idx, row in preview_df.iterrows():
|
| 363 |
html += f"<tr style='background-color: {'#f9f9f9' if idx % 2 == 0 else 'white'};'>"
|
|
|
|
| 369 |
cell_value = f"{value:.3f}" if isinstance(value, float) else str(value)
|
| 370 |
else:
|
| 371 |
cell_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
|
| 372 |
+
|
| 373 |
html += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
|
| 374 |
html += "</tr>"
|
| 375 |
+
|
| 376 |
html += "</tbody></table></div></div>"
|
| 377 |
return html
|
| 378 |
|
|
|
|
| 387 |
"""Run the complete comprehensive pipeline with advanced features"""
|
| 388 |
if file_obj is None:
|
| 389 |
return self._create_error_html("Please upload a file first.")
|
| 390 |
+
|
| 391 |
if learning_type == "Supervised" and not target_column:
|
| 392 |
return self._create_error_html("Please select a target column for supervised learning.")
|
| 393 |
+
|
| 394 |
try:
|
| 395 |
# Initialize progress tracking
|
| 396 |
progress_html = self._create_progress_header()
|
| 397 |
+
|
| 398 |
file_path = file_obj.name
|
| 399 |
file_extension = os.path.splitext(file_path)[1].lower().replace('.', '')
|
| 400 |
+
|
| 401 |
# Step 1: Data Loading
|
| 402 |
step1_html = self._create_step_html(
|
| 403 |
+
1, "π Data Loading", "loading",
|
| 404 |
"Loading and validating your dataset..."
|
| 405 |
)
|
| 406 |
progress_html += step1_html
|
| 407 |
+
|
| 408 |
# Simulate some processing time for better UX
|
| 409 |
time.sleep(1)
|
| 410 |
+
|
| 411 |
# Execute data loading
|
| 412 |
try:
|
| 413 |
# Use your actual SupervisorAgent
|
|
|
|
| 416 |
'target_column': target_column if target_column else None,
|
| 417 |
'domain': domain.lower() if domain else 'general'
|
| 418 |
}
|
| 419 |
+
|
| 420 |
result = self.supervisor.execute_pipeline(
|
| 421 |
data_source=file_path,
|
| 422 |
**pipeline_kwargs
|
| 423 |
)
|
| 424 |
+
|
| 425 |
if result['status'] != 'success':
|
| 426 |
return self._create_error_html(f"Pipeline failed: {result.get('error', 'Unknown error')}")
|
| 427 |
+
|
| 428 |
self.pipeline_results = result['pipeline_results']
|
| 429 |
summary = result['summary']
|
| 430 |
+
|
| 431 |
except Exception as e:
|
| 432 |
# Fallback to demonstration mode
|
| 433 |
result = self._create_demo_results(self.current_data, target_column, learning_type, domain)
|
| 434 |
self.pipeline_results = result['pipeline_results']
|
| 435 |
summary = result['summary']
|
| 436 |
+
|
| 437 |
# Update Step 1 - Completed
|
| 438 |
step1_complete = self._create_step_html(
|
| 439 |
1, "π Data Loading", "completed",
|
| 440 |
self._format_data_loading_results(self.pipeline_results.get('data_loading', {}))
|
| 441 |
)
|
| 442 |
progress_html = progress_html.replace(step1_html, step1_complete)
|
| 443 |
+
|
| 444 |
# Step 2: Data Cleaning
|
| 445 |
step2_html = self._create_step_html(
|
| 446 |
2, "π§Ή Data Cleaning", "completed",
|
| 447 |
self._format_data_cleaning_results(self.pipeline_results.get('data_cleaning', {}))
|
| 448 |
)
|
| 449 |
progress_html += step2_html
|
| 450 |
+
|
| 451 |
# Step 3: Exploratory Data Analysis
|
| 452 |
step3_html = self._create_step_html(
|
| 453 |
3, "π Exploratory Data Analysis", "completed",
|
| 454 |
self._format_eda_results(self.pipeline_results.get('eda', {}), self.current_data)
|
| 455 |
)
|
| 456 |
progress_html += step3_html
|
| 457 |
+
|
| 458 |
# Step 4: Feature Engineering & Domain Insights
|
| 459 |
step4_html = self._create_step_html(
|
| 460 |
4, "βοΈ Feature Engineering & Domain Analysis", "completed",
|
| 461 |
self._format_domain_results(self.pipeline_results.get('domain_insights', {}))
|
| 462 |
)
|
| 463 |
progress_html += step4_html
|
| 464 |
+
|
| 465 |
# Step 5: Model Training
|
| 466 |
if learning_type == "Supervised" and target_column:
|
| 467 |
step5_html = self._create_step_html(
|
|
|
|
| 475 |
self._format_unsupervised_results(self.current_data)
|
| 476 |
)
|
| 477 |
progress_html += step5_html
|
| 478 |
+
|
| 479 |
# Step 6: Results & Insights
|
| 480 |
step6_html = self._create_step_html(
|
| 481 |
6, "π Results & Recommendations", "completed",
|
| 482 |
self._format_final_results(summary, self.pipeline_results)
|
| 483 |
)
|
| 484 |
progress_html += step6_html
|
| 485 |
+
|
| 486 |
# Add completion footer
|
| 487 |
completion_html = self._create_completion_footer(learning_type, domain, enable_deep_learning, enable_automl)
|
| 488 |
progress_html += completion_html
|
| 489 |
+
|
| 490 |
return progress_html
|
| 491 |
+
|
| 492 |
except Exception as e:
|
| 493 |
return self._create_error_html(f"Pipeline execution failed: {str(e)}")
|
| 494 |
|
|
|
|
| 503 |
def _create_demo_results(self, data, target_column, learning_type, domain):
|
| 504 |
"""Create demonstration results when actual pipeline fails"""
|
| 505 |
from datetime import datetime
|
| 506 |
+
|
| 507 |
# Mock comprehensive results
|
| 508 |
return {
|
| 509 |
'status': 'success',
|
|
|
|
| 587 |
'completed': {'color': '#27ae60', 'icon': 'β
', 'bg': '#d4edda'},
|
| 588 |
'error': {'color': '#e74c3c', 'icon': 'β', 'bg': '#f8d7da'}
|
| 589 |
}
|
| 590 |
+
|
| 591 |
config = status_config.get(status, status_config['loading'])
|
| 592 |
+
|
| 593 |
return f"""
|
| 594 |
<div style="margin: 20px 0; padding: 25px; background: {config['bg']}; border-left: 6px solid {config['color']}; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 595 |
<div style="display: flex; align-items: center; margin-bottom: 15px;">
|
|
|
|
| 611 |
"""Format data loading results"""
|
| 612 |
if not results or results.get('status') != 'success':
|
| 613 |
return "<p>Data loading information not available</p>"
|
| 614 |
+
|
| 615 |
info = results.get('info', {})
|
| 616 |
shape = info.get('shape', (0, 0))
|
| 617 |
columns = info.get('columns', [])
|
| 618 |
dtypes = info.get('dtypes', {})
|
| 619 |
+
|
| 620 |
# Count data types
|
| 621 |
numeric_cols = sum(1 for dtype in dtypes.values() if 'int' in str(dtype) or 'float' in str(dtype))
|
| 622 |
categorical_cols = sum(1 for dtype in dtypes.values() if 'object' in str(dtype))
|
| 623 |
+
|
| 624 |
return f"""
|
| 625 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 626 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 650 |
"""Format data cleaning results"""
|
| 651 |
if not results or results.get('status') != 'success':
|
| 652 |
return "<p>Data cleaning information not available</p>"
|
| 653 |
+
|
| 654 |
report = results.get('cleaning_report', {})
|
| 655 |
duplicates = report.get('duplicates_removed', 0)
|
| 656 |
missing_values = report.get('missing_values', {})
|
| 657 |
outliers = report.get('outliers_handled', {})
|
| 658 |
+
|
| 659 |
total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
|
| 660 |
total_outliers = sum(outliers.values()) if isinstance(outliers, dict) else 0
|
| 661 |
+
|
| 662 |
return f"""
|
| 663 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 664 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 669 |
</div>
|
| 670 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 671 |
<h4 style="margin: 0 0 10px 0; color: #e67e22;">π Data Quality</h4>
|
| 672 |
+
<p style="margin: 5px 0;"><strong>Overall Quality:</strong>
|
| 673 |
<span style="color: #27ae60; font-weight: bold;">
|
| 674 |
{85 + np.random.randint(0, 15):.1f}%
|
| 675 |
</span>
|
| 676 |
</p>
|
| 677 |
+
<p style="margin: 5px 0;"><strong>Completeness:</strong>
|
| 678 |
<span style="color: #27ae60;">
|
| 679 |
{95 + np.random.randint(0, 5):.1f}%
|
| 680 |
</span>
|
| 681 |
</p>
|
| 682 |
</div>
|
| 683 |
</div>
|
| 684 |
+
|
| 685 |
{self._create_missing_values_chart(missing_values) if missing_values else ""}
|
| 686 |
+
|
| 687 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Data cleaning completed successfully!</strong></p>
|
| 688 |
<div style="background: #e8f5e8; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 689 |
<p style="margin: 0; color: #2d5a2d;"><strong>Cleaning Strategy:</strong> Applied median imputation for numeric features and mode imputation for categorical features. Outliers were capped using IQR method.</p>
|
|
|
|
| 694 |
"""Create a visual representation of missing values"""
|
| 695 |
if not missing_values or not any(missing_values.values()):
|
| 696 |
return ""
|
| 697 |
+
|
| 698 |
# Filter out columns with no missing values
|
| 699 |
missing_data = {k: v for k, v in missing_values.items() if v > 0}
|
| 700 |
+
|
| 701 |
if not missing_data:
|
| 702 |
return ""
|
| 703 |
+
|
| 704 |
try:
|
| 705 |
# Create a simple matplotlib bar chart
|
| 706 |
fig, ax = plt.subplots(figsize=(10, 6))
|
| 707 |
columns = list(missing_data.keys())[:10] # Limit to 10 columns
|
| 708 |
values = [missing_data[col] for col in columns]
|
| 709 |
+
|
| 710 |
bars = ax.bar(columns, values, color='#e74c3c', alpha=0.7)
|
| 711 |
ax.set_xlabel('Columns')
|
| 712 |
ax.set_ylabel('Missing Values Count')
|
| 713 |
ax.set_title('Missing Values by Column (Before Cleaning)')
|
| 714 |
plt.xticks(rotation=45, ha='right')
|
| 715 |
plt.tight_layout()
|
| 716 |
+
|
| 717 |
# Add value labels on bars
|
| 718 |
for bar, value in zip(bars, values):
|
| 719 |
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
|
| 720 |
str(value), ha='center', va='bottom')
|
| 721 |
+
|
| 722 |
chart_html = self.create_plot_html(fig)
|
| 723 |
return f"""
|
| 724 |
<div style="background: white; padding: 15px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 733 |
"""Format EDA results with visualizations"""
|
| 734 |
if not results or results.get('status') != 'success':
|
| 735 |
return "<p>EDA information not available</p>"
|
| 736 |
+
|
| 737 |
analysis = results.get('analysis', {})
|
| 738 |
correlations = analysis.get('correlations', {})
|
| 739 |
correlation_matrix = correlations.get('correlation_matrix', {})
|
| 740 |
+
|
| 741 |
eda_html = f"""
|
| 742 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 743 |
<div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 753 |
</div>
|
| 754 |
</div>
|
| 755 |
"""
|
| 756 |
+
|
| 757 |
# Add correlation heatmap if available
|
| 758 |
if correlation_matrix:
|
| 759 |
eda_html += self._create_correlation_heatmap(correlation_matrix)
|
| 760 |
+
|
| 761 |
# Add distribution plots
|
| 762 |
eda_html += self._create_distribution_plots(data)
|
| 763 |
+
|
| 764 |
eda_html += """
|
| 765 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Exploratory Data Analysis completed!</strong></p>
|
| 766 |
<div style="background: #f0e6ff; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 767 |
<p style="margin: 0; color: #6a1b9a;"><strong>Key Insights:</strong> Statistical analysis reveals data patterns, correlations, and distributions that will guide feature engineering and model selection.</p>
|
| 768 |
</div>
|
| 769 |
"""
|
| 770 |
+
|
| 771 |
return eda_html
|
| 772 |
|
| 773 |
def _create_correlation_heatmap(self, correlation_matrix):
|
| 774 |
"""Create correlation heatmap visualization"""
|
| 775 |
if not correlation_matrix:
|
| 776 |
return ""
|
| 777 |
+
|
| 778 |
try:
|
| 779 |
corr_df = pd.DataFrame(correlation_matrix)
|
| 780 |
if corr_df.empty or len(corr_df.columns) < 2:
|
| 781 |
return ""
|
| 782 |
+
|
| 783 |
fig, ax = plt.subplots(figsize=(10, 8))
|
| 784 |
mask = np.triu(np.ones_like(corr_df, dtype=bool)) # Mask upper triangle
|
| 785 |
+
sns.heatmap(corr_df, mask=mask, annot=True, cmap='RdBu_r', center=0,
|
| 786 |
square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax)
|
| 787 |
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
|
| 788 |
plt.tight_layout()
|
| 789 |
+
|
| 790 |
chart_html = self.create_plot_html(fig)
|
| 791 |
return f"""
|
| 792 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 793 |
<h4 style="margin: 0 0 15px 0; color: #9b59b6;">π Correlation Analysis</h4>
|
| 794 |
{chart_html}
|
| 795 |
<p style="margin-top: 10px; font-size: 12px; color: #666;">
|
| 796 |
+
<strong>Interpretation:</strong> Red indicates negative correlation, blue indicates positive correlation.
|
| 797 |
Values closer to Β±1 indicate stronger relationships.
|
| 798 |
</p>
|
| 799 |
</div>
|
|
|
|
| 805 |
"""Create distribution plots for key variables"""
|
| 806 |
try:
|
| 807 |
numeric_cols = data.select_dtypes(include=[np.number]).columns[:4] # Limit to 4 plots
|
| 808 |
+
|
| 809 |
if len(numeric_cols) == 0:
|
| 810 |
return "<p>No numeric columns found for distribution analysis</p>"
|
| 811 |
+
|
| 812 |
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
|
| 813 |
axes = axes.flatten()
|
| 814 |
+
|
| 815 |
for i, col in enumerate(numeric_cols):
|
| 816 |
if i < 4:
|
| 817 |
sns.histplot(data[col].dropna(), kde=True, ax=axes[i], color='skyblue', alpha=0.7)
|
|
|
|
| 819 |
axes[i].set_xlabel(col)
|
| 820 |
axes[i].set_ylabel('Frequency')
|
| 821 |
axes[i].grid(True, alpha=0.3)
|
| 822 |
+
|
| 823 |
# Hide empty subplots
|
| 824 |
for i in range(len(numeric_cols), 4):
|
| 825 |
axes[i].set_visible(False)
|
| 826 |
+
|
| 827 |
plt.suptitle('Feature Distributions', fontsize=16, fontweight='bold', y=1.02)
|
| 828 |
plt.tight_layout()
|
| 829 |
+
|
| 830 |
chart_html = self.create_plot_html(fig)
|
| 831 |
return f"""
|
| 832 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 844 |
"""Format domain analysis results"""
|
| 845 |
if not results:
|
| 846 |
return "<p>Domain analysis information not available</p>"
|
| 847 |
+
|
| 848 |
domain = results.get('detected_domain', 'general')
|
| 849 |
insights = results.get('insights', [])
|
| 850 |
recommendations = results.get('recommendations', [])
|
| 851 |
+
|
| 852 |
return f"""
|
| 853 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 854 |
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 866 |
</ul>
|
| 867 |
</div>
|
| 868 |
</div>
|
| 869 |
+
|
| 870 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 871 |
<h4 style="margin: 0 0 15px 0; color: #1abc9c;">π― Recommendations</h4>
|
| 872 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
|
| 873 |
{''.join([f'<div style="background: #e8f5e8; padding: 12px; border-radius: 6px; border-left: 4px solid #27ae60;"><span style="color: #27ae60; font-weight: bold;">β’</span> {rec}</div>' for rec in recommendations[:6]])}
|
| 874 |
</div>
|
| 875 |
</div>
|
| 876 |
+
|
| 877 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Domain analysis and feature engineering recommendations completed!</strong></p>
|
| 878 |
<div style="background: #e0f7fa; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 879 |
<p style="margin: 0; color: #00695c;"><strong>Feature Engineering:</strong> Applied domain-specific transformations and created relevant features based on {domain} domain expertise.</p>
|
|
|
|
| 884 |
"""Format modeling results with comprehensive metrics"""
|
| 885 |
if not results or results.get('status') != 'success':
|
| 886 |
return self._format_unsupervised_results(self.current_data)
|
| 887 |
+
|
| 888 |
problem_type = results.get('problem_type', 'classification')
|
| 889 |
best_model = results.get('best_model', 'Unknown')
|
| 890 |
model_results = results.get('results', {})
|
| 891 |
feature_importance = results.get('feature_importance', {})
|
| 892 |
+
|
| 893 |
# Create model comparison chart
|
| 894 |
model_comparison_html = self._create_model_comparison_chart(model_results, problem_type)
|
| 895 |
+
|
| 896 |
# Create feature importance chart
|
| 897 |
feature_importance_html = self._create_feature_importance_chart(feature_importance)
|
| 898 |
+
|
| 899 |
return f"""
|
| 900 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
|
| 901 |
<div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
|
|
|
| 914 |
<p style="margin: 8px 0;"><strong>Features Used:</strong> {len(feature_importance) if feature_importance else 'N/A'}</p>
|
| 915 |
</div>
|
| 916 |
</div>
|
| 917 |
+
|
| 918 |
{model_comparison_html}
|
| 919 |
{feature_importance_html}
|
| 920 |
+
|
| 921 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 922 |
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">π§ͺ Training Details</h4>
|
| 923 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
|
|
|
| 935 |
</div>
|
| 936 |
</div>
|
| 937 |
</div>
|
| 938 |
+
|
| 939 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Model training and evaluation completed successfully!</strong></p>
|
| 940 |
<div style="background: #fef5e7; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 941 |
<p style="margin: 0; color: #d68910;"><strong>Model Performance:</strong> The {best_model} achieved the best performance with comprehensive evaluation metrics. Consider ensemble methods for further improvement.</p>
|
|
|
|
| 946 |
"""Get formatted metrics for the best model"""
|
| 947 |
if not best_model_result:
|
| 948 |
return ""
|
| 949 |
+
|
| 950 |
if 'classification' in problem_type.lower():
|
| 951 |
accuracy = best_model_result.get('accuracy', 0)
|
| 952 |
f1_score = best_model_result.get('f1_score', 0)
|
|
|
|
| 970 |
"""Create model comparison visualization"""
|
| 971 |
if not model_results:
|
| 972 |
return ""
|
| 973 |
+
|
| 974 |
try:
|
| 975 |
# Prepare data for plotting
|
| 976 |
model_names = []
|
| 977 |
scores = []
|
| 978 |
+
|
| 979 |
for model_name, result in model_results.items():
|
| 980 |
model_names.append(model_name)
|
| 981 |
if 'classification' in problem_type.lower():
|
| 982 |
scores.append(result.get('accuracy', 0))
|
| 983 |
else:
|
| 984 |
scores.append(result.get('r2_score', 0))
|
| 985 |
+
|
| 986 |
if not model_names:
|
| 987 |
return ""
|
| 988 |
+
|
| 989 |
# Create plot
|
| 990 |
fig, ax = plt.subplots(figsize=(12, 6))
|
| 991 |
bars = ax.barh(model_names, scores, color=plt.cm.viridis(np.linspace(0, 1, len(model_names))))
|
| 992 |
+
|
| 993 |
# Customize plot
|
| 994 |
ax.set_xlabel('Accuracy' if 'classification' in problem_type.lower() else 'RΒ² Score')
|
| 995 |
ax.set_title(f'Model Performance Comparison - {problem_type.title()}', fontsize=16, fontweight='bold', pad=20)
|
| 996 |
ax.grid(True, alpha=0.3, axis='x')
|
| 997 |
+
|
| 998 |
# Add value labels on bars
|
| 999 |
for bar, score in zip(bars, scores):
|
| 1000 |
ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
|
| 1001 |
f'{score:.3f}', ha='left', va='center', fontweight='bold')
|
| 1002 |
+
|
| 1003 |
plt.tight_layout()
|
| 1004 |
chart_html = self.create_plot_html(fig)
|
| 1005 |
+
|
| 1006 |
return f"""
|
| 1007 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 1008 |
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">π Model Performance Comparison</h4>
|
|
|
|
| 1021 |
"""Create feature importance visualization"""
|
| 1022 |
if not feature_importance:
|
| 1023 |
return ""
|
| 1024 |
+
|
| 1025 |
try:
|
| 1026 |
# Get top 10 features
|
| 1027 |
sorted_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
|
| 1028 |
+
|
| 1029 |
features = list(sorted_features.keys())
|
| 1030 |
importance = list(sorted_features.values())
|
| 1031 |
+
|
| 1032 |
# Create plot
|
| 1033 |
fig, ax = plt.subplots(figsize=(10, 6))
|
| 1034 |
bars = ax.barh(features, importance, color='coral', alpha=0.8)
|
| 1035 |
+
|
| 1036 |
ax.set_xlabel('Feature Importance')
|
| 1037 |
ax.set_title('Top 10 Most Important Features', fontsize=16, fontweight='bold', pad=20)
|
| 1038 |
ax.grid(True, alpha=0.3, axis='x')
|
| 1039 |
+
|
| 1040 |
# Add value labels
|
| 1041 |
for bar, imp in zip(bars, importance):
|
| 1042 |
ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
|
| 1043 |
f'{imp:.3f}', ha='left', va='center', fontweight='bold')
|
| 1044 |
+
|
| 1045 |
plt.tight_layout()
|
| 1046 |
chart_html = self.create_plot_html(fig)
|
| 1047 |
+
|
| 1048 |
return f"""
|
| 1049 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 1050 |
<h4 style="margin: 0 0 15px 0; color: #e74c3c;">π― Feature Importance Analysis</h4>
|
|
|
|
| 1081 |
<p style="margin: 8px 0;"><strong>Dimensionality:</strong> {data.shape[1]} features analyzed</p>
|
| 1082 |
</div>
|
| 1083 |
</div>
|
| 1084 |
+
|
| 1085 |
<div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 1086 |
<h4 style="margin: 0 0 15px 0; color: #9b59b6;">π― Cluster Characteristics</h4>
|
| 1087 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
|
|
|
| 1099 |
</div>
|
| 1100 |
</div>
|
| 1101 |
</div>
|
| 1102 |
+
|
| 1103 |
<p style="color: #27ae60; margin-top: 15px;"><strong>β
Unsupervised analysis completed successfully!</strong></p>
|
| 1104 |
<div style="background: #f3e5f5; padding: 10px; border-radius: 6px; margin-top: 10px;">
|
| 1105 |
<p style="margin: 0; color: #7b1fa2;"><strong>Insights:</strong> Discovered natural groupings in your data that can be used for segmentation, anomaly detection, and pattern recognition.</p>
|
|
|
|
| 1110 |
"""Format final results and recommendations"""
|
| 1111 |
key_insights = summary.get('key_insights', [])
|
| 1112 |
recommendations = summary.get('recommendations', [])
|
| 1113 |
+
|
| 1114 |
return f"""
|
| 1115 |
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
|
| 1116 |
<h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">π Pipeline Completed Successfully!</h3>
|
|
|
|
| 1133 |
</div>
|
| 1134 |
</div>
|
| 1135 |
</div>
|
| 1136 |
+
|
| 1137 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
|
| 1138 |
<div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 1139 |
<h4 style="margin: 0 0 20px 0; color: #2c3e50; font-size: 1.3em;">π Key Insights Discovered</h4>
|
|
|
|
| 1162 |
"""Create the Gradio interface"""
|
| 1163 |
with gr.Blocks(css=self.custom_css) as demo:
|
| 1164 |
gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>π¬ Comprehensive Data Science Pipeline</h1>")
|
| 1165 |
+
|
| 1166 |
with gr.Row():
|
| 1167 |
with gr.Column(scale=1):
|
| 1168 |
file_upload = gr.File(label="Upload Dataset (CSV or JSON) or Drag & Drop", file_types=[".csv", ".json"])
|
|
|
|
| 1172 |
enable_deep_learning = gr.Checkbox(label="Enable Deep Learning", value=False)
|
| 1173 |
enable_automl = gr.Checkbox(label="Enable AutoML", value=True)
|
| 1174 |
run_btn = gr.Button("Run Pipeline", variant="primary")
|
| 1175 |
+
|
| 1176 |
with gr.Column(scale=1):
|
| 1177 |
file_status = gr.HTML()
|
| 1178 |
preview = gr.HTML()
|
| 1179 |
+
|
| 1180 |
output = gr.HTML()
|
| 1181 |
+
|
| 1182 |
# Hidden states
|
| 1183 |
file_type_state = gr.State("")
|
| 1184 |
columns_state = gr.State([])
|
| 1185 |
+
|
| 1186 |
# Events
|
| 1187 |
file_upload.change(
|
| 1188 |
fn=self.process_file_upload,
|
| 1189 |
inputs=[file_upload, learning_type],
|
| 1190 |
outputs=[file_status, file_type_state, columns_state, target_column, preview]
|
| 1191 |
)
|
| 1192 |
+
|
| 1193 |
learning_type.change(
|
| 1194 |
fn=self.update_target_column_visibility,
|
| 1195 |
inputs=[learning_type, columns_state],
|
| 1196 |
outputs=[target_column]
|
| 1197 |
)
|
| 1198 |
+
|
| 1199 |
run_btn.click(
|
| 1200 |
fn=self.run_comprehensive_pipeline,
|
| 1201 |
inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
|
| 1202 |
outputs=[output]
|
| 1203 |
)
|
| 1204 |
+
|
| 1205 |
return demo
|
| 1206 |
|
| 1207 |
if __name__ == "__main__":
|
| 1208 |
ui = DataSciencePipelineUI()
|
| 1209 |
demo = ui.create_interface()
|
| 1210 |
+
demo.launch(share=True)
|