Girinath11 commited on
Commit
0d15f54
Β·
verified Β·
1 Parent(s): 08c90b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -156
app.py CHANGED
@@ -1,43 +1,152 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import numpy as np
4
- import json
5
- from io import BytesIO
6
- import base64
7
  import os
8
- import time
9
- from datetime import datetime
10
- import warnings
11
- warnings.filterwarnings('ignore')
12
 
13
- # Install and import matplotlib with error handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  try:
 
 
 
 
 
 
 
 
 
 
15
  import matplotlib
16
- matplotlib.use('Agg') # Use non-interactive backend for web deployment
17
- import matplotlib.pyplot as plt
18
- import seaborn as sns
19
- except ImportError:
20
- import subprocess
21
- import sys
22
- subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib", "seaborn"])
23
- import matplotlib
24
- matplotlib.use('Agg')
25
  import matplotlib.pyplot as plt
26
  import seaborn as sns
27
-
28
- # Import plotly with error handling
29
- try:
30
- import plotly.graph_objects as go
31
- import plotly.express as px
32
- from plotly.subplots import make_subplots
33
- except ImportError:
34
- import subprocess
35
- import sys
36
- subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly"])
37
  import plotly.graph_objects as go
38
  import plotly.express as px
39
  from plotly.subplots import make_subplots
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Import your comprehensive pipeline
42
  try:
43
  from supervisor_agent import SupervisorAgent
@@ -46,32 +155,32 @@ except ImportError:
46
 
47
  class DataSciencePipelineUI:
48
  """Advanced UI for the comprehensive data science pipeline"""
49
-
50
  def __init__(self):
51
  try:
52
  self.supervisor = SupervisorAgent()
53
  except:
54
  # Fallback mock implementation if supervisor_agent isn't available
55
  self.supervisor = self._create_mock_supervisor()
56
-
57
  self.current_data = None
58
  self.pipeline_results = None
59
-
60
  # UI State
61
  self.processing_step = 0
62
  self.total_steps = 6
63
-
64
  # Styling
65
  self.custom_css = """
66
- .main-container {
67
- max-width: 1400px;
68
- margin: 0 auto;
69
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
70
  }
71
- .step-container {
72
- margin: 15px 0;
73
- padding: 20px;
74
- border-radius: 12px;
75
  border-left: 5px solid #3498db;
76
  background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
77
  box-shadow: 0 4px 6px rgba(0,0,0,0.1);
@@ -126,7 +235,7 @@ class DataSciencePipelineUI:
126
  'info': {'shape': (1000, 10), 'columns': ['col1', 'col2'], 'dtypes': {'col1': 'float64'}}
127
  },
128
  'data_cleaning': {
129
- 'status': 'success',
130
  'cleaning_report': {'duplicates_removed': 5, 'missing_values': {'col1': 10}}
131
  }
132
  },
@@ -152,12 +261,12 @@ class DataSciencePipelineUI:
152
  """Enhanced file processing with detailed analysis"""
153
  if file_obj is None:
154
  return "❌ No file uploaded", "", [], gr.update(visible=False), ""
155
-
156
  try:
157
  file_path = file_obj.name
158
  file_name = os.path.basename(file_path)
159
  file_extension = os.path.splitext(file_name)[1].lower()
160
-
161
  # Load data based on file type
162
  if file_extension == '.csv':
163
  df = pd.read_csv(file_path)
@@ -167,24 +276,24 @@ class DataSciencePipelineUI:
167
  file_type = 'json'
168
  else:
169
  return "❌ Unsupported file type. Please upload CSV or JSON files only.", "", [], gr.update(visible=False), ""
170
-
171
  # Store the data
172
  self.current_data = df
173
-
174
  # Detailed file analysis
175
  file_size = os.path.getsize(file_path) / 1024 # KB
176
  memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # MB
177
  missing_count = df.isnull().sum().sum()
178
  duplicate_count = df.duplicated().sum()
179
-
180
  # Data type analysis
181
  numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
182
  categorical_cols = len(df.select_dtypes(include=['object']).columns)
183
  datetime_cols = len(df.select_dtypes(include=['datetime64']).columns)
184
-
185
  # Create preview table HTML
186
  preview_html = self._create_data_preview(df)
187
-
188
  file_info = f"""
189
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; color: white; margin: 10px 0;">
190
  <h3 style="margin: 0 0 15px 0;">πŸ“Š File Upload Successful!</h3>
@@ -216,25 +325,25 @@ class DataSciencePipelineUI:
216
  </div>
217
  </div>
218
  """
219
-
220
  columns = df.columns.tolist()
221
  target_update = gr.update(visible=(learning_type == "Supervised"), choices=columns, value=columns[0] if columns and learning_type == "Supervised" else "")
222
-
223
  return (
224
- file_info,
225
- file_type,
226
- columns,
227
  target_update,
228
  preview_html
229
  )
230
-
231
  except Exception as e:
232
  return f"❌ Error processing file: {str(e)}", "", [], gr.update(visible=False), ""
233
 
234
  def _create_data_preview(self, df):
235
  """Create HTML preview of the data"""
236
  preview_df = df.head(10)
237
-
238
  html = """
239
  <div style="background: white; padding: 20px; border-radius: 10px; margin: 15px 0; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
240
  <h4 style="color: #2c3e50; margin-bottom: 15px;">πŸ“‹ Data Preview (First 10 rows)</h4>
@@ -243,12 +352,12 @@ class DataSciencePipelineUI:
243
  <thead>
244
  <tr style="background-color: #3498db; color: white;">
245
  """
246
-
247
  # Add headers
248
  for col in preview_df.columns:
249
  html += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
250
  html += "</tr></thead><tbody>"
251
-
252
  # Add rows
253
  for idx, row in preview_df.iterrows():
254
  html += f"<tr style='background-color: {'#f9f9f9' if idx % 2 == 0 else 'white'};'>"
@@ -260,10 +369,10 @@ class DataSciencePipelineUI:
260
  cell_value = f"{value:.3f}" if isinstance(value, float) else str(value)
261
  else:
262
  cell_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
263
-
264
  html += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
265
  html += "</tr>"
266
-
267
  html += "</tbody></table></div></div>"
268
  return html
269
 
@@ -278,27 +387,27 @@ class DataSciencePipelineUI:
278
  """Run the complete comprehensive pipeline with advanced features"""
279
  if file_obj is None:
280
  return self._create_error_html("Please upload a file first.")
281
-
282
  if learning_type == "Supervised" and not target_column:
283
  return self._create_error_html("Please select a target column for supervised learning.")
284
-
285
  try:
286
  # Initialize progress tracking
287
  progress_html = self._create_progress_header()
288
-
289
  file_path = file_obj.name
290
  file_extension = os.path.splitext(file_path)[1].lower().replace('.', '')
291
-
292
  # Step 1: Data Loading
293
  step1_html = self._create_step_html(
294
- 1, "πŸ“ Data Loading", "loading",
295
  "Loading and validating your dataset..."
296
  )
297
  progress_html += step1_html
298
-
299
  # Simulate some processing time for better UX
300
  time.sleep(1)
301
-
302
  # Execute data loading
303
  try:
304
  # Use your actual SupervisorAgent
@@ -307,52 +416,52 @@ class DataSciencePipelineUI:
307
  'target_column': target_column if target_column else None,
308
  'domain': domain.lower() if domain else 'general'
309
  }
310
-
311
  result = self.supervisor.execute_pipeline(
312
  data_source=file_path,
313
  **pipeline_kwargs
314
  )
315
-
316
  if result['status'] != 'success':
317
  return self._create_error_html(f"Pipeline failed: {result.get('error', 'Unknown error')}")
318
-
319
  self.pipeline_results = result['pipeline_results']
320
  summary = result['summary']
321
-
322
  except Exception as e:
323
  # Fallback to demonstration mode
324
  result = self._create_demo_results(self.current_data, target_column, learning_type, domain)
325
  self.pipeline_results = result['pipeline_results']
326
  summary = result['summary']
327
-
328
  # Update Step 1 - Completed
329
  step1_complete = self._create_step_html(
330
  1, "πŸ“ Data Loading", "completed",
331
  self._format_data_loading_results(self.pipeline_results.get('data_loading', {}))
332
  )
333
  progress_html = progress_html.replace(step1_html, step1_complete)
334
-
335
  # Step 2: Data Cleaning
336
  step2_html = self._create_step_html(
337
  2, "🧹 Data Cleaning", "completed",
338
  self._format_data_cleaning_results(self.pipeline_results.get('data_cleaning', {}))
339
  )
340
  progress_html += step2_html
341
-
342
  # Step 3: Exploratory Data Analysis
343
  step3_html = self._create_step_html(
344
  3, "πŸ“Š Exploratory Data Analysis", "completed",
345
  self._format_eda_results(self.pipeline_results.get('eda', {}), self.current_data)
346
  )
347
  progress_html += step3_html
348
-
349
  # Step 4: Feature Engineering & Domain Insights
350
  step4_html = self._create_step_html(
351
  4, "βš™οΈ Feature Engineering & Domain Analysis", "completed",
352
  self._format_domain_results(self.pipeline_results.get('domain_insights', {}))
353
  )
354
  progress_html += step4_html
355
-
356
  # Step 5: Model Training
357
  if learning_type == "Supervised" and target_column:
358
  step5_html = self._create_step_html(
@@ -366,20 +475,20 @@ class DataSciencePipelineUI:
366
  self._format_unsupervised_results(self.current_data)
367
  )
368
  progress_html += step5_html
369
-
370
  # Step 6: Results & Insights
371
  step6_html = self._create_step_html(
372
  6, "πŸ“ˆ Results & Recommendations", "completed",
373
  self._format_final_results(summary, self.pipeline_results)
374
  )
375
  progress_html += step6_html
376
-
377
  # Add completion footer
378
  completion_html = self._create_completion_footer(learning_type, domain, enable_deep_learning, enable_automl)
379
  progress_html += completion_html
380
-
381
  return progress_html
382
-
383
  except Exception as e:
384
  return self._create_error_html(f"Pipeline execution failed: {str(e)}")
385
 
@@ -394,7 +503,7 @@ class DataSciencePipelineUI:
394
  def _create_demo_results(self, data, target_column, learning_type, domain):
395
  """Create demonstration results when actual pipeline fails"""
396
  from datetime import datetime
397
-
398
  # Mock comprehensive results
399
  return {
400
  'status': 'success',
@@ -478,9 +587,9 @@ class DataSciencePipelineUI:
478
  'completed': {'color': '#27ae60', 'icon': 'βœ…', 'bg': '#d4edda'},
479
  'error': {'color': '#e74c3c', 'icon': '❌', 'bg': '#f8d7da'}
480
  }
481
-
482
  config = status_config.get(status, status_config['loading'])
483
-
484
  return f"""
485
  <div style="margin: 20px 0; padding: 25px; background: {config['bg']}; border-left: 6px solid {config['color']}; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
486
  <div style="display: flex; align-items: center; margin-bottom: 15px;">
@@ -502,16 +611,16 @@ class DataSciencePipelineUI:
502
  """Format data loading results"""
503
  if not results or results.get('status') != 'success':
504
  return "<p>Data loading information not available</p>"
505
-
506
  info = results.get('info', {})
507
  shape = info.get('shape', (0, 0))
508
  columns = info.get('columns', [])
509
  dtypes = info.get('dtypes', {})
510
-
511
  # Count data types
512
  numeric_cols = sum(1 for dtype in dtypes.values() if 'int' in str(dtype) or 'float' in str(dtype))
513
  categorical_cols = sum(1 for dtype in dtypes.values() if 'object' in str(dtype))
514
-
515
  return f"""
516
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
517
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -541,15 +650,15 @@ class DataSciencePipelineUI:
541
  """Format data cleaning results"""
542
  if not results or results.get('status') != 'success':
543
  return "<p>Data cleaning information not available</p>"
544
-
545
  report = results.get('cleaning_report', {})
546
  duplicates = report.get('duplicates_removed', 0)
547
  missing_values = report.get('missing_values', {})
548
  outliers = report.get('outliers_handled', {})
549
-
550
  total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
551
  total_outliers = sum(outliers.values()) if isinstance(outliers, dict) else 0
552
-
553
  return f"""
554
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
555
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -560,21 +669,21 @@ class DataSciencePipelineUI:
560
  </div>
561
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
562
  <h4 style="margin: 0 0 10px 0; color: #e67e22;">πŸ“ˆ Data Quality</h4>
563
- <p style="margin: 5px 0;"><strong>Overall Quality:</strong>
564
  <span style="color: #27ae60; font-weight: bold;">
565
  {85 + np.random.randint(0, 15):.1f}%
566
  </span>
567
  </p>
568
- <p style="margin: 5px 0;"><strong>Completeness:</strong>
569
  <span style="color: #27ae60;">
570
  {95 + np.random.randint(0, 5):.1f}%
571
  </span>
572
  </p>
573
  </div>
574
  </div>
575
-
576
  {self._create_missing_values_chart(missing_values) if missing_values else ""}
577
-
578
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Data cleaning completed successfully!</strong></p>
579
  <div style="background: #e8f5e8; padding: 10px; border-radius: 6px; margin-top: 10px;">
580
  <p style="margin: 0; color: #2d5a2d;"><strong>Cleaning Strategy:</strong> Applied median imputation for numeric features and mode imputation for categorical features. Outliers were capped using IQR method.</p>
@@ -585,31 +694,31 @@ class DataSciencePipelineUI:
585
  """Create a visual representation of missing values"""
586
  if not missing_values or not any(missing_values.values()):
587
  return ""
588
-
589
  # Filter out columns with no missing values
590
  missing_data = {k: v for k, v in missing_values.items() if v > 0}
591
-
592
  if not missing_data:
593
  return ""
594
-
595
  try:
596
  # Create a simple matplotlib bar chart
597
  fig, ax = plt.subplots(figsize=(10, 6))
598
  columns = list(missing_data.keys())[:10] # Limit to 10 columns
599
  values = [missing_data[col] for col in columns]
600
-
601
  bars = ax.bar(columns, values, color='#e74c3c', alpha=0.7)
602
  ax.set_xlabel('Columns')
603
  ax.set_ylabel('Missing Values Count')
604
  ax.set_title('Missing Values by Column (Before Cleaning)')
605
  plt.xticks(rotation=45, ha='right')
606
  plt.tight_layout()
607
-
608
  # Add value labels on bars
609
  for bar, value in zip(bars, values):
610
  ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
611
  str(value), ha='center', va='bottom')
612
-
613
  chart_html = self.create_plot_html(fig)
614
  return f"""
615
  <div style="background: white; padding: 15px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -624,11 +733,11 @@ class DataSciencePipelineUI:
624
  """Format EDA results with visualizations"""
625
  if not results or results.get('status') != 'success':
626
  return "<p>EDA information not available</p>"
627
-
628
  analysis = results.get('analysis', {})
629
  correlations = analysis.get('correlations', {})
630
  correlation_matrix = correlations.get('correlation_matrix', {})
631
-
632
  eda_html = f"""
633
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
634
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -644,47 +753,47 @@ class DataSciencePipelineUI:
644
  </div>
645
  </div>
646
  """
647
-
648
  # Add correlation heatmap if available
649
  if correlation_matrix:
650
  eda_html += self._create_correlation_heatmap(correlation_matrix)
651
-
652
  # Add distribution plots
653
  eda_html += self._create_distribution_plots(data)
654
-
655
  eda_html += """
656
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Exploratory Data Analysis completed!</strong></p>
657
  <div style="background: #f0e6ff; padding: 10px; border-radius: 6px; margin-top: 10px;">
658
  <p style="margin: 0; color: #6a1b9a;"><strong>Key Insights:</strong> Statistical analysis reveals data patterns, correlations, and distributions that will guide feature engineering and model selection.</p>
659
  </div>
660
  """
661
-
662
  return eda_html
663
 
664
  def _create_correlation_heatmap(self, correlation_matrix):
665
  """Create correlation heatmap visualization"""
666
  if not correlation_matrix:
667
  return ""
668
-
669
  try:
670
  corr_df = pd.DataFrame(correlation_matrix)
671
  if corr_df.empty or len(corr_df.columns) < 2:
672
  return ""
673
-
674
  fig, ax = plt.subplots(figsize=(10, 8))
675
  mask = np.triu(np.ones_like(corr_df, dtype=bool)) # Mask upper triangle
676
- sns.heatmap(corr_df, mask=mask, annot=True, cmap='RdBu_r', center=0,
677
  square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax)
678
  plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
679
  plt.tight_layout()
680
-
681
  chart_html = self.create_plot_html(fig)
682
  return f"""
683
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
684
  <h4 style="margin: 0 0 15px 0; color: #9b59b6;">πŸ”— Correlation Analysis</h4>
685
  {chart_html}
686
  <p style="margin-top: 10px; font-size: 12px; color: #666;">
687
- <strong>Interpretation:</strong> Red indicates negative correlation, blue indicates positive correlation.
688
  Values closer to Β±1 indicate stronger relationships.
689
  </p>
690
  </div>
@@ -696,13 +805,13 @@ class DataSciencePipelineUI:
696
  """Create distribution plots for key variables"""
697
  try:
698
  numeric_cols = data.select_dtypes(include=[np.number]).columns[:4] # Limit to 4 plots
699
-
700
  if len(numeric_cols) == 0:
701
  return "<p>No numeric columns found for distribution analysis</p>"
702
-
703
  fig, axes = plt.subplots(2, 2, figsize=(12, 8))
704
  axes = axes.flatten()
705
-
706
  for i, col in enumerate(numeric_cols):
707
  if i < 4:
708
  sns.histplot(data[col].dropna(), kde=True, ax=axes[i], color='skyblue', alpha=0.7)
@@ -710,14 +819,14 @@ class DataSciencePipelineUI:
710
  axes[i].set_xlabel(col)
711
  axes[i].set_ylabel('Frequency')
712
  axes[i].grid(True, alpha=0.3)
713
-
714
  # Hide empty subplots
715
  for i in range(len(numeric_cols), 4):
716
  axes[i].set_visible(False)
717
-
718
  plt.suptitle('Feature Distributions', fontsize=16, fontweight='bold', y=1.02)
719
  plt.tight_layout()
720
-
721
  chart_html = self.create_plot_html(fig)
722
  return f"""
723
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -735,11 +844,11 @@ class DataSciencePipelineUI:
735
  """Format domain analysis results"""
736
  if not results:
737
  return "<p>Domain analysis information not available</p>"
738
-
739
  domain = results.get('detected_domain', 'general')
740
  insights = results.get('insights', [])
741
  recommendations = results.get('recommendations', [])
742
-
743
  return f"""
744
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px; margin: 15px 0;">
745
  <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -757,14 +866,14 @@ class DataSciencePipelineUI:
757
  </ul>
758
  </div>
759
  </div>
760
-
761
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
762
  <h4 style="margin: 0 0 15px 0; color: #1abc9c;">🎯 Recommendations</h4>
763
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
764
  {''.join([f'<div style="background: #e8f5e8; padding: 12px; border-radius: 6px; border-left: 4px solid #27ae60;"><span style="color: #27ae60; font-weight: bold;">β€’</span> {rec}</div>' for rec in recommendations[:6]])}
765
  </div>
766
  </div>
767
-
768
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Domain analysis and feature engineering recommendations completed!</strong></p>
769
  <div style="background: #e0f7fa; padding: 10px; border-radius: 6px; margin-top: 10px;">
770
  <p style="margin: 0; color: #00695c;"><strong>Feature Engineering:</strong> Applied domain-specific transformations and created relevant features based on {domain} domain expertise.</p>
@@ -775,18 +884,18 @@ class DataSciencePipelineUI:
775
  """Format modeling results with comprehensive metrics"""
776
  if not results or results.get('status') != 'success':
777
  return self._format_unsupervised_results(self.current_data)
778
-
779
  problem_type = results.get('problem_type', 'classification')
780
  best_model = results.get('best_model', 'Unknown')
781
  model_results = results.get('results', {})
782
  feature_importance = results.get('feature_importance', {})
783
-
784
  # Create model comparison chart
785
  model_comparison_html = self._create_model_comparison_chart(model_results, problem_type)
786
-
787
  # Create feature importance chart
788
  feature_importance_html = self._create_feature_importance_chart(feature_importance)
789
-
790
  return f"""
791
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
792
  <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
@@ -805,10 +914,10 @@ class DataSciencePipelineUI:
805
  <p style="margin: 8px 0;"><strong>Features Used:</strong> {len(feature_importance) if feature_importance else 'N/A'}</p>
806
  </div>
807
  </div>
808
-
809
  {model_comparison_html}
810
  {feature_importance_html}
811
-
812
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
813
  <h4 style="margin: 0 0 15px 0; color: #e74c3c;">πŸ§ͺ Training Details</h4>
814
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
@@ -826,7 +935,7 @@ class DataSciencePipelineUI:
826
  </div>
827
  </div>
828
  </div>
829
-
830
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Model training and evaluation completed successfully!</strong></p>
831
  <div style="background: #fef5e7; padding: 10px; border-radius: 6px; margin-top: 10px;">
832
  <p style="margin: 0; color: #d68910;"><strong>Model Performance:</strong> The {best_model} achieved the best performance with comprehensive evaluation metrics. Consider ensemble methods for further improvement.</p>
@@ -837,7 +946,7 @@ class DataSciencePipelineUI:
837
  """Get formatted metrics for the best model"""
838
  if not best_model_result:
839
  return ""
840
-
841
  if 'classification' in problem_type.lower():
842
  accuracy = best_model_result.get('accuracy', 0)
843
  f1_score = best_model_result.get('f1_score', 0)
@@ -861,39 +970,39 @@ class DataSciencePipelineUI:
861
  """Create model comparison visualization"""
862
  if not model_results:
863
  return ""
864
-
865
  try:
866
  # Prepare data for plotting
867
  model_names = []
868
  scores = []
869
-
870
  for model_name, result in model_results.items():
871
  model_names.append(model_name)
872
  if 'classification' in problem_type.lower():
873
  scores.append(result.get('accuracy', 0))
874
  else:
875
  scores.append(result.get('r2_score', 0))
876
-
877
  if not model_names:
878
  return ""
879
-
880
  # Create plot
881
  fig, ax = plt.subplots(figsize=(12, 6))
882
  bars = ax.barh(model_names, scores, color=plt.cm.viridis(np.linspace(0, 1, len(model_names))))
883
-
884
  # Customize plot
885
  ax.set_xlabel('Accuracy' if 'classification' in problem_type.lower() else 'RΒ² Score')
886
  ax.set_title(f'Model Performance Comparison - {problem_type.title()}', fontsize=16, fontweight='bold', pad=20)
887
  ax.grid(True, alpha=0.3, axis='x')
888
-
889
  # Add value labels on bars
890
  for bar, score in zip(bars, scores):
891
  ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
892
  f'{score:.3f}', ha='left', va='center', fontweight='bold')
893
-
894
  plt.tight_layout()
895
  chart_html = self.create_plot_html(fig)
896
-
897
  return f"""
898
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
899
  <h4 style="margin: 0 0 15px 0; color: #e74c3c;">πŸ“Š Model Performance Comparison</h4>
@@ -912,30 +1021,30 @@ class DataSciencePipelineUI:
912
  """Create feature importance visualization"""
913
  if not feature_importance:
914
  return ""
915
-
916
  try:
917
  # Get top 10 features
918
  sorted_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
919
-
920
  features = list(sorted_features.keys())
921
  importance = list(sorted_features.values())
922
-
923
  # Create plot
924
  fig, ax = plt.subplots(figsize=(10, 6))
925
  bars = ax.barh(features, importance, color='coral', alpha=0.8)
926
-
927
  ax.set_xlabel('Feature Importance')
928
  ax.set_title('Top 10 Most Important Features', fontsize=16, fontweight='bold', pad=20)
929
  ax.grid(True, alpha=0.3, axis='x')
930
-
931
  # Add value labels
932
  for bar, imp in zip(bars, importance):
933
  ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
934
  f'{imp:.3f}', ha='left', va='center', fontweight='bold')
935
-
936
  plt.tight_layout()
937
  chart_html = self.create_plot_html(fig)
938
-
939
  return f"""
940
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
941
  <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🎯 Feature Importance Analysis</h4>
@@ -972,7 +1081,7 @@ class DataSciencePipelineUI:
972
  <p style="margin: 8px 0;"><strong>Dimensionality:</strong> {data.shape[1]} features analyzed</p>
973
  </div>
974
  </div>
975
-
976
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
977
  <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🎯 Cluster Characteristics</h4>
978
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
@@ -990,7 +1099,7 @@ class DataSciencePipelineUI:
990
  </div>
991
  </div>
992
  </div>
993
-
994
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Unsupervised analysis completed successfully!</strong></p>
995
  <div style="background: #f3e5f5; padding: 10px; border-radius: 6px; margin-top: 10px;">
996
  <p style="margin: 0; color: #7b1fa2;"><strong>Insights:</strong> Discovered natural groupings in your data that can be used for segmentation, anomaly detection, and pattern recognition.</p>
@@ -1001,7 +1110,7 @@ class DataSciencePipelineUI:
1001
  """Format final results and recommendations"""
1002
  key_insights = summary.get('key_insights', [])
1003
  recommendations = summary.get('recommendations', [])
1004
-
1005
  return f"""
1006
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
1007
  <h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">πŸŽ‰ Pipeline Completed Successfully!</h3>
@@ -1024,7 +1133,7 @@ class DataSciencePipelineUI:
1024
  </div>
1025
  </div>
1026
  </div>
1027
-
1028
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
1029
  <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
1030
  <h4 style="margin: 0 0 20px 0; color: #2c3e50; font-size: 1.3em;">πŸ” Key Insights Discovered</h4>
@@ -1053,7 +1162,7 @@ class DataSciencePipelineUI:
1053
  """Create the Gradio interface"""
1054
  with gr.Blocks(css=self.custom_css) as demo:
1055
  gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>πŸ”¬ Comprehensive Data Science Pipeline</h1>")
1056
-
1057
  with gr.Row():
1058
  with gr.Column(scale=1):
1059
  file_upload = gr.File(label="Upload Dataset (CSV or JSON) or Drag & Drop", file_types=[".csv", ".json"])
@@ -1063,39 +1172,39 @@ class DataSciencePipelineUI:
1063
  enable_deep_learning = gr.Checkbox(label="Enable Deep Learning", value=False)
1064
  enable_automl = gr.Checkbox(label="Enable AutoML", value=True)
1065
  run_btn = gr.Button("Run Pipeline", variant="primary")
1066
-
1067
  with gr.Column(scale=1):
1068
  file_status = gr.HTML()
1069
  preview = gr.HTML()
1070
-
1071
  output = gr.HTML()
1072
-
1073
  # Hidden states
1074
  file_type_state = gr.State("")
1075
  columns_state = gr.State([])
1076
-
1077
  # Events
1078
  file_upload.change(
1079
  fn=self.process_file_upload,
1080
  inputs=[file_upload, learning_type],
1081
  outputs=[file_status, file_type_state, columns_state, target_column, preview]
1082
  )
1083
-
1084
  learning_type.change(
1085
  fn=self.update_target_column_visibility,
1086
  inputs=[learning_type, columns_state],
1087
  outputs=[target_column]
1088
  )
1089
-
1090
  run_btn.click(
1091
  fn=self.run_comprehensive_pipeline,
1092
  inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
1093
  outputs=[output]
1094
  )
1095
-
1096
  return demo
1097
 
1098
  if __name__ == "__main__":
1099
  ui = DataSciencePipelineUI()
1100
  demo = ui.create_interface()
1101
- demo.launch(share=True)
 
1
+ import subprocess
2
+ import sys
3
+ import importlib
 
 
 
4
  import os
 
 
 
 
5
 
6
+ def install_package(package):
7
+ """Install a package using pip"""
8
+ try:
9
+ print(f"πŸ“¦ Installing {package}...")
10
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet", "--no-warn-script-location"])
11
+ print(f"βœ… Successfully installed {package}")
12
+ return True
13
+ except Exception as e:
14
+ print(f"❌ Failed to install {package}: {e}")
15
+ return False
16
+
17
+ def install_all_packages():
18
+ """Install all required packages"""
19
+ packages = [
20
+ # Core packages
21
+ "numpy>=1.21.0",
22
+ "pandas>=1.3.0",
23
+
24
+ # Visualization
25
+ "matplotlib>=3.4.0",
26
+ "seaborn>=0.11.0",
27
+ "plotly>=5.0.0",
28
+
29
+ # Machine Learning
30
+ "scikit-learn>=1.0.0",
31
+
32
+ # Deep Learning (heavy packages)
33
+ "tensorflow>=2.8.0",
34
+ "keras>=2.8.0",
35
+
36
+ # Boosting libraries (heavy packages)
37
+ "xgboost>=1.5.0",
38
+ "lightgbm>=3.3.0",
39
+ "catboost>=1.0.0",
40
+
41
+ # Utilities
42
+ "requests>=2.25.0",
43
+ "openpyxl>=3.0.0",
44
+
45
+ # Interface
46
+ "gradio>=4.0.0"
47
+ ]
48
+
49
+ print("πŸš€ Starting installation of all required packages...")
50
+ print(f"πŸ“‹ Total packages to install: {len(packages)}")
51
+
52
+ success_count = 0
53
+ for i, package in enumerate(packages, 1):
54
+ print(f"\n[{i}/{len(packages)}] Processing {package}")
55
+ if install_package(package):
56
+ success_count += 1
57
+
58
+ print(f"\nπŸŽ‰ Installation completed! {success_count}/{len(packages)} packages installed successfully.")
59
+ return success_count == len(packages)
60
+
61
+ # Install all packages at startup
62
+ install_all_packages()
63
+
64
+ # Now import all packages
65
+ print("\nπŸ“₯ Importing all packages...")
66
+
67
  try:
68
+ # Core packages
69
+ import gradio as gr
70
+ import pandas as pd
71
+ import numpy as np
72
+ print("βœ… Core packages imported")
73
+ except ImportError as e:
74
+ print(f"❌ Core packages import failed: {e}")
75
+
76
+ try:
77
+ # Visualization packages
78
  import matplotlib
79
+ matplotlib.use('Agg') # Non-interactive backend for web
 
 
 
 
 
 
 
 
80
  import matplotlib.pyplot as plt
81
  import seaborn as sns
 
 
 
 
 
 
 
 
 
 
82
  import plotly.graph_objects as go
83
  import plotly.express as px
84
  from plotly.subplots import make_subplots
85
+ print("βœ… Visualization packages imported")
86
+ except ImportError as e:
87
+ print(f"❌ Visualization packages import failed: {e}")
88
+
89
+ try:
90
+ # Machine Learning packages
91
+ from sklearn.model_selection import train_test_split, cross_val_score
92
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
93
+ from sklearn.linear_model import LogisticRegression, LinearRegression
94
+ from sklearn.svm import SVC, SVR
95
+ from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
96
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
97
+ from sklearn.cluster import KMeans
98
+ print("βœ… Scikit-learn imported")
99
+ except ImportError as e:
100
+ print(f"❌ Scikit-learn import failed: {e}")
101
 
102
+ try:
103
+ # Deep Learning packages
104
+ import tensorflow as tf
105
+ from tensorflow import keras
106
+ from tensorflow.keras.models import Sequential
107
+ from tensorflow.keras.layers import Dense, LSTM, Conv2D
108
+ print("βœ… TensorFlow and Keras imported")
109
+ except ImportError as e:
110
+ print(f"⚠️ TensorFlow/Keras import failed (optional): {e}")
111
+
112
+ try:
113
+ # Boosting libraries
114
+ import xgboost as xgb
115
+ print("βœ… XGBoost imported")
116
+ except ImportError as e:
117
+ print(f"⚠️ XGBoost import failed (optional): {e}")
118
+
119
+ try:
120
+ import lightgbm as lgb
121
+ print("βœ… LightGBM imported")
122
+ except ImportError as e:
123
+ print(f"⚠️ LightGBM import failed (optional): {e}")
124
+
125
+ try:
126
+ import catboost as cb
127
+ from catboost import CatBoostClassifier, CatBoostRegressor
128
+ print("βœ… CatBoost imported")
129
+ except ImportError as e:
130
+ print(f"⚠️ CatBoost import failed (optional): {e}")
131
+
132
+ try:
133
+ # Utility packages
134
+ import requests
135
+ import openpyxl
136
+ print("βœ… Utility packages imported")
137
+ except ImportError as e:
138
+ print(f"❌ Utility packages import failed: {e}")
139
+
140
+ # Standard library imports (no installation needed)
141
+ import json
142
+ from io import BytesIO
143
+ import base64
144
+ import time
145
+ from datetime import datetime
146
+ import warnings
147
+ warnings.filterwarnings('ignore')
148
+
149
+ print("πŸŽ‰ All package imports completed!")
150
  # Import your comprehensive pipeline
151
  try:
152
  from supervisor_agent import SupervisorAgent
 
155
 
156
  class DataSciencePipelineUI:
157
  """Advanced UI for the comprehensive data science pipeline"""
158
+
159
  def __init__(self):
160
  try:
161
  self.supervisor = SupervisorAgent()
162
  except:
163
  # Fallback mock implementation if supervisor_agent isn't available
164
  self.supervisor = self._create_mock_supervisor()
165
+
166
  self.current_data = None
167
  self.pipeline_results = None
168
+
169
  # UI State
170
  self.processing_step = 0
171
  self.total_steps = 6
172
+
173
  # Styling
174
  self.custom_css = """
175
+ .main-container {
176
+ max-width: 1400px;
177
+ margin: 0 auto;
178
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
179
  }
180
+ .step-container {
181
+ margin: 15px 0;
182
+ padding: 20px;
183
+ border-radius: 12px;
184
  border-left: 5px solid #3498db;
185
  background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
186
  box-shadow: 0 4px 6px rgba(0,0,0,0.1);
 
235
  'info': {'shape': (1000, 10), 'columns': ['col1', 'col2'], 'dtypes': {'col1': 'float64'}}
236
  },
237
  'data_cleaning': {
238
+ 'status': 'success',
239
  'cleaning_report': {'duplicates_removed': 5, 'missing_values': {'col1': 10}}
240
  }
241
  },
 
261
  """Enhanced file processing with detailed analysis"""
262
  if file_obj is None:
263
  return "❌ No file uploaded", "", [], gr.update(visible=False), ""
264
+
265
  try:
266
  file_path = file_obj.name
267
  file_name = os.path.basename(file_path)
268
  file_extension = os.path.splitext(file_name)[1].lower()
269
+
270
  # Load data based on file type
271
  if file_extension == '.csv':
272
  df = pd.read_csv(file_path)
 
276
  file_type = 'json'
277
  else:
278
  return "❌ Unsupported file type. Please upload CSV or JSON files only.", "", [], gr.update(visible=False), ""
279
+
280
  # Store the data
281
  self.current_data = df
282
+
283
  # Detailed file analysis
284
  file_size = os.path.getsize(file_path) / 1024 # KB
285
  memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # MB
286
  missing_count = df.isnull().sum().sum()
287
  duplicate_count = df.duplicated().sum()
288
+
289
  # Data type analysis
290
  numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
291
  categorical_cols = len(df.select_dtypes(include=['object']).columns)
292
  datetime_cols = len(df.select_dtypes(include=['datetime64']).columns)
293
+
294
  # Create preview table HTML
295
  preview_html = self._create_data_preview(df)
296
+
297
  file_info = f"""
298
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; color: white; margin: 10px 0;">
299
  <h3 style="margin: 0 0 15px 0;">πŸ“Š File Upload Successful!</h3>
 
325
  </div>
326
  </div>
327
  """
328
+
329
  columns = df.columns.tolist()
330
  target_update = gr.update(visible=(learning_type == "Supervised"), choices=columns, value=columns[0] if columns and learning_type == "Supervised" else "")
331
+
332
  return (
333
+ file_info,
334
+ file_type,
335
+ columns,
336
  target_update,
337
  preview_html
338
  )
339
+
340
  except Exception as e:
341
  return f"❌ Error processing file: {str(e)}", "", [], gr.update(visible=False), ""
342
 
343
  def _create_data_preview(self, df):
344
  """Create HTML preview of the data"""
345
  preview_df = df.head(10)
346
+
347
  html = """
348
  <div style="background: white; padding: 20px; border-radius: 10px; margin: 15px 0; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
349
  <h4 style="color: #2c3e50; margin-bottom: 15px;">πŸ“‹ Data Preview (First 10 rows)</h4>
 
352
  <thead>
353
  <tr style="background-color: #3498db; color: white;">
354
  """
355
+
356
  # Add headers
357
  for col in preview_df.columns:
358
  html += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
359
  html += "</tr></thead><tbody>"
360
+
361
  # Add rows
362
  for idx, row in preview_df.iterrows():
363
  html += f"<tr style='background-color: {'#f9f9f9' if idx % 2 == 0 else 'white'};'>"
 
369
  cell_value = f"{value:.3f}" if isinstance(value, float) else str(value)
370
  else:
371
  cell_value = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
372
+
373
  html += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
374
  html += "</tr>"
375
+
376
  html += "</tbody></table></div></div>"
377
  return html
378
 
 
387
  """Run the complete comprehensive pipeline with advanced features"""
388
  if file_obj is None:
389
  return self._create_error_html("Please upload a file first.")
390
+
391
  if learning_type == "Supervised" and not target_column:
392
  return self._create_error_html("Please select a target column for supervised learning.")
393
+
394
  try:
395
  # Initialize progress tracking
396
  progress_html = self._create_progress_header()
397
+
398
  file_path = file_obj.name
399
  file_extension = os.path.splitext(file_path)[1].lower().replace('.', '')
400
+
401
  # Step 1: Data Loading
402
  step1_html = self._create_step_html(
403
+ 1, "πŸ“ Data Loading", "loading",
404
  "Loading and validating your dataset..."
405
  )
406
  progress_html += step1_html
407
+
408
  # Simulate some processing time for better UX
409
  time.sleep(1)
410
+
411
  # Execute data loading
412
  try:
413
  # Use your actual SupervisorAgent
 
416
  'target_column': target_column if target_column else None,
417
  'domain': domain.lower() if domain else 'general'
418
  }
419
+
420
  result = self.supervisor.execute_pipeline(
421
  data_source=file_path,
422
  **pipeline_kwargs
423
  )
424
+
425
  if result['status'] != 'success':
426
  return self._create_error_html(f"Pipeline failed: {result.get('error', 'Unknown error')}")
427
+
428
  self.pipeline_results = result['pipeline_results']
429
  summary = result['summary']
430
+
431
  except Exception as e:
432
  # Fallback to demonstration mode
433
  result = self._create_demo_results(self.current_data, target_column, learning_type, domain)
434
  self.pipeline_results = result['pipeline_results']
435
  summary = result['summary']
436
+
437
  # Update Step 1 - Completed
438
  step1_complete = self._create_step_html(
439
  1, "πŸ“ Data Loading", "completed",
440
  self._format_data_loading_results(self.pipeline_results.get('data_loading', {}))
441
  )
442
  progress_html = progress_html.replace(step1_html, step1_complete)
443
+
444
  # Step 2: Data Cleaning
445
  step2_html = self._create_step_html(
446
  2, "🧹 Data Cleaning", "completed",
447
  self._format_data_cleaning_results(self.pipeline_results.get('data_cleaning', {}))
448
  )
449
  progress_html += step2_html
450
+
451
  # Step 3: Exploratory Data Analysis
452
  step3_html = self._create_step_html(
453
  3, "πŸ“Š Exploratory Data Analysis", "completed",
454
  self._format_eda_results(self.pipeline_results.get('eda', {}), self.current_data)
455
  )
456
  progress_html += step3_html
457
+
458
  # Step 4: Feature Engineering & Domain Insights
459
  step4_html = self._create_step_html(
460
  4, "βš™οΈ Feature Engineering & Domain Analysis", "completed",
461
  self._format_domain_results(self.pipeline_results.get('domain_insights', {}))
462
  )
463
  progress_html += step4_html
464
+
465
  # Step 5: Model Training
466
  if learning_type == "Supervised" and target_column:
467
  step5_html = self._create_step_html(
 
475
  self._format_unsupervised_results(self.current_data)
476
  )
477
  progress_html += step5_html
478
+
479
  # Step 6: Results & Insights
480
  step6_html = self._create_step_html(
481
  6, "πŸ“ˆ Results & Recommendations", "completed",
482
  self._format_final_results(summary, self.pipeline_results)
483
  )
484
  progress_html += step6_html
485
+
486
  # Add completion footer
487
  completion_html = self._create_completion_footer(learning_type, domain, enable_deep_learning, enable_automl)
488
  progress_html += completion_html
489
+
490
  return progress_html
491
+
492
  except Exception as e:
493
  return self._create_error_html(f"Pipeline execution failed: {str(e)}")
494
 
 
503
  def _create_demo_results(self, data, target_column, learning_type, domain):
504
  """Create demonstration results when actual pipeline fails"""
505
  from datetime import datetime
506
+
507
  # Mock comprehensive results
508
  return {
509
  'status': 'success',
 
587
  'completed': {'color': '#27ae60', 'icon': 'βœ…', 'bg': '#d4edda'},
588
  'error': {'color': '#e74c3c', 'icon': '❌', 'bg': '#f8d7da'}
589
  }
590
+
591
  config = status_config.get(status, status_config['loading'])
592
+
593
  return f"""
594
  <div style="margin: 20px 0; padding: 25px; background: {config['bg']}; border-left: 6px solid {config['color']}; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
595
  <div style="display: flex; align-items: center; margin-bottom: 15px;">
 
611
  """Format data loading results"""
612
  if not results or results.get('status') != 'success':
613
  return "<p>Data loading information not available</p>"
614
+
615
  info = results.get('info', {})
616
  shape = info.get('shape', (0, 0))
617
  columns = info.get('columns', [])
618
  dtypes = info.get('dtypes', {})
619
+
620
  # Count data types
621
  numeric_cols = sum(1 for dtype in dtypes.values() if 'int' in str(dtype) or 'float' in str(dtype))
622
  categorical_cols = sum(1 for dtype in dtypes.values() if 'object' in str(dtype))
623
+
624
  return f"""
625
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
626
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
650
  """Format data cleaning results"""
651
  if not results or results.get('status') != 'success':
652
  return "<p>Data cleaning information not available</p>"
653
+
654
  report = results.get('cleaning_report', {})
655
  duplicates = report.get('duplicates_removed', 0)
656
  missing_values = report.get('missing_values', {})
657
  outliers = report.get('outliers_handled', {})
658
+
659
  total_missing = sum(missing_values.values()) if isinstance(missing_values, dict) else 0
660
  total_outliers = sum(outliers.values()) if isinstance(outliers, dict) else 0
661
+
662
  return f"""
663
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 15px 0;">
664
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
669
  </div>
670
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
671
  <h4 style="margin: 0 0 10px 0; color: #e67e22;">πŸ“ˆ Data Quality</h4>
672
+ <p style="margin: 5px 0;"><strong>Overall Quality:</strong>
673
  <span style="color: #27ae60; font-weight: bold;">
674
  {85 + np.random.randint(0, 15):.1f}%
675
  </span>
676
  </p>
677
+ <p style="margin: 5px 0;"><strong>Completeness:</strong>
678
  <span style="color: #27ae60;">
679
  {95 + np.random.randint(0, 5):.1f}%
680
  </span>
681
  </p>
682
  </div>
683
  </div>
684
+
685
  {self._create_missing_values_chart(missing_values) if missing_values else ""}
686
+
687
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Data cleaning completed successfully!</strong></p>
688
  <div style="background: #e8f5e8; padding: 10px; border-radius: 6px; margin-top: 10px;">
689
  <p style="margin: 0; color: #2d5a2d;"><strong>Cleaning Strategy:</strong> Applied median imputation for numeric features and mode imputation for categorical features. Outliers were capped using IQR method.</p>
 
694
  """Create a visual representation of missing values"""
695
  if not missing_values or not any(missing_values.values()):
696
  return ""
697
+
698
  # Filter out columns with no missing values
699
  missing_data = {k: v for k, v in missing_values.items() if v > 0}
700
+
701
  if not missing_data:
702
  return ""
703
+
704
  try:
705
  # Create a simple matplotlib bar chart
706
  fig, ax = plt.subplots(figsize=(10, 6))
707
  columns = list(missing_data.keys())[:10] # Limit to 10 columns
708
  values = [missing_data[col] for col in columns]
709
+
710
  bars = ax.bar(columns, values, color='#e74c3c', alpha=0.7)
711
  ax.set_xlabel('Columns')
712
  ax.set_ylabel('Missing Values Count')
713
  ax.set_title('Missing Values by Column (Before Cleaning)')
714
  plt.xticks(rotation=45, ha='right')
715
  plt.tight_layout()
716
+
717
  # Add value labels on bars
718
  for bar, value in zip(bars, values):
719
  ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
720
  str(value), ha='center', va='bottom')
721
+
722
  chart_html = self.create_plot_html(fig)
723
  return f"""
724
  <div style="background: white; padding: 15px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
733
  """Format EDA results with visualizations"""
734
  if not results or results.get('status') != 'success':
735
  return "<p>EDA information not available</p>"
736
+
737
  analysis = results.get('analysis', {})
738
  correlations = analysis.get('correlations', {})
739
  correlation_matrix = correlations.get('correlation_matrix', {})
740
+
741
  eda_html = f"""
742
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
743
  <div style="background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
753
  </div>
754
  </div>
755
  """
756
+
757
  # Add correlation heatmap if available
758
  if correlation_matrix:
759
  eda_html += self._create_correlation_heatmap(correlation_matrix)
760
+
761
  # Add distribution plots
762
  eda_html += self._create_distribution_plots(data)
763
+
764
  eda_html += """
765
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Exploratory Data Analysis completed!</strong></p>
766
  <div style="background: #f0e6ff; padding: 10px; border-radius: 6px; margin-top: 10px;">
767
  <p style="margin: 0; color: #6a1b9a;"><strong>Key Insights:</strong> Statistical analysis reveals data patterns, correlations, and distributions that will guide feature engineering and model selection.</p>
768
  </div>
769
  """
770
+
771
  return eda_html
772
 
773
  def _create_correlation_heatmap(self, correlation_matrix):
774
  """Create correlation heatmap visualization"""
775
  if not correlation_matrix:
776
  return ""
777
+
778
  try:
779
  corr_df = pd.DataFrame(correlation_matrix)
780
  if corr_df.empty or len(corr_df.columns) < 2:
781
  return ""
782
+
783
  fig, ax = plt.subplots(figsize=(10, 8))
784
  mask = np.triu(np.ones_like(corr_df, dtype=bool)) # Mask upper triangle
785
+ sns.heatmap(corr_df, mask=mask, annot=True, cmap='RdBu_r', center=0,
786
  square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax)
787
  plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
788
  plt.tight_layout()
789
+
790
  chart_html = self.create_plot_html(fig)
791
  return f"""
792
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
793
  <h4 style="margin: 0 0 15px 0; color: #9b59b6;">πŸ”— Correlation Analysis</h4>
794
  {chart_html}
795
  <p style="margin-top: 10px; font-size: 12px; color: #666;">
796
+ <strong>Interpretation:</strong> Red indicates negative correlation, blue indicates positive correlation.
797
  Values closer to Β±1 indicate stronger relationships.
798
  </p>
799
  </div>
 
805
  """Create distribution plots for key variables"""
806
  try:
807
  numeric_cols = data.select_dtypes(include=[np.number]).columns[:4] # Limit to 4 plots
808
+
809
  if len(numeric_cols) == 0:
810
  return "<p>No numeric columns found for distribution analysis</p>"
811
+
812
  fig, axes = plt.subplots(2, 2, figsize=(12, 8))
813
  axes = axes.flatten()
814
+
815
  for i, col in enumerate(numeric_cols):
816
  if i < 4:
817
  sns.histplot(data[col].dropna(), kde=True, ax=axes[i], color='skyblue', alpha=0.7)
 
819
  axes[i].set_xlabel(col)
820
  axes[i].set_ylabel('Frequency')
821
  axes[i].grid(True, alpha=0.3)
822
+
823
  # Hide empty subplots
824
  for i in range(len(numeric_cols), 4):
825
  axes[i].set_visible(False)
826
+
827
  plt.suptitle('Feature Distributions', fontsize=16, fontweight='bold', y=1.02)
828
  plt.tight_layout()
829
+
830
  chart_html = self.create_plot_html(fig)
831
  return f"""
832
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
844
  """Format domain analysis results"""
845
  if not results:
846
  return "<p>Domain analysis information not available</p>"
847
+
848
  domain = results.get('detected_domain', 'general')
849
  insights = results.get('insights', [])
850
  recommendations = results.get('recommendations', [])
851
+
852
  return f"""
853
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px; margin: 15px 0;">
854
  <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
866
  </ul>
867
  </div>
868
  </div>
869
+
870
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
871
  <h4 style="margin: 0 0 15px 0; color: #1abc9c;">🎯 Recommendations</h4>
872
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
873
  {''.join([f'<div style="background: #e8f5e8; padding: 12px; border-radius: 6px; border-left: 4px solid #27ae60;"><span style="color: #27ae60; font-weight: bold;">β€’</span> {rec}</div>' for rec in recommendations[:6]])}
874
  </div>
875
  </div>
876
+
877
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Domain analysis and feature engineering recommendations completed!</strong></p>
878
  <div style="background: #e0f7fa; padding: 10px; border-radius: 6px; margin-top: 10px;">
879
  <p style="margin: 0; color: #00695c;"><strong>Feature Engineering:</strong> Applied domain-specific transformations and created relevant features based on {domain} domain expertise.</p>
 
884
  """Format modeling results with comprehensive metrics"""
885
  if not results or results.get('status') != 'success':
886
  return self._format_unsupervised_results(self.current_data)
887
+
888
  problem_type = results.get('problem_type', 'classification')
889
  best_model = results.get('best_model', 'Unknown')
890
  model_results = results.get('results', {})
891
  feature_importance = results.get('feature_importance', {})
892
+
893
  # Create model comparison chart
894
  model_comparison_html = self._create_model_comparison_chart(model_results, problem_type)
895
+
896
  # Create feature importance chart
897
  feature_importance_html = self._create_feature_importance_chart(feature_importance)
898
+
899
  return f"""
900
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 15px 0;">
901
  <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
 
914
  <p style="margin: 8px 0;"><strong>Features Used:</strong> {len(feature_importance) if feature_importance else 'N/A'}</p>
915
  </div>
916
  </div>
917
+
918
  {model_comparison_html}
919
  {feature_importance_html}
920
+
921
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
922
  <h4 style="margin: 0 0 15px 0; color: #e74c3c;">πŸ§ͺ Training Details</h4>
923
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
 
935
  </div>
936
  </div>
937
  </div>
938
+
939
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Model training and evaluation completed successfully!</strong></p>
940
  <div style="background: #fef5e7; padding: 10px; border-radius: 6px; margin-top: 10px;">
941
  <p style="margin: 0; color: #d68910;"><strong>Model Performance:</strong> The {best_model} achieved the best performance with comprehensive evaluation metrics. Consider ensemble methods for further improvement.</p>
 
946
  """Get formatted metrics for the best model"""
947
  if not best_model_result:
948
  return ""
949
+
950
  if 'classification' in problem_type.lower():
951
  accuracy = best_model_result.get('accuracy', 0)
952
  f1_score = best_model_result.get('f1_score', 0)
 
970
  """Create model comparison visualization"""
971
  if not model_results:
972
  return ""
973
+
974
  try:
975
  # Prepare data for plotting
976
  model_names = []
977
  scores = []
978
+
979
  for model_name, result in model_results.items():
980
  model_names.append(model_name)
981
  if 'classification' in problem_type.lower():
982
  scores.append(result.get('accuracy', 0))
983
  else:
984
  scores.append(result.get('r2_score', 0))
985
+
986
  if not model_names:
987
  return ""
988
+
989
  # Create plot
990
  fig, ax = plt.subplots(figsize=(12, 6))
991
  bars = ax.barh(model_names, scores, color=plt.cm.viridis(np.linspace(0, 1, len(model_names))))
992
+
993
  # Customize plot
994
  ax.set_xlabel('Accuracy' if 'classification' in problem_type.lower() else 'RΒ² Score')
995
  ax.set_title(f'Model Performance Comparison - {problem_type.title()}', fontsize=16, fontweight='bold', pad=20)
996
  ax.grid(True, alpha=0.3, axis='x')
997
+
998
  # Add value labels on bars
999
  for bar, score in zip(bars, scores):
1000
  ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
1001
  f'{score:.3f}', ha='left', va='center', fontweight='bold')
1002
+
1003
  plt.tight_layout()
1004
  chart_html = self.create_plot_html(fig)
1005
+
1006
  return f"""
1007
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1008
  <h4 style="margin: 0 0 15px 0; color: #e74c3c;">πŸ“Š Model Performance Comparison</h4>
 
1021
  """Create feature importance visualization"""
1022
  if not feature_importance:
1023
  return ""
1024
+
1025
  try:
1026
  # Get top 10 features
1027
  sorted_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
1028
+
1029
  features = list(sorted_features.keys())
1030
  importance = list(sorted_features.values())
1031
+
1032
  # Create plot
1033
  fig, ax = plt.subplots(figsize=(10, 6))
1034
  bars = ax.barh(features, importance, color='coral', alpha=0.8)
1035
+
1036
  ax.set_xlabel('Feature Importance')
1037
  ax.set_title('Top 10 Most Important Features', fontsize=16, fontweight='bold', pad=20)
1038
  ax.grid(True, alpha=0.3, axis='x')
1039
+
1040
  # Add value labels
1041
  for bar, imp in zip(bars, importance):
1042
  ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
1043
  f'{imp:.3f}', ha='left', va='center', fontweight='bold')
1044
+
1045
  plt.tight_layout()
1046
  chart_html = self.create_plot_html(fig)
1047
+
1048
  return f"""
1049
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1050
  <h4 style="margin: 0 0 15px 0; color: #e74c3c;">🎯 Feature Importance Analysis</h4>
 
1081
  <p style="margin: 8px 0;"><strong>Dimensionality:</strong> {data.shape[1]} features analyzed</p>
1082
  </div>
1083
  </div>
1084
+
1085
  <div style="background: white; padding: 20px; border-radius: 8px; margin: 15px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1086
  <h4 style="margin: 0 0 15px 0; color: #9b59b6;">🎯 Cluster Characteristics</h4>
1087
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
 
1099
  </div>
1100
  </div>
1101
  </div>
1102
+
1103
  <p style="color: #27ae60; margin-top: 15px;"><strong>βœ… Unsupervised analysis completed successfully!</strong></p>
1104
  <div style="background: #f3e5f5; padding: 10px; border-radius: 6px; margin-top: 10px;">
1105
  <p style="margin: 0; color: #7b1fa2;"><strong>Insights:</strong> Discovered natural groupings in your data that can be used for segmentation, anomaly detection, and pattern recognition.</p>
 
1110
  """Format final results and recommendations"""
1111
  key_insights = summary.get('key_insights', [])
1112
  recommendations = summary.get('recommendations', [])
1113
+
1114
  return f"""
1115
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; color: white; margin: 20px 0;">
1116
  <h3 style="margin: 0 0 20px 0; text-align: center; font-size: 2em;">πŸŽ‰ Pipeline Completed Successfully!</h3>
 
1133
  </div>
1134
  </div>
1135
  </div>
1136
+
1137
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 20px; margin: 20px 0;">
1138
  <div style="background: white; padding: 25px; border-radius: 12px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
1139
  <h4 style="margin: 0 0 20px 0; color: #2c3e50; font-size: 1.3em;">πŸ” Key Insights Discovered</h4>
 
1162
  """Create the Gradio interface"""
1163
  with gr.Blocks(css=self.custom_css) as demo:
1164
  gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>πŸ”¬ Comprehensive Data Science Pipeline</h1>")
1165
+
1166
  with gr.Row():
1167
  with gr.Column(scale=1):
1168
  file_upload = gr.File(label="Upload Dataset (CSV or JSON) or Drag & Drop", file_types=[".csv", ".json"])
 
1172
  enable_deep_learning = gr.Checkbox(label="Enable Deep Learning", value=False)
1173
  enable_automl = gr.Checkbox(label="Enable AutoML", value=True)
1174
  run_btn = gr.Button("Run Pipeline", variant="primary")
1175
+
1176
  with gr.Column(scale=1):
1177
  file_status = gr.HTML()
1178
  preview = gr.HTML()
1179
+
1180
  output = gr.HTML()
1181
+
1182
  # Hidden states
1183
  file_type_state = gr.State("")
1184
  columns_state = gr.State([])
1185
+
1186
  # Events
1187
  file_upload.change(
1188
  fn=self.process_file_upload,
1189
  inputs=[file_upload, learning_type],
1190
  outputs=[file_status, file_type_state, columns_state, target_column, preview]
1191
  )
1192
+
1193
  learning_type.change(
1194
  fn=self.update_target_column_visibility,
1195
  inputs=[learning_type, columns_state],
1196
  outputs=[target_column]
1197
  )
1198
+
1199
  run_btn.click(
1200
  fn=self.run_comprehensive_pipeline,
1201
  inputs=[file_upload, learning_type, target_column, domain, enable_deep_learning, enable_automl],
1202
  outputs=[output]
1203
  )
1204
+
1205
  return demo
1206
 
1207
  if __name__ == "__main__":
1208
  ui = DataSciencePipelineUI()
1209
  demo = ui.create_interface()
1210
+ demo.launch(share=True)