entropy25 commited on
Commit
fd475db
·
verified ·
1 Parent(s): 156a07c

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +977 -512
analyzer.py CHANGED
@@ -1,37 +1,28 @@
1
- """
2
- Data Analysis Platform
3
- Copyright (c) 2025 JEAN YOUNG
4
- All rights reserved.
5
-
6
- This software is proprietary and confidential.
7
- Unauthorized copying, distribution, or use is prohibited.
8
- """
9
  import streamlit as st
10
  import pandas as pd
11
  import numpy as np
12
  import plotly.express as px
13
  import plotly.graph_objects as go
 
 
14
  from typing import Dict, List, Any, Optional
15
  import os
16
  from dotenv import load_dotenv
17
  from data_handler import *
18
- from io import BytesIO
19
 
20
- # Load environment variables
21
- load_dotenv()
22
-
23
- # Optional AI Integration
24
  try:
25
- import openai
26
- OPENAI_AVAILABLE = True
 
 
 
 
27
  except ImportError:
28
- OPENAI_AVAILABLE = False
29
 
30
- try:
31
- import google.generativeai as genai
32
- GEMINI_AVAILABLE = True
33
- except ImportError:
34
- GEMINI_AVAILABLE = False
35
 
36
  class AIAssistant:
37
  """AI-powered analysis assistant"""
@@ -40,30 +31,22 @@ class AIAssistant:
40
  self.openai_key = os.getenv('OPENAI_API_KEY')
41
  self.gemini_key = os.getenv('GOOGLE_API_KEY')
42
 
43
- if self.gemini_key and GEMINI_AVAILABLE:
44
- genai.configure(api_key=self.gemini_key)
45
- self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
46
-
47
- def get_available_models(self) -> List[str]:
48
- """Get list of available AI models"""
49
- models = []
50
- if self.openai_key and OPENAI_AVAILABLE:
51
- models.append("OpenAI GPT")
52
- if self.gemini_key and GEMINI_AVAILABLE:
53
- models.append("Google Gemini")
54
- return models
55
 
56
  def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
57
  """Get AI analysis of insights"""
58
-
59
- # Prepare data summary
60
  summary = f"""
61
  Dataset Summary:
62
  - Shape: {df.shape}
63
  - Columns: {list(df.columns)}
64
- - Data types: {df.dtypes.value_counts().to_dict()}
65
 
66
- Key Insights Found:
67
  """
68
 
69
  for insight in insights:
@@ -71,42 +54,33 @@ class AIAssistant:
71
 
72
  prompt = f"""
73
  As a senior data scientist, analyze this dataset and provide:
74
-
75
- 1. Business implications of the findings
76
- 2. Potential opportunities or risks
77
- 3. Recommendations for decision-making
78
  4. Suggestions for further analysis
79
 
80
  {summary}
81
-
82
- Provide actionable insights in a professional format.
83
  """
84
 
85
  try:
86
- if model == "Google Gemini" and hasattr(self, 'gemini_model'):
87
  response = self.gemini_model.generate_content(prompt)
88
  return response.text
89
- elif model == "OpenAI GPT" and self.openai_key:
90
- client = openai.OpenAI(api_key=self.openai_key)
91
- response = client.chat.completions.create(
92
- model="gpt-3.5-turbo",
93
- messages=[{"role": "user", "content": prompt}]
94
- )
95
- return response.choices[0].message.content
96
  else:
97
- return "AI analysis not available. Please configure API keys."
98
  except Exception as e:
99
  return f"AI Analysis Error: {str(e)}"
100
 
101
  class DataAnalysisWorkflow:
102
- """Optimized data analysis workflow with caching and pagination"""
103
 
104
  def __init__(self, df: pd.DataFrame):
105
  self.df = df
 
106
  self.stats = calculate_basic_stats(df)
107
  self.column_types = get_column_types(df)
108
  self.insights = []
109
- self.page_size = 1000 # For pagination
110
 
111
  def add_insight(self, insight: str, stage: int):
112
  """Add insight to analysis report"""
@@ -116,586 +90,1077 @@ class DataAnalysisWorkflow:
116
  'timestamp': pd.Timestamp.now()
117
  })
118
 
119
- def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
120
- """Get paginated data for display"""
121
- start_idx = page * self.page_size
122
- end_idx = start_idx + self.page_size
123
- return self.df.iloc[start_idx:end_idx]
124
-
125
  def stage_1_overview(self):
126
- """Stage 1: Data Overview with caching"""
127
  st.subheader("📊 Data Overview")
128
 
129
- # Data Quality Score
130
- quality_metrics = calculate_data_quality_score(self.df)
131
  col1, col2, col3, col4 = st.columns(4)
132
  with col1:
133
- st.metric("Rows", f"{self.stats['shape'][0]:,}")
134
  with col2:
135
- st.metric("Columns", f"{self.stats['shape'][1]:,}")
136
  with col3:
137
- st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
 
138
  with col4:
139
- st.metric("Grade", quality_metrics['grade'])
140
 
141
- if quality_metrics['issues']:
142
- st.warning("Quality Issues Found:")
143
- for issue in quality_metrics['issues']:
144
- st.write(f"• {issue}")
145
-
146
- # Memory Usage and Optimization
147
- st.subheader("Memory Analysis")
148
- memory_opt = calculate_memory_optimization(self.df)
149
- col1, col2 = st.columns(2)
150
- with col1:
151
- st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
152
- with col2:
153
- if memory_opt['potential_savings_mb'] > 0:
154
- st.metric("Potential Savings",
155
- f"{memory_opt['potential_savings_mb']:.1f} MB",
156
- f"{memory_opt['potential_savings_pct']:.1f}%")
157
-
158
- if st.button("Show Optimization Details"):
159
- st.dataframe(pd.DataFrame(memory_opt['suggestions']))
160
-
161
- # Column Cardinality Analysis
162
- st.subheader("Column Cardinality Analysis")
163
- cardinality_df = calculate_column_cardinality(self.df)
164
-
165
- # Filter options
166
- col_types = cardinality_df['Type'].unique()
167
- selected_types = st.multiselect("Filter by Column Type",
168
- col_types,
169
- default=col_types)
170
-
171
- filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
172
- st.dataframe(filtered_df, use_container_width=True)
173
-
174
- # Highlight important findings
175
- id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
176
- if id_cols:
177
- st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
178
-
179
- const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
180
- if const_cols:
181
- st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
182
-
183
- # Data types visualization
184
  if self.stats['dtypes']:
185
- st.subheader("Data Types Distribution")
186
- fig = px.pie(values=list(self.stats['dtypes'].values()),
187
- names=list(self.stats['dtypes'].keys()),
188
- title="Data Types")
189
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- # Sample data with pagination
192
- st.subheader("Sample Data")
193
- total_pages = (len(self.df) - 1) // self.page_size + 1
 
 
 
 
194
 
195
- if total_pages > 1:
196
- page = st.slider("Page", 0, total_pages - 1, 0)
197
- sample_data = self.get_paginated_data(page)
198
- st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
 
 
 
199
  else:
200
- sample_data = self.df.head(10)
201
-
202
- st.dataframe(sample_data, use_container_width=True)
 
 
 
203
 
204
  # Missing values analysis
205
  missing_df = calculate_missing_data(self.df)
206
  if not missing_df.empty:
207
  st.subheader("Missing Values Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
208
  st.dataframe(missing_df, use_container_width=True)
209
 
210
  worst_column = missing_df.iloc[0]['Column']
211
  worst_percentage = missing_df.iloc[0]['Missing %']
212
  self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
213
  else:
214
- st.success("✅ No missing values found!")
215
- self.add_insight("Dataset has no missing values - excellent data quality", 1)
216
-
217
- # Add insights about data quality and cardinality
218
- if quality_metrics['score'] < 80:
219
- self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
220
-
221
- if memory_opt['potential_savings_pct'] > 20:
222
- self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
223
-
224
- if id_cols:
225
- self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
226
 
227
  def stage_2_exploration(self):
228
- """Stage 2: Exploratory Data Analysis with caching"""
229
  st.subheader("🔍 Exploratory Data Analysis")
230
 
231
  numeric_cols = self.column_types['numeric']
232
  categorical_cols = self.column_types['categorical']
233
 
234
- # Numeric analysis
235
  if numeric_cols:
236
- st.subheader("Numeric Variables")
237
- selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
238
 
239
- col1, col2 = st.columns(2)
240
- with col1:
241
- fig = px.histogram(self.df, x=selected_numeric,
242
- title=f"Distribution of {selected_numeric}")
243
- st.plotly_chart(fig, use_container_width=True)
244
-
245
- with col2:
246
- fig = px.box(self.df, y=selected_numeric,
247
- title=f"Box Plot of {selected_numeric}")
248
- st.plotly_chart(fig, use_container_width=True)
249
 
250
- # Statistical summary
251
- st.subheader("Statistical Summary")
252
- summary_stats = self.df[numeric_cols].describe()
253
- st.dataframe(summary_stats, use_container_width=True)
254
-
255
- # Correlation analysis
256
- if len(numeric_cols) > 1:
257
- st.subheader("Correlation Analysis")
258
- corr_matrix = calculate_correlation_matrix(self.df)
259
- if not corr_matrix.empty:
260
- fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
261
- title="Correlation Matrix")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
263
 
264
- # Find highest correlation
265
- corr_values = []
 
 
 
 
 
 
 
 
 
 
 
 
266
  for i in range(len(corr_matrix.columns)):
267
  for j in range(i+1, len(corr_matrix.columns)):
268
- corr_values.append(abs(corr_matrix.iloc[i, j]))
 
 
 
 
 
 
 
269
 
270
- if corr_values:
271
- max_corr = max(corr_values)
272
- self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
 
 
 
 
 
 
 
273
 
274
- # Categorical analysis
275
  if categorical_cols:
276
- st.subheader("Categorical Variables")
277
  selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
278
 
279
- value_counts = get_value_counts(self.df, selected_categorical)
280
- fig = px.bar(x=value_counts.index, y=value_counts.values,
281
- title=f"Top 10 {selected_categorical} Values")
282
- st.plotly_chart(fig, use_container_width=True)
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  total_categories = self.df[selected_categorical].nunique()
285
- self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
 
 
 
 
 
 
286
 
287
- def stage_3_cleaning(self):
288
- """Stage 3: Data Quality Assessment"""
289
  st.subheader("🧹 Data Quality Assessment")
290
 
291
- cleaning_actions = []
292
- cleaning_history = []
293
 
294
- # Missing values handling
295
  if self.stats['missing_values'] > 0:
296
- st.subheader("Missing Values Treatment")
297
- missing_df = calculate_missing_data(self.df)
298
- st.dataframe(missing_df, use_container_width=True)
299
-
300
- col1, col2 = st.columns(2)
301
- with col1:
302
- selected_col = st.selectbox("Select column to handle missing values:",
303
- missing_df['Column'].tolist())
304
- with col2:
305
- fill_method = st.selectbox("Choose fill method:",
306
- ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
307
-
308
- if st.button("Apply Missing Value Treatment"):
309
- try:
310
- if fill_method == "Drop rows":
311
- self.df = self.df.dropna(subset=[selected_col])
312
- cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
313
- else:
314
- if fill_method == "Mean":
315
- fill_value = self.df[selected_col].mean()
316
- elif fill_method == "Median":
317
- fill_value = self.df[selected_col].median()
318
- elif fill_method == "Mode":
319
- fill_value = self.df[selected_col].mode()[0]
320
- else: # Custom value
321
- fill_value = st.number_input("Enter custom value:", value=0.0)
322
-
323
- self.df[selected_col] = self.df[selected_col].fillna(fill_value)
324
- cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
325
-
326
- st.success("✅ Missing values handled successfully!")
327
- except Exception as e:
328
- st.error(f"Error handling missing values: {str(e)}")
329
 
330
- # Duplicates handling
331
  if self.stats['duplicates'] > 0:
332
- st.subheader("Duplicate Rows")
333
- st.warning(f"Found {self.stats['duplicates']} duplicate rows")
334
-
335
- if st.button("Remove Duplicate Rows"):
336
- original_len = len(self.df)
337
- self.df = self.df.drop_duplicates()
338
- removed = original_len - len(self.df)
339
- cleaning_history.append(f"Removed {removed} duplicate rows")
340
- st.success(f"✅ Removed {removed} duplicate rows")
341
  else:
342
- st.success("✅ No duplicate rows found")
343
-
344
- # Mixed type detection and handling
345
- mixed_types = detect_mixed_types(self.df)
346
- if mixed_types:
347
- st.subheader("Mixed Data Types")
348
- mixed_df = pd.DataFrame(mixed_types)
349
- st.dataframe(mixed_df, use_container_width=True)
350
-
351
- selected_col = st.selectbox("Select column to fix data type:",
352
- [item['column'] for item in mixed_types])
353
-
354
- fix_method = st.selectbox("Choose fix method:",
355
- ["Convert to numeric", "Convert to string"])
356
-
357
- if st.button("Fix Data Type"):
358
- try:
359
- if fix_method == "Convert to numeric":
360
- self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
361
- else:
362
- self.df[selected_col] = self.df[selected_col].astype(str)
363
-
364
- cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
365
- st.success("✅ Data type fixed successfully!")
366
- except Exception as e:
367
- st.error(f"Error fixing data type: {str(e)}")
368
 
369
- # Outlier detection and handling
370
  numeric_cols = self.column_types['numeric']
371
  if numeric_cols:
372
  st.subheader("Outlier Detection")
373
- selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
374
 
375
- outliers = calculate_outliers(self.df, selected_col)
376
- outlier_count = len(outliers)
 
 
 
 
 
 
 
 
377
 
378
- if outlier_count > 0:
379
- st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
380
- st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
381
-
382
- treatment_method = st.selectbox("Choose outlier treatment method:",
383
- ["None", "Remove", "Cap at percentiles"])
384
-
385
- if treatment_method != "None" and st.button("Apply Outlier Treatment"):
386
- try:
387
- if treatment_method == "Remove":
388
- self.df = self.df[~self.df.index.isin(outliers.index)]
389
- cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
390
- else: # Cap at percentiles
391
- Q1 = self.df[selected_col].quantile(0.25)
392
- Q3 = self.df[selected_col].quantile(0.75)
393
- IQR = Q3 - Q1
394
- lower_bound = Q1 - 1.5 * IQR
395
- upper_bound = Q3 + 1.5 * IQR
396
-
397
- self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
398
- cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
399
-
400
- st.success(" Outliers handled successfully!")
401
- except Exception as e:
402
- st.error(f"Error handling outliers: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  else:
404
- st.success(f" No outliers detected in '{selected_col}'")
405
-
406
- # Cleaning History
407
- if cleaning_history:
408
- st.subheader("Cleaning Operations History")
409
- for i, operation in enumerate(cleaning_history, 1):
410
- st.write(f"{i}. {operation}")
411
- self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
412
-
413
- # Summary
414
- if cleaning_actions:
415
- st.subheader("Remaining Action Items")
416
- for i, action in enumerate(cleaning_actions, 1):
417
- st.write(f"{i}. {action}")
418
- self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
419
  else:
420
- st.success(" Data quality is excellent!")
421
- self.add_insight("No major data quality issues found", 3)
422
 
423
- def stage_4_analysis(self):
424
- """Stage 4: Advanced Analysis"""
425
  st.subheader("🔬 Advanced Analysis")
426
 
427
  numeric_cols = self.column_types['numeric']
428
  categorical_cols = self.column_types['categorical']
429
 
430
- # Relationship analysis
431
  if len(numeric_cols) >= 2:
432
- st.subheader("Variable Relationships")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
 
 
434
  col1, col2 = st.columns(2)
435
  with col1:
436
- x_var = st.selectbox("X Variable:", numeric_cols)
437
  with col2:
438
- y_var = st.selectbox("Y Variable:",
439
- [col for col in numeric_cols if col != x_var])
440
 
441
- # Sample data for performance if dataset is large
 
 
 
 
 
 
 
442
  sample_size = min(5000, len(self.df))
443
- sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
444
 
445
- fig = px.scatter(sample_df, x=x_var, y=y_var,
446
- title=f"Relationship: {x_var} vs {y_var}")
 
 
 
 
 
 
 
 
447
  st.plotly_chart(fig, use_container_width=True)
448
 
 
449
  correlation = self.df[x_var].corr(self.df[y_var])
450
- st.metric("Correlation", f"{correlation:.3f}")
451
 
452
- if abs(correlation) > 0.7:
453
- strength = "Strong"
454
- elif abs(correlation) > 0.3:
455
- strength = "Moderate"
456
- else:
457
- strength = "Weak"
 
 
 
 
 
 
 
 
458
 
459
- direction = "positive" if correlation > 0 else "negative"
460
- st.write(f"**Result:** {strength} {direction} correlation")
461
- self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
462
 
463
- # Group analysis
464
  if categorical_cols and numeric_cols:
465
- st.subheader("Group Analysis")
466
 
467
  col1, col2 = st.columns(2)
468
  with col1:
469
- group_var = st.selectbox("Group by:", categorical_cols)
470
  with col2:
471
- metric_var = st.selectbox("Analyze:", numeric_cols)
472
 
 
473
  group_stats = calculate_group_stats(self.df, group_var, metric_var)
474
- st.dataframe(group_stats, use_container_width=True)
475
 
476
- # Sample for visualization if too many groups
477
  unique_groups = self.df[group_var].nunique()
 
478
  if unique_groups <= 20:
479
- fig = px.box(self.df, x=group_var, y=metric_var,
480
- title=f"{metric_var} by {group_var}")
481
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  else:
483
- st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
- best_group = group_stats['mean'].idxmax()
486
- best_value = group_stats.loc[best_group, 'mean']
487
- self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
489
- def stage_5_summary(self):
490
- """Stage 5: Summary and Export"""
491
- st.subheader("📈 Analysis Summary")
492
 
493
- # Key metrics
494
- col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  with col1:
496
- st.metric("Total Insights", len(self.insights))
497
  with col2:
498
- quality = "High" if self.stats['missing_values'] == 0 else "Medium"
499
  st.metric("Data Quality", quality)
500
  with col3:
501
- st.metric("Analysis Complete", "")
 
 
 
 
502
 
503
- # Insights summary
504
- st.subheader("Key Insights")
505
- for i, insight in enumerate(self.insights, 1):
506
- st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
  # Export options
509
- st.subheader("Export Results")
510
- export_format = st.selectbox("Choose export format:",
511
- ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
512
-
513
- if export_format == "Text Report":
514
- report = self.generate_text_report()
515
- st.download_button(
516
- label="Download Text Report",
517
- data=report,
518
- file_name="analysis_report.txt",
519
- mime="text/plain"
520
- )
521
 
522
- elif export_format == "Markdown Report":
523
- report = self.generate_markdown_report()
524
- st.download_button(
525
- label="Download Markdown Report",
526
- data=report,
527
- file_name="analysis_report.md",
528
- mime="text/markdown"
529
- )
530
 
531
- elif export_format == "Python Code":
532
- code = self.generate_python_code()
533
- st.code(code, language="python")
534
- st.download_button(
535
- label="Download Python Script",
536
- data=code,
537
- file_name="analysis_script.py",
538
- mime="text/plain"
539
- )
540
 
541
- else: # Cleaned Data
542
- # Offer different export formats
543
- data_format = st.selectbox("Choose data format:",
544
- ["CSV", "Excel", "Parquet"])
545
-
546
- if st.button("Export Data"):
547
- try:
548
- if data_format == "CSV":
549
- csv = self.df.to_csv(index=False)
550
- st.download_button(
551
- label="Download CSV",
552
- data=csv,
553
- file_name="cleaned_data.csv",
554
- mime="text/csv"
555
- )
556
- elif data_format == "Excel":
557
- excel_buffer = BytesIO()
558
- self.df.to_excel(excel_buffer, index=False)
559
- excel_data = excel_buffer.getvalue()
560
- st.download_button(
561
- label="Download Excel",
562
- data=excel_data,
563
- file_name="cleaned_data.xlsx",
564
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
565
- )
566
- else: # Parquet
567
- parquet_buffer = BytesIO()
568
- self.df.to_parquet(parquet_buffer, index=False)
569
- parquet_data = parquet_buffer.getvalue()
570
- st.download_button(
571
- label="Download Parquet",
572
- data=parquet_data,
573
- file_name="cleaned_data.parquet",
574
- mime="application/octet-stream"
575
- )
576
- except Exception as e:
577
- st.error(f"Error exporting data: {str(e)}")
578
 
579
- def generate_text_report(self) -> str:
580
- """Generate text analysis report"""
581
- report = f"""DATA ANALYSIS REPORT
582
- ==================
 
583
 
584
- Dataset Overview:
585
- - Rows: {self.stats['shape'][0]:,}
586
- - Columns: {self.stats['shape'][1]:,}
587
- - Missing Values: {self.stats['missing_values']:,}
588
- - Memory Usage: {self.stats['memory_usage']:.1f} MB
 
589
 
590
- Key Insights:
 
591
  """
592
- for insight in self.insights:
593
- report += f"\n- Stage {insight['stage']}: {insight['insight']}"
594
 
595
- report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
596
- return report
597
-
598
- def generate_markdown_report(self) -> str:
599
- """Generate markdown analysis report"""
600
- report = f"""# Data Analysis Report
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
- ## Dataset Overview
603
- * **Rows:** {self.stats['shape'][0]:,}
604
- * **Columns:** {self.stats['shape'][1]:,}
605
- * **Missing Values:** {self.stats['missing_values']:,}
606
- * **Memory Usage:** {self.stats['memory_usage']:.1f} MB
 
607
 
608
- ## Data Types
609
- ```
610
- {pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
611
- ```
 
 
 
 
 
 
 
612
 
613
- ## Key Insights
 
 
614
  """
615
- # Group insights by stage
616
- for stage in range(1, 6):
617
- stage_insights = [i for i in self.insights if i['stage'] == stage]
618
- if stage_insights:
619
- report += f"\n### Stage {stage}\n"
620
- for insight in stage_insights:
621
- report += f"* {insight['insight']}\n"
622
-
623
- report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
624
- return report
625
-
626
- def generate_python_code(self) -> str:
627
- """Generate reproducible Python code"""
628
- code = """import pandas as pd
629
- import numpy as np
630
- import plotly.express as px
631
- from typing import Dict, List, Any
632
 
633
- # Load and prepare data
634
- df = pd.read_csv('your_data.csv') # Update with your data source
 
 
 
635
 
636
- # Basic statistics
637
- def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
638
- return {
639
- 'shape': df.shape,
640
- 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
641
- 'missing_values': int(df.isnull().sum().sum()),
642
- 'dtypes': df.dtypes.value_counts().to_dict(),
643
- 'duplicates': int(df.duplicated().sum())
644
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
 
646
- stats = calculate_basic_stats(df)
647
- print("\\nBasic Statistics:")
648
- print(f"- Shape: {stats['shape']}")
649
- print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
650
- print(f"- Missing Values: {stats['missing_values']}")
651
- print(f"- Duplicates: {stats['duplicates']}")
 
652
 
 
 
653
  """
654
- # Add data cleaning operations if any were performed
655
- if hasattr(self, 'cleaning_history'):
656
- code += "\n# Data Cleaning\n"
657
- for operation in self.cleaning_history:
658
- if "missing values" in operation.lower():
659
- code += "# Handle missing values\n"
660
- code += "df = df.fillna(method='ffill') # Update with your chosen method\n"
661
- elif "duplicate" in operation.lower():
662
- code += "# Remove duplicates\n"
663
- code += "df = df.drop_duplicates()\n"
664
- elif "outlier" in operation.lower():
665
- code += """# Handle outliers
666
- def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
667
- Q1 = df[column].quantile(0.25)
668
- Q3 = df[column].quantile(0.75)
669
- IQR = Q3 - Q1
670
- return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
671
-
672
- # Apply to numeric columns as needed
673
- numeric_cols = df.select_dtypes(include=[np.number]).columns
674
- for col in numeric_cols:
675
- df = remove_outliers(df, col)
676
  """
 
 
 
677
 
678
- # Add visualization code
679
- code += """
680
- # Visualizations
681
- def plot_missing_values(df: pd.DataFrame):
682
- missing = df.isnull().sum()
683
- if missing.sum() > 0:
684
- missing = missing[missing > 0]
685
- fig = px.bar(x=missing.index, y=missing.values,
686
- title='Missing Values by Column')
687
- fig.show()
688
 
689
- def plot_correlations(df: pd.DataFrame):
690
- numeric_cols = df.select_dtypes(include=[np.number]).columns
691
- if len(numeric_cols) > 1:
692
- corr = df[numeric_cols].corr()
693
- fig = px.imshow(corr, title='Correlation Matrix')
694
- fig.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
 
696
- # Generate plots
697
- plot_missing_values(df)
698
- plot_correlations(df)
699
  """
700
 
701
- return code
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
  import plotly.graph_objects as go
6
+ import plotly.figure_factory as ff
7
+ from plotly.subplots import make_subplots
8
  from typing import Dict, List, Any, Optional
9
  import os
10
  from dotenv import load_dotenv
11
  from data_handler import *
 
12
 
13
+ # ML imports
 
 
 
14
  try:
15
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
16
+ from sklearn.linear_model import LogisticRegression, LinearRegression
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
19
+ from sklearn.preprocessing import LabelEncoder
20
+ ML_AVAILABLE = True
21
  except ImportError:
22
+ ML_AVAILABLE = False
23
 
24
+ # Load environment variables
25
+ load_dotenv()
 
 
 
26
 
27
  class AIAssistant:
28
  """AI-powered analysis assistant"""
 
31
  self.openai_key = os.getenv('OPENAI_API_KEY')
32
  self.gemini_key = os.getenv('GOOGLE_API_KEY')
33
 
34
+ try:
35
+ import google.generativeai as genai
36
+ if self.gemini_key:
37
+ genai.configure(api_key=self.gemini_key)
38
+ self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
39
+ except ImportError:
40
+ pass
 
 
 
 
 
41
 
42
  def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
43
  """Get AI analysis of insights"""
 
 
44
  summary = f"""
45
  Dataset Summary:
46
  - Shape: {df.shape}
47
  - Columns: {list(df.columns)}
 
48
 
49
+ Key Insights:
50
  """
51
 
52
  for insight in insights:
 
54
 
55
  prompt = f"""
56
  As a senior data scientist, analyze this dataset and provide:
57
+ 1. Business implications
58
+ 2. Key opportunities and risks
59
+ 3. Actionable recommendations
 
60
  4. Suggestions for further analysis
61
 
62
  {summary}
 
 
63
  """
64
 
65
  try:
66
+ if hasattr(self, 'gemini_model'):
67
  response = self.gemini_model.generate_content(prompt)
68
  return response.text
 
 
 
 
 
 
 
69
  else:
70
+ return "AI analysis not available. Please configure API keys in .env file."
71
  except Exception as e:
72
  return f"AI Analysis Error: {str(e)}"
73
 
74
  class DataAnalysisWorkflow:
75
+ """Enhanced data analysis workflow with ML capabilities"""
76
 
77
  def __init__(self, df: pd.DataFrame):
78
  self.df = df
79
+ self.original_df = df.copy() # Keep original for reference
80
  self.stats = calculate_basic_stats(df)
81
  self.column_types = get_column_types(df)
82
  self.insights = []
83
+ self.ml_results = {}
84
 
85
  def add_insight(self, insight: str, stage: int):
86
  """Add insight to analysis report"""
 
90
  'timestamp': pd.Timestamp.now()
91
  })
92
 
 
 
 
 
 
 
93
  def stage_1_overview(self):
94
+ """Stage 1: Enhanced Data Overview"""
95
  st.subheader("📊 Data Overview")
96
 
97
+ # Key metrics with better formatting
 
98
  col1, col2, col3, col4 = st.columns(4)
99
  with col1:
100
+ st.metric("Total Rows", f"{self.stats['shape'][0]:,}")
101
  with col2:
102
+ st.metric("Total Columns", f"{self.stats['shape'][1]:,}")
103
  with col3:
104
+ missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
105
+ st.metric("Missing Values", f"{self.stats['missing_values']:,}", f"{missing_pct:.1f}%")
106
  with col4:
107
+ st.metric("Memory Usage", f"{self.stats['memory_usage']:.1f} MB")
108
 
109
+ # Enhanced data types visualization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  if self.stats['dtypes']:
111
+ col1, col2 = st.columns(2)
112
+
113
+ with col1:
114
+ fig = px.pie(
115
+ values=list(self.stats['dtypes'].values()),
116
+ names=list(self.stats['dtypes'].keys()),
117
+ title="Data Types Distribution",
118
+ color_discrete_sequence=px.colors.qualitative.Set3
119
+ )
120
+ st.plotly_chart(fig, use_container_width=True)
121
+
122
+ with col2:
123
+ # Column overview table
124
+ column_info = []
125
+ for col in self.df.columns:
126
+ column_info.append({
127
+ 'Column': col,
128
+ 'Type': str(self.df[col].dtype),
129
+ 'Non-Null': self.df[col].notna().sum(),
130
+ 'Unique': self.df[col].nunique()
131
+ })
132
+
133
+ info_df = pd.DataFrame(column_info)
134
+ st.subheader("Column Details")
135
+ st.dataframe(info_df, use_container_width=True, height=300)
136
 
137
+ # Enhanced data preview
138
+ st.subheader("Data Preview")
139
+ preview_option = st.radio(
140
+ "Preview type:",
141
+ ["First 10 rows", "Last 10 rows", "Random sample", "Custom range"],
142
+ horizontal=True
143
+ )
144
 
145
+ if preview_option == "First 10 rows":
146
+ st.dataframe(self.df.head(10), use_container_width=True)
147
+ elif preview_option == "Last 10 rows":
148
+ st.dataframe(self.df.tail(10), use_container_width=True)
149
+ elif preview_option == "Random sample":
150
+ sample_size = min(10, len(self.df))
151
+ st.dataframe(self.df.sample(n=sample_size), use_container_width=True)
152
  else:
153
+ col1, col2 = st.columns(2)
154
+ with col1:
155
+ start_row = st.number_input("Start row", 0, len(self.df)-1, 0)
156
+ with col2:
157
+ end_row = st.number_input("End row", start_row+1, len(self.df), min(start_row+10, len(self.df)))
158
+ st.dataframe(self.df.iloc[start_row:end_row], use_container_width=True)
159
 
160
  # Missing values analysis
161
  missing_df = calculate_missing_data(self.df)
162
  if not missing_df.empty:
163
  st.subheader("Missing Values Analysis")
164
+
165
+ # Visualize missing values
166
+ fig = px.bar(
167
+ missing_df,
168
+ x='Column',
169
+ y='Missing %',
170
+ title="Missing Values by Column",
171
+ color='Missing %',
172
+ color_continuous_scale='Reds'
173
+ )
174
+ st.plotly_chart(fig, use_container_width=True)
175
+
176
  st.dataframe(missing_df, use_container_width=True)
177
 
178
  worst_column = missing_df.iloc[0]['Column']
179
  worst_percentage = missing_df.iloc[0]['Missing %']
180
  self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
181
  else:
182
+ st.success("✅ No missing values found - Excellent data quality!")
183
+ self.add_insight("Dataset has perfect completeness with no missing values", 1)
 
 
 
 
 
 
 
 
 
 
184
 
185
  def stage_2_exploration(self):
186
+ """Stage 2: Enhanced Exploratory Data Analysis"""
187
  st.subheader("🔍 Exploratory Data Analysis")
188
 
189
  numeric_cols = self.column_types['numeric']
190
  categorical_cols = self.column_types['categorical']
191
 
192
+ # Numeric analysis with enhanced visualizations
193
  if numeric_cols:
194
+ st.subheader("Numeric Variables Analysis")
 
195
 
196
+ # Multi-column selection
197
+ selected_numerics = st.multiselect(
198
+ "Select numeric columns for analysis:",
199
+ numeric_cols,
200
+ default=numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
201
+ )
 
 
 
 
202
 
203
+ if selected_numerics:
204
+ # Distribution plots
205
+ st.subheader("Distribution Analysis")
206
+
207
+ if len(selected_numerics) == 1:
208
+ col = selected_numerics[0]
209
+ col1, col2 = st.columns(2)
210
+
211
+ with col1:
212
+ fig = px.histogram(
213
+ self.df,
214
+ x=col,
215
+ marginal="box",
216
+ title=f"Distribution of {col}",
217
+ nbins=50
218
+ )
219
+ st.plotly_chart(fig, use_container_width=True)
220
+
221
+ with col2:
222
+ # Q-Q plot
223
+ from scipy import stats
224
+ fig = go.Figure()
225
+
226
+ # Remove NaN values for Q-Q plot
227
+ clean_data = self.df[col].dropna()
228
+ if len(clean_data) > 0:
229
+ qq = stats.probplot(clean_data, dist="norm")
230
+ fig.add_trace(go.Scatter(
231
+ x=qq[0][0],
232
+ y=qq[0][1],
233
+ mode='markers',
234
+ name='Data points'
235
+ ))
236
+ fig.add_trace(go.Scatter(
237
+ x=qq[0][0],
238
+ y=qq[1][1] + qq[1][0] * qq[0][0],
239
+ mode='lines',
240
+ name='Normal distribution line',
241
+ line=dict(color='red')
242
+ ))
243
+ fig.update_layout(
244
+ title=f"Q-Q Plot: {col}",
245
+ xaxis_title="Theoretical Quantiles",
246
+ yaxis_title="Sample Quantiles"
247
+ )
248
+ st.plotly_chart(fig, use_container_width=True)
249
+
250
+ else:
251
+ # Multiple distributions
252
+ fig = make_subplots(
253
+ rows=len(selected_numerics),
254
+ cols=1,
255
+ subplot_titles=selected_numerics,
256
+ vertical_spacing=0.05
257
+ )
258
+
259
+ for i, col in enumerate(selected_numerics, 1):
260
+ fig.add_trace(
261
+ go.Histogram(x=self.df[col], name=col, nbinsx=30),
262
+ row=i, col=1
263
+ )
264
+
265
+ fig.update_layout(height=200 * len(selected_numerics), showlegend=False)
266
  st.plotly_chart(fig, use_container_width=True)
267
+
268
+ # Statistical summary
269
+ st.subheader("Statistical Summary")
270
+ summary_stats = self.df[selected_numerics].describe()
271
+ st.dataframe(summary_stats, use_container_width=True)
272
+
273
+ # Correlation analysis
274
+ if len(selected_numerics) > 1:
275
+ st.subheader("Correlation Analysis")
276
+ corr_matrix = self.df[selected_numerics].corr()
277
 
278
+ # Enhanced correlation heatmap
279
+ fig = px.imshow(
280
+ corr_matrix,
281
+ text_auto=True,
282
+ aspect="auto",
283
+ title="Correlation Matrix",
284
+ color_continuous_scale='RdBu',
285
+ zmin=-1, zmax=1
286
+ )
287
+ fig.update_layout(height=500)
288
+ st.plotly_chart(fig, use_container_width=True)
289
+
290
+ # Find strongest correlations
291
+ corr_pairs = []
292
  for i in range(len(corr_matrix.columns)):
293
  for j in range(i+1, len(corr_matrix.columns)):
294
+ corr_val = corr_matrix.iloc[i, j]
295
+ if abs(corr_val) > 0.1: # Only show meaningful correlations
296
+ corr_pairs.append({
297
+ 'Variable 1': corr_matrix.columns[i],
298
+ 'Variable 2': corr_matrix.columns[j],
299
+ 'Correlation': corr_val,
300
+ 'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
301
+ })
302
 
303
+ if corr_pairs:
304
+ corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
305
+ st.subheader("Top Correlations")
306
+ st.dataframe(corr_df, use_container_width=True)
307
+
308
+ strongest = corr_df.iloc[0]
309
+ self.add_insight(
310
+ f"Strongest correlation: {strongest['Variable 1']} vs {strongest['Variable 2']} ({strongest['Correlation']:.3f})",
311
+ 2
312
+ )
313
 
314
+ # Enhanced categorical analysis
315
  if categorical_cols:
316
+ st.subheader("Categorical Variables Analysis")
317
  selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
318
 
319
+ value_counts = get_value_counts(self.df, selected_categorical, 15) # Top 15
 
 
 
320
 
321
+ col1, col2 = st.columns(2)
322
+
323
+ with col1:
324
+ # Bar chart
325
+ fig = px.bar(
326
+ x=value_counts.values,
327
+ y=value_counts.index,
328
+ orientation='h',
329
+ title=f"Top Categories in {selected_categorical}",
330
+ color=value_counts.values,
331
+ color_continuous_scale='viridis'
332
+ )
333
+ fig.update_layout(height=400, yaxis={'categoryorder':'total ascending'})
334
+ st.plotly_chart(fig, use_container_width=True)
335
+
336
+ with col2:
337
+ # Pie chart for top categories
338
+ top_5 = value_counts.head(5)
339
+ others = value_counts.iloc[5:].sum() if len(value_counts) > 5 else 0
340
+
341
+ if others > 0:
342
+ pie_data = list(top_5.values) + [others]
343
+ pie_labels = list(top_5.index) + ['Others']
344
+ else:
345
+ pie_data = list(top_5.values)
346
+ pie_labels = list(top_5.index)
347
+
348
+ fig = px.pie(
349
+ values=pie_data,
350
+ names=pie_labels,
351
+ title=f"Distribution of {selected_categorical}",
352
+ color_discrete_sequence=px.colors.qualitative.Set3
353
+ )
354
+ st.plotly_chart(fig, use_container_width=True)
355
+
356
+ # Category statistics
357
  total_categories = self.df[selected_categorical].nunique()
358
+ most_common = value_counts.index[0]
359
+ most_common_pct = (value_counts.iloc[0] / len(self.df)) * 100
360
+
361
+ st.metric("Total Unique Categories", total_categories)
362
+ st.metric("Most Common Category", f"{most_common} ({most_common_pct:.1f}%)")
363
+
364
+ self.add_insight(f"Column '{selected_categorical}' has {total_categories} categories, dominated by '{most_common}' ({most_common_pct:.1f}%)", 2)
365
 
366
+ def stage_3_quality_check(self):
367
+ """Stage 3: Enhanced Data Quality Assessment"""
368
  st.subheader("🧹 Data Quality Assessment")
369
 
370
+ quality_score = 100
371
+ issues = []
372
 
373
+ # Missing values check
374
  if self.stats['missing_values'] > 0:
375
+ missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
376
+ st.warning(f"⚠️ Found {self.stats['missing_values']:,} missing values ({missing_pct:.2f}%)")
377
+ quality_score -= min(missing_pct * 2, 30)
378
+ issues.append("Missing values detected")
379
+ else:
380
+ st.success("✅ No missing values")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
+ # Duplicates check
383
  if self.stats['duplicates'] > 0:
384
+ dup_pct = (self.stats['duplicates'] / self.stats['shape'][0]) * 100
385
+ st.warning(f"⚠️ Found {self.stats['duplicates']:,} duplicate rows ({dup_pct:.2f}%)")
386
+ quality_score -= min(dup_pct * 3, 25)
387
+ issues.append("Duplicate rows found")
 
 
 
 
 
388
  else:
389
+ st.success("✅ No duplicate rows")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ # Outlier detection with enhanced visualization
392
  numeric_cols = self.column_types['numeric']
393
  if numeric_cols:
394
  st.subheader("Outlier Detection")
 
395
 
396
+ outlier_summary = []
397
+ for col in numeric_cols:
398
+ outliers = calculate_outliers(self.df, col)
399
+ outlier_pct = (len(outliers) / len(self.df)) * 100
400
+ outlier_summary.append({
401
+ 'Column': col,
402
+ 'Outliers': len(outliers),
403
+ 'Percentage': outlier_pct,
404
+ 'Status': '⚠️ High' if outlier_pct > 10 else '⚡ Medium' if outlier_pct > 5 else '✅ Low'
405
+ })
406
 
407
+ outlier_df = pd.DataFrame(outlier_summary)
408
+ st.dataframe(outlier_df, use_container_width=True)
409
+
410
+ # Visualize outliers
411
+ selected_col = st.selectbox("Select column for detailed outlier analysis:", numeric_cols)
412
+
413
+ col1, col2 = st.columns(2)
414
+
415
+ with col1:
416
+ fig = px.box(
417
+ self.df,
418
+ y=selected_col,
419
+ title=f"Box Plot: {selected_col}",
420
+ points="outliers"
421
+ )
422
+ st.plotly_chart(fig, use_container_width=True)
423
+
424
+ with col2:
425
+ # Outlier details
426
+ outliers = calculate_outliers(self.df, selected_col)
427
+ if len(outliers) > 0:
428
+ st.metric("Outliers Found", len(outliers))
429
+ st.metric("Outlier Percentage", f"{len(outliers)/len(self.df)*100:.2f}%")
430
+
431
+ if len(outliers) <= 100: # Show outlier values if not too many
432
+ st.subheader("Outlier Values")
433
+ st.dataframe(outliers[[selected_col]].head(20), use_container_width=True)
434
+ else:
435
+ st.success("✅ No outliers detected")
436
+
437
+ # Adjust quality score based on outliers
438
+ total_outlier_pct = sum([row['Percentage'] for row in outlier_summary]) / len(outlier_summary)
439
+ quality_score -= min(total_outlier_pct, 20)
440
+
441
+ # Data consistency checks
442
+ st.subheader("Data Consistency Analysis")
443
+
444
+ consistency_issues = []
445
+
446
+ # Check for mixed data types in object columns
447
+ for col in self.column_types['categorical']:
448
+ unique_types = set(type(x).__name__ for x in self.df[col].dropna().head(100))
449
+ if len(unique_types) > 1:
450
+ consistency_issues.append(f"Mixed data types in column '{col}': {unique_types}")
451
+
452
+ # Check for unusual string patterns
453
+ for col in self.column_types['categorical']:
454
+ sample_values = self.df[col].dropna().head(50).astype(str)
455
+ if sample_values.str.contains(r'^[0-9]+$').any() and sample_values.str.contains(r'[a-zA-Z]').any():
456
+ consistency_issues.append(f"Mixed numeric/text patterns in column '{col}'")
457
+
458
+ if consistency_issues:
459
+ for issue in consistency_issues:
460
+ st.warning(f"⚠️ {issue}")
461
+ quality_score -= len(consistency_issues) * 5
462
+ else:
463
+ st.success("✅ Data types are consistent")
464
+
465
+ # Overall quality score
466
+ st.subheader("Overall Data Quality Score")
467
+ quality_score = max(0, min(100, quality_score)) # Ensure 0-100 range
468
+
469
+ col1, col2, col3 = st.columns(3)
470
+ with col2:
471
+ if quality_score >= 90:
472
+ st.success(f"🏆 Excellent Quality: {quality_score:.0f}/100")
473
+ quality_level = "Excellent"
474
+ elif quality_score >= 75:
475
+ st.info(f"👍 Good Quality: {quality_score:.0f}/100")
476
+ quality_level = "Good"
477
+ elif quality_score >= 60:
478
+ st.warning(f"⚠️ Fair Quality: {quality_score:.0f}/100")
479
+ quality_level = "Fair"
480
  else:
481
+ st.error(f" Poor Quality: {quality_score:.0f}/100")
482
+ quality_level = "Poor"
483
+
484
+ # Action recommendations
485
+ if issues:
486
+ st.subheader("📋 Recommended Actions")
487
+ for i, issue in enumerate(issues, 1):
488
+ st.write(f"{i}. Address {issue}")
489
+
490
+ self.add_insight(f"Data quality: {quality_level} ({quality_score:.0f}/100) - {len(issues)} issues identified", 3)
 
 
 
 
 
491
  else:
492
+ st.success("🎉 No major data quality issues found!")
493
+ self.add_insight(f"Excellent data quality ({quality_score:.0f}/100) with no major issues", 3)
494
 
495
+ def stage_4_advanced_analysis(self):
496
+ """Stage 4: Advanced Statistical Analysis"""
497
  st.subheader("🔬 Advanced Analysis")
498
 
499
  numeric_cols = self.column_types['numeric']
500
  categorical_cols = self.column_types['categorical']
501
 
502
+ # Advanced relationship analysis
503
  if len(numeric_cols) >= 2:
504
+ st.subheader("🔗 Advanced Relationship Analysis")
505
+
506
+ # Scatter plot matrix for multiple variables
507
+ if len(numeric_cols) >= 3:
508
+ st.subheader("Scatter Plot Matrix")
509
+ selected_vars = st.multiselect(
510
+ "Select variables for scatter plot matrix:",
511
+ numeric_cols,
512
+ default=numeric_cols[:4] if len(numeric_cols) >= 4 else numeric_cols
513
+ )
514
+
515
+ if len(selected_vars) >= 2:
516
+ # Sample data for performance
517
+ sample_size = min(1000, len(self.df))
518
+ sample_df = self.df[selected_vars].sample(n=sample_size) if len(self.df) > sample_size else self.df[selected_vars]
519
+
520
+ fig = px.scatter_matrix(
521
+ sample_df,
522
+ dimensions=selected_vars,
523
+ title="Scatter Plot Matrix"
524
+ )
525
+ fig.update_layout(height=600)
526
+ st.plotly_chart(fig, use_container_width=True)
527
 
528
+ # Pairwise analysis
529
+ st.subheader("Detailed Pairwise Analysis")
530
  col1, col2 = st.columns(2)
531
  with col1:
532
+ x_var = st.selectbox("X Variable:", numeric_cols, key="x_var_advanced")
533
  with col2:
534
+ y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var], key="y_var_advanced")
 
535
 
536
+ # Color by categorical variable option
537
+ color_var = None
538
+ if categorical_cols:
539
+ use_color = st.checkbox("Color by categorical variable")
540
+ if use_color:
541
+ color_var = st.selectbox("Color variable:", categorical_cols)
542
+
543
+ # Create enhanced scatter plot
544
  sample_size = min(5000, len(self.df))
545
+ plot_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
546
 
547
+ fig = px.scatter(
548
+ plot_df,
549
+ x=x_var,
550
+ y=y_var,
551
+ color=color_var,
552
+ title=f"Advanced Analysis: {x_var} vs {y_var}",
553
+ trendline="ols",
554
+ marginal_x="histogram",
555
+ marginal_y="histogram"
556
+ )
557
  st.plotly_chart(fig, use_container_width=True)
558
 
559
+ # Statistical analysis
560
  correlation = self.df[x_var].corr(self.df[y_var])
 
561
 
562
+ col1, col2, col3 = st.columns(3)
563
+ with col1:
564
+ st.metric("Correlation", f"{correlation:.3f}")
565
+ with col2:
566
+ r_squared = correlation ** 2
567
+ st.metric("R²", f"{r_squared:.3f}")
568
+ with col3:
569
+ if abs(correlation) > 0.7:
570
+ strength = "Strong"
571
+ elif abs(correlation) > 0.3:
572
+ strength = "Moderate"
573
+ else:
574
+ strength = "Weak"
575
+ st.metric("Relationship", strength)
576
 
577
+ self.add_insight(f"Advanced analysis: {strength} relationship between {x_var} and {y_var} (r={correlation:.3f})", 4)
 
 
578
 
579
+ # Group comparison analysis
580
  if categorical_cols and numeric_cols:
581
+ st.subheader("📊 Group Comparison Analysis")
582
 
583
  col1, col2 = st.columns(2)
584
  with col1:
585
+ group_var = st.selectbox("Group by:", categorical_cols, key="group_var_advanced")
586
  with col2:
587
+ metric_var = st.selectbox("Analyze metric:", numeric_cols, key="metric_var_advanced")
588
 
589
+ # Calculate group statistics
590
  group_stats = calculate_group_stats(self.df, group_var, metric_var)
 
591
 
592
+ # Enhanced group visualization
593
  unique_groups = self.df[group_var].nunique()
594
+
595
  if unique_groups <= 20:
596
+ col1, col2 = st.columns(2)
597
+
598
+ with col1:
599
+ # Box plot
600
+ fig = px.box(
601
+ self.df,
602
+ x=group_var,
603
+ y=metric_var,
604
+ title=f"{metric_var} Distribution by {group_var}",
605
+ points="outliers"
606
+ )
607
+ fig.update_xaxes(tickangle=45)
608
+ st.plotly_chart(fig, use_container_width=True)
609
+
610
+ with col2:
611
+ # Violin plot
612
+ fig = px.violin(
613
+ self.df,
614
+ x=group_var,
615
+ y=metric_var,
616
+ title=f"{metric_var} Density by {group_var}",
617
+ box=True
618
+ )
619
+ fig.update_xaxes(tickangle=45)
620
+ st.plotly_chart(fig, use_container_width=True)
621
+
622
+ # Statistical comparison
623
+ st.subheader("Statistical Comparison")
624
+ st.dataframe(group_stats, use_container_width=True)
625
+
626
+ # Identify best performing group
627
+ best_group = group_stats['mean'].idxmax()
628
+ best_value = group_stats.loc[best_group, 'mean']
629
+ worst_group = group_stats['mean'].idxmin()
630
+ worst_value = group_stats.loc[worst_group, 'mean']
631
+
632
+ col1, col2 = st.columns(2)
633
+ with col1:
634
+ st.metric("Best Performing Group", best_group, f"Avg: {best_value:.2f}")
635
+ with col2:
636
+ st.metric("Lowest Performing Group", worst_group, f"Avg: {worst_value:.2f}")
637
+
638
+ self.add_insight(f"Group analysis: '{best_group}' performs best with average {metric_var} of {best_value:.2f}", 4)
639
+ else:
640
+ st.info(f"Too many groups ({unique_groups}) for detailed visualization. Showing summary statistics only.")
641
+ st.dataframe(group_stats.head(15), use_container_width=True)
642
+
643
+ def stage_5_ml_modeling(self):
644
+ """Stage 5: Machine Learning Modeling"""
645
+ st.subheader("🤖 Machine Learning Modeling")
646
+
647
+ if not ML_AVAILABLE:
648
+ st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn to use this feature.")
649
+ st.code("pip install scikit-learn")
650
+ return
651
+
652
+ numeric_cols = self.column_types['numeric']
653
+ categorical_cols = self.column_types['categorical']
654
+
655
+ if len(numeric_cols) < 2:
656
+ st.warning("⚠️ Need at least 2 numeric columns for ML modeling.")
657
+ return
658
+
659
+ st.info("🎯 Automated machine learning model training and evaluation")
660
+
661
+ # Model configuration
662
+ st.subheader("Model Configuration")
663
+
664
+ col1, col2 = st.columns(2)
665
+ with col1:
666
+ target_column = st.selectbox(
667
+ "Select target variable (what to predict):",
668
+ numeric_cols + categorical_cols
669
+ )
670
+
671
+ with col2:
672
+ model_type = st.radio(
673
+ "Problem type:",
674
+ ["Auto-detect", "Regression", "Classification"]
675
+ )
676
+
677
+ # Feature selection
678
+ available_features = [col for col in numeric_cols if col != target_column]
679
+ if len(available_features) == 0:
680
+ st.error("❌ No suitable features available for modeling.")
681
+ return
682
+
683
+ selected_features = st.multiselect(
684
+ "Select features (leave empty for auto-selection):",
685
+ available_features,
686
+ default=available_features[:5] if len(available_features) >= 5 else available_features
687
+ )
688
+
689
+ if not selected_features:
690
+ selected_features = available_features[:10] # Auto-select top 10
691
+
692
+ if st.button("🚀 Train Models", type="primary"):
693
+ try:
694
+ with st.spinner("Training machine learning models..."):
695
+ self._train_ml_models(target_column, selected_features, model_type)
696
+
697
+ st.success("✅ Models trained successfully!")
698
+
699
+ except Exception as e:
700
+ st.error(f"❌ Model training failed: {str(e)}")
701
+
702
+ # Display results if available
703
+ if hasattr(self, 'ml_results') and self.ml_results:
704
+ self._display_ml_results()
705
+
706
+ def _train_ml_models(self, target_col: str, feature_cols: List[str], model_type: str):
707
+ """Train ML models"""
708
+ # Prepare data
709
+ X = self.df[feature_cols].copy()
710
+ y = self.df[target_col].copy()
711
+
712
+ # Handle missing values
713
+ X = X.fillna(X.mean())
714
+ y = y.fillna(y.mean() if y.dtype in ['int64', 'float64'] else y.mode()[0])
715
+
716
+ # Auto-detect problem type
717
+ if model_type == "Auto-detect":
718
+ if y.dtype == 'object' or y.nunique() < 10:
719
+ model_type = "Classification"
720
  else:
721
+ model_type = "Regression"
722
+
723
+ # Encode categorical target if needed
724
+ label_encoder = None
725
+ if model_type == "Classification" and y.dtype == 'object':
726
+ label_encoder = LabelEncoder()
727
+ y = label_encoder.fit_transform(y)
728
+
729
+ # Split data
730
+ X_train, X_test, y_train, y_test = train_test_split(
731
+ X, y, test_size=0.2, random_state=42, stratify=y if model_type == "Classification" else None
732
+ )
733
+
734
+ # Train models
735
+ models = {}
736
+ results = {}
737
+
738
+ if model_type == "Regression":
739
+ models = {
740
+ "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
741
+ "Linear Regression": LinearRegression()
742
+ }
743
+ else:
744
+ models = {
745
+ "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
746
+ "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
747
+ }
748
+
749
+ for name, model in models.items():
750
+ # Train model
751
+ model.fit(X_train, y_train)
752
+
753
+ # Make predictions
754
+ y_pred = model.predict(X_test)
755
+
756
+ # Calculate metrics
757
+ if model_type == "Regression":
758
+ r2 = r2_score(y_test, y_pred)
759
+ mse = mean_squared_error(y_test, y_pred)
760
+ results[name] = {
761
+ "R² Score": r2,
762
+ "MSE": mse,
763
+ "RMSE": np.sqrt(mse)
764
+ }
765
+ else:
766
+ accuracy = accuracy_score(y_test, y_pred)
767
+ results[name] = {
768
+ "Accuracy": accuracy
769
+ }
770
 
771
+ # Feature importance
772
+ if hasattr(model, 'feature_importances_'):
773
+ feature_importance = pd.DataFrame({
774
+ 'feature': feature_cols,
775
+ 'importance': model.feature_importances_
776
+ }).sort_values('importance', ascending=False)
777
+ results[name]['feature_importance'] = feature_importance
778
+
779
+ # Store results
780
+ self.ml_results = {
781
+ 'model_type': model_type,
782
+ 'target_column': target_col,
783
+ 'feature_columns': feature_cols,
784
+ 'results': results,
785
+ 'label_encoder': label_encoder,
786
+ 'test_size': len(X_test)
787
+ }
788
+
789
+ # Add insight
790
+ best_model = max(results.keys(), key=lambda x:
791
+ results[x]['R² Score'] if model_type == "Regression" else results[x]['Accuracy']
792
+ )
793
+ best_score = (results[best_model]['R² Score'] if model_type == "Regression"
794
+ else results[best_model]['Accuracy'])
795
+
796
+ self.add_insight(f"ML modeling: Best {model_type.lower()} model is {best_model} with score {best_score:.3f}", 5)
797
 
798
+ def _display_ml_results(self):
799
+ """Display ML modeling results"""
800
+ st.subheader("🎯 Model Performance Results")
801
 
802
+ results = self.ml_results['results']
803
+ model_type = self.ml_results['model_type']
804
+
805
+ # Performance comparison
806
+ performance_data = []
807
+ for model_name, metrics in results.items():
808
+ row = {'Model': model_name}
809
+ for metric, value in metrics.items():
810
+ if metric != 'feature_importance':
811
+ row[metric] = value
812
+ performance_data.append(row)
813
+
814
+ performance_df = pd.DataFrame(performance_data)
815
+ st.dataframe(performance_df, use_container_width=True)
816
+
817
+ # Visualize performance
818
+ if model_type == "Regression":
819
+ metric_to_plot = "R² Score"
820
+ else:
821
+ metric_to_plot = "Accuracy"
822
+
823
+ fig = px.bar(
824
+ performance_df,
825
+ x='Model',
826
+ y=metric_to_plot,
827
+ title=f"Model Performance Comparison ({metric_to_plot})",
828
+ color=metric_to_plot,
829
+ color_continuous_scale='viridis'
830
+ )
831
+ st.plotly_chart(fig, use_container_width=True)
832
+
833
+ # Feature importance analysis
834
+ st.subheader("🔍 Feature Importance Analysis")
835
+
836
+ # Get feature importance from best model
837
+ best_model = max(results.keys(), key=lambda x:
838
+ results[x][metric_to_plot]
839
+ )
840
+
841
+ if 'feature_importance' in results[best_model]:
842
+ importance_df = results[best_model]['feature_importance']
843
+
844
+ col1, col2 = st.columns(2)
845
+
846
+ with col1:
847
+ # Bar plot
848
+ fig = px.bar(
849
+ importance_df.head(10),
850
+ x='importance',
851
+ y='feature',
852
+ orientation='h',
853
+ title=f"Top 10 Feature Importance ({best_model})",
854
+ color='importance',
855
+ color_continuous_scale='plasma'
856
+ )
857
+ fig.update_layout(yaxis={'categoryorder':'total ascending'})
858
+ st.plotly_chart(fig, use_container_width=True)
859
+
860
+ with col2:
861
+ # Show importance table
862
+ st.subheader("Feature Rankings")
863
+ st.dataframe(importance_df.head(10), use_container_width=True)
864
+
865
+ # Top features insight
866
+ top_feature = importance_df.iloc[0]['feature']
867
+ top_importance = importance_df.iloc[0]['importance']
868
+ self.add_insight(f"Most important feature: '{top_feature}' (importance: {top_importance:.3f})", 5)
869
+
870
+ # Model recommendations
871
+ st.subheader("📋 Model Recommendations")
872
+
873
+ best_score = results[best_model][metric_to_plot]
874
+
875
+ if model_type == "Regression":
876
+ if best_score > 0.8:
877
+ st.success(f"🏆 Excellent model performance! {best_model} explains {best_score*100:.1f}% of the variance.")
878
+ elif best_score > 0.6:
879
+ st.info(f"👍 Good model performance. {best_model} explains {best_score*100:.1f}% of the variance.")
880
+ else:
881
+ st.warning(f"⚠️ Model performance could be improved. Consider feature engineering or more advanced models.")
882
+ else:
883
+ if best_score > 0.9:
884
+ st.success(f"🏆 Excellent classification accuracy: {best_score*100:.1f}%")
885
+ elif best_score > 0.8:
886
+ st.info(f"👍 Good classification accuracy: {best_score*100:.1f}%")
887
+ else:
888
+ st.warning(f"⚠️ Classification accuracy could be improved: {best_score*100:.1f}%")
889
+
890
+ def stage_6_summary(self):
891
+ """Stage 6: Enhanced Summary and Export"""
892
+ st.subheader("📈 Analysis Summary & Export")
893
+
894
+ # Key metrics overview
895
+ col1, col2, col3, col4 = st.columns(4)
896
  with col1:
897
+ st.metric("Total Insights Generated", len(self.insights))
898
  with col2:
899
+ quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
900
  st.metric("Data Quality", quality)
901
  with col3:
902
+ analysis_completeness = "100%" if len(self.insights) >= 5 else f"{len(self.insights)*20}%"
903
+ st.metric("Analysis Complete", analysis_completeness)
904
+ with col4:
905
+ ml_status = "✅" if hasattr(self, 'ml_results') and self.ml_results else "➖"
906
+ st.metric("ML Models", ml_status)
907
 
908
+ # Insights timeline
909
+ st.subheader("🔍 Key Insights Timeline")
910
+
911
+ insights_by_stage = {}
912
+ for insight in self.insights:
913
+ stage = insight['stage']
914
+ if stage not in insights_by_stage:
915
+ insights_by_stage[stage] = []
916
+ insights_by_stage[stage].append(insight)
917
+
918
+ stage_names = {
919
+ 1: "📊 Data Overview",
920
+ 2: "🔍 Exploration",
921
+ 3: "🧹 Quality Check",
922
+ 4: "🔬 Advanced Analysis",
923
+ 5: "🤖 ML Modeling",
924
+ 6: "📈 Summary"
925
+ }
926
+
927
+ for stage_num in sorted(insights_by_stage.keys()):
928
+ with st.expander(f"{stage_names.get(stage_num, f'Stage {stage_num}')} - {len(insights_by_stage[stage_num])} insights"):
929
+ for i, insight in enumerate(insights_by_stage[stage_num], 1):
930
+ st.write(f"{i}. {insight['insight']}")
931
+ st.caption(f"Generated: {insight['timestamp'].strftime('%H:%M:%S')}")
932
+
933
+ # Executive summary with AI
934
+ st.subheader("🤖 AI-Powered Executive Summary")
935
+
936
+ ai_assistant = AIAssistant()
937
+
938
+ if st.button("Generate AI Summary", type="primary"):
939
+ with st.spinner("Generating AI-powered analysis summary..."):
940
+ ai_summary = ai_assistant.analyze_insights(self.df, self.insights)
941
+
942
+ st.markdown("### 📋 Executive Summary")
943
+ st.markdown(ai_summary)
944
+
945
+ # Store AI summary for export
946
+ self.ai_summary = ai_summary
947
 
948
  # Export options
949
+ st.subheader("📥 Export Results")
 
 
 
 
 
 
 
 
 
 
 
950
 
951
+ col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
952
 
953
+ with col1:
954
+ if st.button("📄 Generate Report"):
955
+ report = self._generate_comprehensive_report()
956
+ st.download_button(
957
+ label="📥 Download Analysis Report",
958
+ data=report,
959
+ file_name=f"analysis_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
960
+ mime="text/plain"
961
+ )
962
 
963
+ with col2:
964
+ if st.button("📊 Export Data Summary"):
965
+ summary_data = self._generate_data_summary()
966
+ st.download_button(
967
+ label="📥 Download Data Summary (CSV)",
968
+ data=summary_data.to_csv(index=False),
969
+ file_name=f"data_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
970
+ mime="text/csv"
971
+ )
972
+
973
+ with col3:
974
+ if hasattr(self, 'ml_results') and self.ml_results:
975
+ if st.button("🤖 Export ML Results"):
976
+ ml_report = self._generate_ml_report()
977
+ st.download_button(
978
+ label="📥 Download ML Report",
979
+ data=ml_report,
980
+ file_name=f"ml_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
981
+ mime="text/plain"
982
+ )
983
+
984
+ # Analysis completion celebration
985
+ if len(self.insights) >= 5:
986
+ st.balloons()
987
+ st.success("🎉 Comprehensive analysis completed successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
988
 
989
+ def _generate_comprehensive_report(self) -> str:
990
+ """Generate comprehensive analysis report"""
991
+ report = f"""
992
+ COMPREHENSIVE DATA ANALYSIS REPORT
993
+ {'='*50}
994
 
995
+ DATASET OVERVIEW
996
+ {'-'*20}
997
+ Dataset Shape: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
998
+ Memory Usage: {self.stats['memory_usage']:.2f} MB
999
+ Missing Values: {self.stats['missing_values']:,} ({self.stats['missing_values']/(self.stats['shape'][0]*self.stats['shape'][1])*100:.2f}%)
1000
+ • Duplicate Rows: {self.stats['duplicates']:,}
1001
 
1002
+ DATA TYPES DISTRIBUTION
1003
+ {'-'*25}
1004
  """
1005
+ for dtype, count in self.stats['dtypes'].items():
1006
+ report += f" {dtype}: {count} columns\n"
1007
 
1008
+ report += f"""
1009
+ KEY INSIGHTS BY ANALYSIS STAGE
1010
+ {'-'*35}
1011
+ """
1012
+
1013
+ stage_names = {
1014
+ 1: "Data Overview",
1015
+ 2: "Exploratory Analysis",
1016
+ 3: "Quality Assessment",
1017
+ 4: "Advanced Analysis",
1018
+ 5: "Machine Learning",
1019
+ 6: "Summary"
1020
+ }
1021
+
1022
+ for i, insight in enumerate(self.insights, 1):
1023
+ stage_name = stage_names.get(insight['stage'], f"Stage {insight['stage']}")
1024
+ report += f"\n{i}. [{stage_name}] {insight['insight']}"
1025
+
1026
+ # Add ML results if available
1027
+ if hasattr(self, 'ml_results') and self.ml_results:
1028
+ report += f"""
1029
 
1030
+ MACHINE LEARNING RESULTS
1031
+ {'-'*25}
1032
+ Problem Type: {self.ml_results['model_type']}
1033
+ Target Variable: {self.ml_results['target_column']}
1034
+ Features Used: {len(self.ml_results['feature_columns'])}
1035
+ • Test Set Size: {self.ml_results['test_size']} samples
1036
 
1037
+ Model Performance:
1038
+ """
1039
+ for model_name, metrics in self.ml_results['results'].items():
1040
+ report += f"\n{model_name}:\n"
1041
+ for metric, value in metrics.items():
1042
+ if metric != 'feature_importance':
1043
+ report += f" • {metric}: {value:.4f}\n"
1044
+
1045
+ # Add AI summary if available
1046
+ if hasattr(self, 'ai_summary'):
1047
+ report += f"""
1048
 
1049
+ AI-POWERED EXECUTIVE SUMMARY
1050
+ {'-'*30}
1051
+ {self.ai_summary}
1052
  """
1053
+
1054
+ report += f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1055
 
1056
+ ANALYSIS METADATA
1057
+ {'-'*18}
1058
+ • Total Insights Generated: {len(self.insights)}
1059
+ • Analysis Completion Time: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1060
+ • Platform: Enhanced Data Analysis Platform v2.0
1061
 
1062
+ {'-'*50}
1063
+ Report generated automatically by Enhanced Data Analysis Platform
1064
+ """
1065
+
1066
+ return report
1067
+
1068
+ def _generate_data_summary(self) -> pd.DataFrame:
1069
+ """Generate data summary for export"""
1070
+ summary_data = []
1071
+
1072
+ for col in self.df.columns:
1073
+ col_info = {
1074
+ 'Column': col,
1075
+ 'Data_Type': str(self.df[col].dtype),
1076
+ 'Non_Null_Count': self.df[col].notna().sum(),
1077
+ 'Missing_Count': self.df[col].isna().sum(),
1078
+ 'Missing_Percentage': (self.df[col].isna().sum() / len(self.df)) * 100,
1079
+ 'Unique_Values': self.df[col].nunique(),
1080
+ 'Most_Common_Value': str(self.df[col].mode().iloc[0]) if not self.df[col].mode().empty else 'N/A'
1081
+ }
1082
+
1083
+ if self.df[col].dtype in ['int64', 'float64']:
1084
+ col_info.update({
1085
+ 'Mean': self.df[col].mean(),
1086
+ 'Median': self.df[col].median(),
1087
+ 'Std_Dev': self.df[col].std(),
1088
+ 'Min_Value': self.df[col].min(),
1089
+ 'Max_Value': self.df[col].max()
1090
+ })
1091
+
1092
+ summary_data.append(col_info)
1093
+
1094
+ return pd.DataFrame(summary_data)
1095
+
1096
+ def _generate_ml_report(self) -> str:
1097
+ """Generate ML-specific report"""
1098
+ if not hasattr(self, 'ml_results') or not self.ml_results:
1099
+ return "No ML results available."
1100
+
1101
+ ml_report = f"""
1102
+ MACHINE LEARNING ANALYSIS REPORT
1103
+ {'='*40}
1104
 
1105
+ MODEL CONFIGURATION
1106
+ {'-'*20}
1107
+ Problem Type: {self.ml_results['model_type']}
1108
+ Target Variable: {self.ml_results['target_column']}
1109
+ Number of Features: {len(self.ml_results['feature_columns'])}
1110
+ Features Used: {', '.join(self.ml_results['feature_columns'])}
1111
+ • Test Set Size: {self.ml_results['test_size']} samples
1112
 
1113
+ MODEL PERFORMANCE RESULTS
1114
+ {'-'*27}
1115
  """
1116
+
1117
+ for model_name, metrics in self.ml_results['results'].items():
1118
+ ml_report += f"\n{model_name}:\n"
1119
+ for metric, value in metrics.items():
1120
+ if metric != 'feature_importance':
1121
+ ml_report += f" {metric}: {value:.6f}\n"
1122
+
1123
+ # Add feature importance for best model
1124
+ best_model = max(self.ml_results['results'].keys(), key=lambda x:
1125
+ list(self.ml_results['results'][x].values())[0] if isinstance(list(self.ml_results['results'][x].values())[0], (int, float)) else 0
1126
+ )
1127
+
1128
+ if 'feature_importance' in self.ml_results['results'][best_model]:
1129
+ ml_report += f"""
1130
+ FEATURE IMPORTANCE ANALYSIS ({best_model})
1131
+ {'-'*35}
 
 
 
 
 
 
1132
  """
1133
+ importance_df = self.ml_results['results'][best_model]['feature_importance']
1134
+ for _, row in importance_df.head(10).iterrows():
1135
+ ml_report += f"• {row['feature']}: {row['importance']:.6f}\n"
1136
 
1137
+ ml_report += f"""
 
 
 
 
 
 
 
 
 
1138
 
1139
+ RECOMMENDATIONS
1140
+ {'-'*15}
1141
+ """
1142
+
1143
+ if self.ml_results['model_type'] == "Regression":
1144
+ best_score = max([metrics.get('R² Score', 0) for metrics in self.ml_results['results'].values()])
1145
+ if best_score > 0.8:
1146
+ ml_report += "• Excellent model performance - ready for production use\n"
1147
+ elif best_score > 0.6:
1148
+ ml_report += "• Good model performance - consider feature engineering for improvement\n"
1149
+ else:
1150
+ ml_report += "• Model performance needs improvement - try advanced algorithms or more features\n"
1151
+ else:
1152
+ best_score = max([metrics.get('Accuracy', 0) for metrics in self.ml_results['results'].values()])
1153
+ if best_score > 0.9:
1154
+ ml_report += "• Excellent classification accuracy - model ready for deployment\n"
1155
+ elif best_score > 0.8:
1156
+ ml_report += "• Good classification performance - minor optimizations recommended\n"
1157
+ else:
1158
+ ml_report += "• Classification accuracy needs improvement - consider ensemble methods\n"
1159
+
1160
+ ml_report += f"""
1161
 
1162
+ {'-'*40}
1163
+ ML Report generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
 
1164
  """
1165
 
1166
+ return ml_report