RamAi2026 commited on
Commit
da8e446
·
verified ·
1 Parent(s): 717a86b

Upload 13 files

Browse files
Files changed (13) hide show
  1. README.MD +0 -0
  2. app.py +1023 -0
  3. chatbot.py +1051 -0
  4. data_preprocessing.py +387 -0
  5. data_quality.py +252 -0
  6. dataset_overview.py +1159 -0
  7. explainability.py +176 -0
  8. insights.py +369 -0
  9. ml_pipeline.py +940 -0
  10. requirements.txt +16 -0
  11. statistical_analysis.py +928 -0
  12. utils.py +208 -0
  13. visualization.py +435 -0
README.MD ADDED
Binary file (7.64 kB). View file
 
app.py ADDED
@@ -0,0 +1,1023 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.io as pio
4
+ import traceback
5
+ import sys
6
+ from datetime import datetime
7
+
8
+ from data_preprocessing import preprocess_data
9
+ from insights import generate_business_insights
10
+ from dataset_overview import eda_analysis # Updated import
11
+ from visualization import auto_visualizations
12
+ from ml_pipeline import run_ml_pipeline
13
+ from statistical_analysis import statistical_analysis
14
+ from data_quality import quality_report
15
+ from chatbot import data_chatbot
16
+
17
+ # Set plotly template
18
+ pio.templates.default = "plotly_white"
19
+
20
+ # ---------------------------------------
21
+ # PAGE CONFIG
22
+ # ---------------------------------------
23
+
24
+ st.set_page_config(
25
+ page_title="AI Data Analyst Pro",
26
+ layout="wide",
27
+ page_icon="📊",
28
+ initial_sidebar_state="expanded"
29
+ )
30
+
31
+ # ---------------------------------------
32
+ # CUSTOM ERROR HANDLER
33
+ # ---------------------------------------
34
+
35
+ class StreamlitExceptionHandler:
36
+ """Custom exception handler for Streamlit"""
37
+
38
+ @staticmethod
39
+ def handle_exception(e, context="application"):
40
+ """Handle exceptions with user-friendly messages"""
41
+ error_type = type(e).__name__
42
+ error_msg = str(e)
43
+
44
+ # Create user-friendly error message
45
+ user_message = f"""
46
+ ### ❌ An error occurred in the {context}
47
+
48
+ **Error Type:** {error_type}
49
+
50
+ **What happened:** {error_msg if error_msg else "An unexpected error occurred"}
51
+
52
+ **Possible solutions:**
53
+ """
54
+
55
+ # Add specific solutions based on error type
56
+ if "MemoryError" in error_type:
57
+ user_message += """
58
+ - Your dataset might be too large. Try uploading a smaller file.
59
+ - Close other applications to free up memory.
60
+ - Consider sampling your data before uploading.
61
+ """
62
+ elif "KeyError" in error_type or "IndexError" in error_type:
63
+ user_message += """
64
+ - The requested column or index doesn't exist in your dataset.
65
+ - Check if you've selected valid columns for the operation.
66
+ - Try refreshing the page and uploading your data again.
67
+ """
68
+ elif "ValueError" in error_type:
69
+ user_message += """
70
+ - The data values don't match the expected format.
71
+ - Check for invalid values in your dataset (e.g., text in numeric columns).
72
+ - Ensure your data types are correct for the selected operation.
73
+ """
74
+ elif "TypeError" in error_type:
75
+ user_message += """
76
+ - There's a mismatch in data types.
77
+ - Check if you're mixing numeric and text data in operations.
78
+ - Use the preprocessing tab to convert data types appropriately.
79
+ """
80
+ elif "FileNotFoundError" in error_type:
81
+ user_message += """
82
+ - The file couldn't be found. Please upload it again.
83
+ - Check if the file path is correct.
84
+ """
85
+ elif "PermissionError" in error_type:
86
+ user_message += """
87
+ - Permission denied when accessing the file.
88
+ - Make sure the file isn't open in another program.
89
+ """
90
+ elif "pd.errors.EmptyDataError" in error_type:
91
+ user_message += """
92
+ - The uploaded file is empty.
93
+ - Please upload a file containing data.
94
+ """
95
+ elif "pd.errors.ParserError" in error_type:
96
+ user_message += """
97
+ - Couldn't parse the file. Check if it's a valid CSV or Excel file.
98
+ - Ensure the file format matches the selected file type.
99
+ """
100
+ else:
101
+ user_message += """
102
+ - Try refreshing the page and uploading your data again.
103
+ - Check if your data format is compatible with the operation.
104
+ - If the problem persists, try with a smaller sample of your data.
105
+ """
106
+
107
+ # Add technical details in an expander for debugging
108
+ user_message += f"""
109
+
110
+ **Technical Details:**
111
+ """
112
+
113
+ return user_message
114
+
115
+ # Initialize session state for error tracking
116
+ if "error_log" not in st.session_state:
117
+ st.session_state.error_log = []
118
+
119
+ if "last_successful_operation" not in st.session_state:
120
+ st.session_state.last_successful_operation = None
121
+
122
+ # ---------------------------------------
123
+ # ADVANCED CSS WITH RESPONSIVE DESIGN
124
+ # ---------------------------------------
125
+
126
+ st.markdown("""
127
+ <style>
128
+ /* Global Styles */
129
+ .main {
130
+ padding: 0rem 1rem;
131
+ }
132
+
133
+ /* Header Styling */
134
+ .header-container {
135
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
136
+ padding: 2rem;
137
+ border-radius: 20px;
138
+ margin-bottom: 2rem;
139
+ color: white;
140
+ text-align: center;
141
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
142
+ }
143
+
144
+ .header-title {
145
+ font-size: 2.5rem;
146
+ font-weight: 700;
147
+ margin-bottom: 0.5rem;
148
+ animation: fadeInDown 1s;
149
+ }
150
+
151
+ .header-subtitle {
152
+ font-size: 1.1rem;
153
+ opacity: 0.95;
154
+ animation: fadeInUp 1s;
155
+ }
156
+
157
+ /* Card Styling */
158
+ .custom-card {
159
+ background: white;
160
+ padding: 1.5rem;
161
+ border-radius: 15px;
162
+ box-shadow: 0 5px 15px rgba(0,0,0,0.08);
163
+ margin-bottom: 1.5rem;
164
+ border: 1px solid rgba(0,0,0,0.05);
165
+ transition: transform 0.3s, box-shadow 0.3s;
166
+ }
167
+
168
+ .custom-card:hover {
169
+ transform: translateY(-5px);
170
+ box-shadow: 0 8px 25px rgba(0,0,0,0.15);
171
+ }
172
+
173
+ /* Error Message Styling */
174
+ .error-container {
175
+ background: linear-gradient(135deg, #ff6b6b 0%, #ff4757 100%);
176
+ color: white;
177
+ padding: 1.5rem;
178
+ border-radius: 15px;
179
+ margin: 1rem 0;
180
+ box-shadow: 0 10px 30px rgba(255, 71, 87, 0.3);
181
+ animation: slideInRight 0.5s;
182
+ }
183
+
184
+ .error-title {
185
+ font-size: 1.5rem;
186
+ font-weight: 700;
187
+ margin-bottom: 1rem;
188
+ }
189
+
190
+ .error-solution {
191
+ background: rgba(255, 255, 255, 0.2);
192
+ padding: 1rem;
193
+ border-radius: 10px;
194
+ margin-top: 1rem;
195
+ }
196
+
197
+ /* Success Message Styling */
198
+ .success-container {
199
+ background: linear-gradient(135deg, #51cf66 0%, #37b24d 100%);
200
+ color: white;
201
+ padding: 1rem;
202
+ border-radius: 10px;
203
+ margin: 1rem 0;
204
+ animation: fadeInUp 0.5s;
205
+ }
206
+
207
+ /* Warning Message Styling */
208
+ .warning-container {
209
+ background: linear-gradient(135deg, #ffd43b 0%, #fcc419 100%);
210
+ color: #2c3e50;
211
+ padding: 1rem;
212
+ border-radius: 10px;
213
+ margin: 1rem 0;
214
+ animation: fadeInUp 0.5s;
215
+ }
216
+
217
+ /* Metric Cards */
218
+ .metric-card {
219
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
220
+ padding: 1.2rem;
221
+ border-radius: 12px;
222
+ text-align: center;
223
+ border-left: 4px solid #667eea;
224
+ }
225
+
226
+ .metric-value {
227
+ font-size: 2rem;
228
+ font-weight: 700;
229
+ color: #2c3e50;
230
+ margin: 0.5rem 0;
231
+ }
232
+
233
+ .metric-label {
234
+ font-size: 0.9rem;
235
+ color: #7f8c8d;
236
+ text-transform: uppercase;
237
+ letter-spacing: 1px;
238
+ }
239
+
240
+ /* Chatbot Styling */
241
+ .chat-container {
242
+ max-width: 800px;
243
+ margin: 2rem auto;
244
+ background: #f8f9fa;
245
+ border-radius: 20px;
246
+ padding: 1.5rem;
247
+ box-shadow: 0 5px 20px rgba(0,0,0,0.1);
248
+ }
249
+
250
+ .user-message {
251
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
252
+ color: white;
253
+ padding: 12px 18px;
254
+ border-radius: 20px 20px 5px 20px;
255
+ margin: 10px 0;
256
+ max-width: 80%;
257
+ margin-left: auto;
258
+ animation: slideInRight 0.5s;
259
+ }
260
+
261
+ .bot-message {
262
+ background: white;
263
+ color: #2c3e50;
264
+ padding: 12px 18px;
265
+ border-radius: 20px 20px 20px 5px;
266
+ margin: 10px 0;
267
+ max-width: 80%;
268
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
269
+ animation: slideInLeft 0.5s;
270
+ }
271
+
272
+ /* Loading Spinner */
273
+ .custom-spinner {
274
+ border: 4px solid #f3f3f3;
275
+ border-top: 4px solid #667eea;
276
+ border-radius: 50%;
277
+ width: 40px;
278
+ height: 40px;
279
+ animation: spin 1s linear infinite;
280
+ margin: 20px auto;
281
+ }
282
+
283
+ @keyframes spin {
284
+ 0% { transform: rotate(0deg); }
285
+ 100% { transform: rotate(360deg); }
286
+ }
287
+
288
+ /* Animations */
289
+ @keyframes fadeInDown {
290
+ from {
291
+ opacity: 0;
292
+ transform: translateY(-20px);
293
+ }
294
+ to {
295
+ opacity: 1;
296
+ transform: translateY(0);
297
+ }
298
+ }
299
+
300
+ @keyframes fadeInUp {
301
+ from {
302
+ opacity: 0;
303
+ transform: translateY(20px);
304
+ }
305
+ to {
306
+ opacity: 1;
307
+ transform: translateY(0);
308
+ }
309
+ }
310
+
311
+ @keyframes slideInRight {
312
+ from {
313
+ opacity: 0;
314
+ transform: translateX(30px);
315
+ }
316
+ to {
317
+ opacity: 1;
318
+ transform: translateX(0);
319
+ }
320
+ }
321
+
322
+ @keyframes slideInLeft {
323
+ from {
324
+ opacity: 0;
325
+ transform: translateX(-30px);
326
+ }
327
+ to {
328
+ opacity: 1;
329
+ transform: translateX(0);
330
+ }
331
+ }
332
+
333
+ /* Responsive Design */
334
+ @media (max-width: 768px) {
335
+ .header-title {
336
+ font-size: 1.8rem;
337
+ }
338
+
339
+ .metric-value {
340
+ font-size: 1.5rem;
341
+ }
342
+
343
+ .user-message, .bot-message {
344
+ max-width: 95%;
345
+ }
346
+ }
347
+
348
+ /* Sidebar Styling */
349
+ .css-1d391kg {
350
+ background: linear-gradient(180deg, #f8f9fa 0%, #e9ecef 100%);
351
+ }
352
+
353
+ /* Button Styling */
354
+ .stButton > button {
355
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
356
+ color: white;
357
+ border: none;
358
+ padding: 0.5rem 2rem;
359
+ border-radius: 25px;
360
+ font-weight: 600;
361
+ transition: transform 0.2s, box-shadow 0.2s;
362
+ }
363
+
364
+ .stButton > button:hover {
365
+ transform: translateY(-2px);
366
+ box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
367
+ }
368
+
369
+ .stButton > button:disabled {
370
+ opacity: 0.5;
371
+ cursor: not-allowed;
372
+ }
373
+
374
+ /* Progress Bar */
375
+ .stProgress > div > div {
376
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
377
+ }
378
+
379
+ /* Tabs */
380
+ .stTabs [data-baseweb="tab-list"] {
381
+ gap: 2rem;
382
+ background-color: #f8f9fa;
383
+ padding: 0.5rem;
384
+ border-radius: 30px;
385
+ }
386
+
387
+ .stTabs [data-baseweb="tab"] {
388
+ border-radius: 25px;
389
+ padding: 0.5rem 2rem;
390
+ }
391
+
392
+ /* Tooltip */
393
+ .tooltip {
394
+ position: relative;
395
+ display: inline-block;
396
+ cursor: help;
397
+ }
398
+
399
+ .tooltip .tooltiptext {
400
+ visibility: hidden;
401
+ width: 200px;
402
+ background-color: #555;
403
+ color: #fff;
404
+ text-align: center;
405
+ border-radius: 6px;
406
+ padding: 5px;
407
+ position: absolute;
408
+ z-index: 1;
409
+ bottom: 125%;
410
+ left: 50%;
411
+ margin-left: -100px;
412
+ opacity: 0;
413
+ transition: opacity 0.3s;
414
+ }
415
+
416
+ .tooltip:hover .tooltiptext {
417
+ visibility: visible;
418
+ opacity: 1;
419
+ }
420
+ </style>
421
+ """, unsafe_allow_html=True)
422
+
423
+ # ---------------------------------------
424
+ # HEADER WITH ANIMATION
425
+ # ---------------------------------------
426
+
427
+ st.markdown("""
428
+ <div class="header-container">
429
+ <div class="header-title">📊 AI Data Analyst Pro</div>
430
+ <div class="header-subtitle">Intelligent Data Analysis & Visualization Platform</div>
431
+ </div>
432
+ """, unsafe_allow_html=True)
433
+
434
+ # ---------------------------------------
435
+ # SESSION STATE INITIALIZATION
436
+ # ---------------------------------------
437
+
438
+ if "data" not in st.session_state:
439
+ st.session_state.data = None
440
+
441
+ if "processed_data" not in st.session_state:
442
+ st.session_state.processed_data = None
443
+
444
+ if "uploaded_file_name" not in st.session_state:
445
+ st.session_state.uploaded_file_name = None
446
+
447
+ if "upload_error" not in st.session_state:
448
+ st.session_state.upload_error = None
449
+
450
+ if "data_loaded" not in st.session_state:
451
+ st.session_state.data_loaded = False
452
+
453
+ if "operation_status" not in st.session_state:
454
+ st.session_state.operation_status = {}
455
+
456
+ # ---------------------------------------
457
+ # HELPER FUNCTIONS
458
+ # ---------------------------------------
459
+
460
+ def safe_dataframe_operation(func, df, *args, **kwargs):
461
+ """Safely execute dataframe operations with error handling"""
462
+ try:
463
+ result = func(df, *args, **kwargs)
464
+ st.session_state.last_successful_operation = func.__name__
465
+ return result, None
466
+ except Exception as e:
467
+ error_msg = StreamlitExceptionHandler.handle_exception(e, func.__name__)
468
+ return None, error_msg
469
+
470
+ def validate_dataset(df):
471
+ """Validate dataset for common issues"""
472
+ issues = []
473
+
474
+ if df.empty:
475
+ issues.append("The dataset is empty")
476
+
477
+ if df.shape[0] == 0:
478
+ issues.append("No rows in the dataset")
479
+
480
+ if df.shape[1] == 0:
481
+ issues.append("No columns in the dataset")
482
+
483
+ # Check for memory issues
484
+ memory_usage = df.memory_usage(deep=True).sum() / 1024**3 # GB
485
+ if memory_usage > 1:
486
+ issues.append(f"Large dataset detected ({memory_usage:.2f} GB). Some operations may be slow.")
487
+
488
+ # Check for mixed types
489
+ for col in df.columns:
490
+ if df[col].dtype == 'object':
491
+ # Check if column has mixed types
492
+ types = df[col].apply(type).unique()
493
+ if len(types) > 1:
494
+ issues.append(f"Column '{col}' has mixed data types: {types}")
495
+
496
+ return issues
497
+
498
+ def show_validation_warnings(issues):
499
+ """Display validation warnings"""
500
+ if issues:
501
+ st.markdown("""
502
+ <div class="warning-container">
503
+ <strong>⚠️ Data Quality Warnings:</strong><br>
504
+ """ + "<br>".join([f"• {issue}" for issue in issues]) + """
505
+ </div>
506
+ """, unsafe_allow_html=True)
507
+
508
+ # ---------------------------------------
509
+ # SIDEBAR WITH ENHANCED NAVIGATION
510
+ # ---------------------------------------
511
+
512
+ with st.sidebar:
513
+ st.markdown("### 🧭 Navigation")
514
+
515
+ # Custom radio buttons styling
516
+ page = st.radio(
517
+ "Select Module",
518
+ ["📤 Upload Dataset", "🛠️ Preprocessing", "🔍 EDA",
519
+ "📈 Visualization", "🤖 Machine Learning", "💡 Insights",
520
+ "💬 Chatbot", "📋 Data Quality", "📐 Statistical Analysis"],
521
+ label_visibility="collapsed"
522
+ )
523
+
524
+ st.markdown("---")
525
+
526
+ # Dataset info in sidebar
527
+ if st.session_state.data is not None:
528
+ st.markdown("### 📂 Current Dataset")
529
+ df = st.session_state.data
530
+ col1, col2 = st.columns(2)
531
+ with col1:
532
+ st.metric("Rows", f"{df.shape[0]:,}")
533
+ with col2:
534
+ st.metric("Columns", df.shape[1])
535
+
536
+ # Show data quality indicator
537
+ missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
538
+ if missing_pct == 0:
539
+ st.success("✅ Data Quality: Excellent")
540
+ elif missing_pct < 5:
541
+ st.info(f"ℹ️ Data Quality: Good ({missing_pct:.1f}% missing)")
542
+ elif missing_pct < 20:
543
+ st.warning(f"⚠️ Data Quality: Fair ({missing_pct:.1f}% missing)")
544
+ else:
545
+ st.error(f"❌ Data Quality: Poor ({missing_pct:.1f}% missing)")
546
+
547
+ # Quick actions
548
+ st.markdown("### ⚡ Quick Actions")
549
+
550
+ col1, col2 = st.columns(2)
551
+ with col1:
552
+ if st.button("🔄 Reset Data", use_container_width=True):
553
+ st.session_state.data = None
554
+ st.session_state.processed_data = None
555
+ st.session_state.data_loaded = False
556
+ st.rerun()
557
+
558
+ with col2:
559
+ if st.button("📥 Download Sample", use_container_width=True):
560
+ # Create sample data download
561
+ sample_df = df.head(100)
562
+ csv = sample_df.to_csv(index=False)
563
+ st.download_button(
564
+ label="Download Sample",
565
+ data=csv,
566
+ file_name="sample_data.csv",
567
+ mime="text/csv"
568
+ )
569
+
570
+ # Show operation history
571
+ if st.session_state.operation_status:
572
+ with st.expander("📋 Operation History"):
573
+ for op, status in st.session_state.operation_status.items():
574
+ if status == "success":
575
+ st.success(f"✅ {op}")
576
+ elif status == "error":
577
+ st.error(f"❌ {op}")
578
+ else:
579
+ st.info(f"⏳ {op}")
580
+ else:
581
+ st.info("👆 Upload a dataset to get started")
582
+
583
+ # ---------------------------------------
584
+ # MAIN CONTENT AREA
585
+ # ---------------------------------------
586
+
587
+ # Map page names to functions
588
+ page_map = {
589
+ "📤 Upload Dataset": "upload",
590
+ "🛠️ Preprocessing": "preprocess",
591
+ "🔍 EDA": "eda",
592
+ "📈 Visualization": "visualization",
593
+ "🤖 Machine Learning": "ml",
594
+ "💡 Insights": "insights",
595
+ "💬 Chatbot": "chatbot",
596
+ "📋 Data Quality": "quality",
597
+ "📐 Statistical Analysis": "statistical"
598
+ }
599
+
600
+ current_page = page_map[page]
601
+
602
+ # ---------------------------------------
603
+ # UPLOAD DATASET PAGE
604
+ # ---------------------------------------
605
+
606
+ if current_page == "upload":
607
+
608
+ col1, col2, col3 = st.columns([1, 2, 1])
609
+
610
+ with col2:
611
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
612
+ st.markdown("### 📂 Upload Your Dataset")
613
+
614
+ # File uploader with size limit warning
615
+ file = st.file_uploader(
616
+ "Choose a CSV or Excel file",
617
+ type=["csv", "xlsx"],
618
+ help="Maximum recommended file size: 200MB. Larger files may cause performance issues."
619
+ )
620
+
621
+ if file:
622
+ try:
623
+ # Check file size
624
+ file_size = file.size / 1024**2 # MB
625
+ if file_size > 200:
626
+ st.warning(f"⚠️ Large file detected ({file_size:.2f} MB). Processing may be slow.")
627
+
628
+ with st.spinner("📂 Loading file..."):
629
+ # Read file based on extension
630
+ if file.name.endswith("csv"):
631
+ # Try different encodings
632
+ encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
633
+ df = None
634
+
635
+ for encoding in encodings:
636
+ try:
637
+ df = pd.read_csv(file, encoding=encoding)
638
+ break
639
+ except UnicodeDecodeError:
640
+ continue
641
+
642
+ if df is None:
643
+ st.error("❌ Could not read CSV file with any common encoding.")
644
+ st.stop()
645
+
646
+ elif file.name.endswith(("xlsx", "xls")):
647
+ try:
648
+ df = pd.read_excel(file)
649
+ except Exception as e:
650
+ st.error(f"❌ Error reading Excel file: {str(e)}")
651
+ st.info("💡 Try saving the file as CSV and uploading again.")
652
+ st.stop()
653
+
654
+ # Validate dataset
655
+ issues = validate_dataset(df)
656
+ show_validation_warnings(issues)
657
+
658
+ if not issues or all("Large dataset" not in issue for issue in issues):
659
+ # Store in session state
660
+ st.session_state.data = df
661
+ st.session_state.uploaded_file_name = file.name
662
+ st.session_state.data_loaded = True
663
+ st.session_state.upload_error = None
664
+
665
+ # Show success message
666
+ st.markdown("""
667
+ <div class="success-container">
668
+ <strong>✅ Successfully loaded:</strong> {}<br>
669
+ <strong>📊 Shape:</strong> {} rows × {} columns
670
+ </div>
671
+ """.format(file.name, df.shape[0], df.shape[1]), unsafe_allow_html=True)
672
+
673
+ # File statistics
674
+ st.markdown("### 📊 File Statistics")
675
+ col1, col2, col3 = st.columns(3)
676
+ with col1:
677
+ st.metric("Total Rows", f"{df.shape[0]:,}")
678
+ with col2:
679
+ st.metric("Total Columns", df.shape[1])
680
+ with col3:
681
+ memory = df.memory_usage(deep=True).sum() / 1024**2
682
+ st.metric("Memory Usage", f"{memory:.2f} MB")
683
+
684
+ # Data preview with scroll
685
+ st.markdown("### 👁️ Data Preview")
686
+ st.dataframe(
687
+ df.head(10),
688
+ use_container_width=True,
689
+ height=300
690
+ )
691
+
692
+ # Column info with sorting
693
+ st.markdown("### 📋 Column Information")
694
+ col_info = pd.DataFrame({
695
+ 'Column': df.columns,
696
+ 'Type': df.dtypes.astype(str),
697
+ 'Non-Null Count': df.count().values,
698
+ 'Null Count': df.isnull().sum().values,
699
+ 'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
700
+ 'Unique Values': [df[col].nunique() for col in df.columns]
701
+ })
702
+
703
+ # Sort by null count
704
+ col_info = col_info.sort_values('Null %', ascending=False)
705
+
706
+ st.dataframe(
707
+ col_info.style.background_gradient(subset=['Null %'], cmap='YlOrRd'),
708
+ use_container_width=True
709
+ )
710
+
711
+ # Quick stats
712
+ st.markdown("### 📈 Quick Statistics")
713
+
714
+ numeric_cols = df.select_dtypes(include=['number']).columns
715
+ if len(numeric_cols) > 0:
716
+ st.dataframe(
717
+ df[numeric_cols].describe(),
718
+ use_container_width=True
719
+ )
720
+
721
+ # Navigation buttons
722
+ st.markdown("### 🚀 Next Steps")
723
+ col1, col2, col3 = st.columns(3)
724
+
725
+ with col1:
726
+ if st.button("🛠️ Go to Preprocessing", use_container_width=True):
727
+ st.session_state.page = "🛠️ Preprocessing"
728
+ st.rerun()
729
+
730
+ with col2:
731
+ if st.button("📊 Go to EDA", use_container_width=True):
732
+ st.session_state.page = "📊 EDA"
733
+ st.rerun()
734
+
735
+ with col3:
736
+ if st.button("📈 Go to Visualization", use_container_width=True):
737
+ st.session_state.page = "📈 Visualization"
738
+ st.rerun()
739
+
740
+ except pd.errors.EmptyDataError:
741
+ st.error("❌ The uploaded file is empty. Please upload a file with data.")
742
+ except pd.errors.ParserError as e:
743
+ st.error(f"❌ Error parsing file: {str(e)}")
744
+ st.info("💡 Check if your CSV file has consistent delimiters and quoting.")
745
+ except MemoryError:
746
+ st.error("❌ Out of memory! The file is too large to process.")
747
+ st.info("💡 Try uploading a smaller file or sampling your data first.")
748
+ except Exception as e:
749
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "file upload")
750
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
751
+
752
+ # Log error
753
+ st.session_state.error_log.append({
754
+ 'timestamp': datetime.now(),
755
+ 'error': str(e),
756
+ 'traceback': traceback.format_exc()
757
+ })
758
+
759
+ st.markdown('</div>', unsafe_allow_html=True)
760
+
761
+ # Sample data option
762
+ with st.expander("🔄 Or use sample data"):
763
+ st.markdown("Don't have a dataset? Try our sample data:")
764
+
765
+ if st.button("Load Sample Dataset", use_container_width=True):
766
+ try:
767
+ from utils import create_sample_dataset
768
+ sample_df = create_sample_dataset()
769
+ st.session_state.data = sample_df
770
+ st.session_state.uploaded_file_name = "sample_dataset.csv"
771
+ st.session_state.data_loaded = True
772
+ st.success("✅ Sample dataset loaded successfully!")
773
+ st.rerun()
774
+ except Exception as e:
775
+ st.error(f"❌ Error loading sample data: {str(e)}")
776
+
777
+ # ---------------------------------------
778
+ # PREPROCESSING PAGE
779
+ # ---------------------------------------
780
+
781
+ elif current_page == "preprocess":
782
+ try:
783
+ if st.session_state.data is not None:
784
+ df = st.session_state.data
785
+
786
+ # Validate data before preprocessing
787
+ issues = validate_dataset(df)
788
+ if issues:
789
+ show_validation_warnings(issues)
790
+
791
+ # Run preprocessing with error handling
792
+ with st.spinner("🔄 Preprocessing data..."):
793
+ processed_df, error = safe_dataframe_operation(preprocess_data, df)
794
+
795
+ if error:
796
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
797
+ st.session_state.operation_status['Preprocessing'] = 'error'
798
+ else:
799
+ st.session_state.processed_data = processed_df
800
+ st.session_state.operation_status['Preprocessing'] = 'success'
801
+
802
+ # Show success message
803
+ st.markdown("""
804
+ <div class="success-container">
805
+ <strong>✅ Preprocessing completed successfully!</strong><br>
806
+ You can now proceed to analysis or visualization.
807
+ </div>
808
+ """, unsafe_allow_html=True)
809
+ else:
810
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
811
+ except Exception as e:
812
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "preprocessing")
813
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
814
+
815
+ # ---------------------------------------
816
+ # EDA PAGE
817
+ # ---------------------------------------
818
+
819
+ elif current_page == "eda":
820
+ try:
821
+ if st.session_state.data is not None:
822
+ df = st.session_state.data
823
+
824
+ # Validate data
825
+ issues = validate_dataset(df)
826
+ if issues:
827
+ show_validation_warnings(issues)
828
+
829
+ # Run EDA with error handling
830
+ with st.spinner("🔍 Performing Exploratory Data Analysis..."):
831
+ result, error = safe_dataframe_operation(eda_analysis, df)
832
+
833
+ if error:
834
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
835
+ st.session_state.operation_status['EDA'] = 'error'
836
+ else:
837
+ st.session_state.operation_status['EDA'] = 'success'
838
+ else:
839
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
840
+ except Exception as e:
841
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "EDA")
842
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
843
+
844
+ # ---------------------------------------
845
+ # VISUALIZATION PAGE
846
+ # ---------------------------------------
847
+
848
+ elif current_page == "visualization":
849
+ try:
850
+ if st.session_state.data is not None:
851
+ df = st.session_state.data
852
+
853
+ # Validate data
854
+ issues = validate_dataset(df)
855
+ if issues:
856
+ show_validation_warnings(issues)
857
+
858
+ # Run visualization with error handling
859
+ with st.spinner("📊 Generating visualizations..."):
860
+ result, error = safe_dataframe_operation(auto_visualizations, df)
861
+
862
+ if error:
863
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
864
+ st.session_state.operation_status['Visualization'] = 'error'
865
+ else:
866
+ st.session_state.operation_status['Visualization'] = 'success'
867
+ else:
868
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
869
+ except Exception as e:
870
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "visualization")
871
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
872
+
873
+ # ---------------------------------------
874
+ # MACHINE LEARNING PAGE
875
+ # ---------------------------------------
876
+
877
+ elif current_page == "ml":
878
+ try:
879
+ if st.session_state.data is not None:
880
+ data_to_use = st.session_state.processed_data if st.session_state.processed_data is not None else st.session_state.data
881
+
882
+ # Validate data for ML
883
+ if data_to_use.shape[0] < 10:
884
+ st.warning("⚠️ Dataset too small for machine learning (need at least 10 rows)")
885
+ else:
886
+ # Run ML pipeline with error handling
887
+ with st.spinner("🤖 Running machine learning pipeline..."):
888
+ result, error = safe_dataframe_operation(run_ml_pipeline, data_to_use)
889
+
890
+ if error:
891
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
892
+ st.session_state.operation_status['ML'] = 'error'
893
+ else:
894
+ st.session_state.operation_status['ML'] = 'success'
895
+ else:
896
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
897
+ except Exception as e:
898
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "machine learning")
899
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
900
+
901
+ # ---------------------------------------
902
+ # INSIGHTS PAGE
903
+ # ---------------------------------------
904
+
905
+ elif current_page == "insights":
906
+ try:
907
+ if st.session_state.data is not None:
908
+ df = st.session_state.data
909
+
910
+ # Generate insights with error handling
911
+ with st.spinner("💡 Generating business insights..."):
912
+ result, error = safe_dataframe_operation(generate_business_insights, df)
913
+
914
+ if error:
915
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
916
+ st.session_state.operation_status['Insights'] = 'error'
917
+ else:
918
+ st.session_state.operation_status['Insights'] = 'success'
919
+ else:
920
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
921
+ except Exception as e:
922
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "insights generation")
923
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
924
+
925
+ # ---------------------------------------
926
+ # CHATBOT PAGE
927
+ # ---------------------------------------
928
+
929
+ elif current_page == "chatbot":
930
+ try:
931
+ if st.session_state.data is not None:
932
+ df = st.session_state.data
933
+
934
+ # Run chatbot with error handling
935
+ with st.spinner("🤖 Initializing chatbot..."):
936
+ result, error = safe_dataframe_operation(data_chatbot, df)
937
+
938
+ if error:
939
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
940
+ st.session_state.operation_status['Chatbot'] = 'error'
941
+ else:
942
+ st.session_state.operation_status['Chatbot'] = 'success'
943
+ else:
944
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
945
+ except Exception as e:
946
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "chatbot")
947
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
948
+
949
+ # ---------------------------------------
950
+ # DATA QUALITY PAGE
951
+ # ---------------------------------------
952
+
953
+ elif current_page == "quality":
954
+ try:
955
+ if st.session_state.data is not None:
956
+ df = st.session_state.data
957
+
958
+ # Run quality report with error handling
959
+ with st.spinner("📋 Generating quality report..."):
960
+ from data_quality import quality_report
961
+ result, error = safe_dataframe_operation(quality_report, df)
962
+
963
+ if error:
964
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
965
+ st.session_state.operation_status['Data Quality'] = 'error'
966
+ else:
967
+ st.session_state.operation_status['Data Quality'] = 'success'
968
+ else:
969
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
970
+ except Exception as e:
971
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "data quality")
972
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
973
+
974
+ # ---------------------------------------
975
+ # STATISTICAL ANALYSIS PAGE
976
+ # ---------------------------------------
977
+
978
+ elif current_page == "statistical":
979
+ try:
980
+ if st.session_state.data is not None:
981
+ df = st.session_state.data
982
+
983
+ # Validate numeric data
984
+ numeric_cols = df.select_dtypes(include=['number']).columns
985
+ if len(numeric_cols) == 0:
986
+ st.warning("⚠️ No numeric columns found. Statistical analysis requires numeric data.")
987
+ else:
988
+ # Run statistical analysis with error handling
989
+ with st.spinner("📐 Performing statistical analysis..."):
990
+ from statistical_analysis import statistical_analysis
991
+ result, error = safe_dataframe_operation(statistical_analysis, df)
992
+
993
+ if error:
994
+ st.markdown(f'<div class="error-container">{error}</div>', unsafe_allow_html=True)
995
+ st.session_state.operation_status['Statistical Analysis'] = 'error'
996
+ else:
997
+ st.session_state.operation_status['Statistical Analysis'] = 'success'
998
+ else:
999
+ st.warning("⚠️ Please upload a dataset first in the Upload section")
1000
+ except Exception as e:
1001
+ error_msg = StreamlitExceptionHandler.handle_exception(e, "statistical analysis")
1002
+ st.markdown(f'<div class="error-container">{error_msg}</div>', unsafe_allow_html=True)
1003
+
1004
+ # ---------------------------------------
1005
+ # ERROR LOG DISPLAY (Hidden by default)
1006
+ # ---------------------------------------
1007
+
1008
+ if st.session_state.error_log and st.checkbox("🔧 Show Error Log (Debug Mode)"):
1009
+ st.markdown("### 📋 Error Log")
1010
+ for i, error_entry in enumerate(st.session_state.error_log[-5:]): # Show last 5 errors
1011
+ with st.expander(f"Error {i+1}: {error_entry['timestamp'].strftime('%Y-%m-%d %H:%M:%S')}"):
1012
+ st.code(error_entry['error'])
1013
+ st.code(error_entry['traceback'])
1014
+
1015
+ # ---------------------------------------
1016
+ # FOOTER
1017
+ # ---------------------------------------
1018
+
1019
+ st.markdown("---")
1020
+ st.markdown(
1021
+ "<p style='text-align: center; color: gray;'>Made with ❤️ using Streamlit | Version 2.0 | Enhanced Error Handling</p>",
1022
+ unsafe_allow_html=True
1023
+ )
chatbot.py ADDED
@@ -0,0 +1,1051 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
+ import re
8
+ from datetime import datetime, timedelta
9
+
10
+ def data_chatbot(df):
11
+ """
12
+ Advanced chatbot that provides data access and visualizations based on user questions
13
+ """
14
+
15
+ st.markdown("""
16
+ <style>
17
+ .chat-header {
18
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
19
+ padding: 25px;
20
+ border-radius: 15px;
21
+ color: white;
22
+ text-align: center;
23
+ margin-bottom: 25px;
24
+ box-shadow: 0 10px 30px rgba(102, 126, 234, 0.3);
25
+ }
26
+ .chat-header h2 {
27
+ font-size: 2.2rem;
28
+ margin-bottom: 10px;
29
+ }
30
+ .chat-header p {
31
+ font-size: 1.1rem;
32
+ opacity: 0.95;
33
+ }
34
+ .user-message {
35
+ background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
36
+ padding: 15px 20px;
37
+ border-radius: 20px 20px 5px 20px;
38
+ margin: 10px 0;
39
+ max-width: 80%;
40
+ margin-left: auto;
41
+ border-left: 4px solid #1976d2;
42
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
43
+ }
44
+ .bot-message {
45
+ background: white;
46
+ padding: 15px 20px;
47
+ border-radius: 20px 20px 20px 5px;
48
+ margin: 10px 0;
49
+ max-width: 80%;
50
+ border-left: 4px solid #4caf50;
51
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
52
+ }
53
+ .metric-card {
54
+ background: white;
55
+ padding: 15px;
56
+ border-radius: 10px;
57
+ text-align: center;
58
+ box-shadow: 0 2px 10px rgba(0,0,0,0.05);
59
+ border-left: 4px solid #667eea;
60
+ }
61
+ .viz-container {
62
+ background: white;
63
+ padding: 20px;
64
+ border-radius: 15px;
65
+ margin: 20px 0;
66
+ box-shadow: 0 5px 20px rgba(0,0,0,0.1);
67
+ }
68
+ .insight-badge {
69
+ background: #4caf50;
70
+ color: white;
71
+ padding: 5px 10px;
72
+ border-radius: 15px;
73
+ font-size: 12px;
74
+ display: inline-block;
75
+ margin-right: 5px;
76
+ }
77
+ </style>
78
+
79
+ <div class="chat-header">
80
+ <h2>🤖 Smart Data Assistant</h2>
81
+ <p>Ask questions and get instant visualizations - I'll show you the data!</p>
82
+ </div>
83
+ """, unsafe_allow_html=True)
84
+
85
+ # Initialize session state
86
+ if "chat_messages" not in st.session_state:
87
+ st.session_state.chat_messages = []
88
+
89
+ if "last_viz" not in st.session_state:
90
+ st.session_state.last_viz = None
91
+
92
+ if "last_data" not in st.session_state:
93
+ st.session_state.last_data = None
94
+
95
+ # Main layout
96
+ main_col, viz_col = st.columns([1, 1])
97
+
98
+ with main_col:
99
+ # Chat history
100
+ chat_container = st.container()
101
+
102
+ with chat_container:
103
+ if not st.session_state.chat_messages:
104
+ st.info("""
105
+ 👋 **Hi! I can show you data and create visualizations. Try asking:**
106
+
107
+ **📊 Show Data:**
108
+ • "Show me the first 10 rows"
109
+ • "Show me data where age > 30"
110
+ • "Display top 5 by sales"
111
+
112
+ **📈 Create Visualizations:**
113
+ • "Show me a bar chart of category"
114
+ • "Plot histogram of age"
115
+ • "Create scatter plot of price vs quantity"
116
+ • "Show trend of sales over time"
117
+
118
+ **🔍 Analyze:**
119
+ • "What's the average of salary?"
120
+ • "Show statistics for all columns"
121
+ • "Find outliers in price"
122
+ """)
123
+
124
+ for msg in st.session_state.chat_messages:
125
+ if msg["role"] == "user":
126
+ st.markdown(f'<div class="user-message"><b>👤 You:</b> {msg["content"]}</div>', unsafe_allow_html=True)
127
+ else:
128
+ st.markdown(f'<div class="bot-message">{msg["content"]}</div>', unsafe_allow_html=True)
129
+
130
+ # Input area
131
+ st.markdown("<br>", unsafe_allow_html=True)
132
+ input_col1, input_col2 = st.columns([5, 1])
133
+
134
+ with input_col1:
135
+ user_query = st.text_input("", placeholder="💬 Ask a question or request a visualization...",
136
+ key="chat_input", label_visibility="collapsed")
137
+
138
+ with input_col2:
139
+ send_button = st.button("📤 Ask", use_container_width=True)
140
+
141
+ if send_button and user_query:
142
+ # Add user message
143
+ st.session_state.chat_messages.append({"role": "user", "content": user_query})
144
+
145
+ # Process query and get response with data/viz
146
+ with st.spinner("🔍 Processing your request..."):
147
+ response, viz_data, table_data = process_query_with_viz(user_query, df)
148
+
149
+ # Add bot response
150
+ st.session_state.chat_messages.append({"role": "bot", "content": response})
151
+
152
+ # Store visualization and data for display
153
+ if viz_data:
154
+ st.session_state.last_viz = viz_data
155
+ if table_data is not None:
156
+ st.session_state.last_data = table_data
157
+
158
+ st.rerun()
159
+
160
+ with viz_col:
161
+ # Display visualizations and data
162
+ if st.session_state.last_viz:
163
+ st.markdown('<div class="viz-container">', unsafe_allow_html=True)
164
+ st.markdown("### 📊 Generated Visualization")
165
+ display_visualization(st.session_state.last_viz)
166
+ st.markdown('</div>', unsafe_allow_html=True)
167
+
168
+ if st.session_state.last_data is not None:
169
+ st.markdown('<div class="viz-container">', unsafe_allow_html=True)
170
+ st.markdown("### 📋 Data Result")
171
+ st.dataframe(st.session_state.last_data, use_container_width=True, height=300)
172
+ st.markdown('</div>', unsafe_allow_html=True)
173
+
174
+ # Quick action buttons
175
+ st.markdown("---")
176
+ st.markdown("### 🔍 Quick Actions")
177
+
178
+ col1, col2, col3, col4, col5 = st.columns(5)
179
+
180
+ actions = [
181
+ ("📊 First 10 Rows", "Show me first 10 rows", col1),
182
+ ("📈 Bar Chart", "Show bar chart of first categorical column", col2),
183
+ ("📉 Histogram", "Plot histogram of first numeric column", col3),
184
+ ("🔎 Filter", "Show rows where value > average", col4),
185
+ ("📋 Statistics", "Show me statistics", col5)
186
+ ]
187
+
188
+ for label, query, col in actions:
189
+ if col.button(label, use_container_width=True):
190
+ st.session_state.chat_messages.append({"role": "user", "content": query})
191
+ response, viz_data, table_data = process_query_with_viz(query, df)
192
+ st.session_state.chat_messages.append({"role": "bot", "content": response})
193
+ if viz_data:
194
+ st.session_state.last_viz = viz_data
195
+ if table_data is not None:
196
+ st.session_state.last_data = table_data
197
+ st.rerun()
198
+
199
+ # Clear button
200
+ col1, col2, col3 = st.columns([1, 1, 1])
201
+ with col2:
202
+ if st.button("🗑️ Clear Chat & Visualizations", use_container_width=True):
203
+ st.session_state.chat_messages = []
204
+ st.session_state.last_viz = None
205
+ st.session_state.last_data = None
206
+ st.rerun()
207
+
208
+
209
+ def process_query_with_viz(query, df):
210
+ """Process query and return response with visualization and data"""
211
+ query_lower = query.lower().strip()
212
+
213
+ # Get column information
214
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
215
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
216
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
217
+ all_cols = df.columns.tolist()
218
+
219
+ # Extract numbers from query
220
+ numbers = re.findall(r'\d+', query_lower)
221
+ n = int(numbers[0]) if numbers else 10
222
+
223
+ # 1. SHOW DATA - First/Last/Random rows
224
+ if any(word in query_lower for word in ['first', 'head', 'top']):
225
+ return show_first_rows(df, n)
226
+
227
+ elif any(word in query_lower for word in ['last', 'tail', 'bottom']):
228
+ return show_last_rows(df, n)
229
+
230
+ elif 'random' in query_lower or 'sample' in query_lower:
231
+ return show_random_rows(df, n)
232
+
233
+ # 2. FILTER DATA
234
+ elif any(word in query_lower for word in ['find', 'where', 'filter', 'search', 'with']):
235
+ return filter_data(query_lower, df)
236
+
237
+ # 3. SORT DATA
238
+ elif 'sort' in query_lower or 'order by' in query_lower:
239
+ return sort_data(query_lower, df)
240
+
241
+ # 4. BAR CHART
242
+ elif any(word in query_lower for word in ['bar chart', 'bar plot', 'bar graph', 'count plot']):
243
+ return create_bar_chart(query_lower, df, categorical_cols)
244
+
245
+ # 5. HISTOGRAM
246
+ elif any(word in query_lower for word in ['histogram', 'distribution', 'hist', 'frequency']):
247
+ return create_histogram(query_lower, df, numeric_cols)
248
+
249
+ # 6. SCATTER PLOT
250
+ elif any(word in query_lower for word in ['scatter', 'scatter plot', 'scatterplot', 'relationship']):
251
+ return create_scatter_plot(query_lower, df, numeric_cols)
252
+
253
+ # 7. LINE CHART / TREND
254
+ elif any(word in query_lower for word in ['line chart', 'line plot', 'trend', 'over time']):
255
+ return create_line_chart(query_lower, df, numeric_cols, datetime_cols)
256
+
257
+ # 8. BOX PLOT
258
+ elif any(word in query_lower for word in ['box plot', 'boxplot', 'box', 'outliers']):
259
+ return create_box_plot(query_lower, df, numeric_cols, categorical_cols)
260
+
261
+ # 9. PIE CHART
262
+ elif any(word in query_lower for word in ['pie chart', 'pie', 'proportion', 'percentage']):
263
+ return create_pie_chart(query_lower, df, categorical_cols)
264
+
265
+ # 10. HEATMAP / CORRELATION
266
+ elif any(word in query_lower for word in ['heatmap', 'correlation', 'corr', 'heat map']):
267
+ return create_heatmap(df, numeric_cols)
268
+
269
+ # 11. VIOLIN PLOT
270
+ elif 'violin' in query_lower:
271
+ return create_violin_plot(query_lower, df, numeric_cols, categorical_cols)
272
+
273
+ # 12. STATISTICS
274
+ elif any(word in query_lower for word in ['statistics', 'stats', 'describe', 'summary']):
275
+ return show_statistics(query_lower, df, numeric_cols, all_cols)
276
+
277
+ # 13. COLUMN INFORMATION
278
+ elif any(word in query_lower for word in ['column info', 'column details', 'info about']):
279
+ return show_column_info(query_lower, df, all_cols)
280
+
281
+ # 14. MISSING VALUES
282
+ elif any(word in query_lower for word in ['missing', 'null', 'na', 'empty']):
283
+ return show_missing_values(df)
284
+
285
+ # 15. OUTLIERS
286
+ elif 'outlier' in query_lower:
287
+ return detect_outliers(query_lower, df, numeric_cols)
288
+
289
+ # 16. UNIQUE VALUES
290
+ elif any(word in query_lower for word in ['unique', 'distinct', 'categories']):
291
+ return show_unique_values(query_lower, df, all_cols, categorical_cols)
292
+
293
+ # 17. COMPARE COLUMNS
294
+ elif 'compare' in query_lower:
295
+ return compare_columns(query_lower, df, numeric_cols, categorical_cols)
296
+
297
+ # 18. HELP
298
+ elif any(word in query_lower for word in ['help', 'what can you do', 'capabilities']):
299
+ return show_help(), None, None
300
+
301
+ # 19. DEFAULT - Try to understand if asking about a specific column
302
+ else:
303
+ return handle_general_query(query_lower, df, numeric_cols, categorical_cols, all_cols)
304
+
305
+
306
+ def show_first_rows(df, n=10):
307
+ """Show first n rows"""
308
+ data = df.head(n)
309
+ response = f"### 👁️ First {n} Rows\n\nHere's the data you requested:"
310
+ return response, None, data
311
+
312
+
313
+ def show_last_rows(df, n=10):
314
+ """Show last n rows"""
315
+ data = df.tail(n)
316
+ response = f"### 👁️ Last {n} Rows\n\nHere's the data you requested:"
317
+ return response, None, data
318
+
319
+
320
+ def show_random_rows(df, n=5):
321
+ """Show random n rows"""
322
+ data = df.sample(min(n, len(df)))
323
+ response = f"### 🎲 Random Sample of {n} Rows\n\nHere's a random sample from your data:"
324
+ return response, None, data
325
+
326
+
327
+ def filter_data(query, df):
328
+ """Filter data based on conditions"""
329
+ # Common patterns
330
+ patterns = [
331
+ (r'(\w+)\s*>\s*(\d+\.?\d*)', '>'),
332
+ (r'(\w+)\s*<\s*(\d+\.?\d*)', '<'),
333
+ (r'(\w+)\s*>=\s*(\d+\.?\d*)', '>='),
334
+ (r'(\w+)\s*<=\s*(\d+\.?\d*)', '<='),
335
+ (r'(\w+)\s*=\s*(\d+\.?\d*)', '=='),
336
+ (r'(\w+)\s*==\s*(\d+\.?\d*)', '=='),
337
+ (r'(\w+)\s*contains\s*["\']?([^"\']+)["\']?', 'contains'),
338
+ (r'(\w+)\s*is\s*["\']?([^"\']+)["\']?', '=='),
339
+ ]
340
+
341
+ for pattern, op in patterns:
342
+ match = re.search(pattern, query.lower())
343
+ if match:
344
+ col = match.group(1)
345
+ val = match.group(2)
346
+
347
+ # Find matching column
348
+ for c in df.columns:
349
+ if c.lower() == col:
350
+ try:
351
+ if op in ['>', '<', '>=', '<=']:
352
+ val = float(val)
353
+ if op == '>':
354
+ filtered = df[df[c] > val]
355
+ condition = f"{c} > {val}"
356
+ elif op == '<':
357
+ filtered = df[df[c] < val]
358
+ condition = f"{c} < {val}"
359
+ elif op == '>=':
360
+ filtered = df[df[c] >= val]
361
+ condition = f"{c} >= {val}"
362
+ elif op == '<=':
363
+ filtered = df[df[c] <= val]
364
+ condition = f"{c} <= {val}"
365
+ elif op == 'contains':
366
+ filtered = df[df[c].astype(str).str.contains(val, case=False, na=False)]
367
+ condition = f"{c} contains '{val}'"
368
+ else:
369
+ if df[c].dtype in ['int64', 'float64']:
370
+ filtered = df[df[c] == float(val)]
371
+ else:
372
+ filtered = df[df[c].astype(str).str.lower() == val.lower()]
373
+ condition = f"{c} = {val}"
374
+
375
+ if len(filtered) > 0:
376
+ response = f"### 🔍 Found {len(filtered)} rows where {condition}\n\nShowing first 20 results:"
377
+ return response, None, filtered.head(20)
378
+ else:
379
+ return f"❌ No rows found where {condition}", None, None
380
+ except:
381
+ pass
382
+
383
+ return "❌ I couldn't understand the filter condition. Try something like: 'show rows where age > 30'", None, None
384
+
385
+
386
+ def sort_data(query, df):
387
+ """Sort data by column"""
388
+ # Extract column name
389
+ for col in df.columns:
390
+ if col.lower() in query:
391
+ sort_col = col
392
+ break
393
+ else:
394
+ sort_col = df.columns[0] if len(df.columns) > 0 else None
395
+
396
+ if not sort_col:
397
+ return "❌ Please specify a column to sort by", None, None
398
+
399
+ # Determine order
400
+ if 'desc' in query or 'highest' in query or 'largest' in query:
401
+ ascending = False
402
+ order = "descending"
403
+ else:
404
+ ascending = True
405
+ order = "ascending"
406
+
407
+ # Get number
408
+ numbers = re.findall(r'\d+', query)
409
+ n = int(numbers[0]) if numbers else 20
410
+
411
+ sorted_df = df.sort_values(sort_col, ascending=ascending).head(n)
412
+
413
+ response = f"### 📊 Sorted by {sort_col} ({order})\n\nShowing top {n} results:"
414
+ return response, None, sorted_df
415
+
416
+
417
+ def create_bar_chart(query, df, categorical_cols):
418
+ """Create bar chart for categorical column"""
419
+ # Find requested column
420
+ col = None
421
+ for c in categorical_cols:
422
+ if c.lower() in query:
423
+ col = c
424
+ break
425
+
426
+ if not col and categorical_cols:
427
+ col = categorical_cols[0]
428
+
429
+ if col:
430
+ value_counts = df[col].value_counts().head(20)
431
+
432
+ fig = px.bar(
433
+ x=value_counts.index,
434
+ y=value_counts.values,
435
+ title=f"Bar Chart of {col} (Top 20)",
436
+ labels={'x': col, 'y': 'Count'},
437
+ color_discrete_sequence=['#667eea']
438
+ )
439
+
440
+ fig.update_layout(
441
+ plot_bgcolor='white',
442
+ paper_bgcolor='white',
443
+ font=dict(color='#2c3e50'),
444
+ xaxis_tickangle=-45,
445
+ height=500
446
+ )
447
+
448
+ response = f"### 📊 Bar Chart of '{col}'\n\nHere's the distribution of values:"
449
+ return response, fig, None
450
+
451
+ return "❌ No categorical column found for bar chart", None, None
452
+
453
+
454
+ def create_histogram(query, df, numeric_cols):
455
+ """Create histogram for numeric column"""
456
+ # Find requested column
457
+ col = None
458
+ for c in numeric_cols:
459
+ if c.lower() in query:
460
+ col = c
461
+ break
462
+
463
+ if not col and numeric_cols:
464
+ col = numeric_cols[0]
465
+
466
+ if col:
467
+ fig = px.histogram(
468
+ df,
469
+ x=col,
470
+ nbins=30,
471
+ title=f"Histogram of {col}",
472
+ marginal="box",
473
+ color_discrete_sequence=['#667eea']
474
+ )
475
+
476
+ fig.update_layout(
477
+ plot_bgcolor='white',
478
+ paper_bgcolor='white',
479
+ font=dict(color='#2c3e50'),
480
+ height=500
481
+ )
482
+
483
+ # Add statistics
484
+ data = df[col].dropna()
485
+ stats = f"Mean: {data.mean():.2f} | Median: {data.median():.2f} | Std: {data.std():.2f}"
486
+
487
+ response = f"### 📊 Histogram of '{col}'\n\n{stats}"
488
+ return response, fig, None
489
+
490
+ return "❌ No numeric column found for histogram", None, None
491
+
492
+
493
+ def create_scatter_plot(query, df, numeric_cols):
494
+ """Create scatter plot between two numeric columns"""
495
+ # Find two numeric columns
496
+ cols = []
497
+ for col in numeric_cols:
498
+ if col.lower() in query:
499
+ cols.append(col)
500
+
501
+ if len(cols) >= 2:
502
+ x_col, y_col = cols[0], cols[1]
503
+ elif len(numeric_cols) >= 2:
504
+ x_col, y_col = numeric_cols[0], numeric_cols[1]
505
+ else:
506
+ return "❌ Need at least 2 numeric columns for scatter plot", None, None
507
+
508
+ fig = px.scatter(
509
+ df,
510
+ x=x_col,
511
+ y=y_col,
512
+ title=f"Scatter Plot: {y_col} vs {x_col}",
513
+ trendline="ols",
514
+ opacity=0.6,
515
+ color_discrete_sequence=['#667eea']
516
+ )
517
+
518
+ fig.update_layout(
519
+ plot_bgcolor='white',
520
+ paper_bgcolor='white',
521
+ font=dict(color='#2c3e50'),
522
+ height=500
523
+ )
524
+
525
+ # Calculate correlation
526
+ corr = df[x_col].corr(df[y_col])
527
+
528
+ response = f"### 📊 Scatter Plot: {y_col} vs {x_col}\n\nCorrelation: {corr:.4f}"
529
+ return response, fig, None
530
+
531
+
532
+ def create_line_chart(query, df, numeric_cols, datetime_cols):
533
+ """Create line chart for time series or sequential data"""
534
+ # Find date column
535
+ date_col = None
536
+ for col in datetime_cols:
537
+ if col.lower() in query:
538
+ date_col = col
539
+ break
540
+
541
+ if not date_col and datetime_cols:
542
+ date_col = datetime_cols[0]
543
+
544
+ # Find value column
545
+ val_col = None
546
+ for col in numeric_cols:
547
+ if col.lower() in query:
548
+ val_col = col
549
+ break
550
+
551
+ if not val_col and numeric_cols:
552
+ val_col = numeric_cols[0]
553
+
554
+ if date_col and val_col:
555
+ # Sort by date
556
+ plot_df = df[[date_col, val_col]].dropna().sort_values(date_col)
557
+
558
+ fig = px.line(
559
+ plot_df,
560
+ x=date_col,
561
+ y=val_col,
562
+ title=f"Trend of {val_col} over Time",
563
+ color_discrete_sequence=['#667eea']
564
+ )
565
+
566
+ fig.update_layout(
567
+ plot_bgcolor='white',
568
+ paper_bgcolor='white',
569
+ font=dict(color='#2c3e50'),
570
+ height=500
571
+ )
572
+
573
+ response = f"### 📈 Line Chart: {val_col} over Time"
574
+ return response, fig, None
575
+
576
+ return "❌ Need a datetime column and numeric column for line chart", None, None
577
+
578
+
579
+ def create_box_plot(query, df, numeric_cols, categorical_cols):
580
+ """Create box plot"""
581
+ # Find numeric column
582
+ num_col = None
583
+ for col in numeric_cols:
584
+ if col.lower() in query:
585
+ num_col = col
586
+ break
587
+
588
+ if not num_col and numeric_cols:
589
+ num_col = numeric_cols[0]
590
+
591
+ # Find categorical column for grouping
592
+ cat_col = None
593
+ for col in categorical_cols:
594
+ if col.lower() in query:
595
+ cat_col = col
596
+ break
597
+
598
+ if num_col:
599
+ if cat_col:
600
+ fig = px.box(
601
+ df,
602
+ x=cat_col,
603
+ y=num_col,
604
+ title=f"Box Plot of {num_col} by {cat_col}",
605
+ color_discrete_sequence=['#667eea']
606
+ )
607
+ response = f"### 📊 Box Plot: {num_col} grouped by {cat_col}"
608
+ else:
609
+ fig = px.box(
610
+ df,
611
+ y=num_col,
612
+ title=f"Box Plot of {num_col}",
613
+ color_discrete_sequence=['#667eea']
614
+ )
615
+ response = f"### 📊 Box Plot of {num_col}"
616
+
617
+ fig.update_layout(
618
+ plot_bgcolor='white',
619
+ paper_bgcolor='white',
620
+ font=dict(color='#2c3e50'),
621
+ height=500
622
+ )
623
+
624
+ return response, fig, None
625
+
626
+ return "❌ No numeric column found for box plot", None, None
627
+
628
+
629
+ def create_pie_chart(query, df, categorical_cols):
630
+ """Create pie chart for categorical column"""
631
+ # Find categorical column
632
+ col = None
633
+ for c in categorical_cols:
634
+ if c.lower() in query:
635
+ col = c
636
+ break
637
+
638
+ if not col and categorical_cols:
639
+ col = categorical_cols[0]
640
+
641
+ if col:
642
+ value_counts = df[col].value_counts().head(10)
643
+
644
+ fig = px.pie(
645
+ values=value_counts.values,
646
+ names=value_counts.index,
647
+ title=f"Pie Chart of {col} (Top 10)",
648
+ hole=0.3,
649
+ color_discrete_sequence=px.colors.qualitative.Set3
650
+ )
651
+
652
+ fig.update_layout(
653
+ height=500,
654
+ showlegend=True
655
+ )
656
+
657
+ response = f"### 🥧 Pie Chart of '{col}'\n\nProportion of values:"
658
+ return response, fig, None
659
+
660
+ return "❌ No categorical column found for pie chart", None, None
661
+
662
+
663
+ def create_heatmap(df, numeric_cols):
664
+ """Create correlation heatmap"""
665
+ if len(numeric_cols) < 2:
666
+ return "❌ Need at least 2 numeric columns for correlation heatmap", None, None
667
+
668
+ corr_matrix = df[numeric_cols].corr()
669
+
670
+ fig = px.imshow(
671
+ corr_matrix,
672
+ text_auto=True,
673
+ aspect="auto",
674
+ color_continuous_scale='RdBu_r',
675
+ title="Correlation Heatmap",
676
+ zmin=-1, zmax=1
677
+ )
678
+
679
+ fig.update_layout(
680
+ height=600,
681
+ plot_bgcolor='white',
682
+ paper_bgcolor='white'
683
+ )
684
+
685
+ response = "### 🔥 Correlation Heatmap\n\nStrong correlations are shown in dark red/blue:"
686
+ return response, fig, None
687
+
688
+
689
+ def create_violin_plot(query, df, numeric_cols, categorical_cols):
690
+ """Create violin plot"""
691
+ # Find numeric column
692
+ num_col = None
693
+ for col in numeric_cols:
694
+ if col.lower() in query:
695
+ num_col = col
696
+ break
697
+
698
+ if not num_col and numeric_cols:
699
+ num_col = numeric_cols[0]
700
+
701
+ # Find categorical column for grouping
702
+ cat_col = None
703
+ for col in categorical_cols:
704
+ if col.lower() in query:
705
+ cat_col = col
706
+ break
707
+
708
+ if num_col:
709
+ if cat_col:
710
+ fig = px.violin(
711
+ df,
712
+ x=cat_col,
713
+ y=num_col,
714
+ title=f"Violin Plot of {num_col} by {cat_col}",
715
+ box=True,
716
+ points="all",
717
+ color_discrete_sequence=['#667eea']
718
+ )
719
+ response = f"### 🎻 Violin Plot: {num_col} grouped by {cat_col}"
720
+ else:
721
+ fig = px.violin(
722
+ df,
723
+ y=num_col,
724
+ title=f"Violin Plot of {num_col}",
725
+ box=True,
726
+ points="all",
727
+ color_discrete_sequence=['#667eea']
728
+ )
729
+ response = f"### 🎻 Violin Plot of {num_col}"
730
+
731
+ fig.update_layout(
732
+ plot_bgcolor='white',
733
+ paper_bgcolor='white',
734
+ font=dict(color='#2c3e50'),
735
+ height=500
736
+ )
737
+
738
+ return response, fig, None
739
+
740
+ return "❌ No numeric column found for violin plot", None, None
741
+
742
+
743
+ def show_statistics(query, df, numeric_cols, all_cols):
744
+ """Show statistics for columns"""
745
+ # Check if asking about specific column
746
+ for col in all_cols:
747
+ if col.lower() in query and col in numeric_cols:
748
+ data = df[col].dropna()
749
+
750
+ stats_data = pd.DataFrame({
751
+ 'Statistic': ['Count', 'Mean', 'Std Dev', 'Min', '25%', '50%', '75%', 'Max', 'Skewness', 'Kurtosis'],
752
+ 'Value': [
753
+ len(data),
754
+ f"{data.mean():.4f}",
755
+ f"{data.std():.4f}",
756
+ f"{data.min():.4f}",
757
+ f"{data.quantile(0.25):.4f}",
758
+ f"{data.median():.4f}",
759
+ f"{data.quantile(0.75):.4f}",
760
+ f"{data.max():.4f}",
761
+ f"{data.skew():.4f}",
762
+ f"{data.kurtosis():.4f}"
763
+ ]
764
+ })
765
+
766
+ response = f"### 📊 Statistics for '{col}'"
767
+ return response, None, stats_data
768
+
769
+ # General statistics for all numeric columns
770
+ if numeric_cols:
771
+ stats_df = df[numeric_cols].describe().T
772
+ stats_df['skew'] = df[numeric_cols].skew()
773
+ stats_df['kurtosis'] = df[numeric_cols].kurtosis()
774
+
775
+ response = "### 📈 Summary Statistics for Numeric Columns"
776
+ return response, None, stats_df
777
+
778
+ return "❌ No numeric columns found for statistics", None, None
779
+
780
+
781
+ def show_column_info(query, df, all_cols):
782
+ """Show information about specific column or all columns"""
783
+ # Check if asking about specific column
784
+ for col in all_cols:
785
+ if col.lower() in query:
786
+ info_data = pd.DataFrame({
787
+ 'Property': ['Data Type', 'Unique Values', 'Missing Values', 'Missing %', 'Sample Values'],
788
+ 'Value': [
789
+ str(df[col].dtype),
790
+ df[col].nunique(),
791
+ df[col].isnull().sum(),
792
+ f"{(df[col].isnull().sum()/len(df)*100):.2f}%",
793
+ str(df[col].dropna().iloc[:3].tolist())
794
+ ]
795
+ })
796
+
797
+ response = f"### 📋 Column Information: '{col}'"
798
+ return response, None, info_data
799
+
800
+ # General column information
801
+ col_info = pd.DataFrame({
802
+ 'Column': df.columns,
803
+ 'Data Type': df.dtypes.astype(str),
804
+ 'Unique Values': [df[col].nunique() for col in df.columns],
805
+ 'Missing Values': df.isnull().sum().values,
806
+ 'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
807
+ })
808
+
809
+ response = "### 📋 All Columns Information"
810
+ return response, None, col_info
811
+
812
+
813
+ def show_missing_values(df):
814
+ """Show missing values analysis"""
815
+ missing = df.isnull().sum()
816
+ missing = missing[missing > 0]
817
+
818
+ if len(missing) == 0:
819
+ return "✅ **Good news!** No missing values found in the dataset.", None, None
820
+
821
+ missing_data = pd.DataFrame({
822
+ 'Column': missing.index,
823
+ 'Missing Count': missing.values,
824
+ 'Missing %': (missing.values / len(df) * 100).round(2)
825
+ }).sort_values('Missing %', ascending=False)
826
+
827
+ total_missing = missing.sum()
828
+ total_cells = df.shape[0] * df.shape[1]
829
+
830
+ response = f"### 🔍 Missing Values Analysis\n\n**Total Missing:** {total_missing} out of {total_cells} cells ({total_missing/total_cells*100:.2f}%)"
831
+ return response, None, missing_data
832
+
833
+
834
+ def detect_outliers(query, df, numeric_cols):
835
+ """Detect outliers in numeric columns"""
836
+ # Check if asking about specific column
837
+ target_cols = []
838
+ for col in numeric_cols:
839
+ if col.lower() in query:
840
+ target_cols.append(col)
841
+
842
+ if not target_cols:
843
+ target_cols = numeric_cols[:3] # Check first 3 numeric columns
844
+
845
+ outlier_data = []
846
+
847
+ for col in target_cols:
848
+ data = df[col].dropna()
849
+ Q1 = data.quantile(0.25)
850
+ Q3 = data.quantile(0.75)
851
+ IQR = Q3 - Q1
852
+ outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
853
+
854
+ outlier_data.append({
855
+ 'Column': col,
856
+ 'Outliers Count': len(outliers),
857
+ 'Outliers %': f"{(len(outliers)/len(data)*100):.2f}%",
858
+ 'Normal Range': f"[{Q1 - 1.5 * IQR:.4f}, {Q3 + 1.5 * IQR:.4f}]",
859
+ 'Severity': 'High' if len(outliers)/len(data)*100 > 10 else 'Medium' if len(outliers)/len(data)*100 > 5 else 'Low'
860
+ })
861
+
862
+ outlier_df = pd.DataFrame(outlier_data)
863
+
864
+ response = "### ⚠️ Outlier Detection Results"
865
+ return response, None, outlier_df
866
+
867
+
868
+ def show_unique_values(query, df, all_cols, categorical_cols):
869
+ """Show unique values in columns"""
870
+ # Check if asking about specific column
871
+ for col in all_cols:
872
+ if col.lower() in query:
873
+ value_counts = df[col].value_counts().reset_index()
874
+ value_counts.columns = [col, 'Count']
875
+ value_counts['Percentage'] = (value_counts['Count'] / len(df) * 100).round(2)
876
+
877
+ response = f"### 🎯 Unique Values in '{col}'\n\n**Total Unique:** {df[col].nunique()}"
878
+ return response, None, value_counts.head(20)
879
+
880
+ # Show for categorical columns
881
+ if categorical_cols:
882
+ unique_data = []
883
+ for col in categorical_cols[:10]:
884
+ unique_data.append({
885
+ 'Column': col,
886
+ 'Unique Values': df[col].nunique(),
887
+ 'Most Common': df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A',
888
+ 'Most Common Count': df[col].value_counts().values[0] if len(df[col].value_counts()) > 0 else 0
889
+ })
890
+
891
+ unique_df = pd.DataFrame(unique_data)
892
+ response = "### 🎯 Unique Values in Categorical Columns"
893
+ return response, None, unique_df
894
+
895
+ return "❌ No categorical columns found", None, None
896
+
897
+
898
+ def compare_columns(query, df, numeric_cols, categorical_cols):
899
+ """Compare two columns"""
900
+ # Find two columns to compare
901
+ cols = []
902
+ for col in df.columns:
903
+ if col.lower() in query:
904
+ cols.append(col)
905
+
906
+ if len(cols) >= 2:
907
+ col1, col2 = cols[0], cols[1]
908
+
909
+ if col1 in numeric_cols and col2 in numeric_cols:
910
+ # Numeric comparison
911
+ comparison_data = pd.DataFrame({
912
+ 'Metric': ['Mean', 'Median', 'Std Dev', 'Min', 'Max'],
913
+ col1: [
914
+ df[col1].mean(),
915
+ df[col1].median(),
916
+ df[col1].std(),
917
+ df[col1].min(),
918
+ df[col1].max()
919
+ ],
920
+ col2: [
921
+ df[col2].mean(),
922
+ df[col2].median(),
923
+ df[col2].std(),
924
+ df[col2].min(),
925
+ df[col2].max()
926
+ ]
927
+ })
928
+
929
+ response = f"### 🔄 Comparison: {col1} vs {col2}"
930
+ return response, None, comparison_data
931
+
932
+ elif col1 in categorical_cols and col2 in categorical_cols:
933
+ # Categorical comparison - crosstab
934
+ cross_tab = pd.crosstab(df[col1], df[col2])
935
+ response = f"### 🔄 Cross-tabulation: {col1} vs {col2}"
936
+ return response, None, cross_tab
937
+
938
+ return "❌ Please specify two columns to compare", None, None
939
+
940
+
941
+ def show_help():
942
+ """Show help information"""
943
+ help_text = """
944
+ ### 🤖 I Can Help You With:
945
+
946
+ **📊 Show Data:**
947
+ • "Show me first 10 rows"
948
+ • "Show me last 5 rows"
949
+ • "Show random sample of 10 rows"
950
+ • "Find rows where age > 30"
951
+ • "Sort by price descending"
952
+ • "Top 5 by sales"
953
+
954
+ **📈 Create Visualizations:**
955
+ • "Show bar chart of category"
956
+ • "Plot histogram of age"
957
+ • "Create scatter plot of price vs quantity"
958
+ • "Show line chart of sales over time"
959
+ • "Create box plot of salary"
960
+ • "Show pie chart of region"
961
+ • "Display correlation heatmap"
962
+ • "Create violin plot of price"
963
+
964
+ **🔍 Analyze Data:**
965
+ • "Show statistics for all columns"
966
+ • "Tell me about [column name]"
967
+ • "Any missing values?"
968
+ • "Find outliers in price"
969
+ • "Show unique values in category"
970
+ • "Compare age and income"
971
+
972
+ **Just ask naturally and I'll show you the data and visualizations!**
973
+ """
974
+ return help_text
975
+
976
+
977
+ def handle_general_query(query, df, numeric_cols, categorical_cols, all_cols):
978
+ """Handle general queries that don't match specific patterns"""
979
+
980
+ # Check if asking about a specific column
981
+ for col in all_cols:
982
+ if col.lower() in query:
983
+ if col in numeric_cols:
984
+ data = df[col].dropna()
985
+ return f"**{col}** - Mean: {data.mean():.2f}, Min: {data.min():.2f}, Max: {data.max():.2f}", None, None
986
+ else:
987
+ return f"**{col}** - Unique values: {df[col].nunique()}, Most common: {df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A'}", None, None
988
+
989
+ # Check for dataset size
990
+ if 'size' in query or 'large' in query or 'big' in query:
991
+ size_mb = df.memory_usage(deep=True).sum() / 1024**2
992
+ return f"Dataset size: {size_mb:.2f} MB ({df.shape[0]:,} rows × {df.shape[1]} columns)", None, None
993
+
994
+ # Default response
995
+ return "❌ I didn't understand. Try asking for data, visualizations, or type 'help'", None, None
996
+
997
+
998
+ def display_visualization(fig):
999
+ """Display the visualization"""
1000
+ st.plotly_chart(fig, use_container_width=True)
1001
+
1002
+
1003
+ # Simple version for quick integration
1004
+ def run_simple_chatbot(df):
1005
+ """Simplified chatbot version"""
1006
+ st.markdown("### 💬 Simple Data Chat")
1007
+
1008
+ if "simple_msgs" not in st.session_state:
1009
+ st.session_state.simple_msgs = []
1010
+
1011
+ # Chat display
1012
+ for msg in st.session_state.simple_msgs:
1013
+ if msg["role"] == "user":
1014
+ st.info(f"👤 {msg['content']}")
1015
+ else:
1016
+ st.success(f"🤖 {msg['content']}")
1017
+
1018
+ # Input
1019
+ user_input = st.text_input("Ask:", key="simple_chat_input")
1020
+
1021
+ if st.button("Send") and user_input:
1022
+ st.session_state.simple_msgs.append({"role": "user", "content": user_input})
1023
+
1024
+ # Simple responses
1025
+ response = "I don't understand. Try: rows, columns, missing, stats, chart"
1026
+
1027
+ if "row" in user_input.lower():
1028
+ response = f"Dataset has {df.shape[0]} rows"
1029
+ elif "column" in user_input.lower():
1030
+ response = f"Dataset has {df.shape[1]} columns: {', '.join(df.columns[:5])}"
1031
+ elif "missing" in user_input.lower():
1032
+ missing = df.isnull().sum().sum()
1033
+ response = f"Found {missing} missing values" if missing > 0 else "No missing values"
1034
+ elif "stat" in user_input.lower():
1035
+ numeric = df.select_dtypes(include=[np.number]).columns
1036
+ if len(numeric) > 0:
1037
+ response = f"Mean of {numeric[0]}: {df[numeric[0]].mean():.2f}"
1038
+ elif "chart" in user_input.lower() or "plot" in user_input.lower():
1039
+ response = "📊 Creating visualization... (check the plot above)"
1040
+ # Simple histogram
1041
+ numeric = df.select_dtypes(include=[np.number]).columns
1042
+ if len(numeric) > 0:
1043
+ fig = px.histogram(df, x=numeric[0], title=f"Distribution of {numeric[0]}")
1044
+ st.plotly_chart(fig, use_container_width=True)
1045
+
1046
+ st.session_state.simple_msgs.append({"role": "bot", "content": response})
1047
+ st.rerun()
1048
+
1049
+ if st.button("Clear Chat"):
1050
+ st.session_state.simple_msgs = []
1051
+ st.rerun()
data_preprocessing.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
5
+ from sklearn.impute import SimpleImputer, KNNImputer
6
+ from sklearn.ensemble import IsolationForest
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+
10
+ def preprocess_data(df):
11
+
12
+ st.markdown("""
13
+ <div style='text-align: center; margin-bottom: 2rem;'>
14
+ <h2>⚙️ Data Preprocessing Pipeline</h2>
15
+ <p style='color: gray;'>Clean, transform, and prepare your data for analysis</p>
16
+ </div>
17
+ """, unsafe_allow_html=True)
18
+
19
+ # Create tabs for different preprocessing steps
20
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
21
+ "📊 Overview", "🧹 Clean Data", "🔄 Transform",
22
+ "📏 Scale & Encode", "📈 Feature Engineering"
23
+ ])
24
+
25
+ with tab1:
26
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
27
+
28
+ col1, col2, col3 = st.columns(3)
29
+
30
+ with col1:
31
+ st.metric("Original Rows", df.shape[0])
32
+ with col2:
33
+ st.metric("Original Columns", df.shape[1])
34
+ with col3:
35
+ missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
36
+ st.metric("Missing Data", f"{missing_pct:.1f}%")
37
+
38
+ # Data quality before preprocessing
39
+ st.subheader("Data Quality Check")
40
+
41
+ quality_df = pd.DataFrame({
42
+ 'Column': df.columns,
43
+ 'Data Type': df.dtypes,
44
+ 'Missing Values': df.isnull().sum(),
45
+ 'Missing %': (df.isnull().sum() / len(df) * 100).round(2),
46
+ 'Unique Values': [df[col].nunique() for col in df.columns]
47
+ })
48
+
49
+ st.dataframe(quality_df, use_container_width=True)
50
+
51
+ # Visualize missing values
52
+ if df.isnull().sum().sum() > 0:
53
+ st.subheader("Missing Value Heatmap")
54
+ missing_df = df.isnull().astype(int)
55
+ fig = px.imshow(missing_df.T,
56
+ color_continuous_scale='reds',
57
+ aspect="auto",
58
+ title="Missing Values Pattern")
59
+ st.plotly_chart(fig, use_container_width=True)
60
+
61
+ st.markdown('</div>', unsafe_allow_html=True)
62
+
63
+ with tab2:
64
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
65
+ st.subheader("🧹 Data Cleaning Options")
66
+
67
+ # Create a copy for processing
68
+ processed_df = df.copy()
69
+
70
+ # Remove duplicates
71
+ st.markdown("### Duplicate Removal")
72
+ duplicates = processed_df.duplicated().sum()
73
+ st.write(f"Duplicate rows found: **{duplicates}**")
74
+
75
+ if duplicates > 0:
76
+ if st.button("Remove Duplicates", use_container_width=True):
77
+ processed_df = processed_df.drop_duplicates()
78
+ st.success(f"✅ Removed {duplicates} duplicate rows")
79
+
80
+ # Handle missing values
81
+ st.markdown("### Missing Value Handling")
82
+
83
+ missing_cols = processed_df.columns[processed_df.isnull().any()].tolist()
84
+
85
+ if missing_cols:
86
+ selected_col = st.selectbox("Select column to handle missing values", missing_cols)
87
+
88
+ col_type = processed_df[selected_col].dtype
89
+
90
+ if pd.api.types.is_numeric_dtype(processed_df[selected_col]):
91
+ method = st.radio(
92
+ "Choose imputation method",
93
+ ["Mean", "Median", "Mode", "KNN Imputer", "Drop rows", "Fill with value"]
94
+ )
95
+
96
+ if method == "Mean":
97
+ processed_df[selected_col].fillna(processed_df[selected_col].mean(), inplace=True)
98
+ elif method == "Median":
99
+ processed_df[selected_col].fillna(processed_df[selected_col].median(), inplace=True)
100
+ elif method == "Mode":
101
+ processed_df[selected_col].fillna(processed_df[selected_col].mode()[0], inplace=True)
102
+ elif method == "KNN Imputer":
103
+ st.info("KNN Imputer will be applied to all numeric columns")
104
+ if st.button("Apply KNN Imputer"):
105
+ numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
106
+ imputer = KNNImputer(n_neighbors=5)
107
+ processed_df[numeric_cols] = imputer.fit_transform(processed_df[numeric_cols])
108
+ elif method == "Drop rows":
109
+ if st.button(f"Drop rows with missing values in {selected_col}"):
110
+ processed_df = processed_df.dropna(subset=[selected_col])
111
+ else:
112
+ fill_value = st.text_input("Enter fill value")
113
+ if fill_value:
114
+ if pd.api.types.is_numeric_dtype(processed_df[selected_col]):
115
+ processed_df[selected_col].fillna(float(fill_value), inplace=True)
116
+ else:
117
+ processed_df[selected_col].fillna(fill_value, inplace=True)
118
+
119
+ else: # Categorical column
120
+ method = st.radio(
121
+ "Choose imputation method",
122
+ ["Mode", "Drop rows", "Fill with value"]
123
+ )
124
+
125
+ if method == "Mode":
126
+ processed_df[selected_col].fillna(processed_df[selected_col].mode()[0], inplace=True)
127
+ elif method == "Drop rows":
128
+ if st.button(f"Drop rows with missing values in {selected_col}"):
129
+ processed_df = processed_df.dropna(subset=[selected_col])
130
+ else:
131
+ fill_value = st.text_input("Enter fill value")
132
+ if fill_value:
133
+ processed_df[selected_col].fillna(fill_value, inplace=True)
134
+ else:
135
+ st.success("✅ No missing values found!")
136
+
137
+ # Outlier detection
138
+ st.markdown("### Outlier Detection")
139
+ numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
140
+
141
+ if len(numeric_cols) > 0:
142
+ selected_num = st.selectbox("Select numeric column for outlier detection", numeric_cols)
143
+
144
+ # Calculate IQR
145
+ Q1 = processed_df[selected_num].quantile(0.25)
146
+ Q3 = processed_df[selected_num].quantile(0.75)
147
+ IQR = Q3 - Q1
148
+
149
+ outliers = processed_df[
150
+ (processed_df[selected_num] < Q1 - 1.5 * IQR) |
151
+ (processed_df[selected_num] > Q3 + 1.5 * IQR)
152
+ ]
153
+
154
+ st.write(f"Outliers detected: **{len(outliers)}** rows")
155
+
156
+ if len(outliers) > 0:
157
+ if st.button(f"Remove outliers from {selected_num}"):
158
+ processed_df = processed_df[
159
+ (processed_df[selected_num] >= Q1 - 1.5 * IQR) &
160
+ (processed_df[selected_num] <= Q3 + 1.5 * IQR)
161
+ ]
162
+ st.success(f"✅ Removed {len(outliers)} outliers")
163
+
164
+ st.markdown('</div>', unsafe_allow_html=True)
165
+
166
+ # Update session state
167
+ st.session_state.data = processed_df
168
+
169
+ with tab3:
170
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
171
+ st.subheader("🔄 Data Transformations")
172
+
173
+ processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
174
+
175
+ # Column operations
176
+ st.markdown("### Column Operations")
177
+
178
+ operation = st.selectbox(
179
+ "Choose operation",
180
+ ["Create new column", "Rename column", "Drop column", "Change data type"]
181
+ )
182
+
183
+ if operation == "Create new column":
184
+ col1, col2, col3 = st.columns(3)
185
+ with col1:
186
+ new_col_name = st.text_input("New column name")
187
+ with col2:
188
+ col_to_use = st.selectbox("Based on column", processed_df.columns)
189
+ with col3:
190
+ operation_type = st.selectbox(
191
+ "Operation",
192
+ ["Square", "Square Root", "Log", "Absolute", "Round", "Binary encode"]
193
+ )
194
+
195
+ if st.button("Create column") and new_col_name:
196
+ if operation_type == "Square":
197
+ processed_df[new_col_name] = processed_df[col_to_use] ** 2
198
+ elif operation_type == "Square Root":
199
+ processed_df[new_col_name] = np.sqrt(processed_df[col_to_use])
200
+ elif operation_type == "Log":
201
+ processed_df[new_col_name] = np.log1p(processed_df[col_to_use])
202
+ elif operation_type == "Absolute":
203
+ processed_df[new_col_name] = np.abs(processed_df[col_to_use])
204
+ elif operation_type == "Round":
205
+ processed_df[new_col_name] = np.round(processed_df[col_to_use])
206
+ elif operation_type == "Binary encode":
207
+ threshold = st.number_input("Threshold for binary encoding")
208
+ processed_df[new_col_name] = (processed_df[col_to_use] > threshold).astype(int)
209
+
210
+ st.success(f"✅ Created column: {new_col_name}")
211
+
212
+ elif operation == "Rename column":
213
+ col_to_rename = st.selectbox("Select column to rename", processed_df.columns)
214
+ new_name = st.text_input("New column name")
215
+
216
+ if st.button("Rename") and new_name:
217
+ processed_df.rename(columns={col_to_rename: new_name}, inplace=True)
218
+ st.success(f"✅ Renamed {col_to_rename} to {new_name}")
219
+
220
+ elif operation == "Drop column":
221
+ cols_to_drop = st.multiselect("Select columns to drop", processed_df.columns)
222
+
223
+ if st.button("Drop columns") and cols_to_drop:
224
+ processed_df = processed_df.drop(columns=cols_to_drop)
225
+ st.success(f"✅ Dropped columns: {', '.join(cols_to_drop)}")
226
+
227
+ elif operation == "Change data type":
228
+ col_to_change = st.selectbox("Select column", processed_df.columns)
229
+ new_type = st.selectbox(
230
+ "New data type",
231
+ ["int", "float", "str", "datetime", "category"]
232
+ )
233
+
234
+ if st.button("Change type"):
235
+ try:
236
+ if new_type == "int":
237
+ processed_df[col_to_change] = processed_df[col_to_change].astype(int)
238
+ elif new_type == "float":
239
+ processed_df[col_to_change] = processed_df[col_to_change].astype(float)
240
+ elif new_type == "str":
241
+ processed_df[col_to_change] = processed_df[col_to_change].astype(str)
242
+ elif new_type == "datetime":
243
+ processed_df[col_to_change] = pd.to_datetime(processed_df[col_to_change])
244
+ elif new_type == "category":
245
+ processed_df[col_to_change] = processed_df[col_to_change].astype('category')
246
+
247
+ st.success(f"✅ Changed {col_to_change} to {new_type}")
248
+ except Exception as e:
249
+ st.error(f"Error: {str(e)}")
250
+
251
+ st.markdown('</div>', unsafe_allow_html=True)
252
+
253
+ # Update session state
254
+ st.session_state.data = processed_df
255
+
256
+ with tab4:
257
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
258
+ st.subheader("📏 Feature Scaling & Encoding")
259
+
260
+ processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
261
+
262
+ col1, col2 = st.columns(2)
263
+
264
+ with col1:
265
+ st.markdown("### Feature Scaling")
266
+ numeric_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
267
+
268
+ if numeric_cols:
269
+ scale_cols = st.multiselect("Select columns to scale", numeric_cols)
270
+ scale_method = st.radio("Scaling method", ["StandardScaler", "MinMaxScaler"])
271
+
272
+ if st.button("Apply Scaling") and scale_cols:
273
+ if scale_method == "StandardScaler":
274
+ scaler = StandardScaler()
275
+ else:
276
+ scaler = MinMaxScaler()
277
+
278
+ processed_df[scale_cols] = scaler.fit_transform(processed_df[scale_cols])
279
+ st.success(f"✅ Applied {scale_method} to {len(scale_cols)} columns")
280
+
281
+ with col2:
282
+ st.markdown("### Categorical Encoding")
283
+ cat_cols = processed_df.select_dtypes(include=['object', 'category']).columns.tolist()
284
+
285
+ if cat_cols:
286
+ encode_cols = st.multiselect("Select columns to encode", cat_cols)
287
+ encode_method = st.radio("Encoding method", ["Label Encoding", "One-Hot Encoding"])
288
+
289
+ if st.button("Apply Encoding") and encode_cols:
290
+ if encode_method == "Label Encoding":
291
+ for col in encode_cols:
292
+ le = LabelEncoder()
293
+ processed_df[col + '_encoded'] = le.fit_transform(processed_df[col])
294
+ st.success(f"✅ Applied Label Encoding to {len(encode_cols)} columns")
295
+ else:
296
+ processed_df = pd.get_dummies(processed_df, columns=encode_cols)
297
+ st.success(f"✅ Applied One-Hot Encoding to {len(encode_cols)} columns")
298
+
299
+ st.markdown('</div>', unsafe_allow_html=True)
300
+
301
+ # Update session state
302
+ st.session_state.data = processed_df
303
+
304
+ with tab5:
305
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
306
+ st.subheader("📈 Feature Engineering")
307
+
308
+ processed_df = st.session_state.data.copy() if 'processed_df' not in locals() else processed_df
309
+
310
+ # Feature interactions
311
+ st.markdown("### Feature Interactions")
312
+ numeric_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
313
+
314
+ if len(numeric_cols) >= 2:
315
+ col1, col2 = st.columns(2)
316
+ with col1:
317
+ feat1 = st.selectbox("First feature", numeric_cols)
318
+ with col2:
319
+ feat2 = st.selectbox("Second feature", [c for c in numeric_cols if c != feat1])
320
+
321
+ interaction_type = st.selectbox(
322
+ "Interaction type",
323
+ ["Multiplication", "Addition", "Subtraction", "Division", "Ratio"]
324
+ )
325
+
326
+ new_col_name = st.text_input("New column name", f"{feat1}_{interaction_type}_{feat2}")
327
+
328
+ if st.button("Create Interaction Feature"):
329
+ if interaction_type == "Multiplication":
330
+ processed_df[new_col_name] = processed_df[feat1] * processed_df[feat2]
331
+ elif interaction_type == "Addition":
332
+ processed_df[new_col_name] = processed_df[feat1] + processed_df[feat2]
333
+ elif interaction_type == "Subtraction":
334
+ processed_df[new_col_name] = processed_df[feat1] - processed_df[feat2]
335
+ elif interaction_type == "Division":
336
+ processed_df[new_col_name] = processed_df[feat1] / (processed_df[feat2] + 1e-8)
337
+ elif interaction_type == "Ratio":
338
+ processed_df[new_col_name] = processed_df[feat1] / (processed_df[feat2].sum() + 1e-8)
339
+
340
+ st.success(f"✅ Created feature: {new_col_name}")
341
+
342
+ # Binning
343
+ st.markdown("### Feature Binning")
344
+ if numeric_cols:
345
+ bin_col = st.selectbox("Select column for binning", numeric_cols)
346
+ n_bins = st.slider("Number of bins", 2, 20, 5)
347
+ bin_labels = [f"Bin_{i}" for i in range(n_bins)]
348
+
349
+ if st.button("Create Binned Feature"):
350
+ processed_df[bin_col + '_binned'] = pd.cut(processed_df[bin_col],
351
+ bins=n_bins,
352
+ labels=bin_labels)
353
+ st.success(f"✅ Created binned feature: {bin_col}_binned")
354
+
355
+ st.markdown('</div>', unsafe_allow_html=True)
356
+
357
+ # Update session state
358
+ st.session_state.data = processed_df
359
+
360
+ # Preview processed data
361
+ st.markdown("---")
362
+ st.subheader("📋 Processed Data Preview")
363
+
364
+ data_to_show = st.session_state.data
365
+
366
+ col1, col2, col3 = st.columns(3)
367
+ with col1:
368
+ st.metric("Final Rows", data_to_show.shape[0])
369
+ with col2:
370
+ st.metric("Final Columns", data_to_show.shape[1])
371
+ with col3:
372
+ final_missing = data_to_show.isnull().sum().sum()
373
+ st.metric("Remaining Missing", final_missing)
374
+
375
+ st.dataframe(data_to_show.head(10), use_container_width=True)
376
+
377
+ # Download processed data
378
+ csv = data_to_show.to_csv(index=False)
379
+ st.download_button(
380
+ label="📥 Download Processed Data",
381
+ data=csv,
382
+ file_name="processed_data.csv",
383
+ mime="text/csv",
384
+ use_container_width=True
385
+ )
386
+
387
+ return data_to_show
data_quality.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from sklearn.ensemble import IsolationForest
7
+
8
+ def quality_report(df):
9
+
10
+ st.markdown("""
11
+ <div style='text-align: center; margin-bottom: 2rem;'>
12
+ <h2>📋 Data Quality Report</h2>
13
+ <p style='color: gray;'>Comprehensive data quality assessment</p>
14
+ </div>
15
+ """, unsafe_allow_html=True)
16
+
17
+ # Overall quality score
18
+ st.subheader("📊 Overall Data Quality Score")
19
+
20
+ # Calculate various quality metrics
21
+ completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
22
+ uniqueness = (1 - df.duplicated().sum() / df.shape[0]) * 100
23
+
24
+ # Data type consistency
25
+ type_consistency = 100
26
+ for col in df.columns:
27
+ if df[col].dtype == 'object':
28
+ # Check if column has consistent types
29
+ try:
30
+ pd.to_numeric(df[col], errors='raise')
31
+ # If convertible to numeric, it might be inconsistent
32
+ type_consistency -= 5
33
+ except:
34
+ pass
35
+
36
+ # Outlier impact
37
+ outlier_impact = 100
38
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
39
+ if len(numeric_cols) > 0:
40
+ for col in numeric_cols:
41
+ Q1 = df[col].quantile(0.25)
42
+ Q3 = df[col].quantile(0.75)
43
+ IQR = Q3 - Q1
44
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
45
+ outlier_pct = len(outliers) / len(df) * 100
46
+ if outlier_pct > 10:
47
+ outlier_impact -= 10
48
+
49
+ quality_score = (completeness + uniqueness + type_consistency + outlier_impact) / 4
50
+
51
+ # Display gauge
52
+ fig = go.Figure(go.Indicator(
53
+ mode="gauge+number",
54
+ value=quality_score,
55
+ domain={'x': [0, 1], 'y': [0, 1]},
56
+ title={'text': "Quality Score"},
57
+ gauge={
58
+ 'axis': {'range': [None, 100]},
59
+ 'bar': {'color': "#2E86AB"},
60
+ 'steps': [
61
+ {'range': [0, 50], 'color': "#FF6B6B"},
62
+ {'range': [50, 70], 'color': "#FFD93D"},
63
+ {'range': [70, 85], 'color': "#6BCB77"},
64
+ {'range': [85, 100], 'color': "#4CAF50"}
65
+ ],
66
+ 'threshold': {
67
+ 'line': {'color': "red", 'width': 4},
68
+ 'thickness': 0.75,
69
+ 'value': 90
70
+ }
71
+ }))
72
+
73
+ st.plotly_chart(fig, use_container_width=True)
74
+
75
+ # Quality metrics cards
76
+ col1, col2, col3, col4 = st.columns(4)
77
+
78
+ with col1:
79
+ st.metric("Completeness", f"{completeness:.1f}%",
80
+ delta=None, delta_color="normal")
81
+
82
+ with col2:
83
+ st.metric("Uniqueness", f"{uniqueness:.1f}%",
84
+ delta=None, delta_color="normal")
85
+
86
+ with col3:
87
+ st.metric("Type Consistency", f"{type_consistency:.1f}%",
88
+ delta=None, delta_color="normal")
89
+
90
+ with col4:
91
+ st.metric("Outlier Impact", f"{outlier_impact:.1f}%",
92
+ delta=None, delta_color="inverse")
93
+
94
+ # Detailed quality report
95
+ st.subheader("🔍 Detailed Quality Report")
96
+
97
+ quality_df = pd.DataFrame({
98
+ 'Column': df.columns,
99
+ 'Data Type': df.dtypes,
100
+ 'Missing Count': df.isnull().sum().values,
101
+ 'Missing %': (df.isnull().sum().values / len(df) * 100).round(2),
102
+ 'Unique Values': [df[col].nunique() for col in df.columns],
103
+ 'Unique %': [round((df[col].nunique() / len(df) * 100),2) for col in df.columns],
104
+ 'Duplicate Values?': [df[col].duplicated().any() for col in df.columns]
105
+ })
106
+
107
+ # Add outlier info for numeric columns
108
+ outlier_info = []
109
+ for col in df.columns:
110
+ if df[col].dtype in ['int64', 'float64']:
111
+ Q1 = df[col].quantile(0.25)
112
+ Q3 = df[col].quantile(0.75)
113
+ IQR = Q3 - Q1
114
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
115
+ outlier_info.append(len(outliers))
116
+ else:
117
+ outlier_info.append(0)
118
+
119
+ quality_df['Outliers'] = outlier_info
120
+
121
+ st.dataframe(quality_df.style.background_gradient(subset=['Missing %', 'Outliers'], cmap='YlOrRd'),
122
+ use_container_width=True)
123
+
124
+ # Visualizations
125
+ st.subheader("📊 Quality Visualizations")
126
+
127
+ col1, col2 = st.columns(2)
128
+
129
+ with col1:
130
+ # Missing values bar chart
131
+ missing_cols = df.isnull().sum()[df.isnull().sum() > 0]
132
+ if len(missing_cols) > 0:
133
+ fig = px.bar(x=missing_cols.index, y=missing_cols.values,
134
+ title="Missing Values by Column",
135
+ labels={'x': 'Column', 'y': 'Missing Count'})
136
+ st.plotly_chart(fig, use_container_width=True)
137
+ else:
138
+ st.success("No missing values found!")
139
+
140
+ with col2:
141
+ # Data type distribution
142
+ dtype_counts = df.dtypes.value_counts()
143
+ fig = px.pie(values=dtype_counts.values, names=dtype_counts.index.astype(str),
144
+ title="Data Type Distribution")
145
+ st.plotly_chart(fig, use_container_width=True)
146
+
147
+ # Outlier detection with Isolation Forest
148
+ st.subheader("🕵️ Anomaly Detection")
149
+
150
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
151
+
152
+ if len(numeric_cols) > 0:
153
+ contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
154
+
155
+ iso_forest = IsolationForest(contamination=contamination, random_state=42)
156
+ outliers = iso_forest.fit_predict(df[numeric_cols].fillna(0))
157
+
158
+ n_outliers = (outliers == -1).sum()
159
+ st.write(f"**Detected Anomalies:** {n_outliers} rows ({n_outliers/len(df)*100:.2f}%)")
160
+
161
+ # Visualize outliers (if 2 or 3 numeric columns)
162
+ if len(numeric_cols) >= 2:
163
+ df_with_outliers = df[numeric_cols[:3]].copy()
164
+ df_with_outliers['is_outlier'] = outliers
165
+
166
+ if len(numeric_cols) == 2:
167
+ fig = px.scatter(df_with_outliers, x=numeric_cols[0], y=numeric_cols[1],
168
+ color='is_outlier', title="Anomaly Detection Results",
169
+ color_continuous_scale=['blue', 'red'])
170
+ st.plotly_chart(fig, use_container_width=True)
171
+ elif len(numeric_cols) >= 3:
172
+ fig = px.scatter_3d(df_with_outliers, x=numeric_cols[0],
173
+ y=numeric_cols[1], z=numeric_cols[2],
174
+ color='is_outlier', title="Anomaly Detection Results (3D)",
175
+ color_continuous_scale=['blue', 'red'])
176
+ st.plotly_chart(fig, use_container_width=True)
177
+ else:
178
+ st.info("No numeric columns available for anomaly detection")
179
+
180
+ # Recommendations
181
+ st.subheader("💡 Quality Improvement Recommendations")
182
+
183
+ recommendations = []
184
+
185
+ # Missing value recommendations
186
+ missing_cols = df.columns[df.isnull().any()].tolist()
187
+ if missing_cols:
188
+ recommendations.append(f"• Handle missing values in {len(missing_cols)} columns: {', '.join(missing_cols[:5])}")
189
+
190
+ # Duplicate recommendations
191
+ if df.duplicated().sum() > 0:
192
+ recommendations.append(f"• Remove {df.duplicated().sum()} duplicate rows")
193
+
194
+ # Outlier recommendations
195
+ outlier_cols = []
196
+ for col in numeric_cols:
197
+ Q1 = df[col].quantile(0.25)
198
+ Q3 = df[col].quantile(0.75)
199
+ IQR = Q3 - Q1
200
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
201
+ if len(outliers) > len(df) * 0.1: # More than 10% outliers
202
+ outlier_cols.append(col)
203
+
204
+ if outlier_cols:
205
+ recommendations.append(f"• Investigate outliers in: {', '.join(outlier_cols[:3])}")
206
+
207
+ # Data type recommendations
208
+ for col in df.columns:
209
+ if df[col].dtype == 'object':
210
+ # Check if column should be numeric
211
+ try:
212
+ pd.to_numeric(df[col].dropna().iloc[:100])
213
+ recommendations.append(f"• Convert '{col}' to numeric type")
214
+ except:
215
+ pass
216
+
217
+ if recommendations:
218
+ for rec in recommendations:
219
+ st.markdown(rec)
220
+ else:
221
+ st.success("✅ Dataset quality looks good! No major issues detected.")
222
+
223
+ # Download quality report
224
+ report_text = f"""
225
+ DATA QUALITY REPORT
226
+ ===================
227
+
228
+ Overall Quality Score: {quality_score:.1f}/100
229
+
230
+ Metrics:
231
+ • Completeness: {completeness:.1f}%
232
+ • Uniqueness: {uniqueness:.1f}%
233
+ • Type Consistency: {type_consistency:.1f}%
234
+ • Outlier Impact: {outlier_impact:.1f}%
235
+
236
+ Dataset Statistics:
237
+ • Rows: {df.shape[0]:,}
238
+ • Columns: {df.shape[1]}
239
+ • Missing Values: {df.isnull().sum().sum():,}
240
+ • Duplicate Rows: {df.duplicated().sum():,}
241
+
242
+ Recommendations:
243
+ {chr(10).join(recommendations)}
244
+ """
245
+
246
+ st.download_button(
247
+ label="📥 Download Quality Report",
248
+ data=report_text,
249
+ file_name="data_quality_report.txt",
250
+ mime="text/plain",
251
+ use_container_width=True
252
+ )
dataset_overview.py ADDED
@@ -0,0 +1,1159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
+
8
+ def eda_analysis(df):
9
+ """
10
+ Comprehensive Exploratory Data Analysis (EDA) with visual insights
11
+ """
12
+ st.markdown("""
13
+ <div style='text-align: center; margin-bottom: 2rem;'>
14
+ <h2>🔍 Exploratory Data Analysis (EDA)</h2>
15
+ <p style='color: gray;'>Discover patterns, relationships, and insights through visual exploration</p>
16
+ </div>
17
+ """, unsafe_allow_html=True)
18
+
19
+ # Error handling
20
+ if df.empty:
21
+ st.error("❌ The dataset is empty. Please upload a valid dataset.")
22
+ return
23
+
24
+ try:
25
+ # Create tabs for different EDA aspects
26
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
27
+ "📋 Data Overview",
28
+ "🔍 Missing Data Analysis",
29
+ "📊 Univariate Analysis",
30
+ "🔄 Bivariate Analysis",
31
+ "📈 Multivariate Analysis",
32
+ "🎯 Pattern Discovery"
33
+ ])
34
+
35
+ with tab1:
36
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
37
+ st.subheader("📋 Dataset Overview")
38
+
39
+ try:
40
+ # Key metrics in cards
41
+ col1, col2, col3, col4 = st.columns(4)
42
+
43
+ with col1:
44
+ st.metric("Total Rows", f"{df.shape[0]:,}")
45
+ with col2:
46
+ st.metric("Total Columns", df.shape[1])
47
+ with col3:
48
+ memory_usage = df.memory_usage(deep=True).sum() / 1024**2
49
+ st.metric("Memory Usage", f"{memory_usage:.2f} MB")
50
+ with col4:
51
+ missing_total = df.isnull().sum().sum()
52
+ st.metric("Missing Values", f"{missing_total:,}")
53
+
54
+ # Data preview with interactive controls
55
+ st.subheader("🔍 Data Preview")
56
+ col1, col2 = st.columns(2)
57
+ with col1:
58
+ preview_rows = st.slider("Number of rows to display", 5, 50, 10, key="preview_rows")
59
+ with col2:
60
+ preview_type = st.radio("Preview type", ["Head", "Tail", "Random Sample"],
61
+ horizontal=True, key="preview_type")
62
+
63
+ if preview_type == "Head":
64
+ st.dataframe(df.head(preview_rows), use_container_width=True)
65
+ elif preview_type == "Tail":
66
+ st.dataframe(df.tail(preview_rows), use_container_width=True)
67
+ else:
68
+ if len(df) > preview_rows:
69
+ st.dataframe(df.sample(preview_rows), use_container_width=True)
70
+ else:
71
+ st.warning("⚠️ Sample size larger than dataset. Showing all rows.")
72
+ st.dataframe(df, use_container_width=True)
73
+
74
+ # Column information with visual indicators
75
+ st.subheader("📋 Column Information")
76
+
77
+ col_info = pd.DataFrame({
78
+ 'Column': df.columns,
79
+ 'Data Type': df.dtypes.astype(str),
80
+ 'Non-Null Count': df.count().values,
81
+ 'Null Count': df.isnull().sum().values,
82
+ 'Null %': (df.isnull().sum().values / len(df) * 100).round(2),
83
+ 'Unique Values': [df[col].nunique() for col in df.columns],
84
+ 'Sample Values': [str(df[col].dropna().iloc[:3].tolist()) if len(df[col].dropna()) > 0 else "All null" for col in df.columns]
85
+ })
86
+
87
+ # Add color coding for data types
88
+ def color_data_type(val):
89
+ if 'int' in val or 'float' in val:
90
+ return 'background-color: #e3f2fd'
91
+ elif 'object' in val:
92
+ return 'background-color: #f1f8e9'
93
+ elif 'datetime' in val:
94
+ return 'background-color: #fff3e0'
95
+ return ''
96
+
97
+ st.dataframe(col_info.style.applymap(color_data_type, subset=['Data Type']),
98
+ use_container_width=True)
99
+
100
+ # Data type distribution
101
+ st.subheader("📊 Data Type Distribution")
102
+
103
+ dtype_counts = df.dtypes.value_counts()
104
+ if len(dtype_counts) > 0:
105
+ fig = make_subplots(rows=1, cols=2,
106
+ specs=[[{"type": "pie"}, {"type": "bar"}]],
107
+ subplot_titles=("Pie Chart", "Bar Chart"))
108
+
109
+ fig.add_trace(go.Pie(labels=dtype_counts.index.astype(str),
110
+ values=dtype_counts.values,
111
+ hole=0.3), row=1, col=1)
112
+
113
+ fig.add_trace(go.Bar(x=dtype_counts.index.astype(str),
114
+ y=dtype_counts.values,
115
+ marker_color=['#42a5f5', '#66bb6a', '#ffa726'][:len(dtype_counts)]),
116
+ row=1, col=2)
117
+
118
+ fig.update_layout(height=400, title_text="Column Types Distribution")
119
+ st.plotly_chart(fig, use_container_width=True)
120
+ else:
121
+ st.warning("⚠️ No data type information available")
122
+
123
+ # Dataset statistics
124
+ st.subheader("📈 Dataset Statistics")
125
+
126
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
127
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
128
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
129
+ bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
130
+
131
+ col1, col2, col3, col4 = st.columns(4)
132
+ with col1:
133
+ st.info(f"**Numeric:** {len(numeric_cols)} columns")
134
+ with col2:
135
+ st.info(f"**Categorical:** {len(categorical_cols)} columns")
136
+ with col3:
137
+ st.info(f"**Datetime:** {len(datetime_cols)} columns")
138
+ with col4:
139
+ st.info(f"**Boolean:** {len(bool_cols)} columns")
140
+
141
+ except Exception as e:
142
+ st.error(f"❌ Error in data overview: {str(e)}")
143
+ st.info("💡 Tip: Check if your dataset contains valid data types")
144
+
145
+ st.markdown('</div>', unsafe_allow_html=True)
146
+
147
+ with tab2:
148
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
149
+ st.subheader("🔍 Missing Data Analysis")
150
+
151
+ try:
152
+ if df.isnull().sum().sum() > 0:
153
+ # Missing data overview
154
+ missing_df = pd.DataFrame({
155
+ 'Column': df.columns,
156
+ 'Missing Count': df.isnull().sum().values,
157
+ 'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
158
+ }).sort_values('Missing %', ascending=False)
159
+
160
+ missing_df = missing_df[missing_df['Missing Count'] > 0]
161
+
162
+ if len(missing_df) > 0:
163
+ # Visualize missing data
164
+ fig = make_subplots(rows=2, cols=2,
165
+ subplot_titles=("Missing Values Heatmap",
166
+ "Missing Values by Column",
167
+ "Missing Data Patterns",
168
+ "Missing Data Matrix"),
169
+ specs=[[{"type": "heatmap"}, {"type": "bar"}],
170
+ [{"type": "scatter"}, {"type": "heatmap"}]])
171
+
172
+ # Heatmap of missing values
173
+ missing_matrix = df.isnull().astype(int).T
174
+ fig.add_trace(go.Heatmap(z=missing_matrix.values,
175
+ y=missing_matrix.index,
176
+ colorscale='Reds',
177
+ showscale=False), row=1, col=1)
178
+
179
+ # Bar chart of missing values
180
+ fig.add_trace(go.Bar(x=missing_df['Column'].head(20),
181
+ y=missing_df['Missing Count'].head(20),
182
+ marker_color='#ef5350',
183
+ name="Missing Count"), row=1, col=2)
184
+
185
+ # Missing data patterns (rows with missing data)
186
+ missing_rows = df[df.isnull().any(axis=1)]
187
+ if len(missing_rows) > 0:
188
+ pattern_df = missing_rows.isnull().sum(axis=1).value_counts().reset_index()
189
+ pattern_df.columns = ['Missing Count per Row', 'Number of Rows']
190
+ pattern_df = pattern_df.sort_values('Missing Count per Row')
191
+
192
+ fig.add_trace(go.Scatter(x=pattern_df['Missing Count per Row'],
193
+ y=pattern_df['Number of Rows'],
194
+ mode='lines+markers',
195
+ name="Patterns"), row=2, col=1)
196
+
197
+ # Missing data matrix for first 50 rows
198
+ sample_missing = df.head(min(50, len(df))).isnull().astype(int).T
199
+ fig.add_trace(go.Heatmap(z=sample_missing.values,
200
+ y=sample_missing.index,
201
+ colorscale='Reds',
202
+ showscale=False,
203
+ name="Matrix"), row=2, col=2)
204
+
205
+ fig.update_layout(height=800, title_text="Missing Data Analysis",
206
+ showlegend=False)
207
+ st.plotly_chart(fig, use_container_width=True)
208
+
209
+ # Detailed missing data table
210
+ st.subheader("📋 Missing Data Details")
211
+
212
+ # Add severity classification
213
+ def classify_severity(pct):
214
+ if pct == 0:
215
+ return "✅ None"
216
+ elif pct < 5:
217
+ return "🟢 Low"
218
+ elif pct < 20:
219
+ return "🟡 Medium"
220
+ else:
221
+ return "🔴 High"
222
+
223
+ missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
224
+ missing_df['Recommendation'] = missing_df['Missing %'].apply(
225
+ lambda x: "No action needed" if x == 0 else
226
+ "Consider imputation" if x < 5 else
227
+ "Imputation recommended" if x < 20 else
228
+ "Consider dropping column"
229
+ )
230
+
231
+ st.dataframe(missing_df, use_container_width=True)
232
+
233
+ # Missing data patterns
234
+ if len(missing_df) > 1:
235
+ st.subheader("🔄 Missing Data Patterns")
236
+
237
+ # Find columns with similar missing patterns
238
+ missing_corr = df[missing_df['Column'].tolist()].isnull().corr()
239
+
240
+ if len(missing_corr) > 1:
241
+ fig = px.imshow(missing_corr,
242
+ text_auto=True,
243
+ aspect="auto",
244
+ color_continuous_scale='RdBu_r',
245
+ title="Missing Value Correlation Matrix")
246
+ st.plotly_chart(fig, use_container_width=True)
247
+
248
+ # Find highly correlated missing patterns
249
+ high_corr = []
250
+ for i in range(len(missing_corr.columns)):
251
+ for j in range(i+1, len(missing_corr.columns)):
252
+ if abs(missing_corr.iloc[i, j]) > 0.7:
253
+ high_corr.append({
254
+ 'Column 1': missing_corr.columns[i],
255
+ 'Column 2': missing_corr.columns[j],
256
+ 'Correlation': missing_corr.iloc[i, j]
257
+ })
258
+
259
+ if high_corr:
260
+ st.info("🔍 **Columns with similar missing patterns:**")
261
+ for item in high_corr[:5]: # Show top 5
262
+ st.write(f"• {item['Column 1']} & {item['Column 2']}: {item['Correlation']:.2f}")
263
+ else:
264
+ st.success("✅ No missing values found in the dataset!")
265
+ else:
266
+ st.success("✅ No missing values found in the dataset!")
267
+
268
+ # Show complete data visualization
269
+ fig = go.Figure()
270
+ fig.add_trace(go.Indicator(
271
+ mode="number+gauge",
272
+ value=100,
273
+ title={'text': "Data Completeness"},
274
+ gauge={'axis': {'range': [0, 100]},
275
+ 'bar': {'color': "green"},
276
+ 'steps': [{'range': [0, 100], 'color': "lightgreen"}]}
277
+ ))
278
+ st.plotly_chart(fig, use_container_width=True)
279
+
280
+ except Exception as e:
281
+ st.error(f"❌ Error in missing data analysis: {str(e)}")
282
+ st.info("💡 Tip: Ensure your dataset has valid data for missing value analysis")
283
+
284
+ st.markdown('</div>', unsafe_allow_html=True)
285
+
286
+ with tab3:
287
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
288
+ st.subheader("📊 Univariate Analysis")
289
+
290
+ try:
291
+ col_type = st.radio("Select column type", ["Numeric", "Categorical", "Datetime"],
292
+ horizontal=True, key="univariate_type")
293
+
294
+ if col_type == "Numeric":
295
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
296
+ if numeric_cols:
297
+ selected_col = st.selectbox("Select numeric column", numeric_cols, key="univariate_num")
298
+
299
+ data = df[selected_col].dropna()
300
+
301
+ if len(data) > 0:
302
+ # Create comprehensive visualization
303
+ fig = make_subplots(rows=2, cols=3,
304
+ subplot_titles=("Histogram", "Box Plot", "Violin Plot",
305
+ "ECDF", "QQ Plot", "Summary Stats"),
306
+ specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
307
+ [{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
308
+
309
+ # Histogram
310
+ fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Histogram",
311
+ marker_color='#42a5f5'), row=1, col=1)
312
+
313
+ # Box plot
314
+ fig.add_trace(go.Box(y=data, name="Box Plot", boxpoints='outliers',
315
+ marker_color='#66bb6a'), row=1, col=2)
316
+
317
+ # Violin plot
318
+ fig.add_trace(go.Violin(y=data, name="Violin Plot", box_visible=True,
319
+ line_color='black', fillcolor='#ffa726',
320
+ opacity=0.6), row=1, col=3)
321
+
322
+ # ECDF
323
+ sorted_data = np.sort(data)
324
+ ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
325
+ fig.add_trace(go.Scatter(x=sorted_data, y=ecdf, mode='lines',
326
+ name="ECDF", line=dict(color='#ab47bc')),
327
+ row=2, col=1)
328
+
329
+ # QQ plot
330
+ theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
331
+ theoretical_q.sort()
332
+ fig.add_trace(go.Scatter(x=theoretical_q, y=sorted_data,
333
+ mode='markers', name="QQ Plot",
334
+ marker=dict(color='#7e57c2', size=3)),
335
+ row=2, col=2)
336
+
337
+ # Add reference line to QQ plot
338
+ min_val = min(theoretical_q.min(), sorted_data.min())
339
+ max_val = max(theoretical_q.max(), sorted_data.max())
340
+ fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
341
+ mode='lines', line=dict(color='red', dash='dash'),
342
+ showlegend=False), row=2, col=2)
343
+
344
+ # Summary statistics as table
345
+ stats_text = f"""
346
+ <b>Summary Statistics</b><br>
347
+ Count: {len(data):,}<br>
348
+ Mean: {data.mean():.4f}<br>
349
+ Std: {data.std():.4f}<br>
350
+ Min: {data.min():.4f}<br>
351
+ Q1: {data.quantile(0.25):.4f}<br>
352
+ Median: {data.median():.4f}<br>
353
+ Q3: {data.quantile(0.75):.4f}<br>
354
+ Max: {data.max():.4f}<br>
355
+ IQR: {data.quantile(0.75) - data.quantile(0.25):.4f}<br>
356
+ Skewness: {data.skew():.4f}<br>
357
+ Kurtosis: {data.kurtosis():.4f}
358
+ """
359
+
360
+ fig.add_annotation(x=0.5, y=0.5, text=stats_text,
361
+ showarrow=False, font=dict(size=10),
362
+ row=2, col=3, align='left')
363
+
364
+ fig.update_layout(height=800, title_text=f"Univariate Analysis: {selected_col}")
365
+ st.plotly_chart(fig, use_container_width=True)
366
+
367
+ # Outlier detection
368
+ Q1 = data.quantile(0.25)
369
+ Q3 = data.quantile(0.75)
370
+ IQR = Q3 - Q1
371
+ outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
372
+
373
+ col1, col2 = st.columns(2)
374
+ with col1:
375
+ st.metric("Outliers Count", len(outliers))
376
+ with col2:
377
+ st.metric("Outliers %", f"{len(outliers)/len(data)*100:.2f}%")
378
+
379
+ if len(outliers) > 0:
380
+ with st.expander("View outlier values"):
381
+ st.write(outliers.tolist()[:20]) # Show first 20 outliers
382
+ if len(outliers) > 20:
383
+ st.info(f"... and {len(outliers) - 20} more outliers")
384
+ else:
385
+ st.warning("⚠️ No numeric columns available for analysis")
386
+
387
+ elif col_type == "Categorical":
388
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
389
+ if categorical_cols:
390
+ selected_col = st.selectbox("Select categorical column", categorical_cols,
391
+ key="univariate_cat")
392
+
393
+ # Get value counts
394
+ value_counts = df[selected_col].value_counts().reset_index()
395
+ value_counts.columns = [selected_col, 'count']
396
+ value_counts['percentage'] = (value_counts['count'] / len(df) * 100).round(2)
397
+
398
+ if len(value_counts) > 0:
399
+ # Create visualizations
400
+ fig = make_subplots(rows=2, cols=2,
401
+ subplot_titles=("Bar Chart (Top 20)", "Pie Chart (Top 10)",
402
+ "Treemap (Top 10)", "Frequency Table"),
403
+ specs=[[{"type": "xy"}, {"type": "domain"}],
404
+ [{"type": "domain"}, {"type": "table"}]])
405
+
406
+ # Bar chart (top 20)
407
+ top20 = value_counts.head(20)
408
+ fig.add_trace(go.Bar(x=top20[selected_col],
409
+ y=top20['count'],
410
+ marker_color='#42a5f5',
411
+ name="Count"), row=1, col=1)
412
+
413
+ # Pie chart (top 10)
414
+ top10 = value_counts.head(10)
415
+ fig.add_trace(go.Pie(labels=top10[selected_col],
416
+ values=top10['count'],
417
+ hole=0.3,
418
+ textinfo='percent+label',
419
+ name="Proportion"), row=1, col=2)
420
+
421
+ # Treemap (top 10)
422
+ fig.add_trace(go.Treemap(labels=top10[selected_col],
423
+ parents=['']*len(top10),
424
+ values=top10['count'],
425
+ textinfo='label+value',
426
+ name="Treemap"), row=2, col=1)
427
+
428
+ # Frequency table (top 10)
429
+ fig.add_trace(go.Table(header=dict(values=[selected_col, 'Count', 'Percentage']),
430
+ cells=dict(values=[top10[selected_col].tolist(),
431
+ top10['count'].tolist(),
432
+ top10['percentage'].tolist()]),
433
+ name="Table"), row=2, col=2)
434
+
435
+ fig.update_layout(height=800, title_text=f"Categorical Analysis: {selected_col}")
436
+ st.plotly_chart(fig, use_container_width=True)
437
+
438
+ # Summary statistics for categorical
439
+ col1, col2, col3 = st.columns(3)
440
+ with col1:
441
+ st.metric("Unique Values", f"{value_counts.shape[0]:,}")
442
+ with col2:
443
+ st.metric("Most Frequent", f"{value_counts.iloc[0, 0]}")
444
+ with col3:
445
+ st.metric("Frequency", f"{value_counts.iloc[0, 1]:,} ({value_counts.iloc[0, 2]}%)")
446
+
447
+ # Cardinality warning
448
+ if value_counts.shape[0] > 50:
449
+ st.warning(f"⚠️ High cardinality detected: {value_counts.shape[0]} unique values. Consider grouping rare categories.")
450
+ else:
451
+ st.warning("⚠️ No categorical columns available for analysis")
452
+
453
+ elif col_type == "Datetime":
454
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
455
+ if datetime_cols:
456
+ selected_col = st.selectbox("Select datetime column", datetime_cols,
457
+ key="univariate_datetime")
458
+
459
+ # Extract temporal features
460
+ df_temp = df[selected_col].dropna()
461
+
462
+ if len(df_temp) > 0:
463
+ # Create temporal distributions
464
+ fig = make_subplots(rows=2, cols=2,
465
+ subplot_titles=("Year Distribution", "Month Distribution",
466
+ "Day of Week Distribution", "Hour Distribution"),
467
+ specs=[[{"type": "xy"}, {"type": "xy"}],
468
+ [{"type": "xy"}, {"type": "xy"}]])
469
+
470
+ # Year distribution
471
+ years = df_temp.dt.year.value_counts().sort_index()
472
+ if len(years) > 0:
473
+ fig.add_trace(go.Bar(x=years.index.astype(str), y=years.values,
474
+ marker_color='#42a5f5', name="Year"), row=1, col=1)
475
+
476
+ # Month distribution
477
+ months = df_temp.dt.month.value_counts().sort_index()
478
+ month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
479
+ 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
480
+ if len(months) > 0:
481
+ fig.add_trace(go.Bar(x=[month_names[i-1] for i in months.index],
482
+ y=months.values, marker_color='#66bb6a',
483
+ name="Month"), row=1, col=2)
484
+
485
+ # Day of week distribution
486
+ days = df_temp.dt.dayofweek.value_counts().sort_index()
487
+ day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
488
+ if len(days) > 0:
489
+ fig.add_trace(go.Bar(x=[day_names[i] for i in days.index],
490
+ y=days.values, marker_color='#ffa726',
491
+ name="Day of Week"), row=2, col=1)
492
+
493
+ # Hour distribution (if time component exists)
494
+ if df_temp.dt.hour.nunique() > 1:
495
+ hours = df_temp.dt.hour.value_counts().sort_index()
496
+ fig.add_trace(go.Bar(x=hours.index.astype(str), y=hours.values,
497
+ marker_color='#ab47bc', name="Hour"), row=2, col=2)
498
+
499
+ fig.update_layout(height=800, title_text=f"Temporal Analysis: {selected_col}")
500
+ st.plotly_chart(fig, use_container_width=True)
501
+
502
+ # Date range information
503
+ col1, col2, col3 = st.columns(3)
504
+ with col1:
505
+ st.metric("Start Date", df_temp.min().strftime('%Y-%m-%d'))
506
+ with col2:
507
+ st.metric("End Date", df_temp.max().strftime('%Y-%m-%d'))
508
+ with col3:
509
+ date_range = (df_temp.max() - df_temp.min()).days
510
+ st.metric("Date Range", f"{date_range} days")
511
+ else:
512
+ st.warning("⚠️ No datetime columns available for analysis")
513
+
514
+ except Exception as e:
515
+ st.error(f"❌ Error in univariate analysis: {str(e)}")
516
+ st.info("💡 Tip: Ensure the selected column contains valid data for analysis")
517
+
518
+ st.markdown('</div>', unsafe_allow_html=True)
519
+
520
+ with tab4:
521
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
522
+ st.subheader("🔄 Bivariate Analysis")
523
+
524
+ try:
525
+ analysis_type = st.radio("Select analysis type",
526
+ ["Numeric vs Numeric", "Numeric vs Categorical",
527
+ "Categorical vs Categorical"],
528
+ horizontal=True, key="bivariate_type")
529
+
530
+ if analysis_type == "Numeric vs Numeric":
531
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
532
+ if len(numeric_cols) >= 2:
533
+ col1, col2 = st.columns(2)
534
+ with col1:
535
+ x_col = st.selectbox("Select X axis", numeric_cols, key="bi_x")
536
+ with col2:
537
+ y_col = st.selectbox("Select Y axis", [c for c in numeric_cols if c != x_col],
538
+ key="bi_y")
539
+
540
+ # Clean data for analysis
541
+ plot_df = df[[x_col, y_col]].dropna()
542
+
543
+ if len(plot_df) > 0:
544
+ # Create comprehensive visualization
545
+ fig = make_subplots(rows=2, cols=3,
546
+ subplot_titles=("Scatter Plot", "Hexbin Plot", "Density Contour",
547
+ "Marginal Distributions", "Residuals", "Statistics"),
548
+ specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
549
+ [{"type": "xy"}, {"type": "xy"}, {"type": "domain"}]])
550
+
551
+ # Scatter plot with trendline
552
+ fig.add_trace(go.Scatter(x=plot_df[x_col], y=plot_df[y_col],
553
+ mode='markers', name="Scatter",
554
+ marker=dict(size=5, opacity=0.6, color='#42a5f5')),
555
+ row=1, col=1)
556
+
557
+ # Add trendline
558
+ try:
559
+ z = np.polyfit(plot_df[x_col], plot_df[y_col], 1)
560
+ p = np.poly1d(z)
561
+ x_range = np.linspace(plot_df[x_col].min(), plot_df[x_col].max(), 100)
562
+ fig.add_trace(go.Scatter(x=x_range, y=p(x_range),
563
+ mode='lines', name="Trend",
564
+ line=dict(color='red', width=2)), row=1, col=1)
565
+ except:
566
+ pass
567
+
568
+ # Hexbin plot
569
+ fig.add_trace(go.Histogram2d(x=plot_df[x_col], y=plot_df[y_col],
570
+ colorscale='Viridis',
571
+ name="Hexbin"), row=1, col=2)
572
+
573
+ # Density contour
574
+ fig.add_trace(go.Histogram2dContour(x=plot_df[x_col], y=plot_df[y_col],
575
+ colorscale='Viridis',
576
+ name="Contour"), row=1, col=3)
577
+
578
+ # Marginal distributions
579
+ fig.add_trace(go.Histogram(x=plot_df[x_col], name=f"{x_col}",
580
+ marker_color='#66bb6a'), row=2, col=1)
581
+ fig.add_trace(go.Histogram(y=plot_df[y_col], name=f"{y_col}",
582
+ marker_color='#ffa726', orientation='h'),
583
+ row=2, col=1)
584
+
585
+ # Residuals
586
+ try:
587
+ residuals = plot_df[y_col] - p(plot_df[x_col])
588
+ fig.add_trace(go.Scatter(x=plot_df[x_col], y=residuals,
589
+ mode='markers', name="Residuals",
590
+ marker=dict(size=3, opacity=0.5, color='#ab47bc')),
591
+ row=2, col=2)
592
+ fig.add_hline(y=0, line_dash="dash", line_color="red", row=2, col=2)
593
+ except:
594
+ pass
595
+
596
+ # Statistics
597
+ corr = plot_df[x_col].corr(plot_df[y_col])
598
+ stats_text = f"""
599
+ <b>Statistics</b><br>
600
+ Correlation: {corr:.4f}<br>
601
+ R²: {corr**2:.4f}<br>
602
+ Covariance: {plot_df[x_col].cov(plot_df[y_col]):.4f}<br>
603
+ Sample Size: {len(plot_df)}<br>
604
+ """
605
+
606
+ fig.add_annotation(x=0.5, y=0.5, text=stats_text,
607
+ showarrow=False, font=dict(size=10),
608
+ row=2, col=3, align='left')
609
+
610
+ fig.update_layout(height=800, title_text=f"Bivariate Analysis: {x_col} vs {y_col}")
611
+ st.plotly_chart(fig, use_container_width=True)
612
+
613
+ # Correlation interpretation
614
+ if abs(corr) > 0.7:
615
+ st.success(f"✅ Strong {'positive' if corr > 0 else 'negative'} correlation detected")
616
+ elif abs(corr) > 0.3:
617
+ st.info(f"ℹ️ Moderate {'positive' if corr > 0 else 'negative'} correlation detected")
618
+ else:
619
+ st.warning(f"⚠️ Weak or no correlation detected")
620
+ else:
621
+ st.warning("⚠️ Need at least 2 numeric columns for this analysis")
622
+
623
+ elif analysis_type == "Numeric vs Categorical":
624
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
625
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
626
+
627
+ if numeric_cols and categorical_cols:
628
+ col1, col2 = st.columns(2)
629
+ with col1:
630
+ num_col = st.selectbox("Select numeric column", numeric_cols, key="bi_num")
631
+ with col2:
632
+ cat_col = st.selectbox("Select categorical column", categorical_cols, key="bi_cat")
633
+
634
+ # Clean data
635
+ plot_df = df[[num_col, cat_col]].dropna()
636
+
637
+ if len(plot_df) > 0 and plot_df[cat_col].nunique() <= 30:
638
+ # Create visualizations
639
+ fig = make_subplots(rows=2, cols=2,
640
+ subplot_titles=("Box Plot", "Violin Plot",
641
+ "Strip Plot", "Bar Chart (Means ± SD)"),
642
+ specs=[[{"type": "xy"}, {"type": "xy"}],
643
+ [{"type": "xy"}, {"type": "xy"}]])
644
+
645
+ # Box plot
646
+ fig.add_trace(go.Box(x=plot_df[cat_col], y=plot_df[num_col],
647
+ name="Box Plot", marker_color='#42a5f5'), row=1, col=1)
648
+
649
+ # Violin plot
650
+ fig.add_trace(go.Violin(x=plot_df[cat_col], y=plot_df[num_col],
651
+ box_visible=True, line_color='black',
652
+ fillcolor='#66bb6a', opacity=0.6,
653
+ name="Violin Plot"), row=1, col=2)
654
+
655
+ # Strip plot
656
+ fig.add_trace(go.Scatter(x=plot_df[cat_col], y=plot_df[num_col],
657
+ mode='markers', name="Strip Plot",
658
+ marker=dict(size=3, opacity=0.3, color='#ffa726')),
659
+ row=2, col=1)
660
+
661
+ # Bar chart with error bars
662
+ stats_by_cat = plot_df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count']).reset_index()
663
+ stats_by_cat = stats_by_cat.sort_values('mean', ascending=False).head(15)
664
+
665
+ fig.add_trace(go.Bar(x=stats_by_cat[cat_col], y=stats_by_cat['mean'],
666
+ error_y=dict(type='data', array=stats_by_cat['std']),
667
+ name="Mean ± SD", marker_color='#ab47bc'),
668
+ row=2, col=2)
669
+
670
+ fig.update_layout(height=800, title_text=f"{num_col} by {cat_col}")
671
+ st.plotly_chart(fig, use_container_width=True)
672
+
673
+ # ANOVA test for groups with >2 categories
674
+ if plot_df[cat_col].nunique() >= 2:
675
+ groups = [group[num_col].values for name, group in plot_df.groupby(cat_col)]
676
+ if all(len(g) > 0 for g in groups):
677
+ f_stat, p_val = stats.f_oneway(*groups)
678
+ st.write(f"**One-way ANOVA Results:** F-statistic = {f_stat:.4f}, p-value = {p_val:.4f}")
679
+ if p_val < 0.05:
680
+ st.success("✅ Significant differences exist between groups")
681
+ else:
682
+ st.info("ℹ️ No significant differences found between groups")
683
+ elif plot_df[cat_col].nunique() > 30:
684
+ st.warning(f"⚠️ Categorical column has {plot_df[cat_col].nunique()} unique values. Consider grouping or selecting another column.")
685
+ else:
686
+ st.warning("⚠️ Need both numeric and categorical columns for this analysis")
687
+
688
+ elif analysis_type == "Categorical vs Categorical":
689
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
690
+
691
+ if len(categorical_cols) >= 2:
692
+ col1, col2 = st.columns(2)
693
+ with col1:
694
+ cat1 = st.selectbox("Select first categorical column", categorical_cols, key="bi_cat1")
695
+ with col2:
696
+ cat2 = st.selectbox("Select second categorical column",
697
+ [c for c in categorical_cols if c != cat1], key="bi_cat2")
698
+
699
+ # Create contingency table
700
+ contingency = pd.crosstab(df[cat1], df[cat2])
701
+
702
+ if contingency.size > 0:
703
+ fig = make_subplots(rows=1, cols=2,
704
+ subplot_titles=("Stacked Bar Chart", "Heatmap"),
705
+ specs=[[{"type": "xy"}, {"type": "heatmap"}]])
706
+
707
+ # Stacked bar chart
708
+ for col in contingency.columns[:10]: # Limit to 10 categories
709
+ fig.add_trace(go.Bar(x=contingency.index[:10], y=contingency[col][:10],
710
+ name=str(col)), row=1, col=1)
711
+
712
+ # Heatmap
713
+ fig.add_trace(go.Heatmap(z=contingency.values[:10, :10],
714
+ x=contingency.columns[:10].astype(str),
715
+ y=contingency.index[:10].astype(str),
716
+ colorscale='Viridis',
717
+ text=contingency.values[:10, :10],
718
+ texttemplate="%{text}"), row=1, col=2)
719
+
720
+ fig.update_layout(height=600, title_text=f"Relationship: {cat1} vs {cat2}",
721
+ barmode='stack')
722
+ st.plotly_chart(fig, use_container_width=True)
723
+
724
+ # Chi-square test
725
+ from scipy.stats import chi2_contingency
726
+ chi2, p_val, dof, expected = chi2_contingency(contingency)
727
+
728
+ st.write(f"**Chi-square Test Results:**")
729
+ st.write(f"χ² = {chi2:.4f}, df = {dof}, p-value = {p_val:.4f}")
730
+
731
+ if p_val < 0.05:
732
+ st.success("✅ Significant association found between variables")
733
+
734
+ # Cramer's V for effect size
735
+ n = contingency.sum().sum()
736
+ cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
737
+ st.write(f"**Cramer's V (effect size):** {cramer_v:.4f}")
738
+ else:
739
+ st.info("ℹ️ No significant association found")
740
+ else:
741
+ st.warning("⚠️ Need at least 2 categorical columns for this analysis")
742
+
743
+ except Exception as e:
744
+ st.error(f"❌ Error in bivariate analysis: {str(e)}")
745
+ st.info("💡 Tip: Check if selected columns have sufficient data for analysis")
746
+
747
+ st.markdown('</div>', unsafe_allow_html=True)
748
+
749
+ with tab5:
750
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
751
+ st.subheader("📈 Multivariate Analysis")
752
+
753
+ try:
754
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
755
+
756
+ if len(numeric_cols) >= 3:
757
+ analysis_type = st.radio("Select analysis type",
758
+ ["Correlation Matrix", "Parallel Coordinates",
759
+ "3D Scatter", "Radar Chart"],
760
+ horizontal=True, key="multivariate_type")
761
+
762
+ if analysis_type == "Correlation Matrix":
763
+ corr_matrix = df[numeric_cols].corr()
764
+
765
+ fig = px.imshow(corr_matrix,
766
+ text_auto=True,
767
+ aspect="auto",
768
+ color_continuous_scale='RdBu_r',
769
+ title="Correlation Matrix Heatmap",
770
+ zmin=-1, zmax=1)
771
+
772
+ fig.update_layout(height=700)
773
+ st.plotly_chart(fig, use_container_width=True)
774
+
775
+ # Find highly correlated pairs
776
+ high_corr = []
777
+ for i in range(len(numeric_cols)):
778
+ for j in range(i+1, len(numeric_cols)):
779
+ if abs(corr_matrix.iloc[i, j]) > 0.7:
780
+ high_corr.append({
781
+ 'Feature 1': numeric_cols[i],
782
+ 'Feature 2': numeric_cols[j],
783
+ 'Correlation': corr_matrix.iloc[i, j]
784
+ })
785
+
786
+ if high_corr:
787
+ st.subheader("🔍 Highly Correlated Pairs (|r| > 0.7)")
788
+ for item in high_corr:
789
+ st.write(f"• **{item['Feature 1']}** & **{item['Feature 2']}**: {item['Correlation']:.4f}")
790
+
791
+ elif analysis_type == "Parallel Coordinates":
792
+ # Select dimensions
793
+ selected_dims = st.multiselect("Select dimensions (columns)",
794
+ numeric_cols,
795
+ default=numeric_cols[:min(4, len(numeric_cols))])
796
+
797
+ if len(selected_dims) >= 2:
798
+ # Optional color dimension
799
+ color_dim = st.selectbox("Color by", ["None"] + numeric_cols +
800
+ df.select_dtypes(include=['object', 'category']).columns.tolist())
801
+
802
+ plot_df = df[selected_dims].dropna()
803
+
804
+ if len(plot_df) > 0:
805
+ if color_dim == "None":
806
+ fig = px.parallel_coordinates(plot_df,
807
+ dimensions=selected_dims,
808
+ title="Parallel Coordinates Plot")
809
+ else:
810
+ if color_dim in numeric_cols:
811
+ fig = px.parallel_coordinates(plot_df,
812
+ dimensions=selected_dims,
813
+ color=color_dim,
814
+ color_continuous_scale=px.colors.diverging.RdBu,
815
+ title=f"Parallel Coordinates colored by {color_dim}")
816
+ else:
817
+ # Categorical color
818
+ temp_df = df[selected_dims + [color_dim]].dropna()
819
+ fig = px.parallel_coordinates(temp_df,
820
+ dimensions=selected_dims,
821
+ color=color_dim,
822
+ title=f"Parallel Coordinates colored by {color_dim}")
823
+
824
+ fig.update_layout(height=600)
825
+ st.plotly_chart(fig, use_container_width=True)
826
+
827
+ elif analysis_type == "3D Scatter":
828
+ if len(numeric_cols) >= 3:
829
+ col1, col2, col3 = st.columns(3)
830
+ with col1:
831
+ x_3d = st.selectbox("X axis", numeric_cols, key="3d_x")
832
+ with col2:
833
+ y_3d = st.selectbox("Y axis", [c for c in numeric_cols if c != x_3d], key="3d_y")
834
+ with col3:
835
+ z_3d = st.selectbox("Z axis", [c for c in numeric_cols if c not in [x_3d, y_3d]],
836
+ key="3d_z")
837
+
838
+ color_3d = st.selectbox("Color by", ["None"] +
839
+ df.select_dtypes(include=['object', 'category']).columns.tolist())
840
+
841
+ plot_df = df[[x_3d, y_3d, z_3d]].dropna()
842
+
843
+ if len(plot_df) > 0:
844
+ if color_3d == "None":
845
+ fig = px.scatter_3d(plot_df, x=x_3d, y=y_3d, z=z_3d,
846
+ title=f"3D Scatter Plot",
847
+ opacity=0.7)
848
+ else:
849
+ temp_df = df[[x_3d, y_3d, z_3d, color_3d]].dropna()
850
+ fig = px.scatter_3d(temp_df, x=x_3d, y=y_3d, z=z_3d,
851
+ color=color_3d,
852
+ title=f"3D Scatter colored by {color_3d}",
853
+ opacity=0.7)
854
+
855
+ fig.update_layout(height=700)
856
+ st.plotly_chart(fig, use_container_width=True)
857
+
858
+ elif analysis_type == "Radar Chart":
859
+ # Select features for radar
860
+ radar_features = st.multiselect("Select features for radar chart",
861
+ numeric_cols,
862
+ default=numeric_cols[:min(5, len(numeric_cols))])
863
+
864
+ if len(radar_features) >= 3:
865
+ # Select how many samples to show
866
+ n_samples = st.slider("Number of samples to show", 1, min(10, len(df)), 3)
867
+
868
+ fig = go.Figure()
869
+
870
+ for i in range(n_samples):
871
+ sample = df.iloc[i][radar_features].values
872
+ fig.add_trace(go.Scatterpolar(
873
+ r=sample,
874
+ theta=radar_features,
875
+ fill='toself',
876
+ name=f'Sample {i}'
877
+ ))
878
+
879
+ fig.update_layout(
880
+ polar=dict(
881
+ radialaxis=dict(
882
+ visible=True,
883
+ range=[df[radar_features].min().min(), df[radar_features].max().max()]
884
+ )),
885
+ title=f"Radar Chart - First {n_samples} Samples",
886
+ height=600
887
+ )
888
+
889
+ st.plotly_chart(fig, use_container_width=True)
890
+ else:
891
+ st.warning("⚠️ Need at least 3 numeric columns for multivariate analysis")
892
+
893
+ except Exception as e:
894
+ st.error(f"❌ Error in multivariate analysis: {str(e)}")
895
+ st.info("💡 Tip: Ensure you have enough numeric columns for multivariate analysis")
896
+
897
+ st.markdown('</div>', unsafe_allow_html=True)
898
+
899
+ with tab6:
900
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
901
+ st.subheader("🎯 Pattern Discovery")
902
+
903
+ try:
904
+ analysis_type = st.radio("Select pattern discovery method",
905
+ ["Clustering Visualization", "Outlier Detection",
906
+ "Trend Detection", "Seasonal Patterns"],
907
+ horizontal=True, key="pattern_type")
908
+
909
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
910
+
911
+ if analysis_type == "Clustering Visualization":
912
+ if len(numeric_cols) >= 2:
913
+ from sklearn.cluster import KMeans
914
+ from sklearn.preprocessing import StandardScaler
915
+
916
+ # Select features for clustering
917
+ cluster_features = st.multiselect("Select features for clustering",
918
+ numeric_cols,
919
+ default=numeric_cols[:min(3, len(numeric_cols))])
920
+
921
+ if len(cluster_features) >= 2:
922
+ n_clusters = st.slider("Number of clusters", 2, 8, 3)
923
+
924
+ # Prepare data
925
+ X = df[cluster_features].dropna()
926
+
927
+ if len(X) > 0:
928
+ # Scale data
929
+ scaler = StandardScaler()
930
+ X_scaled = scaler.fit_transform(X)
931
+
932
+ # Perform clustering
933
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
934
+ clusters = kmeans.fit_predict(X_scaled)
935
+
936
+ # Create visualization
937
+ if len(cluster_features) == 2:
938
+ fig = px.scatter(x=X[cluster_features[0]], y=X[cluster_features[1]],
939
+ color=clusters.astype(str),
940
+ title=f"K-Means Clustering (k={n_clusters})",
941
+ labels={'x': cluster_features[0], 'y': cluster_features[1],
942
+ 'color': 'Cluster'})
943
+ elif len(cluster_features) >= 3:
944
+ fig = px.scatter_3d(x=X[cluster_features[0]], y=X[cluster_features[1]],
945
+ z=X[cluster_features[2]], color=clusters.astype(str),
946
+ title=f"K-Means Clustering (k={n_clusters})",
947
+ labels={cluster_features[0]: cluster_features[0],
948
+ cluster_features[1]: cluster_features[1],
949
+ cluster_features[2]: cluster_features[2],
950
+ 'color': 'Cluster'})
951
+
952
+ fig.update_layout(height=600)
953
+ st.plotly_chart(fig, use_container_width=True)
954
+
955
+ # Cluster statistics
956
+ st.subheader("📊 Cluster Statistics")
957
+ X['Cluster'] = clusters
958
+ cluster_stats = X.groupby('Cluster')[cluster_features].mean()
959
+ st.dataframe(cluster_stats.style.format("{:.4f}"))
960
+
961
+ elif analysis_type == "Outlier Detection":
962
+ if len(numeric_cols) >= 2:
963
+ from sklearn.ensemble import IsolationForest
964
+
965
+ # Select features for outlier detection
966
+ outlier_features = st.multiselect("Select features for outlier detection",
967
+ numeric_cols,
968
+ default=numeric_cols[:min(3, len(numeric_cols))])
969
+
970
+ if len(outlier_features) >= 2:
971
+ contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
972
+
973
+ # Prepare data
974
+ X = df[outlier_features].dropna()
975
+
976
+ if len(X) > 0:
977
+ # Detect outliers
978
+ iso_forest = IsolationForest(contamination=contamination, random_state=42)
979
+ outliers = iso_forest.fit_predict(X)
980
+
981
+ # Create visualization
982
+ if len(outlier_features) == 2:
983
+ fig = px.scatter(x=X[outlier_features[0]], y=X[outlier_features[1]],
984
+ color=outliers,
985
+ color_continuous_scale=['blue', 'red'],
986
+ title=f"Outlier Detection (contamination={contamination})",
987
+ labels={'x': outlier_features[0], 'y': outlier_features[1],
988
+ 'color': 'Outlier'})
989
+ elif len(outlier_features) >= 3:
990
+ fig = px.scatter_3d(x=X[outlier_features[0]], y=X[outlier_features[1]],
991
+ z=X[outlier_features[2]], color=outliers,
992
+ color_continuous_scale=['blue', 'red'],
993
+ title=f"Outlier Detection (contamination={contamination})",
994
+ labels={outlier_features[0]: outlier_features[0],
995
+ outlier_features[1]: outlier_features[1],
996
+ outlier_features[2]: outlier_features[2],
997
+ 'color': 'Outlier'})
998
+
999
+ fig.update_layout(height=600)
1000
+ st.plotly_chart(fig, use_container_width=True)
1001
+
1002
+ # Outlier statistics
1003
+ n_outliers = (outliers == -1).sum()
1004
+ st.write(f"**Outliers detected:** {n_outliers} ({n_outliers/len(X)*100:.2f}%)")
1005
+
1006
+ elif analysis_type == "Trend Detection":
1007
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
1008
+
1009
+ if datetime_cols and numeric_cols:
1010
+ date_col = st.selectbox("Select date column", datetime_cols)
1011
+ value_col = st.selectbox("Select value column", numeric_cols)
1012
+
1013
+ # Prepare time series data
1014
+ ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
1015
+
1016
+ if len(ts_df) > 10:
1017
+ # Calculate moving averages
1018
+ window = st.slider("Moving average window", 2, 30, 7)
1019
+ ts_df['MA'] = ts_df[value_col].rolling(window=window).mean()
1020
+
1021
+ # Detect trend using linear regression
1022
+ from sklearn.linear_model import LinearRegression
1023
+
1024
+ X = np.arange(len(ts_df)).reshape(-1, 1)
1025
+ y = ts_df[value_col].values
1026
+
1027
+ model = LinearRegression()
1028
+ model.fit(X, y)
1029
+ trend = model.predict(X)
1030
+
1031
+ # Create visualization
1032
+ fig = go.Figure()
1033
+ fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df[value_col],
1034
+ mode='lines', name='Original'))
1035
+ fig.add_trace(go.Scatter(x=ts_df[date_col], y=ts_df['MA'],
1036
+ mode='lines', name=f'{window}-period MA',
1037
+ line=dict(color='orange')))
1038
+ fig.add_trace(go.Scatter(x=ts_df[date_col], y=trend,
1039
+ mode='lines', name='Linear Trend',
1040
+ line=dict(color='red', dash='dash')))
1041
+
1042
+ fig.update_layout(title="Trend Detection",
1043
+ xaxis_title="Date",
1044
+ yaxis_title=value_col,
1045
+ height=500)
1046
+ st.plotly_chart(fig, use_container_width=True)
1047
+
1048
+ # Trend statistics
1049
+ slope = model.coef_[0]
1050
+ st.write(f"**Trend slope:** {slope:.4f} units per time step")
1051
+ if slope > 0:
1052
+ st.success("✅ Upward trend detected")
1053
+ elif slope < 0:
1054
+ st.warning("⚠�� Downward trend detected")
1055
+ else:
1056
+ st.info("ℹ️ No clear trend detected")
1057
+
1058
+ elif analysis_type == "Seasonal Patterns":
1059
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
1060
+
1061
+ if datetime_cols and numeric_cols:
1062
+ date_col = st.selectbox("Select date column", datetime_cols, key="seasonal_date")
1063
+ value_col = st.selectbox("Select value column", numeric_cols, key="seasonal_value")
1064
+
1065
+ # Extract seasonal components
1066
+ df_temp = df[[date_col, value_col]].dropna()
1067
+ df_temp['year'] = pd.DatetimeIndex(df_temp[date_col]).year
1068
+ df_temp['month'] = pd.DatetimeIndex(df_temp[date_col]).month
1069
+ df_temp['quarter'] = pd.DatetimeIndex(df_temp[date_col]).quarter
1070
+ df_temp['dayofweek'] = pd.DatetimeIndex(df_temp[date_col]).dayofweek
1071
+
1072
+ # Create seasonal visualizations
1073
+ fig = make_subplots(rows=2, cols=2,
1074
+ subplot_titles=("Year-over-Year", "Monthly Pattern",
1075
+ "Quarterly Pattern", "Day of Week Pattern"),
1076
+ specs=[[{"type": "xy"}, {"type": "xy"}],
1077
+ [{"type": "xy"}, {"type": "xy"}]])
1078
+
1079
+ # Year-over-Year
1080
+ yearly_avg = df_temp.groupby('year')[value_col].mean().reset_index()
1081
+ fig.add_trace(go.Scatter(x=yearly_avg['year'], y=yearly_avg[value_col],
1082
+ mode='lines+markers', name="Yearly Avg"), row=1, col=1)
1083
+
1084
+ # Monthly pattern
1085
+ monthly_avg = df_temp.groupby('month')[value_col].mean().reset_index()
1086
+ month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
1087
+ 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1088
+ fig.add_trace(go.Bar(x=[month_names[m-1] for m in monthly_avg['month']],
1089
+ y=monthly_avg[value_col], name="Monthly Avg"), row=1, col=2)
1090
+
1091
+ # Quarterly pattern
1092
+ quarterly_avg = df_temp.groupby('quarter')[value_col].mean().reset_index()
1093
+ quarter_names = ['Q1', 'Q2', 'Q3', 'Q4']
1094
+ fig.add_trace(go.Bar(x=[quarter_names[q-1] for q in quarterly_avg['quarter']],
1095
+ y=quarterly_avg[value_col], name="Quarterly Avg"), row=2, col=1)
1096
+
1097
+ # Day of week pattern
1098
+ dow_avg = df_temp.groupby('dayofweek')[value_col].mean().reset_index()
1099
+ day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1100
+ fig.add_trace(go.Bar(x=[day_names[d] for d in dow_avg['dayofweek']],
1101
+ y=dow_avg[value_col], name="Day of Week Avg"), row=2, col=2)
1102
+
1103
+ fig.update_layout(height=800, title_text="Seasonal Pattern Analysis")
1104
+ st.plotly_chart(fig, use_container_width=True)
1105
+
1106
+ except Exception as e:
1107
+ st.error(f"❌ Error in pattern discovery: {str(e)}")
1108
+ st.info("💡 Tip: Ensure you have sufficient data for pattern detection")
1109
+
1110
+ st.markdown('</div>', unsafe_allow_html=True)
1111
+
1112
+ except Exception as e:
1113
+ st.error(f"❌ Critical error in EDA: {str(e)}")
1114
+ st.info("💡 Please check your dataset and try again")
1115
+
1116
+ # Export options
1117
+ st.markdown("---")
1118
+ st.markdown("### 📥 Export EDA Report")
1119
+
1120
+ try:
1121
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1122
+
1123
+ report_text = f"""
1124
+ EXPLORATORY DATA ANALYSIS REPORT
1125
+ =================================
1126
+
1127
+ Dataset Information:
1128
+ • Total Rows: {df.shape[0]:,}
1129
+ • Total Columns: {df.shape[1]}
1130
+ • Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
1131
+
1132
+ Column Types:
1133
+ • Numeric: {len(numeric_cols)}
1134
+ • Categorical: {len(df.select_dtypes(include=['object', 'category']).columns)}
1135
+ • Datetime: {len(df.select_dtypes(include=['datetime64']).columns)}
1136
+
1137
+ Data Quality:
1138
+ • Missing Values: {df.isnull().sum().sum():,}
1139
+ • Complete Cases: {df.dropna().shape[0]:,}
1140
+ • Duplicate Rows: {df.duplicated().sum():,}
1141
+
1142
+ Analysis Performed:
1143
+ • Data Overview
1144
+ • Missing Data Analysis
1145
+ • Univariate Analysis
1146
+ • Bivariate Analysis
1147
+ • Multivariate Analysis
1148
+ • Pattern Discovery
1149
+ """
1150
+
1151
+ st.download_button(
1152
+ label="📥 Download EDA Report",
1153
+ data=report_text,
1154
+ file_name="eda_report.txt",
1155
+ mime="text/plain",
1156
+ use_container_width=True
1157
+ )
1158
+ except Exception as e:
1159
+ st.error(f"❌ Error generating report: {str(e)}")
explainability.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from sklearn.inspection import permutation_importance
7
+ import matplotlib.pyplot as plt
8
+ import shap
9
+
10
+ def explain_model(model, X, y=None, feature_names=None):
11
+ """
12
+ Explain model predictions using various techniques
13
+ """
14
+ st.subheader("🔍 Model Explainability")
15
+
16
+ if feature_names is None:
17
+ feature_names = X.columns if hasattr(X, 'columns') else [f"Feature {i}" for i in range(X.shape[1])]
18
+
19
+ # Create tabs for different explanation methods
20
+ tab1, tab2, tab3 = st.tabs(["Feature Importance", "SHAP Values", "Partial Dependence"])
21
+
22
+ with tab1:
23
+ st.markdown("### 📊 Feature Importance")
24
+
25
+ # Method selection
26
+ method = st.radio(
27
+ "Importance method",
28
+ ["Built-in", "Permutation"],
29
+ horizontal=True
30
+ )
31
+
32
+ if method == "Built-in":
33
+ if hasattr(model, 'feature_importances_'):
34
+ importance = model.feature_importances_
35
+ importance_df = pd.DataFrame({
36
+ 'feature': feature_names,
37
+ 'importance': importance
38
+ }).sort_values('importance', ascending=False)
39
+
40
+ fig = px.bar(importance_df.head(20), x='importance', y='feature',
41
+ orientation='h', title="Feature Importance (Built-in)")
42
+ st.plotly_chart(fig, use_container_width=True)
43
+ else:
44
+ st.warning("Model doesn't have built-in feature importance")
45
+
46
+ else: # Permutation importance
47
+ if y is not None:
48
+ with st.spinner("Calculating permutation importance..."):
49
+ perm_importance = permutation_importance(model, X, y, n_repeats=10)
50
+
51
+ importance_df = pd.DataFrame({
52
+ 'feature': feature_names,
53
+ 'importance': perm_importance.importances_mean,
54
+ 'std': perm_importance.importances_std
55
+ }).sort_values('importance', ascending=False)
56
+
57
+ fig = go.Figure()
58
+ fig.add_trace(go.Bar(
59
+ x=importance_df['importance'].head(20),
60
+ y=importance_df['feature'].head(20),
61
+ orientation='h',
62
+ error_x=dict(
63
+ type='data',
64
+ array=importance_df['std'].head(20),
65
+ visible=True
66
+ )
67
+ ))
68
+ fig.update_layout(title="Permutation Importance (with error bars)",
69
+ xaxis_title="Importance")
70
+ st.plotly_chart(fig, use_container_width=True)
71
+ else:
72
+ st.warning("Need target values for permutation importance")
73
+
74
+ with tab2:
75
+ st.markdown("### 📈 SHAP Values")
76
+
77
+ if hasattr(model, 'predict'):
78
+ with st.spinner("Calculating SHAP values (this may take a moment)..."):
79
+ try:
80
+ # Create explainer based on model type
81
+ if str(type(model)).find('sklearn') != -1:
82
+ explainer = shap.Explainer(model, X[:100]) # Use subset for speed
83
+ else:
84
+ explainer = shap.TreeExplainer(model) if hasattr(model, 'feature_importances_') else shap.Explainer(model, X[:100])
85
+
86
+ # Calculate SHAP values
87
+ shap_values = explainer(X[:100]) # Limit to 100 samples for performance
88
+
89
+ # Summary plot
90
+ st.markdown("#### SHAP Summary Plot")
91
+ fig, ax = plt.subplots()
92
+ shap.summary_plot(shap_values, X[:100], feature_names=feature_names, show=False)
93
+ st.pyplot(fig)
94
+ plt.close()
95
+
96
+ # Waterfall plot for a single prediction
97
+ st.markdown("#### Single Prediction Explanation")
98
+ sample_idx = st.slider("Select sample index", 0, min(99, len(X)-1), 0)
99
+
100
+ fig, ax = plt.subplots()
101
+ shap.waterfall_plot(shap_values[sample_idx], show=False)
102
+ st.pyplot(fig)
103
+ plt.close()
104
+
105
+ except Exception as e:
106
+ st.error(f"Error calculating SHAP values: {str(e)}")
107
+ st.info("Try using a smaller sample or a different model type")
108
+ else:
109
+ st.warning("Model doesn't support prediction")
110
+
111
+ with tab3:
112
+ st.markdown("### 📉 Partial Dependence Plots")
113
+
114
+ if hasattr(model, 'predict') and len(feature_names) > 0:
115
+ from sklearn.inspection import partial_dependence
116
+
117
+ selected_feature = st.selectbox("Select feature for PDP", feature_names)
118
+
119
+ if selected_feature:
120
+ feature_idx = list(feature_names).index(selected_feature)
121
+
122
+ # Calculate partial dependence
123
+ pdp = partial_dependence(model, X, [feature_idx], grid_resolution=50)
124
+
125
+ # Create plot
126
+ fig = go.Figure()
127
+ fig.add_trace(go.Scatter(
128
+ x=pdp['values'][0],
129
+ y=pdp['average'][0],
130
+ mode='lines+markers',
131
+ name='Partial Dependence'
132
+ ))
133
+
134
+ fig.update_layout(
135
+ title=f"Partial Dependence Plot for {selected_feature}",
136
+ xaxis_title=selected_feature,
137
+ yaxis_title="Prediction"
138
+ )
139
+
140
+ st.plotly_chart(fig, use_container_width=True)
141
+
142
+ # Individual conditional expectation (ICE) plots
143
+ if st.checkbox("Show ICE plots"):
144
+ ice_data = []
145
+ for i in range(min(10, X.shape[0])): # Show up to 10 lines
146
+ ice = partial_dependence(model, X.iloc[i:i+1], [feature_idx], grid_resolution=20)
147
+ ice_data.append(ice['average'][0])
148
+
149
+ fig = go.Figure()
150
+ for i, ice in enumerate(ice_data):
151
+ fig.add_trace(go.Scatter(
152
+ x=pdp['values'][0],
153
+ y=ice,
154
+ mode='lines',
155
+ name=f'Sample {i}',
156
+ line=dict(width=1, color='lightgray')
157
+ ))
158
+
159
+ # Add average line
160
+ fig.add_trace(go.Scatter(
161
+ x=pdp['values'][0],
162
+ y=pdp['average'][0],
163
+ mode='lines',
164
+ name='Average',
165
+ line=dict(width=3, color='red')
166
+ ))
167
+
168
+ fig.update_layout(
169
+ title=f"ICE Plots for {selected_feature}",
170
+ xaxis_title=selected_feature,
171
+ yaxis_title="Prediction"
172
+ )
173
+
174
+ st.plotly_chart(fig, use_container_width=True)
175
+ else:
176
+ st.warning("Need more features for partial dependence plots")
insights.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from scipy import stats
7
+
8
+ def generate_business_insights(df):
9
+
10
+ st.markdown("""
11
+ <div style='text-align: center; margin-bottom: 2rem;'>
12
+ <h2>💡 Automated Business Insights</h2>
13
+ <p style='color: gray;'>AI-powered analysis to uncover hidden patterns and opportunities</p>
14
+ </div>
15
+ """, unsafe_allow_html=True)
16
+
17
+ # Get column types
18
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
19
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
20
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
21
+
22
+ # Create tabs for different insight categories
23
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
24
+ "📊 Overview", "📈 Trends & Patterns", "🎯 Key Drivers",
25
+ "⚠️ Anomalies", "💡 Recommendations"
26
+ ])
27
+
28
+ with tab1:
29
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
30
+ st.subheader("📊 Dataset Overview")
31
+
32
+ # Key metrics
33
+ col1, col2, col3, col4 = st.columns(4)
34
+ with col1:
35
+ st.metric("Total Records", f"{df.shape[0]:,}")
36
+ with col2:
37
+ st.metric("Total Features", df.shape[1])
38
+ with col3:
39
+ completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
40
+ st.metric("Data Completeness", f"{completeness:.1f}%")
41
+ with col4:
42
+ if numeric_cols:
43
+ total_value = df[numeric_cols].sum().sum()
44
+ st.metric("Total Value", f"{total_value:,.0f}" if total_value < 1e6 else f"{total_value/1e6:,.1f}M")
45
+
46
+ # Column composition
47
+ st.markdown("### 📋 Column Composition")
48
+
49
+ comp_data = {
50
+ 'Type': ['Numeric', 'Categorical', 'Datetime'],
51
+ 'Count': [len(numeric_cols), len(categorical_cols), len(datetime_cols)]
52
+ }
53
+
54
+ fig = px.pie(comp_data, values='Count', names='Type',
55
+ title="Column Type Distribution",
56
+ color_discrete_sequence=px.colors.qualitative.Set3)
57
+ st.plotly_chart(fig, use_container_width=True)
58
+
59
+ # Data quality score
60
+ st.markdown("### 📊 Data Quality Score")
61
+
62
+ quality_score = 0
63
+ quality_metrics = []
64
+
65
+ # Completeness score
66
+ completeness_score = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
67
+ quality_metrics.append(completeness_score)
68
+
69
+ # Uniqueness score (avoid duplicates)
70
+ duplicate_pct = (df.duplicated().sum() / df.shape[0]) * 100
71
+ uniqueness_score = 100 - duplicate_pct
72
+ quality_metrics.append(uniqueness_score)
73
+
74
+ # Consistency score (data type consistency)
75
+ type_consistency = 100 # Default high
76
+ for col in df.columns:
77
+ if df[col].dtype == 'object':
78
+ # Check if column has mixed types
79
+ try:
80
+ pd.to_numeric(df[col], errors='raise')
81
+ # If convertible to numeric, it's consistent
82
+ except:
83
+ pass # Object type is fine
84
+ else:
85
+ # Numeric columns are consistent
86
+ pass
87
+ quality_metrics.append(type_consistency)
88
+
89
+ # Average quality score
90
+ avg_quality = np.mean(quality_metrics)
91
+
92
+ # Display gauge chart
93
+ fig = go.Figure(go.Indicator(
94
+ mode = "gauge+number",
95
+ value = avg_quality,
96
+ domain = {'x': [0, 1], 'y': [0, 1]},
97
+ title = {'text': "Overall Data Quality"},
98
+ gauge = {
99
+ 'axis': {'range': [None, 100]},
100
+ 'bar': {'color': "darkblue"},
101
+ 'steps': [
102
+ {'range': [0, 50], 'color': "lightgray"},
103
+ {'range': [50, 80], 'color': "gray"},
104
+ {'range': [80, 100], 'color': "lightgreen"}],
105
+ 'threshold': {
106
+ 'line': {'color': "red", 'width': 4},
107
+ 'thickness': 0.75,
108
+ 'value': 90}}))
109
+
110
+ st.plotly_chart(fig, use_container_width=True)
111
+
112
+ st.markdown('</div>', unsafe_allow_html=True)
113
+
114
+ with tab2:
115
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
116
+ st.subheader("📈 Trends & Patterns")
117
+
118
+ if len(numeric_cols) > 0:
119
+ # Correlation analysis
120
+ if len(numeric_cols) >= 2:
121
+ st.markdown("### 🔗 Key Relationships")
122
+
123
+ corr_matrix = df[numeric_cols].corr()
124
+
125
+ # Find strongest correlations
126
+ corr_pairs = []
127
+ for i in range(len(numeric_cols)):
128
+ for j in range(i+1, len(numeric_cols)):
129
+ corr_pairs.append({
130
+ 'feature1': numeric_cols[i],
131
+ 'feature2': numeric_cols[j],
132
+ 'correlation': corr_matrix.iloc[i, j]
133
+ })
134
+
135
+ corr_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
136
+
137
+ # Display top 5 correlations
138
+ for pair in corr_pairs[:5]:
139
+ strength = abs(pair['correlation'])
140
+ if strength > 0.7:
141
+ emoji = "🟢"
142
+ desc = "Strong"
143
+ elif strength > 0.3:
144
+ emoji = "🟡"
145
+ desc = "Moderate"
146
+ else:
147
+ emoji = "🔴"
148
+ desc = "Weak"
149
+
150
+ direction = "positive" if pair['correlation'] > 0 else "negative"
151
+
152
+ st.markdown(
153
+ f"{emoji} **{pair['feature1']}** & **{pair['feature2']}**: "
154
+ f"{pair['correlation']:.3f} ({desc} {direction} correlation)"
155
+ )
156
+
157
+ # Insight
158
+ if corr_pairs:
159
+ st.info(f"💡 **Insight**: {corr_pairs[0]['feature1']} and {corr_pairs[0]['feature2']} "
160
+ f"have the strongest {'positive' if corr_pairs[0]['correlation'] > 0 else 'negative'} "
161
+ f"relationship in the dataset.")
162
+
163
+ # Distribution insights
164
+ st.markdown("### 📊 Distribution Analysis")
165
+
166
+ skewness = df[numeric_cols].skew()
167
+ skewed_cols = skewness[abs(skewness) > 1].index.tolist()
168
+
169
+ if skewed_cols:
170
+ st.warning(f"⚠️ **Skewed Features**: {', '.join(skewed_cols[:3])}" +
171
+ (" and more" if len(skewed_cols) > 3 else ""))
172
+ st.markdown("💡 These features might benefit from transformation for better model performance.")
173
+
174
+ # Show distribution of most skewed feature
175
+ if skewed_cols:
176
+ col_to_show = skewed_cols[0]
177
+ fig = px.histogram(df, x=col_to_show, nbins=30,
178
+ title=f"Distribution of {col_to_show} (Most Skewed)",
179
+ marginal="box")
180
+ st.plotly_chart(fig, use_container_width=True)
181
+ else:
182
+ st.info("No numeric columns available for trend analysis")
183
+
184
+ st.markdown('</div>', unsafe_allow_html=True)
185
+
186
+ with tab3:
187
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
188
+ st.subheader("🎯 Key Business Drivers")
189
+
190
+ if len(numeric_cols) > 0:
191
+ # Find features with highest variance (potential impact)
192
+ variances = df[numeric_cols].var().sort_values(ascending=False)
193
+
194
+ st.markdown("### 📊 High Variance Features")
195
+ st.markdown("Features with high variance often indicate key business drivers")
196
+
197
+ fig = px.bar(x=variances.index[:10], y=variances.values[:10],
198
+ title="Top 10 Features by Variance",
199
+ labels={'x': 'Feature', 'y': 'Variance'})
200
+ st.plotly_chart(fig, use_container_width=True)
201
+
202
+ # Feature importance based on mutual information
203
+ if len(numeric_cols) >= 2:
204
+ st.markdown("### 🔍 Predictive Power")
205
+
206
+ # Use last numeric column as potential target
207
+ target = numeric_cols[-1]
208
+ features = numeric_cols[:-1]
209
+
210
+ if len(features) > 0:
211
+ from sklearn.feature_selection import mutual_info_regression
212
+
213
+ mi_scores = mutual_info_regression(df[features].fillna(0), df[target].fillna(0))
214
+ mi_df = pd.DataFrame({
215
+ 'feature': features,
216
+ 'importance': mi_scores
217
+ }).sort_values('importance', ascending=False)
218
+
219
+ fig = px.bar(mi_df.head(10), x='importance', y='feature',
220
+ orientation='h',
221
+ title=f"Feature Importance for Predicting {target}")
222
+ st.plotly_chart(fig, use_container_width=True)
223
+
224
+ st.info(f"💡 **Key Driver**: {mi_df.iloc[0]['feature']} appears to be the most "
225
+ f"important factor for predicting {target}")
226
+ else:
227
+ st.info("No numeric columns available for driver analysis")
228
+
229
+ st.markdown('</div>', unsafe_allow_html=True)
230
+
231
+ with tab4:
232
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
233
+ st.subheader("⚠️ Anomaly Detection")
234
+
235
+ if len(numeric_cols) > 0:
236
+ # Outlier detection using IQR
237
+ outlier_report = []
238
+
239
+ for col in numeric_cols:
240
+ Q1 = df[col].quantile(0.25)
241
+ Q3 = df[col].quantile(0.75)
242
+ IQR = Q3 - Q1
243
+
244
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
245
+ outlier_pct = (len(outliers) / len(df)) * 100
246
+
247
+ if outlier_pct > 5:
248
+ outlier_report.append({
249
+ 'column': col,
250
+ 'outlier_pct': outlier_pct,
251
+ 'lower_bound': Q1 - 1.5 * IQR,
252
+ 'upper_bound': Q3 + 1.5 * IQR
253
+ })
254
+
255
+ if outlier_report:
256
+ st.warning(f"⚠️ Found {len(outlier_report)} columns with significant outliers")
257
+
258
+ for item in outlier_report[:5]:
259
+ st.markdown(f"**{item['column']}**: {item['outlier_pct']:.1f}% outliers "
260
+ f"(outside [{item['lower_bound']:.2f}, {item['upper_bound']:.2f}])")
261
+
262
+ # Visualize outliers for first column
263
+ col_to_show = outlier_report[0]['column']
264
+ fig = px.box(df, y=col_to_show, title=f"Outliers in {col_to_show}")
265
+ st.plotly_chart(fig, use_container_width=True)
266
+
267
+ st.markdown("💡 **Recommendation**: Investigate these outliers - they may represent "
268
+ "unusual but important business events or data quality issues.")
269
+ else:
270
+ st.success("✅ No significant outliers detected in numeric columns")
271
+ else:
272
+ st.info("No numeric columns available for outlier detection")
273
+
274
+ st.markdown('</div>', unsafe_allow_html=True)
275
+
276
+ with tab5:
277
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
278
+ st.subheader("💡 Strategic Recommendations")
279
+
280
+ # Generate business recommendations based on data insights
281
+ recommendations = []
282
+
283
+ if len(numeric_cols) > 0:
284
+ # Check for growth opportunities
285
+ growth_cols = []
286
+ for col in numeric_cols:
287
+ if df[col].min() >= 0 and df[col].max() > df[col].min() * 10:
288
+ growth_cols.append(col)
289
+
290
+ if growth_cols:
291
+ recommendations.append({
292
+ 'area': 'Growth Opportunity',
293
+ 'recommendation': f"Focus on {growth_cols[0]} which shows high variability "
294
+ f"(range: {df[growth_cols[0]].min():.2f} to {df[growth_cols[0]].max():.2f})",
295
+ 'priority': 'High'
296
+ })
297
+
298
+ # Check for efficiency opportunities
299
+ if len(numeric_cols) >= 2:
300
+ # Find features with high correlation - potential redundancy
301
+ corr_matrix = df[numeric_cols].corr()
302
+ for i in range(len(numeric_cols)):
303
+ for j in range(i+1, len(numeric_cols)):
304
+ if abs(corr_matrix.iloc[i, j]) > 0.9:
305
+ recommendations.append({
306
+ 'area': 'Efficiency',
307
+ 'recommendation': f"Consider consolidating {numeric_cols[i]} and {numeric_cols[j]} "
308
+ f"as they are highly correlated ({corr_matrix.iloc[i, j]:.2f})",
309
+ 'priority': 'Medium'
310
+ })
311
+ break
312
+ if len(recommendations) > 3:
313
+ break
314
+
315
+ if categorical_cols:
316
+ # Check for customer/market segments
317
+ for col in categorical_cols[:2]:
318
+ if df[col].nunique() > 1 and df[col].nunique() <= 10:
319
+ top_segment = df[col].value_counts().index[0]
320
+ recommendations.append({
321
+ 'area': 'Segmentation',
322
+ 'recommendation': f"Target the dominant segment in {col}: '{top_segment}' "
323
+ f"({df[col].value_counts().iloc[0]:,} records)",
324
+ 'priority': 'Medium'
325
+ })
326
+
327
+ # Display recommendations
328
+ if recommendations:
329
+ for rec in recommendations:
330
+ priority_color = "🔴" if rec['priority'] == 'High' else "🟡" if rec['priority'] == 'Medium' else "🟢"
331
+ st.markdown(f"{priority_color} **{rec['area']}**: {rec['recommendation']}")
332
+ else:
333
+ st.info("No specific recommendations generated. Try uploading a dataset with more variety.")
334
+
335
+ # Add download insights option
336
+ st.markdown("---")
337
+ st.markdown("### 📥 Export Insights")
338
+
339
+ insight_text = f"""
340
+ BUSINESS INSIGHTS REPORT
341
+ =======================
342
+
343
+ Dataset: {df.shape[0]} rows × {df.shape[1]} columns
344
+
345
+ KEY METRICS:
346
+ • Total Records: {df.shape[0]:,}
347
+ • Total Features: {df.shape[1]}
348
+ • Data Completeness: {completeness:.1f}%
349
+
350
+ COLUMN COMPOSITION:
351
+ • Numeric: {len(numeric_cols)}
352
+ • Categorical: {len(categorical_cols)}
353
+ • Datetime: {len(datetime_cols)}
354
+
355
+ RECOMMENDATIONS:
356
+ """
357
+
358
+ for rec in recommendations:
359
+ insight_text += f"\n• {rec['area']}: {rec['recommendation']} (Priority: {rec['priority']})"
360
+
361
+ st.download_button(
362
+ label="📥 Download Insights Report",
363
+ data=insight_text,
364
+ file_name="business_insights.txt",
365
+ mime="text/plain",
366
+ use_container_width=True
367
+ )
368
+
369
+ st.markdown('</div>', unsafe_allow_html=True)
ml_pipeline.py ADDED
@@ -0,0 +1,940 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
6
+ from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
7
+ confusion_matrix, classification_report, roc_curve, auc,
8
+ mean_squared_error, r2_score, mean_absolute_error)
9
+ from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
10
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
11
+ from sklearn.svm import SVC, SVR
12
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
13
+ from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
14
+ GradientBoostingClassifier, GradientBoostingRegressor,
15
+ AdaBoostClassifier, AdaBoostRegressor,
16
+ VotingClassifier, VotingRegressor)
17
+ from xgboost import XGBClassifier, XGBRegressor
18
+ from lightgbm import LGBMClassifier, LGBMRegressor
19
+ import plotly.express as px
20
+ import plotly.graph_objects as go
21
+ from plotly.subplots import make_subplots
22
+ import time
23
+ import warnings
24
+ warnings.filterwarnings('ignore')
25
+
26
+ class MLPipelineError(Exception):
27
+ """Custom exception for ML pipeline errors"""
28
+ pass
29
+
30
+ def validate_ml_data(df, target, features):
31
+ """Validate data for machine learning"""
32
+ issues = []
33
+
34
+ if df.empty:
35
+ issues.append("Dataset is empty")
36
+ return issues
37
+
38
+ if target not in df.columns:
39
+ issues.append(f"Target column '{target}' not found in dataset")
40
+
41
+ missing_features = [f for f in features if f not in df.columns]
42
+ if missing_features:
43
+ issues.append(f"Features not found: {missing_features}")
44
+
45
+ # Check for sufficient data
46
+ if df.shape[0] < 10:
47
+ issues.append("Dataset too small (minimum 10 rows required)")
48
+
49
+ # Check for constant columns
50
+ for col in features:
51
+ if df[col].nunique() == 1:
52
+ issues.append(f"Feature '{col}' is constant")
53
+
54
+ # Check target for classification
55
+ if target in df.columns:
56
+ if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
57
+ if df[target].nunique() == 1:
58
+ issues.append("Target has only one class")
59
+ elif df[target].nunique() > 50:
60
+ issues.append(f"Target has {df[target].nunique()} classes, which may cause issues")
61
+
62
+ return issues
63
+
64
+ def safe_ml_operation(func, *args, **kwargs):
65
+ """Safely execute ML operations with error handling"""
66
+ try:
67
+ result = func(*args, **kwargs)
68
+ return result, None
69
+ except ValueError as e:
70
+ error_msg = f"Value Error: {str(e)}. Check your data types and values."
71
+ return None, error_msg
72
+ except MemoryError as e:
73
+ error_msg = "Memory Error: Dataset too large. Try reducing the number of features or using a sample."
74
+ return None, error_msg
75
+ except Exception as e:
76
+ error_msg = f"ML Error: {str(e)}"
77
+ return None, error_msg
78
+
79
+ def run_ml_pipeline(df):
80
+ """
81
+ Enhanced machine learning pipeline with comprehensive error handling
82
+ """
83
+ st.markdown("""
84
+ <div style='text-align: center; margin-bottom: 2rem;'>
85
+ <h2>🤖 Advanced Machine Learning Pipeline</h2>
86
+ <p style='color: gray;'>Train, evaluate, and compare multiple ML models with automatic error handling</p>
87
+ </div>
88
+ """, unsafe_allow_html=True)
89
+
90
+ try:
91
+ # Check if dataset is suitable for ML
92
+ if df.shape[0] < 10:
93
+ st.error("❌ Dataset too small for machine learning (need at least 10 rows)")
94
+ return
95
+
96
+ # Create tabs for different ML stages
97
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
98
+ "⚙️ Configuration",
99
+ "📊 Model Training",
100
+ "📈 Model Evaluation",
101
+ "🔮 Predictions",
102
+ "📋 ML Report"
103
+ ])
104
+
105
+ with tab1:
106
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
107
+ st.subheader("⚙️ Model Configuration")
108
+
109
+ try:
110
+ # Target selection with validation
111
+ st.markdown("### 🎯 Target Variable")
112
+
113
+ # Auto-detect potential target columns
114
+ potential_targets = []
115
+ target_types = {}
116
+
117
+ for col in df.columns:
118
+ try:
119
+ if df[col].dtype in ['int64', 'float64']:
120
+ if df[col].nunique() <= 20:
121
+ potential_targets.append(col)
122
+ target_types[col] = "Classification (low cardinality)"
123
+ else:
124
+ potential_targets.append(col)
125
+ target_types[col] = "Regression"
126
+ elif df[col].dtype in ['object', 'category']:
127
+ if df[col].nunique() <= 50:
128
+ potential_targets.append(col)
129
+ target_types[col] = f"Classification ({df[col].nunique()} classes)"
130
+ except Exception as e:
131
+ st.warning(f"⚠️ Couldn't analyze column {col}: {str(e)}")
132
+
133
+ if not potential_targets:
134
+ st.error("❌ No suitable target columns found. Need numeric or categorical columns with reasonable cardinality.")
135
+ return
136
+
137
+ target = st.selectbox(
138
+ "Select target column",
139
+ potential_targets,
140
+ help=f"Column types: {target_types}"
141
+ )
142
+
143
+ # Task type detection
144
+ if df[target].dtype in ['object', 'category'] or df[target].nunique() <= 20:
145
+ task_type = "Classification"
146
+ unique_values = df[target].nunique()
147
+
148
+ if unique_values == 2:
149
+ st.success("✅ **Binary Classification** problem detected")
150
+ elif unique_values <= 10:
151
+ st.info(f"📊 **Multi-class Classification** with {unique_values} classes")
152
+ else:
153
+ st.warning(f"⚠️ **Multi-class Classification** with {unique_values} classes - may be challenging")
154
+
155
+ # Check class balance
156
+ class_dist = df[target].value_counts(normalize=True)
157
+ if class_dist.min() < 0.1:
158
+ st.warning("⚠️ Class imbalance detected. Consider using class weights or resampling.")
159
+ else:
160
+ task_type = "Regression"
161
+ st.info("📈 **Regression** task detected")
162
+
163
+ # Check target distribution
164
+ target_skew = df[target].skew()
165
+ if abs(target_skew) > 1:
166
+ st.warning(f"⚠️ Target variable is highly skewed (skewness: {target_skew:.2f}). Consider log transformation.")
167
+
168
+ # Feature selection
169
+ st.markdown("### 🔍 Feature Selection")
170
+
171
+ # Auto-select features (exclude target)
172
+ all_features = [col for col in df.columns if col != target]
173
+
174
+ # Remove problematic columns
175
+ problematic_cols = []
176
+ for col in all_features:
177
+ try:
178
+ if df[col].nunique() == 1:
179
+ problematic_cols.append(col)
180
+ elif df[col].isnull().sum() > len(df) * 0.5:
181
+ problematic_cols.append(col)
182
+ except:
183
+ problematic_cols.append(col)
184
+
185
+ if problematic_cols:
186
+ st.warning(f"⚠️ Problematic columns detected (will be excluded): {problematic_cols}")
187
+ all_features = [f for f in all_features if f not in problematic_cols]
188
+
189
+ if not all_features:
190
+ st.error("❌ No valid features remaining after filtering.")
191
+ return
192
+
193
+ # Select features
194
+ selected_features = st.multiselect(
195
+ "Choose features for modeling",
196
+ all_features,
197
+ default=all_features[:min(10, len(all_features))],
198
+ help="Select the columns to use as features. Using too many features may cause overfitting."
199
+ )
200
+
201
+ if not selected_features:
202
+ st.warning("⚠️ Please select at least one feature")
203
+ return
204
+
205
+ # Validate selected features
206
+ validation_issues = validate_ml_data(df, target, selected_features)
207
+ if validation_issues:
208
+ for issue in validation_issues:
209
+ st.warning(f"⚠️ {issue}")
210
+
211
+ # Data preprocessing options
212
+ st.markdown("### 🛠️ Preprocessing Options")
213
+
214
+ col1, col2 = st.columns(2)
215
+ with col1:
216
+ test_size = st.slider("Test set size (%)", 10, 40, 20, 5) / 100
217
+ scaler_option = st.selectbox("Feature scaling", ["None", "StandardScaler", "MinMaxScaler"])
218
+
219
+ with col2:
220
+ cv_folds = st.slider("Cross-validation folds", 2, 10, 5)
221
+ if task_type == "Classification":
222
+ handle_imbalance = st.checkbox("Handle class imbalance", value=False,
223
+ help="Use class weights or sampling techniques")
224
+ else:
225
+ handle_imbalance = False
226
+
227
+ # Model selection based on task type
228
+ st.markdown("### 🤖 Model Selection")
229
+
230
+ if task_type == "Classification":
231
+ models = {
232
+ "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
233
+ "K-Nearest Neighbors": KNeighborsClassifier(),
234
+ "Decision Tree": DecisionTreeClassifier(random_state=42),
235
+ "Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
236
+ "Gradient Boosting": GradientBoostingClassifier(random_state=42),
237
+ "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
238
+ "LightGBM": LGBMClassifier(verbose=-1, random_state=42),
239
+ "AdaBoost": AdaBoostClassifier(random_state=42),
240
+ "SVM": SVC(probability=True, random_state=42)
241
+ }
242
+
243
+ # Default models for quick selection
244
+ default_models = ["Logistic Regression", "Random Forest", "XGBoost"]
245
+ else: # Regression
246
+ models = {
247
+ "Linear Regression": LinearRegression(),
248
+ "Ridge Regression": Ridge(random_state=42),
249
+ "Lasso Regression": Lasso(random_state=42),
250
+ "Decision Tree": DecisionTreeRegressor(random_state=42),
251
+ "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
252
+ "Gradient Boosting": GradientBoostingRegressor(random_state=42),
253
+ "XGBoost": XGBRegressor(random_state=42),
254
+ "LightGBM": LGBMRegressor(verbose=-1, random_state=42),
255
+ "AdaBoost": AdaBoostRegressor(random_state=42),
256
+ "SVR": SVR()
257
+ }
258
+
259
+ default_models = ["Linear Regression", "Random Forest", "XGBoost"]
260
+
261
+ selected_models = st.multiselect(
262
+ "Choose models to train",
263
+ list(models.keys()),
264
+ default=default_models,
265
+ help="Select multiple models to compare performance"
266
+ )
267
+
268
+ if not selected_models:
269
+ st.warning("⚠️ Please select at least one model")
270
+ return
271
+
272
+ # Advanced options
273
+ with st.expander("⚡ Advanced Options"):
274
+ do_tuning = st.checkbox("Perform hyperparameter tuning", value=False,
275
+ help="Grid search for best parameters (may be slow)")
276
+
277
+ if do_tuning:
278
+ tuning_folds = st.slider("Tuning CV folds", 2, 5, 3)
279
+ max_tuning_iter = st.slider("Max tuning iterations per model", 5, 50, 20)
280
+
281
+ use_sampling = st.checkbox("Use data sampling (for large datasets)", value=False,
282
+ help="Use a sample for faster experimentation")
283
+
284
+ if use_sampling:
285
+ sample_size = st.slider("Sample size (%)", 10, 100, 100, 10) / 100
286
+
287
+ random_state = st.number_input("Random seed", value=42, min_value=0, max_value=999)
288
+
289
+ st.markdown('</div>', unsafe_allow_html=True)
290
+
291
+ # Store configuration in session state
292
+ st.session_state['ml_config'] = {
293
+ 'target': target,
294
+ 'features': selected_features,
295
+ 'task_type': task_type,
296
+ 'test_size': test_size,
297
+ 'scaler': scaler_option,
298
+ 'cv_folds': cv_folds,
299
+ 'handle_imbalance': handle_imbalance,
300
+ 'models': {name: models[name] for name in selected_models},
301
+ 'do_tuning': do_tuning,
302
+ 'random_state': random_state
303
+ }
304
+
305
+ except Exception as e:
306
+ st.error(f"❌ Error in configuration: {str(e)}")
307
+ st.info("💡 Tip: Check your data types and ensure all columns are valid")
308
+ return
309
+
310
+ with tab2:
311
+ if 'ml_config' not in st.session_state:
312
+ st.info("ℹ️ Please configure your model in the 'Configuration' tab first")
313
+ return
314
+
315
+ if st.button("🚀 Start Training", use_container_width=True, type="primary"):
316
+ try:
317
+ config = st.session_state['ml_config']
318
+
319
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
320
+
321
+ # Prepare data with error handling
322
+ with st.spinner("📊 Preparing data..."):
323
+ try:
324
+ X = df[config['features']].copy()
325
+ y = df[config['target']].copy()
326
+
327
+ # Handle missing values
328
+ if X.isnull().sum().sum() > 0:
329
+ st.info(f"⚠️ Handling {X.isnull().sum().sum()} missing values in features...")
330
+ X = X.fillna(X.mean(numeric_only=True)).fillna(X.mode().iloc[0])
331
+
332
+ # Handle categorical features
333
+ cat_features = X.select_dtypes(include=['object', 'category']).columns
334
+ if len(cat_features) > 0:
335
+ st.info(f"🔄 Encoding categorical features: {list(cat_features)}")
336
+ X = pd.get_dummies(X, columns=cat_features)
337
+
338
+ # Handle target encoding for classification
339
+ le = None
340
+ if config['task_type'] == "Classification" and y.dtype == 'object':
341
+ le = LabelEncoder()
342
+ y = le.fit_transform(y)
343
+ st.info(f"📊 Target classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")
344
+
345
+ # Handle class imbalance
346
+ if config['task_type'] == "Classification" and config['handle_imbalance']:
347
+ from sklearn.utils.class_weight import compute_class_weight
348
+ class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
349
+ st.info(f"⚖️ Using class weights: {dict(zip(np.unique(y), class_weights))}")
350
+
351
+ # Scale features
352
+ scaler = None
353
+ if config['scaler'] != "None":
354
+ if config['scaler'] == "StandardScaler":
355
+ scaler = StandardScaler()
356
+ else:
357
+ scaler = MinMaxScaler()
358
+ X_scaled = scaler.fit_transform(X)
359
+ X = pd.DataFrame(X_scaled, columns=X.columns)
360
+
361
+ # Split data
362
+ stratify = y if config['task_type'] == "Classification" else None
363
+ X_train, X_test, y_train, y_test = train_test_split(
364
+ X, y, test_size=config['test_size'],
365
+ random_state=config['random_state'],
366
+ stratify=stratify
367
+ )
368
+
369
+ st.success(f"✅ Data prepared: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
370
+
371
+ except Exception as e:
372
+ st.error(f"❌ Error in data preparation: {str(e)}")
373
+ return
374
+
375
+ # Train models
376
+ results = []
377
+ trained_models = {}
378
+ progress_bar = st.progress(0)
379
+ status_text = st.empty()
380
+
381
+ for i, (model_name, model) in enumerate(config['models'].items()):
382
+ status_text.text(f"🔄 Training {model_name}...")
383
+
384
+ try:
385
+ # Apply class weights if needed
386
+ if config['task_type'] == "Classification" and config['handle_imbalance']:
387
+ if hasattr(model, 'class_weight'):
388
+ model.set_params(class_weight='balanced')
389
+
390
+ # Train
391
+ start_time = time.time()
392
+ model.fit(X_train, y_train)
393
+ training_time = time.time() - start_time
394
+
395
+ # Store trained model
396
+ trained_models[model_name] = {
397
+ 'model': model,
398
+ 'scaler': scaler,
399
+ 'label_encoder': le,
400
+ 'features': X.columns.tolist()
401
+ }
402
+
403
+ # Predict
404
+ y_pred = model.predict(X_test)
405
+
406
+ # Calculate metrics
407
+ if config['task_type'] == "Classification":
408
+ try:
409
+ accuracy = accuracy_score(y_test, y_pred)
410
+ precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
411
+ recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
412
+ f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
413
+
414
+ # Cross-validation
415
+ cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'])
416
+
417
+ results.append({
418
+ "Model": model_name,
419
+ "Accuracy": f"{accuracy:.4f}",
420
+ "Precision": f"{precision:.4f}",
421
+ "Recall": f"{recall:.4f}",
422
+ "F1 Score": f"{f1:.4f}",
423
+ "CV Score": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
424
+ "Time (s)": f"{training_time:.2f}"
425
+ })
426
+ except Exception as e:
427
+ st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
428
+
429
+ else: # Regression
430
+ try:
431
+ mse = mean_squared_error(y_test, y_pred)
432
+ rmse = np.sqrt(mse)
433
+ mae = mean_absolute_error(y_test, y_pred)
434
+ r2 = r2_score(y_test, y_pred)
435
+
436
+ # Cross-validation
437
+ cv_scores = cross_val_score(model, X_train, y_train, cv=config['cv_folds'], scoring='r2')
438
+
439
+ results.append({
440
+ "Model": model_name,
441
+ "R² Score": f"{r2:.4f}",
442
+ "RMSE": f"{rmse:.4f}",
443
+ "MAE": f"{mae:.4f}",
444
+ "CV R²": f"{cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})",
445
+ "Time (s)": f"{training_time:.2f}"
446
+ })
447
+ except Exception as e:
448
+ st.warning(f"⚠️ Could not calculate all metrics for {model_name}: {str(e)}")
449
+
450
+ except MemoryError:
451
+ st.error(f"❌ Out of memory training {model_name}. Try using fewer features or a sample.")
452
+ except Exception as e:
453
+ st.warning(f"⚠️ Error training {model_name}: {str(e)}")
454
+
455
+ progress_bar.progress((i + 1) / len(config['models']))
456
+
457
+ status_text.text("✅ Training complete!")
458
+
459
+ if not results:
460
+ st.error("❌ No models were successfully trained")
461
+ return
462
+
463
+ # Display results
464
+ st.subheader("📊 Model Performance Comparison")
465
+ results_df = pd.DataFrame(results)
466
+
467
+ # Highlight best model
468
+ if config['task_type'] == "Classification":
469
+ best_idx = results_df['F1 Score'].astype(float).idxmax()
470
+ else:
471
+ best_idx = results_df['R² Score'].astype(float).idxmax()
472
+
473
+ # Style dataframe
474
+ def highlight_best(s):
475
+ is_best = s.index == best_idx
476
+ return ['background-color: #90EE90' if v else '' for v in is_best]
477
+
478
+ st.dataframe(results_df.style.apply(highlight_best), use_container_width=True)
479
+
480
+ # Store results
481
+ st.session_state['trained_models'] = trained_models
482
+ st.session_state['X_train'] = X_train
483
+ st.session_state['X_test'] = X_test
484
+ st.session_state['y_train'] = y_train
485
+ st.session_state['y_test'] = y_test
486
+ st.session_state['task_type'] = config['task_type']
487
+ st.session_state['results_df'] = results_df
488
+
489
+ # Best model info
490
+ best_model_name = results_df.iloc[best_idx]['Model']
491
+ st.success(f"🏆 **Best Model:** {best_model_name}")
492
+
493
+ st.markdown('</div>', unsafe_allow_html=True)
494
+
495
+ except Exception as e:
496
+ st.error(f"❌ Critical error in training: {str(e)}")
497
+ st.info("💡 Try reducing the number of features or models")
498
+
499
+ with tab3:
500
+ if 'trained_models' not in st.session_state:
501
+ st.info("ℹ️ Train some models first in the 'Model Training' tab")
502
+ return
503
+
504
+ try:
505
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
506
+ st.subheader("📈 Detailed Model Evaluation")
507
+
508
+ # Model selection for detailed evaluation
509
+ selected_eval_model = st.selectbox(
510
+ "Select model for detailed evaluation",
511
+ list(st.session_state['trained_models'].keys())
512
+ )
513
+
514
+ model_info = st.session_state['trained_models'][selected_eval_model]
515
+ model = model_info['model']
516
+ X_test = st.session_state['X_test']
517
+ y_test = st.session_state['y_test']
518
+ task_type = st.session_state['task_type']
519
+
520
+ try:
521
+ y_pred = model.predict(X_test)
522
+
523
+ if task_type == "Classification":
524
+ # Confusion Matrix
525
+ st.markdown("### Confusion Matrix")
526
+ cm = confusion_matrix(y_test, y_pred)
527
+
528
+ fig = px.imshow(cm,
529
+ text_auto=True,
530
+ aspect="auto",
531
+ color_continuous_scale='Blues',
532
+ title=f"Confusion Matrix - {selected_eval_model}")
533
+
534
+ fig.update_layout(xaxis_title="Predicted", yaxis_title="Actual")
535
+ st.plotly_chart(fig, use_container_width=True)
536
+
537
+ # Classification Report
538
+ st.markdown("### Classification Report")
539
+ report = classification_report(y_test, y_pred, output_dict=True)
540
+ report_df = pd.DataFrame(report).transpose()
541
+ st.dataframe(report_df.style.format("{:.4f}"), use_container_width=True)
542
+
543
+ # ROC Curve (for binary classification)
544
+ if len(np.unique(y_test)) == 2 and hasattr(model, "predict_proba"):
545
+ st.markdown("### ROC Curve")
546
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
547
+ fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
548
+ roc_auc = auc(fpr, tpr)
549
+
550
+ fig = go.Figure()
551
+ fig.add_trace(go.Scatter(x=fpr, y=tpr,
552
+ mode='lines',
553
+ name=f'ROC (AUC = {roc_auc:.3f})',
554
+ line=dict(color='blue', width=2)))
555
+ fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
556
+ mode='lines',
557
+ name='Random',
558
+ line=dict(color='gray', dash='dash')))
559
+
560
+ fig.update_layout(xaxis_title="False Positive Rate",
561
+ yaxis_title="True Positive Rate",
562
+ title=f"ROC Curve - {selected_eval_model}")
563
+
564
+ st.plotly_chart(fig, use_container_width=True)
565
+
566
+ else: # Regression
567
+ # Actual vs Predicted plot
568
+ st.markdown("### Actual vs Predicted")
569
+
570
+ fig = px.scatter(x=y_test, y=y_pred,
571
+ labels={'x': 'Actual', 'y': 'Predicted'},
572
+ title=f"Actual vs Predicted - {selected_eval_model}",
573
+ trendline="ols")
574
+
575
+ # Add perfect prediction line
576
+ min_val = min(y_test.min(), y_pred.min())
577
+ max_val = max(y_test.max(), y_pred.max())
578
+ fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
579
+ mode='lines', name='Perfect Prediction',
580
+ line=dict(color='red', dash='dash')))
581
+
582
+ st.plotly_chart(fig, use_container_width=True)
583
+
584
+ # Residuals plot
585
+ st.markdown("### Residuals Analysis")
586
+ residuals = y_test - y_pred
587
+
588
+ fig = make_subplots(rows=1, cols=2,
589
+ subplot_titles=("Residuals vs Predicted", "Residuals Distribution"))
590
+
591
+ fig.add_trace(go.Scatter(x=y_pred, y=residuals,
592
+ mode='markers',
593
+ name='Residuals',
594
+ marker=dict(color='blue', opacity=0.5)), row=1, col=1)
595
+
596
+ fig.add_hline(y=0, line_dash="dash", line_color="red", row=1, col=1)
597
+
598
+ fig.add_trace(go.Histogram(x=residuals, nbinsx=30,
599
+ name='Distribution',
600
+ marker_color='green'), row=1, col=2)
601
+
602
+ fig.update_layout(title=f"Residual Analysis - {selected_eval_model}")
603
+ st.plotly_chart(fig, use_container_width=True)
604
+
605
+ # Residual statistics
606
+ col1, col2, col3 = st.columns(3)
607
+ with col1:
608
+ st.metric("Mean Residual", f"{residuals.mean():.4f}")
609
+ with col2:
610
+ st.metric("Std Residual", f"{residuals.std():.4f}")
611
+ with col3:
612
+ st.metric("Residual Range", f"{residuals.max() - residuals.min():.4f}")
613
+
614
+ # Feature Importance (if available)
615
+ if hasattr(model, 'feature_importances_'):
616
+ st.markdown("### Feature Importance")
617
+ feature_importance = pd.DataFrame({
618
+ 'feature': X_test.columns,
619
+ 'importance': model.feature_importances_
620
+ }).sort_values('importance', ascending=True)
621
+
622
+ fig = px.bar(feature_importance.tail(10),
623
+ x='importance', y='feature',
624
+ orientation='h',
625
+ title="Top 10 Feature Importances",
626
+ color='importance',
627
+ color_continuous_scale='Viridis')
628
+ st.plotly_chart(fig, use_container_width=True)
629
+
630
+ except Exception as e:
631
+ st.error(f"❌ Error in evaluation: {str(e)}")
632
+
633
+ st.markdown('</div>', unsafe_allow_html=True)
634
+
635
+ except Exception as e:
636
+ st.error(f"❌ Error loading evaluation: {str(e)}")
637
+
638
+ with tab4:
639
+ if 'trained_models' not in st.session_state:
640
+ st.info("ℹ️ Train some models first in the 'Model Training' tab")
641
+ return
642
+
643
+ try:
644
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
645
+ st.subheader("🔮 Make Predictions")
646
+
647
+ # Model selection for predictions
648
+ selected_pred_model = st.selectbox(
649
+ "Select model for predictions",
650
+ list(st.session_state['trained_models'].keys()),
651
+ key="pred_model"
652
+ )
653
+
654
+ model_info = st.session_state['trained_models'][selected_pred_model]
655
+ model = model_info['model']
656
+ scaler = model_info['scaler']
657
+ le = model_info.get('label_encoder')
658
+ feature_names = model_info['features']
659
+
660
+ # Input method
661
+ input_method = st.radio(
662
+ "Input method",
663
+ ["Manual input", "Upload new data", "Batch prediction"],
664
+ horizontal=True
665
+ )
666
+
667
+ if input_method == "Manual input":
668
+ st.markdown("### Enter feature values")
669
+
670
+ input_data = {}
671
+ cols = st.columns(3)
672
+
673
+ for i, feature in enumerate(feature_names):
674
+ with cols[i % 3]:
675
+ try:
676
+ # Get feature range from training data
677
+ if feature in st.session_state['X_train'].columns:
678
+ min_val = float(st.session_state['X_train'][feature].min())
679
+ max_val = float(st.session_state['X_train'][feature].max())
680
+ mean_val = float(st.session_state['X_train'][feature].mean())
681
+
682
+ input_data[feature] = st.slider(
683
+ f"{feature}",
684
+ min_val, max_val, mean_val,
685
+ format="%.4f",
686
+ key=f"manual_{feature}"
687
+ )
688
+ else:
689
+ input_data[feature] = st.number_input(
690
+ f"{feature}",
691
+ value=0.0,
692
+ key=f"manual_{feature}"
693
+ )
694
+ except Exception as e:
695
+ st.warning(f"⚠️ Error with {feature}: {str(e)}")
696
+ input_data[feature] = 0.0
697
+
698
+ if st.button("🔮 Predict", use_container_width=True):
699
+ try:
700
+ # Convert input to DataFrame
701
+ input_df = pd.DataFrame([input_data])
702
+
703
+ # Ensure all features are present
704
+ for col in feature_names:
705
+ if col not in input_df.columns:
706
+ input_df[col] = 0
707
+
708
+ input_df = input_df[feature_names]
709
+
710
+ # Scale if needed
711
+ if scaler is not None:
712
+ input_scaled = scaler.transform(input_df)
713
+ input_df = pd.DataFrame(input_scaled, columns=feature_names)
714
+
715
+ # Make prediction
716
+ prediction = model.predict(input_df)[0]
717
+
718
+ # Decode if needed
719
+ if le is not None:
720
+ prediction = le.inverse_transform([int(prediction)])[0]
721
+
722
+ # Display prediction with styling
723
+ st.markdown("""
724
+ <div class="success-container" style="text-align: center; padding: 2rem;">
725
+ <h3>🎯 Prediction Result</h3>
726
+ <h1 style="font-size: 3rem;">{}</h1>
727
+ </div>
728
+ """.format(prediction), unsafe_allow_html=True)
729
+
730
+ except Exception as e:
731
+ st.error(f"❌ Prediction error: {str(e)}")
732
+
733
+ elif input_method == "Upload new data":
734
+ pred_file = st.file_uploader("Upload data for predictions",
735
+ type=["csv", "xlsx"],
736
+ key="pred_file")
737
+
738
+ if pred_file:
739
+ try:
740
+ if pred_file.name.endswith("csv"):
741
+ pred_df = pd.read_csv(pred_file)
742
+ else:
743
+ pred_df = pd.read_excel(pred_file)
744
+
745
+ st.subheader("📋 Uploaded Data Preview")
746
+ st.dataframe(pred_df.head())
747
+
748
+ if st.button("🔮 Predict for all rows", use_container_width=True):
749
+ with st.spinner("Making predictions..."):
750
+ try:
751
+ # Prepare data
752
+ pred_processed = pred_df.copy()
753
+
754
+ # Handle categorical features if needed
755
+ for col in pred_processed.columns:
756
+ if pred_processed[col].dtype == 'object':
757
+ pred_processed = pd.get_dummies(pred_processed, columns=[col])
758
+
759
+ # Align columns with training data
760
+ for col in feature_names:
761
+ if col not in pred_processed.columns:
762
+ pred_processed[col] = 0
763
+
764
+ pred_processed = pred_processed[feature_names]
765
+
766
+ # Scale if needed
767
+ if scaler is not None:
768
+ pred_scaled = scaler.transform(pred_processed)
769
+ pred_processed = pd.DataFrame(pred_scaled, columns=feature_names)
770
+
771
+ # Make predictions
772
+ predictions = model.predict(pred_processed)
773
+
774
+ # Decode if needed
775
+ if le is not None:
776
+ predictions = le.inverse_transform(predictions.astype(int))
777
+
778
+ # Add predictions to dataframe
779
+ pred_df['Prediction'] = predictions
780
+
781
+ st.subheader("📊 Predictions Result")
782
+ st.dataframe(pred_df)
783
+
784
+ # Download predictions
785
+ csv = pred_df.to_csv(index=False)
786
+ st.download_button(
787
+ label="📥 Download Predictions",
788
+ data=csv,
789
+ file_name="predictions.csv",
790
+ mime="text/csv",
791
+ use_container_width=True
792
+ )
793
+
794
+ except Exception as e:
795
+ st.error(f"❌ Prediction error: {str(e)}")
796
+
797
+ except Exception as e:
798
+ st.error(f"❌ Error reading file: {str(e)}")
799
+
800
+ elif input_method == "Batch prediction":
801
+ st.markdown("### Batch Prediction Settings")
802
+
803
+ n_samples = st.number_input("Number of samples to generate",
804
+ min_value=1, max_value=1000, value=10)
805
+
806
+ if st.button("🎲 Generate Random Samples & Predict", use_container_width=True):
807
+ try:
808
+ # Generate random samples based on training data distribution
809
+ random_samples = {}
810
+ for feature in feature_names:
811
+ if feature in st.session_state['X_train'].columns:
812
+ mean = st.session_state['X_train'][feature].mean()
813
+ std = st.session_state['X_train'][feature].std()
814
+ random_samples[feature] = np.random.normal(mean, std, n_samples)
815
+ else:
816
+ random_samples[feature] = np.zeros(n_samples)
817
+
818
+ batch_df = pd.DataFrame(random_samples)
819
+
820
+ # Scale if needed
821
+ if scaler is not None:
822
+ batch_scaled = scaler.transform(batch_df)
823
+ batch_df = pd.DataFrame(batch_scaled, columns=feature_names)
824
+
825
+ # Make predictions
826
+ predictions = model.predict(batch_df)
827
+
828
+ # Decode if needed
829
+ if le is not None:
830
+ predictions = le.inverse_transform(predictions.astype(int))
831
+
832
+ # Add predictions to dataframe
833
+ batch_df['Prediction'] = predictions
834
+
835
+ st.subheader("📊 Batch Predictions")
836
+ st.dataframe(batch_df)
837
+
838
+ # Statistics
839
+ if le is None: # Numerical predictions
840
+ st.subheader("📈 Prediction Statistics")
841
+ col1, col2, col3 = st.columns(3)
842
+ with col1:
843
+ st.metric("Mean", f"{predictions.mean():.4f}")
844
+ with col2:
845
+ st.metric("Std", f"{predictions.std():.4f}")
846
+ with col3:
847
+ st.metric("Range", f"{predictions.max() - predictions.min():.4f}")
848
+
849
+ # Download predictions
850
+ csv = batch_df.to_csv(index=False)
851
+ st.download_button(
852
+ label="📥 Download Batch Predictions",
853
+ data=csv,
854
+ file_name="batch_predictions.csv",
855
+ mime="text/csv",
856
+ use_container_width=True
857
+ )
858
+
859
+ except Exception as e:
860
+ st.error(f"❌ Batch prediction error: {str(e)}")
861
+
862
+ st.markdown('</div>', unsafe_allow_html=True)
863
+
864
+ except Exception as e:
865
+ st.error(f"❌ Error in prediction: {str(e)}")
866
+
867
+ with tab5:
868
+ if 'results_df' not in st.session_state:
869
+ st.info("ℹ️ Train some models first in the 'Model Training' tab")
870
+ return
871
+
872
+ try:
873
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
874
+ st.subheader("📋 Machine Learning Report")
875
+
876
+ results_df = st.session_state['results_df']
877
+ config = st.session_state.get('ml_config', {})
878
+
879
+ # Generate report
880
+ report = f"""
881
+ # Machine Learning Pipeline Report
882
+
883
+ ## Configuration Summary
884
+ - **Task Type:** {config.get('task_type', 'N/A')}
885
+ - **Target Variable:** {config.get('target', 'N/A')}
886
+ - **Number of Features:** {len(config.get('features', []))}
887
+ - **Test Size:** {config.get('test_size', 0.2)*100:.0f}%
888
+ - **Cross-Validation Folds:** {config.get('cv_folds', 5)}
889
+ - **Feature Scaling:** {config.get('scaler', 'None')}
890
+
891
+ ## Dataset Information
892
+ - **Total Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0] + st.session_state.get('X_test', pd.DataFrame()).shape[0]}
893
+ - **Training Samples:** {st.session_state.get('X_train', pd.DataFrame()).shape[0]}
894
+ - **Test Samples:** {st.session_state.get('X_test', pd.DataFrame()).shape[0]}
895
+
896
+ ## Model Performance Summary
897
+
898
+ {results_df.to_string()}
899
+
900
+ ## Best Model
901
+ **{results_df.iloc[0]['Model']}** performed best based on {'F1 Score' if config.get('task_type') == 'Classification' else 'R² Score'}.
902
+
903
+ ## Recommendations
904
+ """
905
+
906
+ # Add recommendations based on results
907
+ if config.get('task_type') == 'Classification':
908
+ if float(results_df['Accuracy'].iloc[0]) > 0.9:
909
+ report += "\n- ✓ Excellent model performance achieved"
910
+ elif float(results_df['Accuracy'].iloc[0]) > 0.7:
911
+ report += "\n- ✓ Good model performance"
912
+ else:
913
+ report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
914
+ else:
915
+ if float(results_df['R² Score'].iloc[0]) > 0.8:
916
+ report += "\n- ✓ Excellent model performance achieved"
917
+ elif float(results_df['R² Score'].iloc[0]) > 0.6:
918
+ report += "\n- ✓ Good model performance"
919
+ else:
920
+ report += "\n- ⚠️ Model performance could be improved. Consider feature engineering or trying different algorithms"
921
+
922
+ st.markdown(report)
923
+
924
+ # Download report
925
+ st.download_button(
926
+ label="📥 Download ML Report",
927
+ data=report,
928
+ file_name="ml_report.txt",
929
+ mime="text/plain",
930
+ use_container_width=True
931
+ )
932
+
933
+ st.markdown('</div>', unsafe_allow_html=True)
934
+
935
+ except Exception as e:
936
+ st.error(f"❌ Error generating report: {str(e)}")
937
+
938
+ except Exception as e:
939
+ st.error(f"❌ Critical error in ML pipeline: {str(e)}")
940
+ st.info("💡 Please check your data and try again. If the problem persists, try with a smaller dataset.")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ scikit-learn>=1.3.0
5
+ plotly>=5.17.0
6
+ matplotlib>=3.7.0
7
+ xgboost>=1.7.0
8
+ lightgbm>=4.0.0
9
+ openpyxl>=3.1.0
10
+ scipy>=1.10.0
11
+ shap>=0.42.0
12
+ imbalanced-learn>=0.11.0
13
+ category-encoders>=2.6.0
14
+ statsmodels>=0.14.0
15
+ seaborn>=0.12.0
16
+ joblib>=1.3.0
statistical_analysis.py ADDED
@@ -0,0 +1,928 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import scipy.stats as stats
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
+ import statsmodels.api as sm
9
+ from statsmodels.formula.api import ols
10
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
11
+ from statsmodels.tsa.stattools import adfuller, kpss
12
+ from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
13
+ import matplotlib.pyplot as plt
14
+ import warnings
15
+ warnings.filterwarnings('ignore')
16
+
17
+ def statistical_analysis(df):
18
+ """
19
+ Enhanced statistical analysis with advanced statistical tests and visualizations
20
+ """
21
+ st.markdown("""
22
+ <div style='text-align: center; margin-bottom: 2rem;'>
23
+ <h2>📐 Advanced Statistical Analysis</h2>
24
+ <p style='color: gray;'>Comprehensive statistical tests, hypothesis testing, and probability analysis</p>
25
+ </div>
26
+ """, unsafe_allow_html=True)
27
+
28
+ # Error handling for empty dataframe
29
+ if df.empty:
30
+ st.error("❌ The dataset is empty. Please upload a valid dataset.")
31
+ return
32
+
33
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
34
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
35
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
36
+
37
+ if not numeric_cols:
38
+ st.warning("⚠️ No numeric columns found. Statistical analysis requires numeric data.")
39
+ return
40
+
41
+ # Create tabs for different statistical analyses
42
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
43
+ "📊 Descriptive Stats",
44
+ "📈 Correlation Analysis",
45
+ "🔬 Hypothesis Testing",
46
+ "📊 Distribution Analysis",
47
+ "📉 Time Series Analysis",
48
+ "🎲 Probability & Sampling"
49
+ ])
50
+
51
+ with tab1:
52
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
53
+ st.subheader("📊 Descriptive Statistics")
54
+
55
+ try:
56
+ # Basic statistics with confidence intervals
57
+ stats_df = pd.DataFrame()
58
+ for col in numeric_cols:
59
+ data = df[col].dropna()
60
+ if len(data) > 0:
61
+ # Calculate confidence interval
62
+ ci = stats.t.interval(0.95, len(data)-1, loc=data.mean(), scale=stats.sem(data))
63
+
64
+ stats_df[col] = {
65
+ 'Count': len(data),
66
+ 'Mean': data.mean(),
67
+ 'Std Dev': data.std(),
68
+ 'Variance': data.var(),
69
+ 'Min': data.min(),
70
+ 'Q1 (25%)': data.quantile(0.25),
71
+ 'Median (50%)': data.median(),
72
+ 'Q3 (75%)': data.quantile(0.75),
73
+ 'Max': data.max(),
74
+ 'Range': data.max() - data.min(),
75
+ 'IQR': data.quantile(0.75) - data.quantile(0.25),
76
+ 'Skewness': data.skew(),
77
+ 'Kurtosis': data.kurtosis(),
78
+ 'Coefficient of Variation (%)': (data.std() / data.mean() * 100) if data.mean() != 0 else np.nan,
79
+ '95% CI Lower': ci[0],
80
+ '95% CI Upper': ci[1]
81
+ }
82
+
83
+ stats_df = pd.DataFrame(stats_df).T
84
+ st.dataframe(stats_df.style.format("{:.4f}"), use_container_width=True)
85
+
86
+ # Summary cards
87
+ st.subheader("📊 Summary Cards")
88
+ col1, col2, col3, col4 = st.columns(4)
89
+
90
+ with col1:
91
+ st.metric("Total Numeric Columns", len(numeric_cols))
92
+ with col2:
93
+ st.metric("Total Observations", f"{df.shape[0]:,}")
94
+ with col3:
95
+ st.metric("Complete Cases", f"{df.dropna().shape[0]:,}")
96
+ with col4:
97
+ completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
98
+ st.metric("Data Completeness", f"{completeness:.1f}%")
99
+
100
+ # Distribution visualization
101
+ st.subheader("Distribution Analysis")
102
+ selected_col = st.selectbox("Select column for detailed distribution analysis", numeric_cols)
103
+
104
+ data = df[selected_col].dropna()
105
+
106
+ fig = make_subplots(rows=2, cols=2,
107
+ subplot_titles=("Histogram with KDE", "Box Plot",
108
+ "Violin Plot", "Q-Q Plot"),
109
+ specs=[[{"type": "xy"}, {"type": "xy"}],
110
+ [{"type": "xy"}, {"type": "xy"}]])
111
+
112
+ # Histogram with KDE
113
+ hist_data = go.Histogram(x=data, nbinsx=30, name="Histogram", opacity=0.7)
114
+ fig.add_trace(hist_data, row=1, col=1)
115
+
116
+ # Box plot
117
+ box_data = go.Box(y=data, name="Box Plot", boxpoints='outliers')
118
+ fig.add_trace(box_data, row=1, col=2)
119
+
120
+ # Violin plot
121
+ violin_data = go.Violin(y=data, name="Violin Plot", box_visible=True, meanline_visible=True)
122
+ fig.add_trace(violin_data, row=2, col=1)
123
+
124
+ # Q-Q plot
125
+ theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
126
+ theoretical_q.sort()
127
+ data_sorted = np.sort(data)
128
+ qq_data = go.Scatter(x=theoretical_q, y=data_sorted, mode='markers', name='Q-Q')
129
+ fig.add_trace(qq_data, row=2, col=2)
130
+
131
+ # Add reference line to Q-Q plot
132
+ min_val = min(theoretical_q.min(), data_sorted.min())
133
+ max_val = max(theoretical_q.max(), data_sorted.max())
134
+ ref_line = go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
135
+ mode='lines', name='Reference', line=dict(color='red', dash='dash'))
136
+ fig.add_trace(ref_line, row=2, col=2)
137
+
138
+ fig.update_layout(height=800, title_text=f"Distribution Analysis of {selected_col}")
139
+ st.plotly_chart(fig, use_container_width=True)
140
+
141
+ # Outlier detection
142
+ Q1 = data.quantile(0.25)
143
+ Q3 = data.quantile(0.75)
144
+ IQR = Q3 - Q1
145
+ outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
146
+
147
+ if len(outliers) > 0:
148
+ st.warning(f"⚠️ **Outliers detected**: {len(outliers)} outliers found ({len(outliers)/len(data)*100:.2f}%)")
149
+ with st.expander("View outlier values"):
150
+ st.write(outliers.tolist())
151
+ else:
152
+ st.success("✅ No outliers detected in this column")
153
+
154
+ except Exception as e:
155
+ st.error(f"❌ Error in descriptive statistics: {str(e)}")
156
+ st.info("💡 Tip: Check if your data contains non-numeric values or extreme outliers")
157
+
158
+ st.markdown('</div>', unsafe_allow_html=True)
159
+
160
+ with tab2:
161
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
162
+ st.subheader("📈 Advanced Correlation Analysis")
163
+
164
+ try:
165
+ if len(numeric_cols) >= 2:
166
+ # Multiple correlation methods
167
+ corr_method = st.radio(
168
+ "Select correlation method",
169
+ ["Pearson (linear)", "Spearman (rank)", "Kendall (ordinal)"],
170
+ horizontal=True
171
+ )
172
+
173
+ method_map = {
174
+ "Pearson (linear)": "pearson",
175
+ "Spearman (rank)": "spearman",
176
+ "Kendall (ordinal)": "kendall"
177
+ }
178
+
179
+ # Calculate correlation matrix
180
+ corr_matrix = df[numeric_cols].corr(method=method_map[corr_method])
181
+
182
+ # Heatmap with improved visualization
183
+ fig = px.imshow(corr_matrix,
184
+ text_auto=True,
185
+ aspect="auto",
186
+ color_continuous_scale='RdBu_r',
187
+ title=f"{corr_method} Correlation Matrix",
188
+ zmin=-1, zmax=1)
189
+
190
+ fig.update_layout(height=600)
191
+ st.plotly_chart(fig, use_container_width=True)
192
+
193
+ # Correlation significance testing
194
+ st.subheader("📊 Correlation Significance Testing")
195
+
196
+ col1, col2 = st.columns(2)
197
+ with col1:
198
+ feat1 = st.selectbox("Select first feature", numeric_cols, key="corr_feat1")
199
+ with col2:
200
+ feat2 = st.selectbox("Select second feature", [c for c in numeric_cols if c != feat1], key="corr_feat2")
201
+
202
+ data1 = df[feat1].dropna()
203
+ data2 = df[feat2].dropna()
204
+
205
+ # Align data
206
+ combined = pd.concat([data1, data2], axis=1).dropna()
207
+ if len(combined) > 0:
208
+ corr_coef, p_value = stats.pearsonr(combined.iloc[:, 0], combined.iloc[:, 1])
209
+
210
+ st.write(f"**Pearson correlation coefficient:** {corr_coef:.4f}")
211
+ st.write(f"**P-value:** {p_value:.4f}")
212
+
213
+ if p_value < 0.05:
214
+ st.success(f"✅ Statistically significant correlation (p < 0.05)")
215
+ else:
216
+ st.info(f"ℹ️ No statistically significant correlation (p >= 0.05)")
217
+
218
+ # Confidence interval for correlation
219
+ n = len(combined)
220
+ r = corr_coef
221
+ z = np.arctanh(r)
222
+ se = 1 / np.sqrt(n - 3)
223
+ ci_z = stats.norm.interval(0.95, loc=z, scale=se)
224
+ ci_r = np.tanh(ci_z)
225
+
226
+ st.write(f"**95% Confidence Interval:** [{ci_r[0]:.4f}, {ci_r[1]:.4f}]")
227
+
228
+ # Scatter plot with regression line
229
+ fig = px.scatter(combined, x=combined.columns[0], y=combined.columns[1],
230
+ trendline="ols", title=f"Relationship: {feat1} vs {feat2}")
231
+ st.plotly_chart(fig, use_container_width=True)
232
+
233
+ # Partial correlation analysis
234
+ st.subheader("🔍 Partial Correlation Analysis")
235
+ if len(numeric_cols) >= 3:
236
+ from sklearn.linear_model import LinearRegression
237
+
238
+ control_var = st.selectbox("Select control variable",
239
+ [c for c in numeric_cols if c not in [feat1, feat2]])
240
+
241
+ # Calculate partial correlation
242
+ X_control = df[[control_var]].dropna()
243
+ y1 = df[feat1].dropna()
244
+ y2 = df[feat2].dropna()
245
+
246
+ # Align data
247
+ aligned_data = pd.concat([X_control, y1, y2], axis=1).dropna()
248
+
249
+ if len(aligned_data) > 0:
250
+ # Residualize
251
+ model1 = LinearRegression().fit(aligned_data[[control_var]], aligned_data[feat1])
252
+ res1 = aligned_data[feat1] - model1.predict(aligned_data[[control_var]])
253
+
254
+ model2 = LinearRegression().fit(aligned_data[[control_var]], aligned_data[feat2])
255
+ res2 = aligned_data[feat2] - model2.predict(aligned_data[[control_var]])
256
+
257
+ partial_corr, partial_p = stats.pearsonr(res1, res2)
258
+
259
+ st.write(f"**Partial correlation (controlling for {control_var}):** {partial_corr:.4f}")
260
+ st.write(f"**P-value:** {partial_p:.4f}")
261
+
262
+ if abs(partial_corr) < abs(corr_coef):
263
+ st.info(f"ℹ️ The correlation decreases when controlling for {control_var}, suggesting it may be a confounding variable")
264
+ else:
265
+ st.warning("⚠️ Need at least 2 numeric columns for correlation analysis")
266
+
267
+ except Exception as e:
268
+ st.error(f"❌ Error in correlation analysis: {str(e)}")
269
+ st.info("💡 Tip: Ensure your data has sufficient non-null values for correlation calculation")
270
+
271
+ st.markdown('</div>', unsafe_allow_html=True)
272
+
273
+ with tab3:
274
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
275
+ st.subheader("🔬 Statistical Hypothesis Testing")
276
+
277
+ try:
278
+ test_category = st.selectbox(
279
+ "Select test category",
280
+ ["Parametric Tests", "Non-parametric Tests", "ANOVA & Post-hoc", "Goodness of Fit"]
281
+ )
282
+
283
+ if test_category == "Parametric Tests":
284
+ param_test = st.selectbox(
285
+ "Select parametric test",
286
+ ["One-Sample t-test", "Independent t-test", "Paired t-test", "Z-test"]
287
+ )
288
+
289
+ if param_test == "One-Sample t-test":
290
+ if numeric_cols:
291
+ col = st.selectbox("Select variable", numeric_cols)
292
+ test_value = st.number_input("Test value (population mean)", value=0.0)
293
+
294
+ data = df[col].dropna()
295
+ if len(data) > 0:
296
+ t_stat, p_value = stats.ttest_1samp(data, test_value)
297
+
298
+ st.write(f"**t-statistic:** {t_stat:.4f}")
299
+ st.write(f"**p-value:** {p_value:.4f}")
300
+ st.write(f"**Degrees of freedom:** {len(data)-1}")
301
+
302
+ # Effect size (Cohen's d)
303
+ cohens_d = (data.mean() - test_value) / data.std()
304
+ st.write(f"**Cohen's d (effect size):** {cohens_d:.4f}")
305
+
306
+ if p_value < 0.05:
307
+ st.success(f"✅ Reject null hypothesis: Mean is significantly different from {test_value}")
308
+ else:
309
+ st.info(f"ℹ️ Fail to reject null hypothesis: Mean is not significantly different from {test_value}")
310
+
311
+ # Visualization
312
+ fig = go.Figure()
313
+ fig.add_trace(go.Histogram(x=data, name="Sample", opacity=0.7))
314
+ fig.add_vline(x=test_value, line_dash="dash", line_color="red",
315
+ annotation_text=f"Test value: {test_value}")
316
+ fig.add_vline(x=data.mean(), line_color="green",
317
+ annotation_text=f"Sample mean: {data.mean():.2f}")
318
+ fig.update_layout(title=f"One-Sample t-test: {col}")
319
+ st.plotly_chart(fig, use_container_width=True)
320
+
321
+ elif param_test == "Independent t-test":
322
+ if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
323
+ num_col = st.selectbox("Select numeric variable", numeric_cols, key="ind_num")
324
+ cat_col = st.selectbox("Select grouping variable", categorical_cols, key="ind_cat")
325
+
326
+ groups = df[cat_col].dropna().unique()
327
+ if len(groups) == 2:
328
+ group1 = df[df[cat_col] == groups[0]][num_col].dropna()
329
+ group2 = df[df[cat_col] == groups[1]][num_col].dropna()
330
+
331
+ # Test for equal variances
332
+ levene_stat, levene_p = stats.levene(group1, group2)
333
+ equal_var = levene_p > 0.05
334
+
335
+ t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=equal_var)
336
+
337
+ st.write(f"**Groups:** {groups[0]} (n={len(group1)}) vs {groups[1]} (n={len(group2)})")
338
+ st.write(f"**Levene's test for equal variances:** p={levene_p:.4f}")
339
+ st.write(f"**Assuming {'equal' if equal_var else 'unequal'} variances")
340
+ st.write(f"**t-statistic:** {t_stat:.4f}")
341
+ st.write(f"**p-value:** {p_value:.4f}")
342
+
343
+ # Effect size (Cohen's d)
344
+ pooled_std = np.sqrt(((len(group1)-1)*group1.std()**2 + (len(group2)-1)*group2.std()**2) /
345
+ (len(group1)+len(group2)-2))
346
+ cohens_d = (group1.mean() - group2.mean()) / pooled_std
347
+ st.write(f"**Cohen's d (effect size):** {cohens_d:.4f}")
348
+
349
+ if p_value < 0.05:
350
+ st.success(f"✅ Significant difference found between groups")
351
+ else:
352
+ st.info(f"ℹ️ No significant difference found between groups")
353
+
354
+ # Visualization
355
+ fig = px.box(df, x=cat_col, y=num_col, title=f"Comparison: {num_col} by {cat_col}")
356
+ st.plotly_chart(fig, use_container_width=True)
357
+ else:
358
+ st.warning(f"⚠️ Independent t-test requires exactly 2 groups. Found {len(groups)} groups.")
359
+
360
+ elif param_test == "Paired t-test":
361
+ if len(numeric_cols) >= 2:
362
+ col1 = st.selectbox("Select first measurement", numeric_cols, key="paired1")
363
+ col2 = st.selectbox("Select second measurement", numeric_cols, key="paired2")
364
+
365
+ paired_data = df[[col1, col2]].dropna()
366
+ if len(paired_data) > 0:
367
+ t_stat, p_value = stats.ttest_rel(paired_data[col1], paired_data[col2])
368
+
369
+ st.write(f"**Sample size:** {len(paired_data)}")
370
+ st.write(f"**Mean difference:** {(paired_data[col1] - paired_data[col2]).mean():.4f}")
371
+ st.write(f"**t-statistic:** {t_stat:.4f}")
372
+ st.write(f"**p-value:** {p_value:.4f}")
373
+
374
+ if p_value < 0.05:
375
+ st.success(f"✅ Significant difference found between measurements")
376
+ else:
377
+ st.info(f"ℹ️ No significant difference found between measurements")
378
+
379
+ # Visualization
380
+ fig = go.Figure()
381
+ fig.add_trace(go.Scatter(x=paired_data[col1], y=paired_data[col2],
382
+ mode='markers', text=paired_data.index))
383
+ fig.add_trace(go.Scatter(x=[paired_data[col1].min(), paired_data[col1].max()],
384
+ y=[paired_data[col1].min(), paired_data[col1].max()],
385
+ mode='lines', name='y=x', line=dict(dash='dash')))
386
+ fig.update_layout(title=f"Paired Comparison: {col1} vs {col2}")
387
+ st.plotly_chart(fig, use_container_width=True)
388
+
389
+ elif test_category == "Non-parametric Tests":
390
+ nonparam_test = st.selectbox(
391
+ "Select non-parametric test",
392
+ ["Mann-Whitney U", "Wilcoxon Signed-Rank", "Kruskal-Wallis H", "Friedman Test"]
393
+ )
394
+
395
+ if nonparam_test == "Mann-Whitney U":
396
+ if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
397
+ num_col = st.selectbox("Select numeric variable", numeric_cols, key="mw_num")
398
+ cat_col = st.selectbox("Select grouping variable", categorical_cols, key="mw_cat")
399
+
400
+ groups = df[cat_col].dropna().unique()
401
+ if len(groups) == 2:
402
+ group1 = df[df[cat_col] == groups[0]][num_col].dropna()
403
+ group2 = df[df[cat_col] == groups[1]][num_col].dropna()
404
+
405
+ u_stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
406
+
407
+ st.write(f"**U-statistic:** {u_stat:.4f}")
408
+ st.write(f"**p-value:** {p_value:.4f}")
409
+
410
+ # Effect size (r = Z/√N)
411
+ from scipy.stats import norm
412
+ z_score = norm.ppf(p_value/2) if p_value < 1 else 0
413
+ effect_size = abs(z_score) / np.sqrt(len(group1) + len(group2))
414
+ st.write(f"**Effect size (r):** {effect_size:.4f}")
415
+
416
+ if p_value < 0.05:
417
+ st.success(f"✅ Significant difference found between groups")
418
+ else:
419
+ st.info(f"ℹ️ No significant difference found between groups")
420
+
421
+ # Visualization
422
+ fig = px.violin(df, x=cat_col, y=num_col, box=True, points="all",
423
+ title=f"Mann-Whitney U Test: {num_col} by {cat_col}")
424
+ st.plotly_chart(fig, use_container_width=True)
425
+
426
+ elif test_category == "ANOVA & Post-hoc":
427
+ if len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
428
+ num_col = st.selectbox("Select numeric variable", numeric_cols, key="anova_num")
429
+ cat_col = st.selectbox("Select grouping variable", categorical_cols, key="anova_cat")
430
+
431
+ groups = [df[df[cat_col] == group][num_col].dropna()
432
+ for group in df[cat_col].unique() if len(df[df[cat_col] == group]) > 0]
433
+
434
+ if len(groups) >= 2:
435
+ # One-way ANOVA
436
+ f_stat, p_value = stats.f_oneway(*groups)
437
+
438
+ st.write("**One-way ANOVA Results:**")
439
+ st.write(f"**F-statistic:** {f_stat:.4f}")
440
+ st.write(f"**p-value:** {p_value:.4f}")
441
+
442
+ if p_value < 0.05:
443
+ st.success("✅ Significant differences found between groups")
444
+
445
+ # Post-hoc Tukey HSD
446
+ if st.button("Run Tukey HSD Post-hoc Test"):
447
+ tukey = pairwise_tukeyhsd(df[num_col].dropna(), df[cat_col].dropna())
448
+ tukey_df = pd.DataFrame(data=tukey.summary().data[1:],
449
+ columns=tukey.summary().data[0])
450
+ st.dataframe(tukey_df)
451
+
452
+ # Visualize confidence intervals
453
+ fig = go.Figure()
454
+ for i, row in enumerate(tukey_df.itertuples()):
455
+ if row.padj < 0.05:
456
+ color = 'green'
457
+ else:
458
+ color = 'red'
459
+ fig.add_trace(go.Scatter(x=[row[4], row[5]], y=[i, i],
460
+ mode='lines', line=dict(color=color, width=3),
461
+ name=f"{row[1]} vs {row[2]}"))
462
+ fig.update_layout(title="Tukey HSD Confidence Intervals",
463
+ xaxis_title="Mean Difference",
464
+ yaxis_title="Comparison")
465
+ st.plotly_chart(fig, use_container_width=True)
466
+ else:
467
+ st.info("ℹ️ No significant differences found between groups")
468
+
469
+ # Visualization
470
+ fig = px.box(df, x=cat_col, y=num_col, title=f"ANOVA: {num_col} by {cat_col}")
471
+ st.plotly_chart(fig, use_container_width=True)
472
+
473
+ except Exception as e:
474
+ st.error(f"❌ Error in hypothesis testing: {str(e)}")
475
+ st.info("💡 Tip: Ensure you have sufficient data and appropriate variable types for the selected test")
476
+
477
+ st.markdown('</div>', unsafe_allow_html=True)
478
+
479
+ with tab4:
480
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
481
+ st.subheader("📊 Distribution Analysis & Normality Tests")
482
+
483
+ try:
484
+ if numeric_cols:
485
+ col = st.selectbox("Select column for distribution analysis", numeric_cols, key="dist_col")
486
+ data = df[col].dropna()
487
+
488
+ if len(data) > 0:
489
+ # Multiple normality tests
490
+ st.markdown("### 🔍 Normality Tests")
491
+
492
+ col1, col2 = st.columns(2)
493
+
494
+ with col1:
495
+ # Shapiro-Wilk test
496
+ if len(data) <= 5000:
497
+ shapiro_stat, shapiro_p = stats.shapiro(data)
498
+ st.write("**Shapiro-Wilk Test**")
499
+ st.write(f"Statistic: {shapiro_stat:.4f}")
500
+ st.write(f"P-value: {shapiro_p:.4f}")
501
+ if shapiro_p < 0.05:
502
+ st.error("❌ Not normally distributed")
503
+ else:
504
+ st.success("✅ Normally distributed")
505
+
506
+ with col2:
507
+ # Kolmogorov-Smirnov test
508
+ ks_stat, ks_p = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
509
+ st.write("**Kolmogorov-Smirnov Test**")
510
+ st.write(f"Statistic: {ks_stat:.4f}")
511
+ st.write(f"P-value: {ks_p:.4f}")
512
+ if ks_p < 0.05:
513
+ st.error("❌ Not normally distributed")
514
+ else:
515
+ st.success("✅ Normally distributed")
516
+
517
+ # Anderson-Darling test
518
+ anderson_stat, anderson_crit, anderson_sig = stats.anderson(data, dist='norm')
519
+ st.write("**Anderson-Darling Test**")
520
+ st.write(f"Statistic: {anderson_stat:.4f}")
521
+ for i in range(len(anderson_crit)):
522
+ st.write(f"Critical value at {anderson_sig[i]}%: {anderson_crit[i]:.4f}")
523
+
524
+ # D'Agostino's K-squared test
525
+ skew_stat, skew_p = stats.skewtest(data)
526
+ kurt_stat, kurt_p = stats.kurtosistest(data)
527
+
528
+ st.write("**D'Agostino's Tests**")
529
+ st.write(f"Skewness test p-value: {skew_p:.4f}")
530
+ st.write(f"Kurtosis test p-value: {kurt_p:.4f}")
531
+
532
+ # Distribution fitting
533
+ st.markdown("### 📈 Distribution Fitting")
534
+
535
+ distributions = ['norm', 'expon', 'gamma', 'beta', 'lognorm', 'uniform']
536
+ selected_dist = st.selectbox("Select distribution to fit", distributions)
537
+
538
+ if selected_dist == 'norm':
539
+ params = stats.norm.fit(data)
540
+ pdf = stats.norm.pdf(np.sort(data), *params)
541
+ elif selected_dist == 'expon':
542
+ params = stats.expon.fit(data)
543
+ pdf = stats.expon.pdf(np.sort(data), *params)
544
+ elif selected_dist == 'gamma':
545
+ params = stats.gamma.fit(data)
546
+ pdf = stats.gamma.pdf(np.sort(data), *params)
547
+ elif selected_dist == 'beta':
548
+ # Scale data to [0,1] for beta distribution
549
+ scaled_data = (data - data.min()) / (data.max() - data.min())
550
+ scaled_data = scaled_data[(scaled_data > 0) & (scaled_data < 1)]
551
+ if len(scaled_data) > 0:
552
+ params = stats.beta.fit(scaled_data)
553
+ pdf = stats.beta.pdf(np.sort(scaled_data), *params)
554
+ elif selected_dist == 'lognorm':
555
+ params = stats.lognorm.fit(data)
556
+ pdf = stats.lognorm.pdf(np.sort(data), *params)
557
+ elif selected_dist == 'uniform':
558
+ params = stats.uniform.fit(data)
559
+ pdf = stats.uniform.pdf(np.sort(data), *params)
560
+
561
+ # Plot histogram with fitted distribution
562
+ fig = go.Figure()
563
+ fig.add_trace(go.Histogram(x=data, nbinsx=30, name="Data", opacity=0.7))
564
+
565
+ if selected_dist != 'beta':
566
+ fig.add_trace(go.Scatter(x=np.sort(data), y=pdf * len(data) * (data.max() - data.min()) / 30,
567
+ mode='lines', name=f"Fitted {selected_dist}",
568
+ line=dict(color='red', width=2)))
569
+
570
+ fig.update_layout(title=f"Histogram with Fitted {selected_dist} Distribution")
571
+ st.plotly_chart(fig, use_container_width=True)
572
+
573
+ # Q-Q plot with confidence bands
574
+ st.markdown("### 📊 Enhanced Q-Q Plot")
575
+
576
+ # Generate theoretical quantiles
577
+ theoretical_q = np.random.normal(data.mean(), data.std(), len(data))
578
+ theoretical_q.sort()
579
+ data_sorted = np.sort(data)
580
+
581
+ # Calculate confidence bands (bootstrap)
582
+ n_bootstrap = 100
583
+ bootstrap_lines = []
584
+ for i in range(n_bootstrap):
585
+ bootstrap_sample = np.random.choice(data, len(data), replace=True)
586
+ bootstrap_sample.sort()
587
+ bootstrap_lines.append(bootstrap_sample)
588
+
589
+ bootstrap_lines = np.array(bootstrap_lines)
590
+ lower_band = np.percentile(bootstrap_lines, 2.5, axis=0)
591
+ upper_band = np.percentile(bootstrap_lines, 97.5, axis=0)
592
+
593
+ fig = go.Figure()
594
+
595
+ # Add confidence band
596
+ fig.add_trace(go.Scatter(x=np.concatenate([theoretical_q, theoretical_q[::-1]]),
597
+ y=np.concatenate([lower_band, upper_band[::-1]]),
598
+ fill='toself', fillcolor='rgba(0,100,80,0.2)',
599
+ line=dict(color='rgba(255,255,255,0)'),
600
+ name='95% CI'))
601
+
602
+ # Add data points
603
+ fig.add_trace(go.Scatter(x=theoretical_q, y=data_sorted,
604
+ mode='markers', name='Data'))
605
+
606
+ # Add reference line
607
+ fig.add_trace(go.Scatter(x=[data_sorted.min(), data_sorted.max()],
608
+ y=[data_sorted.min(), data_sorted.max()],
609
+ mode='lines', name='Reference',
610
+ line=dict(color='red', dash='dash')))
611
+
612
+ fig.update_layout(title=f"Enhanced Q-Q Plot with 95% Confidence Band")
613
+ st.plotly_chart(fig, use_container_width=True)
614
+
615
+ except Exception as e:
616
+ st.error(f"❌ Error in distribution analysis: {str(e)}")
617
+ st.info("💡 Tip: Ensure you have sufficient data points for distribution fitting")
618
+
619
+ st.markdown('</div>', unsafe_allow_html=True)
620
+
621
+ with tab5:
622
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
623
+ st.subheader("📉 Advanced Time Series Analysis")
624
+
625
+ try:
626
+ if datetime_cols and numeric_cols:
627
+ date_col = st.selectbox("Select date column", datetime_cols)
628
+ value_col = st.selectbox("Select value column", numeric_cols, key="ts_value_adv")
629
+
630
+ # Prepare time series data
631
+ ts_df = df[[date_col, value_col]].dropna().sort_values(date_col)
632
+ ts_df.set_index(date_col, inplace=True)
633
+
634
+ if len(ts_df) >= 10:
635
+ # Time series decomposition
636
+ st.markdown("### 🔄 Time Series Decomposition")
637
+
638
+ from statsmodels.tsa.seasonal import seasonal_decompose
639
+
640
+ # Determine frequency
641
+ freq_options = {
642
+ 'Auto-detect': None,
643
+ 'Daily (7)': 7,
644
+ 'Weekly (52)': 52,
645
+ 'Monthly (12)': 12,
646
+ 'Quarterly (4)': 4
647
+ }
648
+
649
+ selected_freq = st.selectbox("Select seasonal period", list(freq_options.keys()))
650
+ period = freq_options[selected_freq]
651
+
652
+ if period is None:
653
+ # Auto-detect frequency
654
+ try:
655
+ freq = pd.infer_freq(ts_df.index)
656
+ if freq:
657
+ period_map = {'D': 7, 'W': 52, 'M': 12, 'Q': 4}
658
+ period = period_map.get(freq[0], 7)
659
+ except:
660
+ period = 7
661
+
662
+ if len(ts_df) >= 2 * period:
663
+ decomposition = seasonal_decompose(ts_df[value_col], model='additive', period=period)
664
+
665
+ fig = make_subplots(rows=4, cols=1,
666
+ subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'))
667
+
668
+ fig.add_trace(go.Scatter(x=ts_df.index, y=ts_df[value_col],
669
+ mode='lines', name='Original'), row=1, col=1)
670
+ fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.trend,
671
+ mode='lines', name='Trend'), row=2, col=1)
672
+ fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.seasonal,
673
+ mode='lines', name='Seasonal'), row=3, col=1)
674
+ fig.add_trace(go.Scatter(x=ts_df.index, y=decomposition.resid,
675
+ mode='lines', name='Residual'), row=4, col=1)
676
+
677
+ fig.update_layout(height=800, title="Time Series Decomposition")
678
+ st.plotly_chart(fig, use_container_width=True)
679
+
680
+ # Stationarity tests
681
+ st.markdown("### 📊 Stationarity Tests")
682
+
683
+ col1, col2 = st.columns(2)
684
+
685
+ with col1:
686
+ # ADF test
687
+ adf_result = adfuller(ts_df[value_col].dropna())
688
+ st.write("**Augmented Dickey-Fuller Test**")
689
+ st.write(f"ADF Statistic: {adf_result[0]:.4f}")
690
+ st.write(f"p-value: {adf_result[1]:.4f}")
691
+ st.write(f"Critical values:")
692
+ for key, value in adf_result[4].items():
693
+ st.write(f" {key}: {value:.4f}")
694
+
695
+ if adf_result[1] < 0.05:
696
+ st.success("✅ Series is stationary")
697
+ else:
698
+ st.warning("⚠️ Series is non-stationary")
699
+
700
+ with col2:
701
+ # KPSS test
702
+ kpss_result = kpss(ts_df[value_col].dropna(), regression='c')
703
+ st.write("**KPSS Test**")
704
+ st.write(f"KPSS Statistic: {kpss_result[0]:.4f}")
705
+ st.write(f"p-value: {kpss_result[1]:.4f}")
706
+ st.write(f"Critical values:")
707
+ for key, value in kpss_result[3].items():
708
+ st.write(f" {key}: {value:.4f}")
709
+
710
+ if kpss_result[1] < 0.05:
711
+ st.warning("⚠️ Series is non-stationary")
712
+ else:
713
+ st.success("✅ Series is stationary")
714
+
715
+ # ACF and PACF plots
716
+ st.markdown("### 📈 ACF and PACF Plots")
717
+
718
+ lags = st.slider("Number of lags", 10, 50, 20)
719
+
720
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
721
+ plot_acf(ts_df[value_col].dropna(), lags=lags, ax=ax1)
722
+ plot_pacf(ts_df[value_col].dropna(), lags=lags, ax=ax2)
723
+ plt.tight_layout()
724
+ st.pyplot(fig)
725
+
726
+ # Forecasting with simple models
727
+ st.markdown("### 🔮 Simple Forecasting")
728
+
729
+ forecast_periods = st.slider("Forecast periods", 1, 30, 10)
730
+
731
+ from statsmodels.tsa.holtwinters import ExponentialSmoothing
732
+
733
+ model = ExponentialSmoothing(ts_df[value_col],
734
+ seasonal_periods=period,
735
+ trend='add', seasonal='add')
736
+ fitted_model = model.fit()
737
+ forecast = fitted_model.forecast(forecast_periods)
738
+
739
+ # Plot forecast
740
+ fig = go.Figure()
741
+ fig.add_trace(go.Scatter(x=ts_df.index, y=ts_df[value_col],
742
+ mode='lines', name='Historical'))
743
+ fig.add_trace(go.Scatter(x=forecast.index, y=forecast,
744
+ mode='lines+markers', name='Forecast',
745
+ line=dict(color='red')))
746
+ fig.update_layout(title=f"Exponential Smoothing Forecast ({forecast_periods} periods)")
747
+ st.plotly_chart(fig, use_container_width=True)
748
+
749
+ else:
750
+ st.info("ℹ️ Need both datetime and numeric columns for time series analysis")
751
+
752
+ except Exception as e:
753
+ st.error(f"❌ Error in time series analysis: {str(e)}")
754
+ st.info("💡 Tip: Ensure your date column is properly formatted as datetime")
755
+
756
+ st.markdown('</div>', unsafe_allow_html=True)
757
+
758
+ with tab6:
759
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
760
+ st.subheader("🎲 Probability & Sampling Analysis")
761
+
762
+ try:
763
+ if numeric_cols:
764
+ col = st.selectbox("Select column for probability analysis", numeric_cols, key="prob_col")
765
+ data = df[col].dropna()
766
+
767
+ if len(data) > 0:
768
+ # Probability distribution fitting
769
+ st.markdown("### 📊 Probability Distribution Fitting")
770
+
771
+ # Calculate empirical CDF
772
+ sorted_data = np.sort(data)
773
+ ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)
774
+
775
+ fig = go.Figure()
776
+ fig.add_trace(go.Scatter(x=sorted_data, y=ecdf,
777
+ mode='lines', name='Empirical CDF'))
778
+
779
+ # Fit theoretical distributions
780
+ dist_options = ['Normal', 'Exponential', 'Gamma', 'Log-normal']
781
+ selected_dist = st.multiselect("Select distributions to compare", dist_options, default=['Normal'])
782
+
783
+ colors = ['red', 'green', 'blue', 'orange']
784
+ for i, dist_name in enumerate(selected_dist):
785
+ if dist_name == 'Normal':
786
+ params = stats.norm.fit(data)
787
+ theoretical_cdf = stats.norm.cdf(sorted_data, *params)
788
+ elif dist_name == 'Exponential':
789
+ params = stats.expon.fit(data)
790
+ theoretical_cdf = stats.expon.cdf(sorted_data, *params)
791
+ elif dist_name == 'Gamma':
792
+ params = stats.gamma.fit(data)
793
+ theoretical_cdf = stats.gamma.cdf(sorted_data, *params)
794
+ elif dist_name == 'Log-normal':
795
+ params = stats.lognorm.fit(data)
796
+ theoretical_cdf = stats.lognorm.cdf(sorted_data, *params)
797
+
798
+ fig.add_trace(go.Scatter(x=sorted_data, y=theoretical_cdf,
799
+ mode='lines', name=f'{dist_name} CDF',
800
+ line=dict(color=colors[i], dash='dash')))
801
+
802
+ fig.update_layout(title="CDF Comparison: Empirical vs Theoretical",
803
+ xaxis_title=col, yaxis_title="Cumulative Probability")
804
+ st.plotly_chart(fig, use_container_width=True)
805
+
806
+ # Goodness of fit tests
807
+ st.markdown("### 📈 Goodness of Fit Tests")
808
+
809
+ for dist_name in selected_dist:
810
+ if dist_name == 'Normal':
811
+ ks_stat, ks_p = stats.kstest(data, 'norm', args=stats.norm.fit(data))
812
+ elif dist_name == 'Exponential':
813
+ ks_stat, ks_p = stats.kstest(data, 'expon', args=stats.expon.fit(data))
814
+ elif dist_name == 'Gamma':
815
+ ks_stat, ks_p = stats.kstest(data, 'gamma', args=stats.gamma.fit(data))
816
+ elif dist_name == 'Log-normal':
817
+ ks_stat, ks_p = stats.kstest(data, 'lognorm', args=stats.lognorm.fit(data))
818
+
819
+ st.write(f"**{dist_name} Distribution**")
820
+ st.write(f"KS Statistic: {ks_stat:.4f}")
821
+ st.write(f"P-value: {ks_p:.4f}")
822
+
823
+ if ks_p < 0.05:
824
+ st.error(f"❌ Data does NOT follow {dist_name} distribution")
825
+ else:
826
+ st.success(f"✅ Data may follow {dist_name} distribution")
827
+
828
+ # Sampling analysis
829
+ st.markdown("### 🎯 Sampling Analysis")
830
+
831
+ sample_size = st.slider("Sample size", 10, min(500, len(data)), 100)
832
+ n_samples = st.slider("Number of samples", 10, 1000, 100)
833
+
834
+ # Bootstrap sampling
835
+ bootstrap_means = []
836
+ for i in range(n_samples):
837
+ sample = np.random.choice(data, sample_size, replace=True)
838
+ bootstrap_means.append(sample.mean())
839
+
840
+ bootstrap_means = np.array(bootstrap_means)
841
+
842
+ # Plot sampling distribution
843
+ fig = make_subplots(rows=1, cols=2,
844
+ subplot_titles=("Sampling Distribution of Mean",
845
+ "Confidence Intervals"))
846
+
847
+ fig.add_trace(go.Histogram(x=bootstrap_means, nbinsx=30,
848
+ name="Sample Means"), row=1, col=1)
849
+
850
+ # Add confidence intervals
851
+ ci_lower = np.percentile(bootstrap_means, 2.5)
852
+ ci_upper = np.percentile(bootstrap_means, 97.5)
853
+
854
+ fig.add_trace(go.Scatter(x=[ci_lower, ci_lower], y=[0, 10],
855
+ mode='lines', name='95% CI Lower',
856
+ line=dict(color='red', dash='dash')), row=1, col=1)
857
+ fig.add_trace(go.Scatter(x=[ci_upper, ci_upper], y=[0, 10],
858
+ mode='lines', name='95% CI Upper',
859
+ line=dict(color='red', dash='dash')), row=1, col=1)
860
+
861
+ # Confidence interval plot
862
+ for i in range(min(20, n_samples)):
863
+ sample_mean = bootstrap_means[i]
864
+ fig.add_trace(go.Scatter(x=[i, i], y=[sample_mean - data.std()/np.sqrt(sample_size),
865
+ sample_mean + data.std()/np.sqrt(sample_size)],
866
+ mode='lines', line=dict(color='blue', width=1),
867
+ showlegend=False), row=1, col=2)
868
+ fig.add_trace(go.Scatter(x=[i], y=[sample_mean],
869
+ mode='markers', marker=dict(color='red', size=5),
870
+ showlegend=False), row=1, col=2)
871
+
872
+ fig.update_layout(height=500, title="Bootstrap Sampling Analysis")
873
+ st.plotly_chart(fig, use_container_width=True)
874
+
875
+ # Sampling statistics
876
+ col1, col2, col3 = st.columns(3)
877
+ with col1:
878
+ st.metric("Population Mean", f"{data.mean():.4f}")
879
+ with col2:
880
+ st.metric("Mean of Sample Means", f"{bootstrap_means.mean():.4f}")
881
+ with col3:
882
+ st.metric("Standard Error", f"{bootstrap_means.std():.4f}")
883
+
884
+ st.write(f"**95% Confidence Interval:** [{ci_lower:.4f}, {ci_upper:.4f}]")
885
+
886
+ except Exception as e:
887
+ st.error(f"❌ Error in probability analysis: {str(e)}")
888
+ st.info("💡 Tip: Ensure you have sufficient data for probability analysis")
889
+
890
+ st.markdown('</div>', unsafe_allow_html=True)
891
+
892
+ # Export options
893
+ st.markdown("---")
894
+ st.markdown("### 📥 Export Statistical Report")
895
+
896
+ try:
897
+ report_text = f"""
898
+ STATISTICAL ANALYSIS REPORT
899
+ ===========================
900
+
901
+ Dataset Information:
902
+ • Total Rows: {df.shape[0]:,}
903
+ • Total Columns: {df.shape[1]}
904
+ • Numeric Columns: {len(numeric_cols)}
905
+ • Categorical Columns: {len(categorical_cols)}
906
+ • Datetime Columns: {len(datetime_cols)}
907
+
908
+ Summary Statistics:
909
+ {df[numeric_cols].describe().to_string()}
910
+
911
+ Analysis Performed:
912
+ • Descriptive Statistics
913
+ • Correlation Analysis
914
+ • Hypothesis Testing
915
+ • Distribution Analysis
916
+ • Time Series Analysis (if applicable)
917
+ • Probability & Sampling Analysis
918
+ """
919
+
920
+ st.download_button(
921
+ label="📥 Download Complete Statistical Report",
922
+ data=report_text,
923
+ file_name="statistical_analysis_report.txt",
924
+ mime="text/plain",
925
+ use_container_width=True
926
+ )
927
+ except Exception as e:
928
+ st.error(f"❌ Error generating report: {str(e)}")
utils.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+
5
+ def detect_column_types(df):
6
+ """
7
+ Detect and return column types
8
+ """
9
+ numeric = df.select_dtypes(include=[np.number]).columns.tolist()
10
+ categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()
11
+ datetime = df.select_dtypes(include=['datetime64']).columns.tolist()
12
+ boolean = df.select_dtypes(include=['bool']).columns.tolist()
13
+
14
+ return numeric, categorical, datetime, boolean
15
+
16
+ def get_basic_stats(df):
17
+ """
18
+ Return basic statistics about the dataset
19
+ """
20
+ stats = {
21
+ 'rows': df.shape[0],
22
+ 'columns': df.shape[1],
23
+ 'missing_values': df.isnull().sum().sum(),
24
+ 'missing_percentage': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100,
25
+ 'duplicates': df.duplicated().sum(),
26
+ 'memory_usage': df.memory_usage(deep=True).sum() / 1024**2 # MB
27
+ }
28
+ return stats
29
+
30
+ def suggest_visualizations(df):
31
+ """
32
+ Suggest appropriate visualizations based on data types
33
+ """
34
+ numeric, categorical, datetime, boolean = detect_column_types(df)
35
+
36
+ suggestions = []
37
+
38
+ if len(numeric) > 0:
39
+ suggestions.append({
40
+ 'type': 'histogram',
41
+ 'description': f'Distribution of numeric columns',
42
+ 'columns': numeric[:3]
43
+ })
44
+
45
+ if len(categorical) > 0:
46
+ suggestions.append({
47
+ 'type': 'bar_chart',
48
+ 'description': f'Category distributions',
49
+ 'columns': categorical[:3]
50
+ })
51
+
52
+ if len(numeric) >= 2:
53
+ suggestions.append({
54
+ 'type': 'scatter_plot',
55
+ 'description': 'Relationship between numeric variables',
56
+ 'columns': numeric[:2]
57
+ })
58
+
59
+ if len(datetime) > 0 and len(numeric) > 0:
60
+ suggestions.append({
61
+ 'type': 'line_chart',
62
+ 'description': 'Time series trends',
63
+ 'columns': [datetime[0], numeric[0]]
64
+ })
65
+
66
+ if len(numeric) > 1:
67
+ suggestions.append({
68
+ 'type': 'correlation_heatmap',
69
+ 'description': 'Correlations between numeric variables'
70
+ })
71
+
72
+ return suggestions
73
+
74
+ def format_number(num):
75
+ """
76
+ Format large numbers with commas
77
+ """
78
+ if pd.isna(num):
79
+ return "N/A"
80
+ return f"{num:,.0f}"
81
+
82
+ def format_percentage(num):
83
+ """
84
+ Format as percentage
85
+ """
86
+ if pd.isna(num):
87
+ return "N/A"
88
+ return f"{num:.1f}%"
89
+
90
+ def get_data_quality_issues(df):
91
+ """
92
+ Identify data quality issues
93
+ """
94
+ issues = []
95
+
96
+ # Check for missing values
97
+ missing_cols = df.columns[df.isnull().any()].tolist()
98
+ if missing_cols:
99
+ issues.append({
100
+ 'type': 'missing_values',
101
+ 'severity': 'high' if df.isnull().sum().sum() > len(df) * 0.1 else 'medium',
102
+ 'description': f'Missing values in {len(missing_cols)} columns',
103
+ 'columns': missing_cols
104
+ })
105
+
106
+ # Check for duplicates
107
+ duplicates = df.duplicated().sum()
108
+ if duplicates > 0:
109
+ issues.append({
110
+ 'type': 'duplicates',
111
+ 'severity': 'medium' if duplicates > len(df) * 0.05 else 'low',
112
+ 'description': f'{duplicates} duplicate rows found',
113
+ 'count': duplicates
114
+ })
115
+
116
+ # Check for constant columns
117
+ constant_cols = [col for col in df.columns if df[col].nunique() == 1]
118
+ if constant_cols:
119
+ issues.append({
120
+ 'type': 'constant_columns',
121
+ 'severity': 'low',
122
+ 'description': f'{len(constant_cols)} constant columns found',
123
+ 'columns': constant_cols
124
+ })
125
+
126
+ # Check for outliers in numeric columns
127
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
128
+ for col in numeric_cols:
129
+ Q1 = df[col].quantile(0.25)
130
+ Q3 = df[col].quantile(0.75)
131
+ IQR = Q3 - Q1
132
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
133
+ if len(outliers) > len(df) * 0.1:
134
+ issues.append({
135
+ 'type': 'outliers',
136
+ 'severity': 'medium',
137
+ 'description': f'Significant outliers in {col}',
138
+ 'column': col,
139
+ 'outlier_count': len(outliers)
140
+ })
141
+ break # Just report first outlier issue
142
+
143
+ return issues
144
+
145
+ def get_recommendations(df):
146
+ """
147
+ Generate data analysis recommendations
148
+ """
149
+ numeric, categorical, datetime, boolean = detect_column_types(df)
150
+
151
+ recommendations = []
152
+
153
+ # Missing data recommendations
154
+ if df.isnull().sum().sum() > 0:
155
+ recommendations.append("Consider handling missing values using imputation or removal")
156
+
157
+ # Feature engineering suggestions
158
+ if len(numeric) >= 2:
159
+ recommendations.append("Create interaction features between highly correlated variables")
160
+
161
+ if datetime:
162
+ recommendations.append("Extract time-based features (hour, day, month, year) from datetime columns")
163
+
164
+ # Modeling suggestions
165
+ if len(numeric) > 5:
166
+ recommendations.append("Consider dimensionality reduction techniques (PCA, t-SNE)")
167
+
168
+ if df.shape[0] > 10000:
169
+ recommendations.append("Dataset is large - consider sampling for faster exploration")
170
+
171
+ # Visualization suggestions
172
+ if len(numeric) > 2:
173
+ recommendations.append("Use pair plots to visualize relationships between multiple variables")
174
+
175
+ if len(categorical) > 1:
176
+ recommendations.append("Create contingency tables to analyze categorical relationships")
177
+
178
+ return recommendations
179
+
180
+ def create_sample_dataset():
181
+ """
182
+ Create a sample dataset for testing
183
+ """
184
+ np.random.seed(42)
185
+ n_rows = 1000
186
+
187
+ data = {
188
+ 'id': range(n_rows),
189
+ 'age': np.random.normal(40, 15, n_rows).clip(18, 90).astype(int),
190
+ 'income': np.random.normal(50000, 20000, n_rows).clip(20000, 150000).astype(int),
191
+ 'score': np.random.uniform(0, 100, n_rows).round(2),
192
+ 'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
193
+ 'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
194
+ 'purchased': np.random.choice([0, 1], n_rows, p=[0.7, 0.3]),
195
+ 'signup_date': pd.date_range('2023-01-01', periods=n_rows, freq='D'),
196
+ 'satisfaction': np.random.choice([1, 2, 3, 4, 5], n_rows, p=[0.1, 0.15, 0.3, 0.25, 0.2])
197
+ }
198
+
199
+ # Add some missing values
200
+ df = pd.DataFrame(data)
201
+ mask = np.random.random(df.shape) < 0.05
202
+ df = df.mask(mask)
203
+
204
+ # Add some duplicates
205
+ duplicate_rows = np.random.choice(n_rows, 10, replace=False)
206
+ df = pd.concat([df, df.iloc[duplicate_rows]]).reset_index(drop=True)
207
+
208
+ return df
visualization.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import plotly.express as px
3
+ import plotly.graph_objects as go
4
+ from plotly.subplots import make_subplots
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ def auto_visualizations(df):
9
+
10
+ st.markdown("""
11
+ <div style='text-align: center; margin-bottom: 2rem;'>
12
+ <h2>📊 Interactive Data Visualization</h2>
13
+ <p style='color: gray;'>Create beautiful, interactive visualizations with just a few clicks</p>
14
+ </div>
15
+ """, unsafe_allow_html=True)
16
+
17
+ # Get column types
18
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
19
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
20
+ date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
21
+
22
+ # Visualization type selector
23
+ viz_type = st.selectbox(
24
+ "🎨 Select Visualization Type",
25
+ ["Distribution Plots", "Categorical Plots", "Relationship Plots",
26
+ "Time Series Plots", "Statistical Plots", "Advanced Plots"]
27
+ )
28
+
29
+ if viz_type == "Distribution Plots":
30
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
31
+ st.subheader("📈 Distribution Plots")
32
+
33
+ if num_cols:
34
+ # Create tabs for different distribution plots
35
+ dist_tab1, dist_tab2, dist_tab3 = st.tabs(["Histogram", "Box Plot", "Violin Plot"])
36
+
37
+ with dist_tab1:
38
+ col1, col2 = st.columns(2)
39
+ with col1:
40
+ hist_col = st.selectbox("Select column", num_cols, key="hist")
41
+ with col2:
42
+ bins = st.slider("Number of bins", 5, 100, 30)
43
+
44
+ fig = px.histogram(df, x=hist_col, nbins=bins,
45
+ title=f"Distribution of {hist_col}",
46
+ marginal="box", opacity=0.7)
47
+ fig.update_layout(showlegend=False)
48
+ st.plotly_chart(fig, use_container_width=True)
49
+
50
+ with dist_tab2:
51
+ if cat_cols:
52
+ box_col = st.selectbox("Numeric column", num_cols, key="box_num")
53
+ box_cat = st.selectbox("Category column (optional)", ["None"] + cat_cols, key="box_cat")
54
+
55
+ if box_cat == "None":
56
+ fig = px.box(df, y=box_col, title=f"Box Plot of {box_col}")
57
+ else:
58
+ fig = px.box(df, x=box_cat, y=box_col, title=f"{box_col} by {box_cat}")
59
+
60
+ st.plotly_chart(fig, use_container_width=True)
61
+ else:
62
+ st.info("Add categorical columns to create grouped box plots")
63
+
64
+ with dist_tab3:
65
+ if cat_cols:
66
+ violin_col = st.selectbox("Numeric column", num_cols, key="violin_num")
67
+ violin_cat = st.selectbox("Category column", cat_cols, key="violin_cat")
68
+
69
+ fig = px.violin(df, x=violin_cat, y=violin_col,
70
+ box=True, points="all",
71
+ title=f"Violin Plot of {violin_col} by {violin_cat}")
72
+ st.plotly_chart(fig, use_container_width=True)
73
+ else:
74
+ st.warning("No numeric columns available for distribution plots")
75
+
76
+ st.markdown('</div>', unsafe_allow_html=True)
77
+
78
+ elif viz_type == "Categorical Plots":
79
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
80
+ st.subheader("📊 Categorical Plots")
81
+
82
+ if cat_cols:
83
+ # Create tabs for categorical plots
84
+ cat_tab1, cat_tab2, cat_tab3 = st.tabs(["Bar Chart", "Pie Chart", "Sunburst Chart"])
85
+
86
+ with cat_tab1:
87
+ bar_col = st.selectbox("Select categorical column", cat_cols, key="bar")
88
+
89
+ # Get value counts
90
+ value_counts = df[bar_col].value_counts().reset_index()
91
+ value_counts.columns = [bar_col, 'count']
92
+
93
+ # Color option
94
+ if num_cols:
95
+ color_by = st.selectbox("Color by (optional)", ["None"] + num_cols, key="bar_color")
96
+ else:
97
+ color_by = "None"
98
+
99
+ if color_by == "None":
100
+ fig = px.bar(value_counts, x=bar_col, y='count',
101
+ title=f"Distribution of {bar_col}",
102
+ color_discrete_sequence=['#636EFA'])
103
+ else:
104
+ # Aggregate numeric column by category
105
+ agg_data = df.groupby(bar_col)[color_by].mean().reset_index()
106
+ fig = px.bar(agg_data, x=bar_col, y=color_by,
107
+ title=f"Average {color_by} by {bar_col}",
108
+ color=bar_col)
109
+
110
+ fig.update_layout(xaxis_tickangle=-45)
111
+ st.plotly_chart(fig, use_container_width=True)
112
+
113
+ with cat_tab2:
114
+ pie_col = st.selectbox("Select column for pie chart", cat_cols, key="pie")
115
+
116
+ # Limit to top 10 categories for readability
117
+ top_n = st.slider("Show top N categories", 3, 20, 10)
118
+ value_counts = df[pie_col].value_counts().head(top_n)
119
+
120
+ fig = px.pie(values=value_counts.values, names=value_counts.index,
121
+ title=f"Proportion of {pie_col} (Top {top_n})",
122
+ hole=0.3)
123
+ fig.update_traces(textposition='inside', textinfo='percent+label')
124
+ st.plotly_chart(fig, use_container_width=True)
125
+
126
+ with cat_tab3:
127
+ if len(cat_cols) >= 2:
128
+ st.markdown("**Hierarchical View**")
129
+ path = st.multiselect("Select hierarchy (order matters)",
130
+ cat_cols, default=cat_cols[:2])
131
+
132
+ if len(path) >= 2:
133
+ fig = px.sunburst(df, path=path,
134
+ title="Hierarchical Distribution")
135
+ st.plotly_chart(fig, use_container_width=True)
136
+ else:
137
+ st.info("Need at least 2 categorical columns for sunburst chart")
138
+ else:
139
+ st.warning("No categorical columns available")
140
+
141
+ st.markdown('</div>', unsafe_allow_html=True)
142
+
143
+ elif viz_type == "Relationship Plots":
144
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
145
+ st.subheader("🔄 Relationship Plots")
146
+
147
+ if len(num_cols) >= 2:
148
+ rel_tab1, rel_tab2, rel_tab3 = st.tabs(["Scatter Plot", "Line Plot", "Heatmap"])
149
+
150
+ with rel_tab1:
151
+ col1, col2, col3 = st.columns(3)
152
+ with col1:
153
+ x_col = st.selectbox("X axis", num_cols, key="scatter_x")
154
+ with col2:
155
+ y_col = st.selectbox("Y axis", [c for c in num_cols if c != x_col], key="scatter_y")
156
+ with col3:
157
+ color_col = st.selectbox("Color by", ["None"] + cat_cols + num_cols, key="scatter_color")
158
+
159
+ size_col = st.selectbox("Size by (optional)", ["None"] + num_cols, key="scatter_size")
160
+
161
+ # Create scatter plot
162
+ if color_col == "None" and size_col == "None":
163
+ fig = px.scatter(df, x=x_col, y=y_col,
164
+ title=f"{y_col} vs {x_col}",
165
+ trendline="ols")
166
+ elif color_col != "None" and size_col == "None":
167
+ fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
168
+ title=f"{y_col} vs {x_col} colored by {color_col}",
169
+ trendline="ols")
170
+ elif color_col == "None" and size_col != "None":
171
+ fig = px.scatter(df, x=x_col, y=y_col, size=size_col,
172
+ title=f"{y_col} vs {x_col} sized by {size_col}",
173
+ trendline="ols")
174
+ else:
175
+ fig = px.scatter(df, x=x_col, y=y_col, color=color_col, size=size_col,
176
+ title=f"{y_col} vs {x_col}",
177
+ trendline="ols")
178
+
179
+ st.plotly_chart(fig, use_container_width=True)
180
+
181
+ with rel_tab2:
182
+ col1, col2 = st.columns(2)
183
+ with col1:
184
+ line_x = st.selectbox("X axis (usually time)", num_cols + date_cols, key="line_x")
185
+ with col2:
186
+ line_y = st.selectbox("Y axis", num_cols, key="line_y")
187
+
188
+ line_color = st.selectbox("Color by", ["None"] + cat_cols, key="line_color")
189
+
190
+ if line_color == "None":
191
+ fig = px.line(df, x=line_x, y=line_y,
192
+ title=f"{line_y} over {line_x}")
193
+ else:
194
+ fig = px.line(df, x=line_x, y=line_y, color=line_color,
195
+ title=f"{line_y} over {line_x} by {line_color}")
196
+
197
+ st.plotly_chart(fig, use_container_width=True)
198
+
199
+ with rel_tab3:
200
+ # Correlation heatmap
201
+ corr_matrix = df[num_cols].corr()
202
+
203
+ # Mask for upper triangle
204
+ mask = np.triu(np.ones_like(corr_matrix), k=1)
205
+ masked_corr = corr_matrix * (1 - mask)
206
+
207
+ fig = px.imshow(masked_corr,
208
+ text_auto=True,
209
+ aspect="auto",
210
+ color_continuous_scale='RdBu_r',
211
+ title="Correlation Heatmap",
212
+ zmin=-1, zmax=1)
213
+
214
+ st.plotly_chart(fig, use_container_width=True)
215
+
216
+ # Show strongest correlations
217
+ st.markdown("**Strongest Correlations:**")
218
+ corr_pairs = []
219
+ for i in range(len(num_cols)):
220
+ for j in range(i+1, len(num_cols)):
221
+ corr_pairs.append((num_cols[i], num_cols[j],
222
+ corr_matrix.iloc[i, j]))
223
+
224
+ corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
225
+
226
+ for col1, col2, corr in corr_pairs[:5]:
227
+ strength = "🟢" if abs(corr) > 0.7 else "🟡" if abs(corr) > 0.3 else "🔴"
228
+ st.write(f"{strength} **{col1}** & **{col2}**: {corr:.3f}")
229
+ else:
230
+ st.warning("Need at least 2 numeric columns for relationship plots")
231
+
232
+ st.markdown('</div>', unsafe_allow_html=True)
233
+
234
+ elif viz_type == "Time Series Plots":
235
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
236
+ st.subheader("📅 Time Series Plots")
237
+
238
+ if date_cols:
239
+ ts_tab1, ts_tab2 = st.tabs(["Time Series", "Resampling"])
240
+
241
+ with ts_tab1:
242
+ date_col = st.selectbox("Date column", date_cols, key="ts_date")
243
+ value_col = st.selectbox("Value column", num_cols if num_cols else [], key="ts_value")
244
+
245
+ if num_cols and date_col:
246
+ # Sort by date
247
+ df_sorted = df.sort_values(date_col)
248
+
249
+ fig = go.Figure()
250
+ fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted[value_col],
251
+ mode='lines+markers', name=value_col))
252
+
253
+ fig.update_layout(title=f"{value_col} over Time",
254
+ xaxis_title="Date",
255
+ yaxis_title=value_col)
256
+
257
+ st.plotly_chart(fig, use_container_width=True)
258
+
259
+ with ts_tab2:
260
+ if num_cols and date_cols:
261
+ date_col = st.selectbox("Select date column", date_cols, key="resample_date")
262
+ resample_col = st.selectbox("Select column to resample", num_cols, key="resample_col")
263
+
264
+ freq = st.selectbox("Resampling frequency",
265
+ ["Daily", "Weekly", "Monthly", "Quarterly", "Yearly"])
266
+
267
+ freq_map = {
268
+ "Daily": "D",
269
+ "Weekly": "W",
270
+ "Monthly": "M",
271
+ "Quarterly": "Q",
272
+ "Yearly": "Y"
273
+ }
274
+
275
+ # Set date as index
276
+ df_date = df.set_index(date_col)
277
+
278
+ # Resample
279
+ resampled = df_date[resample_col].resample(freq_map[freq]).mean().reset_index()
280
+
281
+ fig = px.line(resampled, x=date_col, y=resample_col,
282
+ title=f"{resample_col} ({freq} Aggregated)")
283
+ st.plotly_chart(fig, use_container_width=True)
284
+ else:
285
+ st.warning("No datetime columns found. Convert a column to datetime first.")
286
+
287
+ st.markdown('</div>', unsafe_allow_html=True)
288
+
289
+ elif viz_type == "Statistical Plots":
290
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
291
+ st.subheader("📐 Statistical Plots")
292
+
293
+ if num_cols:
294
+ stat_tab1, stat_tab2, stat_tab3 = st.tabs(["QQ Plot", "ECDF", "Density Heatmap"])
295
+
296
+ with stat_tab1:
297
+ qq_col = st.selectbox("Select column for QQ plot", num_cols, key="qq")
298
+
299
+ # Calculate quantiles
300
+ data = df[qq_col].dropna()
301
+ theoretical_quantiles = np.percentile(np.random.normal(0, 1, len(data)),
302
+ np.linspace(0, 100, len(data)))
303
+ sample_quantiles = np.percentile(data, np.linspace(0, 100, len(data)))
304
+
305
+ fig = go.Figure()
306
+ fig.add_trace(go.Scatter(x=theoretical_quantiles, y=sample_quantiles,
307
+ mode='markers', name='Data'))
308
+
309
+ # Add diagonal line
310
+ min_val = min(theoretical_quantiles.min(), sample_quantiles.min())
311
+ max_val = max(theoretical_quantiles.max(), sample_quantiles.max())
312
+ fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
313
+ mode='lines', name='Normal',
314
+ line=dict(color='red', dash='dash')))
315
+
316
+ fig.update_layout(title=f"QQ Plot - {qq_col}",
317
+ xaxis_title="Theoretical Quantiles",
318
+ yaxis_title="Sample Quantiles")
319
+
320
+ st.plotly_chart(fig, use_container_width=True)
321
+
322
+ with stat_tab2:
323
+ ecdf_col = st.selectbox("Select column for ECDF", num_cols, key="ecdf")
324
+
325
+ fig = px.ecdf(df, x=ecdf_col,
326
+ title=f"Empirical Cumulative Distribution - {ecdf_col}")
327
+ st.plotly_chart(fig, use_container_width=True)
328
+
329
+ with stat_tab3:
330
+ if len(num_cols) >= 2:
331
+ x_col = st.selectbox("X axis", num_cols, key="density_x")
332
+ y_col = st.selectbox("Y axis", [c for c in num_cols if c != x_col], key="density_y")
333
+
334
+ fig = px.density_heatmap(df, x=x_col, y=y_col,
335
+ title=f"Density Heatmap: {y_col} vs {x_col}",
336
+ marginal_x="histogram",
337
+ marginal_y="histogram")
338
+ st.plotly_chart(fig, use_container_width=True)
339
+ else:
340
+ st.warning("No numeric columns available for statistical plots")
341
+
342
+ st.markdown('</div>', unsafe_allow_html=True)
343
+
344
+ elif viz_type == "Advanced Plots":
345
+ st.markdown('<div class="custom-card">', unsafe_allow_html=True)
346
+ st.subheader("🚀 Advanced Visualizations")
347
+
348
+ adv_tab1, adv_tab2, adv_tab3 = st.tabs(["3D Scatter", "Parallel Coordinates", "Radar Chart"])
349
+
350
+ with adv_tab1:
351
+ if len(num_cols) >= 3:
352
+ col1, col2, col3 = st.columns(3)
353
+ with col1:
354
+ x_3d = st.selectbox("X axis", num_cols, key="3d_x")
355
+ with col2:
356
+ y_3d = st.selectbox("Y axis", [c for c in num_cols if c != x_3d], key="3d_y")
357
+ with col3:
358
+ z_3d = st.selectbox("Z axis", [c for c in num_cols if c not in [x_3d, y_3d]], key="3d_z")
359
+
360
+ color_3d = st.selectbox("Color by", ["None"] + cat_cols + num_cols, key="3d_color")
361
+
362
+ if color_3d == "None":
363
+ fig = px.scatter_3d(df, x=x_3d, y=y_3d, z=z_3d,
364
+ title=f"3D Scatter: {x_3d}, {y_3d}, {z_3d}")
365
+ else:
366
+ fig = px.scatter_3d(df, x=x_3d, y=y_3d, z=z_3d, color=color_3d,
367
+ title=f"3D Scatter colored by {color_3d}")
368
+
369
+ st.plotly_chart(fig, use_container_width=True)
370
+ else:
371
+ st.info("Need at least 3 numeric columns for 3D scatter plot")
372
+
373
+ with adv_tab2:
374
+ if num_cols:
375
+ selected_dims = st.multiselect("Select dimensions", num_cols, default=num_cols[:4])
376
+
377
+ if selected_dims and len(selected_dims) >= 2:
378
+ color_dim = st.selectbox("Color dimension", ["None"] + cat_cols + num_cols)
379
+
380
+ if color_dim == "None":
381
+ fig = px.parallel_coordinates(df, dimensions=selected_dims,
382
+ title="Parallel Coordinates Plot")
383
+ else:
384
+ fig = px.parallel_coordinates(df, dimensions=selected_dims,
385
+ color=color_dim,
386
+ title=f"Parallel Coordinates colored by {color_dim}")
387
+
388
+ st.plotly_chart(fig, use_container_width=True)
389
+
390
+ with adv_tab3:
391
+ if num_cols:
392
+ st.markdown("**Radar Chart** (requires at least 3 numeric columns)")
393
+ selected_radar = st.multiselect("Select metrics for radar chart",
394
+ num_cols, default=num_cols[:3])
395
+
396
+ if len(selected_radar) >= 3:
397
+ # Get first row as sample
398
+ sample = df[selected_radar].iloc[0]
399
+
400
+ fig = go.Figure(data=go.Scatterpolar(
401
+ r=sample.values,
402
+ theta=selected_radar,
403
+ fill='toself'
404
+ ))
405
+
406
+ fig.update_layout(
407
+ polar=dict(
408
+ radialaxis=dict(
409
+ visible=True,
410
+ range=[sample.min(), sample.max()]
411
+ )),
412
+ showlegend=False,
413
+ title="Radar Chart (First Row)"
414
+ )
415
+
416
+ st.plotly_chart(fig, use_container_width=True)
417
+
418
+ st.markdown('</div>', unsafe_allow_html=True)
419
+
420
+ # Download plot data option
421
+ st.markdown("---")
422
+ st.markdown("### 💾 Export Options")
423
+
424
+ col1, col2 = st.columns(2)
425
+ with col1:
426
+ st.info("To save any plot, hover over it and click the camera icon 📷")
427
+ with col2:
428
+ csv = df.to_csv(index=False)
429
+ st.download_button(
430
+ label="📥 Download Data as CSV",
431
+ data=csv,
432
+ file_name="visualization_data.csv",
433
+ mime="text/csv",
434
+ use_container_width=True
435
+ )