HaryaniAnjali commited on
Commit
06fe39c
·
verified ·
1 Parent(s): 3f73c0a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +788 -0
app.py ADDED
@@ -0,0 +1,788 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ import io
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.preprocessing import StandardScaler
11
+ import os
12
+ import json
13
+ import requests
14
+ import re
15
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
16
+ import torch
17
+ import openai
18
+
19
+ # Set plot styling
20
+ sns.set(style="whitegrid")
21
+ plt.rcParams["figure.figsize"] = (10, 6)
22
+
23
+ # Initialize AI Models
24
+ def initialize_ai_models():
25
+ """Initialize the AI models for data analysis."""
26
+ # Initialize OpenAI API (keys will be loaded from environment variables)
27
+ # Note: Users need to set OPENAI_API_KEY in their Hugging Face Space secrets
28
+
29
+ # Initialize Hugging Face model for data recommendations
30
+ try:
31
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
32
+ model = AutoModelForCausalLM.from_pretrained("google/flan-t5-base")
33
+ data_assistant = pipeline("text-generation", model=model, tokenizer=tokenizer)
34
+ except:
35
+ # Fallback to a smaller model if the main one fails to load
36
+ data_assistant = pipeline("text-generation", model="distilgpt2")
37
+
38
+ return data_assistant
39
+
40
+ # Global variables for AI models
41
+ data_assistant = None
42
+
43
+ def read_file(file):
44
+ """Read different file formats into a pandas DataFrame."""
45
+ if file is None:
46
+ return None
47
+
48
+ file_name = file.name if hasattr(file, 'name') else ''
49
+
50
+ try:
51
+ # Handle different file types
52
+ if file_name.endswith('.csv'):
53
+ return pd.read_csv(file)
54
+ elif file_name.endswith(('.xls', '.xlsx')):
55
+ return pd.read_excel(file)
56
+ elif file_name.endswith('.json'):
57
+ return pd.read_json(file)
58
+ elif file_name.endswith('.txt'):
59
+ return pd.read_csv(file, delimiter='\t')
60
+ else:
61
+ return "Unsupported file format. Please upload .csv, .xlsx, .xls, .json, or .txt files."
62
+ except Exception as e:
63
+ return f"Error reading file: {str(e)}"
64
+
65
+ def analyze_data(df):
66
+ """Generate basic statistics and information about the dataset."""
67
+ if not isinstance(df, pd.DataFrame):
68
+ return df # Return error message if df is not a DataFrame
69
+
70
+ # Basic info
71
+ info = {}
72
+ info['Shape'] = df.shape
73
+ info['Columns'] = df.columns.tolist()
74
+ info['Data Types'] = df.dtypes.astype(str).to_dict()
75
+
76
+ # Check for missing values
77
+ missing_values = df.isnull().sum()
78
+ if missing_values.sum() > 0:
79
+ info['Missing Values'] = missing_values[missing_values > 0].to_dict()
80
+ else:
81
+ info['Missing Values'] = "No missing values found"
82
+
83
+ # Data quality issues
84
+ info['Data Quality Issues'] = identify_data_quality_issues(df)
85
+
86
+ # Basic statistics for numerical columns
87
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
88
+ if numeric_cols:
89
+ info['Numeric Columns'] = numeric_cols
90
+ info['Statistics'] = df[numeric_cols].describe().to_html()
91
+
92
+ # Check for outliers
93
+ outliers = detect_outliers(df, numeric_cols)
94
+ if outliers:
95
+ info['Outliers'] = outliers
96
+
97
+ # Identify categorical columns
98
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
99
+ if categorical_cols:
100
+ info['Categorical Columns'] = categorical_cols
101
+ # Get unique value counts for categorical columns (limit to first 5 for brevity)
102
+ cat_counts = {}
103
+ for col in categorical_cols[:5]: # Limit to first 5 categorical columns
104
+ cat_counts[col] = df[col].value_counts().head(10).to_dict() # Show top 10 values
105
+ info['Category Counts'] = cat_counts
106
+
107
+ return info
108
+
109
+ def identify_data_quality_issues(df):
110
+ """Identify common data quality issues."""
111
+ issues = {}
112
+
113
+ # Check for duplicate rows
114
+ duplicate_count = df.duplicated().sum()
115
+ if duplicate_count > 0:
116
+ issues['Duplicate Rows'] = duplicate_count
117
+
118
+ # Check for high cardinality in categorical columns
119
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
120
+ high_cardinality = {}
121
+ for col in categorical_cols:
122
+ unique_count = df[col].nunique()
123
+ if unique_count > 50: # Arbitrary threshold
124
+ high_cardinality[col] = unique_count
125
+
126
+ if high_cardinality:
127
+ issues['High Cardinality Columns'] = high_cardinality
128
+
129
+ # Check for potential date columns not properly formatted
130
+ potential_date_cols = []
131
+ for col in df.select_dtypes(include=['object']).columns:
132
+ # Sample the first 10 non-null values
133
+ sample = df[col].dropna().head(10).tolist()
134
+ if all(isinstance(x, str) for x in sample):
135
+ # Simple date pattern check
136
+ date_pattern = re.compile(r'\d{1,4}[-/\.]\d{1,2}[-/\.]\d{1,4}')
137
+ if any(date_pattern.search(str(x)) for x in sample):
138
+ potential_date_cols.append(col)
139
+
140
+ if potential_date_cols:
141
+ issues['Potential Date Columns'] = potential_date_cols
142
+
143
+ # Check for columns with mostly missing values
144
+ high_missing = {}
145
+ for col in df.columns:
146
+ missing_pct = df[col].isnull().mean() * 100
147
+ if missing_pct > 50: # More than 50% missing
148
+ high_missing[col] = f"{missing_pct:.2f}%"
149
+
150
+ if high_missing:
151
+ issues['Columns with >50% Missing'] = high_missing
152
+
153
+ return issues
154
+
155
+ def detect_outliers(df, numeric_cols):
156
+ """Detect outliers in numeric columns using IQR method."""
157
+ outliers = {}
158
+
159
+ for col in numeric_cols:
160
+ # Skip columns with too many unique values (potentially ID columns)
161
+ if df[col].nunique() > df.shape[0] * 0.9:
162
+ continue
163
+
164
+ # Calculate IQR
165
+ Q1 = df[col].quantile(0.25)
166
+ Q3 = df[col].quantile(0.75)
167
+ IQR = Q3 - Q1
168
+
169
+ # Define outlier bounds
170
+ lower_bound = Q1 - 1.5 * IQR
171
+ upper_bound = Q3 + 1.5 * IQR
172
+
173
+ # Count outliers
174
+ outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
175
+
176
+ if outlier_count > 0:
177
+ outlier_pct = (outlier_count / df.shape[0]) * 100
178
+ if outlier_pct > 1: # Only report if more than 1% are outliers
179
+ outliers[col] = {
180
+ 'count': outlier_count,
181
+ 'percentage': f"{outlier_pct:.2f}%",
182
+ 'lower_bound': lower_bound,
183
+ 'upper_bound': upper_bound
184
+ }
185
+
186
+ return outliers
187
+
188
+ def generate_visualizations(df):
189
+ """Generate appropriate visualizations based on the data types."""
190
+ if not isinstance(df, pd.DataFrame):
191
+ return df # Return error message if df is not a DataFrame
192
+
193
+ visualizations = {}
194
+
195
+ # Identify column types
196
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
197
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
198
+ date_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns]' or
199
+ (df[col].dtype == 'object' and pd.to_datetime(df[col], errors='coerce').notna().all())]
200
+
201
+ # 1. Distribution plots for numeric columns (first 5)
202
+ if numeric_cols:
203
+ for i, col in enumerate(numeric_cols[:5]): # Limit to first 5 numeric columns
204
+ fig = px.histogram(df, x=col, marginal="box", title=f"Distribution of {col}")
205
+ visualizations[f'dist_{col}'] = fig
206
+
207
+ # 2. Bar charts for categorical columns (first 5)
208
+ if categorical_cols:
209
+ for i, col in enumerate(categorical_cols[:5]): # Limit to first 5 categorical columns
210
+ value_counts = df[col].value_counts().nlargest(10) # Top 10 categories
211
+ fig = px.bar(x=value_counts.index, y=value_counts.values,
212
+ title=f"Top 10 categories in {col}")
213
+ fig.update_xaxes(title=col)
214
+ fig.update_yaxes(title="Count")
215
+ visualizations[f'bar_{col}'] = fig
216
+
217
+ # 3. Correlation heatmap for numeric columns
218
+ if len(numeric_cols) > 1:
219
+ corr_matrix = df[numeric_cols].corr()
220
+ fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
221
+ title="Correlation Heatmap")
222
+ visualizations['correlation'] = fig
223
+
224
+ # 4. Scatter plot matrix (first 4 numeric columns)
225
+ if len(numeric_cols) >= 2:
226
+ plot_cols = numeric_cols[:4] # Limit to first 4 numeric columns
227
+ fig = px.scatter_matrix(df, dimensions=plot_cols, title="Scatter Plot Matrix")
228
+ visualizations['scatter_matrix'] = fig
229
+
230
+ # 5. Time series plot if date column exists
231
+ if date_cols and numeric_cols:
232
+ date_col = date_cols[0] # Use the first date column
233
+ # Convert to datetime if not already
234
+ if df[date_col].dtype != 'datetime64[ns]':
235
+ df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
236
+
237
+ # Sort by date
238
+ df_sorted = df.sort_values(by=date_col)
239
+
240
+ # Create time series for first numeric column
241
+ num_col = numeric_cols[0]
242
+ fig = px.line(df_sorted, x=date_col, y=num_col,
243
+ title=f"{num_col} over Time")
244
+ visualizations['time_series'] = fig
245
+
246
+ # 6. PCA visualization if enough numeric columns
247
+ if len(numeric_cols) >= 3:
248
+ # Apply PCA to numeric data
249
+ numeric_data = df[numeric_cols].select_dtypes(include=[np.number])
250
+ # Fill NaN values with mean for PCA
251
+ numeric_data = numeric_data.fillna(numeric_data.mean())
252
+
253
+ # Standardize the data
254
+ scaler = StandardScaler()
255
+ scaled_data = scaler.fit_transform(numeric_data)
256
+
257
+ # Apply PCA with 2 components
258
+ pca = PCA(n_components=2)
259
+ pca_result = pca.fit_transform(scaled_data)
260
+
261
+ # Create a DataFrame with PCA results
262
+ pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
263
+
264
+ # If categorical column exists, use it for color
265
+ if categorical_cols:
266
+ cat_col = categorical_cols[0]
267
+ pca_df[cat_col] = df[cat_col].values
268
+ fig = px.scatter(pca_df, x='PC1', y='PC2', color=cat_col,
269
+ title="PCA Visualization")
270
+ else:
271
+ fig = px.scatter(pca_df, x='PC1', y='PC2',
272
+ title="PCA Visualization")
273
+
274
+ variance_ratio = pca.explained_variance_ratio_
275
+ fig.update_layout(
276
+ annotations=[
277
+ dict(
278
+ text=f"PC1 explained variance: {variance_ratio[0]:.2f}",
279
+ showarrow=False,
280
+ x=0.5,
281
+ y=1.05,
282
+ xref="paper",
283
+ yref="paper"
284
+ ),
285
+ dict(
286
+ text=f"PC2 explained variance: {variance_ratio[1]:.2f}",
287
+ showarrow=False,
288
+ x=0.5,
289
+ y=1.02,
290
+ xref="paper",
291
+ yref="paper"
292
+ )
293
+ ]
294
+ )
295
+
296
+ visualizations['pca'] = fig
297
+
298
+ return visualizations
299
+
300
+ def get_ai_cleaning_recommendations(df):
301
+ """Get AI-powered recommendations for data cleaning using OpenAI."""
302
+ try:
303
+ # Prepare the dataset summary
304
+ summary = {
305
+ "shape": df.shape,
306
+ "columns": df.columns.tolist(),
307
+ "dtypes": df.dtypes.astype(str).to_dict(),
308
+ "missing_values": df.isnull().sum().to_dict(),
309
+ "duplicates": df.duplicated().sum(),
310
+ "sample_data": df.head(5).to_dict()
311
+ }
312
+
313
+ # Create the prompt for OpenAI
314
+ prompt = f"""
315
+ I have a dataset with the following properties:
316
+ - Shape: {summary['shape']}
317
+ - Columns: {', '.join(summary['columns'])}
318
+ - Missing values: {summary['missing_values']}
319
+ - Duplicate rows: {summary['duplicates']}
320
+
321
+ Here's a sample of the data:
322
+ {json.dumps(summary['sample_data'], indent=2)}
323
+
324
+ Based on this information, provide specific data cleaning recommendations in a bulleted list.
325
+ Include suggestions for handling missing values, outliers, data types, and duplicate rows.
326
+ Format your response as markdown and ONLY include the cleaning recommendations.
327
+ """
328
+
329
+ # Check if OpenAI API key is available
330
+ api_key = os.environ.get("OPENAI_API_KEY")
331
+ if api_key:
332
+ openai.api_key = api_key
333
+ response = openai.ChatCompletion.create(
334
+ model="gpt-3.5-turbo",
335
+ messages=[
336
+ {"role": "system", "content": "You are a data science assistant focused on data cleaning recommendations."},
337
+ {"role": "user", "content": prompt}
338
+ ],
339
+ max_tokens=700
340
+ )
341
+ return response.choices[0].message.content
342
+ else:
343
+ # Fallback to Hugging Face model if OpenAI key is not available
344
+ global data_assistant
345
+ if data_assistant is None:
346
+ data_assistant = initialize_ai_models()
347
+
348
+ # Shorten the prompt for the smaller model
349
+ short_prompt = f"Data cleaning recommendations for dataset with {df.shape[0]} rows, {df.shape[1]} columns, and columns: {', '.join(df.columns[:5])}..."
350
+
351
+ # Generate recommendations
352
+ recommendations = data_assistant(
353
+ short_prompt,
354
+ max_length=500,
355
+ num_return_sequences=1
356
+ )[0]['generated_text']
357
+
358
+ return f"""
359
+ ## Data Cleaning Recommendations
360
+
361
+ * Handle missing values in columns with appropriate imputation techniques
362
+ * Check for and remove duplicate records
363
+ * Standardize text fields and correct spelling errors
364
+ * Convert columns to appropriate data types
365
+ * Check for and handle outliers in numerical columns
366
+
367
+ Note: These are generic recommendations as AI model access is limited.
368
+ """
369
+ except Exception as e:
370
+ return f"""
371
+ ## Data Cleaning Recommendations
372
+
373
+ * Handle missing values by either removing rows or imputing with mean/median/mode
374
+ * Remove duplicate rows if present
375
+ * Convert date-like string columns to proper datetime format
376
+ * Standardize text data by removing extra spaces and converting to lowercase
377
+ * Check for and handle outliers in numerical columns
378
+
379
+ Note: Could not access AI models for customized recommendations. Error: {str(e)}
380
+ """
381
+
382
+ def get_hf_model_insights(df):
383
+ """Get dataset insights using Hugging Face model."""
384
+ try:
385
+ global data_assistant
386
+ if data_assistant is None:
387
+ data_assistant = initialize_ai_models()
388
+
389
+ # Prepare a brief summary of the dataset
390
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
391
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
392
+
393
+ dataset_summary = f"""
394
+ Dataset with {df.shape[0]} rows and {df.shape[1]} columns.
395
+ Numeric columns: {', '.join(numeric_cols[:5])}
396
+ Categorical columns: {', '.join(categorical_cols[:5])}
397
+ """
398
+
399
+ # Generate analysis insights
400
+ prompt = f"Based on this dataset summary, suggest data analysis approaches: {dataset_summary}"
401
+
402
+ response = data_assistant(
403
+ prompt,
404
+ max_length=300,
405
+ num_return_sequences=1
406
+ )[0]['generated_text']
407
+
408
+ # Clean up the response
409
+ analysis_insights = response.replace(prompt, "").strip()
410
+
411
+ if not analysis_insights or len(analysis_insights) < 50:
412
+ # Fallback if the model doesn't produce good results
413
+ analysis_insights = """
414
+ ## Data Analysis Suggestions
415
+
416
+ 1. For numeric columns, calculate correlation matrices to identify relationships
417
+ 2. For categorical columns, analyze frequency distributions
418
+ 3. Consider creating pivot tables to understand how categories relate
419
+ 4. Look for time-based patterns if datetime columns are present
420
+ 5. Consider dimensionality reduction techniques like PCA for visualization
421
+ """
422
+
423
+ return analysis_insights
424
+
425
+ except Exception as e:
426
+ return f"""
427
+ ## Data Analysis Suggestions
428
+
429
+ 1. Examine the distribution of each numeric column
430
+ 2. Analyze correlations between numeric features
431
+ 3. Look for patterns in categorical data
432
+ 4. Consider creating visualizations like histograms and scatter plots
433
+ 5. Explore relationships between different variables
434
+
435
+ Note: Could not access AI models for customized recommendations. Error: {str(e)}
436
+ """
437
+
438
+ def process_file(file):
439
+ """Main function to process uploaded file and generate analysis."""
440
+ # Read the file
441
+ df = read_file(file)
442
+
443
+ if isinstance(df, str): # If error message
444
+ return df, None, None, None
445
+
446
+ # Convert date columns to datetime
447
+ for col in df.columns:
448
+ if df[col].dtype == 'object':
449
+ try:
450
+ if pd.to_datetime(df[col], errors='coerce').notna().all():
451
+ df[col] = pd.to_datetime(df[col])
452
+ except:
453
+ pass
454
+
455
+ # Analyze data
456
+ analysis = analyze_data(df)
457
+
458
+ # Generate visualizations
459
+ visualizations = generate_visualizations(df)
460
+
461
+ # Get AI cleaning recommendations
462
+ cleaning_recommendations = get_ai_cleaning_recommendations(df)
463
+
464
+ # Get insights from Hugging Face model
465
+ analysis_insights = get_hf_model_insights(df)
466
+
467
+ return analysis, visualizations, cleaning_recommendations, analysis_insights
468
+
469
+ def display_analysis(analysis):
470
+ """Format the analysis results for display."""
471
+ if analysis is None:
472
+ return "No analysis available."
473
+
474
+ if isinstance(analysis, str): # Error message
475
+ return analysis
476
+
477
+ # Format analysis as HTML
478
+ html = "<h2>Data Analysis</h2>"
479
+
480
+ # Basic info
481
+ html += f"<p><strong>Shape:</strong> {analysis['Shape'][0]} rows, {analysis['Shape'][1]} columns</p>"
482
+ html += f"<p><strong>Columns:</strong> {', '.join(analysis['Columns'])}</p>"
483
+
484
+ # Missing values
485
+ html += "<h3>Missing Values</h3>"
486
+ if isinstance(analysis['Missing Values'], str):
487
+ html += f"<p>{analysis['Missing Values']}</p>"
488
+ else:
489
+ html += "<ul>"
490
+ for col, count in analysis['Missing Values'].items():
491
+ html += f"<li>{col}: {count}</li>"
492
+ html += "</ul>"
493
+
494
+ # Data quality issues
495
+ if 'Data Quality Issues' in analysis and analysis['Data Quality Issues']:
496
+ html += "<h3>Data Quality Issues</h3>"
497
+ for issue_type, issue_details in analysis['Data Quality Issues'].items():
498
+ html += f"<h4>{issue_type}</h4>"
499
+ if isinstance(issue_details, dict):
500
+ html += "<ul>"
501
+ for key, value in issue_details.items():
502
+ html += f"<li>{key}: {value}</li>"
503
+ html += "</ul>"
504
+ else:
505
+ html += f"<p>{issue_details}</p>"
506
+
507
+ # Outliers
508
+ if 'Outliers' in analysis and analysis['Outliers']:
509
+ html += "<h3>Outliers Detected</h3>"
510
+ html += "<ul>"
511
+ for col, details in analysis['Outliers'].items():
512
+ html += f"<li><strong>{col}:</strong> {details['count']} outliers ({details['percentage']})<br>"
513
+ html += f"Values outside range: [{details['lower_bound']:.2f}, {details['upper_bound']:.2f}]</li>"
514
+ html += "</ul>"
515
+
516
+ # Statistics for numeric columns
517
+ if 'Statistics' in analysis:
518
+ html += "<h3>Numeric Statistics</h3>"
519
+ html += analysis['Statistics']
520
+
521
+ # Categorical columns info
522
+ if 'Category Counts' in analysis:
523
+ html += "<h3>Categorical Data (Top Values)</h3>"
524
+ for col, counts in analysis['Category Counts'].items():
525
+ html += f"<h4>{col}</h4><ul>"
526
+ for val, count in counts.items():
527
+ html += f"<li>{val}: {count}</li>"
528
+ html += "</ul>"
529
+
530
+ return html
531
+
532
+ def apply_data_cleaning(df, cleaning_options):
533
+ """Apply selected data cleaning operations to the DataFrame."""
534
+ cleaned_df = df.copy()
535
+ cleaning_log = []
536
+
537
+ # Handle missing values
538
+ if cleaning_options.get("handle_missing"):
539
+ method = cleaning_options.get("missing_method", "drop")
540
+ for col in cleaned_df.columns:
541
+ missing_count_before = cleaned_df[col].isnull().sum()
542
+ if missing_count_before > 0:
543
+ if method == "drop":
544
+ # Drop rows with missing values in this column
545
+ cleaned_df = cleaned_df.dropna(subset=[col])
546
+ cleaning_log.append(f"Dropped {missing_count_before} rows with missing values in column '{col}'")
547
+ elif method == "mean" and cleaned_df[col].dtype in [np.float64, np.int64]:
548
+ # Fill with mean for numeric columns
549
+ mean_val = cleaned_df[col].mean()
550
+ cleaned_df[col] = cleaned_df[col].fillna(mean_val)
551
+ cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with mean ({mean_val:.2f})")
552
+ elif method == "median" and cleaned_df[col].dtype in [np.float64, np.int64]:
553
+ # Fill with median for numeric columns
554
+ median_val = cleaned_df[col].median()
555
+ cleaned_df[col] = cleaned_df[col].fillna(median_val)
556
+ cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with median ({median_val:.2f})")
557
+ elif method == "mode":
558
+ # Fill with mode for any column type
559
+ mode_val = cleaned_df[col].mode()[0]
560
+ cleaned_df[col] = cleaned_df[col].fillna(mode_val)
561
+ cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with mode ({mode_val})")
562
+ elif method == "zero" and cleaned_df[col].dtype in [np.float64, np.int64]:
563
+ # Fill with zeros for numeric columns
564
+ cleaned_df[col] = cleaned_df[col].fillna(0)
565
+ cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with 0")
566
+
567
+ # Remove duplicates
568
+ if cleaning_options.get("remove_duplicates"):
569
+ dupe_count_before = cleaned_df.duplicated().sum()
570
+ if dupe_count_before > 0:
571
+ cleaned_df = cleaned_df.drop_duplicates()
572
+ cleaning_log.append(f"Removed {dupe_count_before} duplicate rows")
573
+
574
+ # Handle outliers in numeric columns
575
+ if cleaning_options.get("handle_outliers"):
576
+ method = cleaning_options.get("outlier_method", "remove")
577
+ numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
578
+
579
+ for col in numeric_cols:
580
+ # Calculate IQR
581
+ Q1 = cleaned_df[col].quantile(0.25)
582
+ Q3 = cleaned_df[col].quantile(0.75)
583
+ IQR = Q3 - Q1
584
+
585
+ # Define outlier bounds
586
+ lower_bound = Q1 - 1.5 * IQR
587
+ upper_bound = Q3 + 1.5 * IQR
588
+
589
+ # Identify outliers
590
+ outliers = ((cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound))
591
+ outlier_count = outliers.sum()
592
+
593
+ if outlier_count > 0:
594
+ if method == "remove":
595
+ # Remove rows with outliers
596
+ cleaned_df = cleaned_df[~outliers]
597
+ cleaning_log.append(f"Removed {outlier_count} rows with outliers in column '{col}'")
598
+ elif method == "cap":
599
+ # Cap outliers at the bounds
600
+ cleaned_df.loc[cleaned_df[col] < lower_bound, col] = lower_bound
601
+ cleaned_df.loc[cleaned_df[col] > upper_bound, col] = upper_bound
602
+ cleaning_log.append(f"Capped {outlier_count} outliers in column '{col}' to range [{lower_bound:.2f}, {upper_bound:.2f}]")
603
+
604
+ # Convert date columns
605
+ if cleaning_options.get("convert_dates"):
606
+ for col in cleaned_df.columns:
607
+ if col in cleaning_options.get("date_columns", []):
608
+ try:
609
+ cleaned_df[col] = pd.to_datetime(cleaned_df[col])
610
+ cleaning_log.append(f"Converted column '{col}' to datetime format")
611
+ except:
612
+ cleaning_log.append(f"Failed to convert column '{col}' to datetime format")
613
+
614
+ # Normalize numeric columns
615
+ if cleaning_options.get("normalize_columns"):
616
+ for col in cleaned_df.columns:
617
+ if col in cleaning_options.get("normalize_columns_list", []) and cleaned_df[col].dtype in [np.float64, np.int64]:
618
+ # Min-max normalization
619
+ min_val = cleaned_df[col].min()
620
+ max_val = cleaned_df[col].max()
621
+ if max_val > min_val: # Avoid division by zero
622
+ cleaned_df[col] = (cleaned_df[col] - min_val) / (max_val - min_val)
623
+ cleaning_log.append(f"Normalized column '{col}' to range [0, 1]")
624
+
625
+ return cleaned_df, cleaning_log
626
+
627
+ def app_ui(file):
628
+ """Main function for the Gradio interface."""
629
+ if file is None:
630
+ return "Please upload a file to begin analysis.", None, None, None
631
+
632
+ # Process the file
633
+ analysis, visualizations, cleaning_recommendations, analysis_insights = process_file(file)
634
+
635
+ if isinstance(analysis, str): # If error message
636
+ return analysis, None, None, None
637
+
638
+ # Format analysis for display
639
+ analysis_html = display_analysis(analysis)
640
+
641
+ # Prepare visualizations for display
642
+ viz_html = ""
643
+ if visualizations and not isinstance(visualizations, str):
644
+ for viz_name, fig in visualizations.items():
645
+ # Convert plotly figure to HTML
646
+ viz_html += f'<div style="margin-bottom: 30px;">{fig.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
647
+
648
+ # Combine analysis and visualizations
649
+ result_html = f"""
650
+ <div style="display: flex; flex-direction: column;">
651
+ <div>{analysis_html}</div>
652
+ <h2>Data Visualizations</h2>
653
+ <div>{viz_html}</div>
654
+ </div>
655
+ """
656
+
657
+ return result_html, visualizations, cleaning_recommendations, analysis_insights
658
+
659
+ def apply_cleaning_ui(file, handle_missing, missing_method, remove_duplicates,
660
+ handle_outliers, outlier_method, convert_dates, date_columns,
661
+ normalize_numeric):
662
+ """UI function for data cleaning workflow."""
663
+ if file is None:
664
+ return "Please upload a file before attempting to clean data.", None
665
+
666
+ # Read the file
667
+ df = read_file(file)
668
+
669
+ if isinstance(df, str): # If error message
670
+ return df, None
671
+
672
+ # Configure cleaning options
673
+ cleaning_options = {
674
+ "handle_missing": handle_missing,
675
+ "missing_method": missing_method,
676
+ "remove_duplicates": remove_duplicates,
677
+ "handle_outliers": handle_outliers,
678
+ "outlier_method": outlier_method,
679
+ "convert_dates": convert_dates,
680
+ "date_columns": date_columns.split(",") if date_columns else [],
681
+ "normalize_columns": normalize_numeric,
682
+ "normalize_columns_list": df.select_dtypes(include=[np.number]).columns.tolist() if normalize_numeric else []
683
+ }
684
+
685
+ # Apply cleaning
686
+ cleaned_df, cleaning_log = apply_data_cleaning(df, cleaning_options)
687
+
688
+ # Generate info about the cleaning
689
+ result_summary = f"""
690
+ <h2>Data Cleaning Results</h2>
691
+ <p>Original data: {df.shape[0]} rows, {df.shape[1]} columns</p>
692
+ <p>Cleaned data: {cleaned_df.shape[0]} rows, {cleaned_df.shape[1]} columns</p>
693
+
694
+ <h3>Cleaning Operations Applied:</h3>
695
+ <ul>
696
+ """
697
+
698
+ for log_item in cleaning_log:
699
+ result_summary += f"<li>{log_item}</li>"
700
+
701
+ result_summary += "</ul>"
702
+
703
+ # Save cleaned data for download
704
+ buffer = io.BytesIO()
705
+ cleaned_df.to_csv(buffer, index=False)
706
+ buffer.seek(0)
707
+
708
+ return result_summary, buffer
709
+
710
+ # Create Gradio interface
711
+ with gr.Blocks(title="Data Visualization & Cleaning AI") as demo:
712
+ gr.Markdown("# Data Visualization & Cleaning AI")
713
+ gr.Markdown("Upload your data file (CSV, Excel, JSON, or TXT) and get automatic analysis, visualizations, and AI-powered insights.")
714
+
715
+ with gr.Row():
716
+ file_input = gr.File(label="Upload Data File")
717
+
718
+ with gr.Tabs():
719
+ with gr.TabItem("Data Analysis"):
720
+ with gr.Row():
721
+ analyze_button = gr.Button("Analyze Data")
722
+
723
+ with gr.Tabs():
724
+ with gr.TabItem("Analysis & Visualizations"):
725
+ output = gr.HTML(label="Results")
726
+ with gr.TabItem("AI Cleaning Recommendations"):
727
+ cleaning_recommendations_output = gr.Markdown(label="AI Recommendations")
728
+ with gr.TabItem("AI Analysis Insights"):
729
+ analysis_insights_output = gr.Markdown(label="Analysis Insights")
730
+ with gr.TabItem("Raw Visualization Objects"):
731
+ viz_output = gr.JSON(label="Visualization Objects")
732
+
733
+ with gr.TabItem("Data Cleaning"):
734
+ with gr.Row():
735
+ with gr.Column(scale=1):
736
+ gr.Markdown("### Cleaning Options")
737
+ handle_missing = gr.Checkbox(label="Handle Missing Values", value=True)
738
+ missing_method = gr.Radio(
739
+ label="Missing Values Method",
740
+ choices=["drop", "mean", "median", "mode", "zero"],
741
+ value="mean"
742
+ )
743
+ remove_duplicates = gr.Checkbox(label="Remove Duplicate Rows", value=True)
744
+ handle_outliers = gr.Checkbox(label="Handle Outliers", value=False)
745
+ outlier_method = gr.Radio(
746
+ label="Outlier Method",
747
+ choices=["remove", "cap"],
748
+ value="cap"
749
+ )
750
+ convert_dates = gr.Checkbox(label="Convert Date Columns", value=False)
751
+ date_columns = gr.Textbox(
752
+ label="Date Columns (comma-separated)",
753
+ placeholder="e.g., date,created_at,timestamp"
754
+ )
755
+ normalize_numeric = gr.Checkbox(label="Normalize Numeric Columns", value=False)
756
+
757
+ with gr.Column(scale=2):
758
+ clean_button = gr.Button("Clean Data")
759
+ cleaning_output = gr.HTML(label="Cleaning Results")
760
+ cleaned_file_output = gr.File(label="Download Cleaned Data")
761
+
762
+ # Connect the buttons to functions
763
+ analyze_button.click(
764
+ fn=app_ui,
765
+ inputs=[file_input],
766
+ outputs=[output, viz_output, cleaning_recommendations_output, analysis_insights_output]
767
+ )
768
+
769
+ clean_button.click(
770
+ fn=apply_cleaning_ui,
771
+ inputs=[
772
+ file_input, handle_missing, missing_method, remove_duplicates,
773
+ handle_outliers, outlier_method, convert_dates, date_columns,
774
+ normalize_numeric
775
+ ],
776
+ outputs=[cleaning_output, cleaned_file_output]
777
+ )
778
+
779
+ # Initialize AI models
780
+ try:
781
+ data_assistant = initialize_ai_models()
782
+ except Exception as e:
783
+ print(f"Error initializing AI models: {e}")
784
+ data_assistant = None
785
+
786
+ # Launch the app
787
+ if __name__ == "__main__":
788
+ demo.launch()