Spaces:
No application file
No application file
File size: 6,925 Bytes
da8e446 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | import pandas as pd
import numpy as np
import streamlit as st
def detect_column_types(df):
"""
Detect and return column types
"""
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime = df.select_dtypes(include=['datetime64']).columns.tolist()
boolean = df.select_dtypes(include=['bool']).columns.tolist()
return numeric, categorical, datetime, boolean
def get_basic_stats(df):
"""
Return basic statistics about the dataset
"""
stats = {
'rows': df.shape[0],
'columns': df.shape[1],
'missing_values': df.isnull().sum().sum(),
'missing_percentage': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100,
'duplicates': df.duplicated().sum(),
'memory_usage': df.memory_usage(deep=True).sum() / 1024**2 # MB
}
return stats
def suggest_visualizations(df):
"""
Suggest appropriate visualizations based on data types
"""
numeric, categorical, datetime, boolean = detect_column_types(df)
suggestions = []
if len(numeric) > 0:
suggestions.append({
'type': 'histogram',
'description': f'Distribution of numeric columns',
'columns': numeric[:3]
})
if len(categorical) > 0:
suggestions.append({
'type': 'bar_chart',
'description': f'Category distributions',
'columns': categorical[:3]
})
if len(numeric) >= 2:
suggestions.append({
'type': 'scatter_plot',
'description': 'Relationship between numeric variables',
'columns': numeric[:2]
})
if len(datetime) > 0 and len(numeric) > 0:
suggestions.append({
'type': 'line_chart',
'description': 'Time series trends',
'columns': [datetime[0], numeric[0]]
})
if len(numeric) > 1:
suggestions.append({
'type': 'correlation_heatmap',
'description': 'Correlations between numeric variables'
})
return suggestions
def format_number(num):
"""
Format large numbers with commas
"""
if pd.isna(num):
return "N/A"
return f"{num:,.0f}"
def format_percentage(num):
"""
Format as percentage
"""
if pd.isna(num):
return "N/A"
return f"{num:.1f}%"
def get_data_quality_issues(df):
"""
Identify data quality issues
"""
issues = []
# Check for missing values
missing_cols = df.columns[df.isnull().any()].tolist()
if missing_cols:
issues.append({
'type': 'missing_values',
'severity': 'high' if df.isnull().sum().sum() > len(df) * 0.1 else 'medium',
'description': f'Missing values in {len(missing_cols)} columns',
'columns': missing_cols
})
# Check for duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
issues.append({
'type': 'duplicates',
'severity': 'medium' if duplicates > len(df) * 0.05 else 'low',
'description': f'{duplicates} duplicate rows found',
'count': duplicates
})
# Check for constant columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
if constant_cols:
issues.append({
'type': 'constant_columns',
'severity': 'low',
'description': f'{len(constant_cols)} constant columns found',
'columns': constant_cols
})
# Check for outliers in numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
if len(outliers) > len(df) * 0.1:
issues.append({
'type': 'outliers',
'severity': 'medium',
'description': f'Significant outliers in {col}',
'column': col,
'outlier_count': len(outliers)
})
break # Just report first outlier issue
return issues
def get_recommendations(df):
"""
Generate data analysis recommendations
"""
numeric, categorical, datetime, boolean = detect_column_types(df)
recommendations = []
# Missing data recommendations
if df.isnull().sum().sum() > 0:
recommendations.append("Consider handling missing values using imputation or removal")
# Feature engineering suggestions
if len(numeric) >= 2:
recommendations.append("Create interaction features between highly correlated variables")
if datetime:
recommendations.append("Extract time-based features (hour, day, month, year) from datetime columns")
# Modeling suggestions
if len(numeric) > 5:
recommendations.append("Consider dimensionality reduction techniques (PCA, t-SNE)")
if df.shape[0] > 10000:
recommendations.append("Dataset is large - consider sampling for faster exploration")
# Visualization suggestions
if len(numeric) > 2:
recommendations.append("Use pair plots to visualize relationships between multiple variables")
if len(categorical) > 1:
recommendations.append("Create contingency tables to analyze categorical relationships")
return recommendations
def create_sample_dataset():
"""
Create a sample dataset for testing
"""
np.random.seed(42)
n_rows = 1000
data = {
'id': range(n_rows),
'age': np.random.normal(40, 15, n_rows).clip(18, 90).astype(int),
'income': np.random.normal(50000, 20000, n_rows).clip(20000, 150000).astype(int),
'score': np.random.uniform(0, 100, n_rows).round(2),
'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
'purchased': np.random.choice([0, 1], n_rows, p=[0.7, 0.3]),
'signup_date': pd.date_range('2023-01-01', periods=n_rows, freq='D'),
'satisfaction': np.random.choice([1, 2, 3, 4, 5], n_rows, p=[0.1, 0.15, 0.3, 0.25, 0.2])
}
# Add some missing values
df = pd.DataFrame(data)
mask = np.random.random(df.shape) < 0.05
df = df.mask(mask)
# Add some duplicates
duplicate_rows = np.random.choice(n_rows, 10, replace=False)
df = pd.concat([df, df.iloc[duplicate_rows]]).reset_index(drop=True)
return df |