dataanalyst / utils.py
RamAi2026's picture
Upload 13 files
da8e446 verified
raw
history blame
6.93 kB
import pandas as pd
import numpy as np
import streamlit as st
def detect_column_types(df):
"""
Detect and return column types
"""
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime = df.select_dtypes(include=['datetime64']).columns.tolist()
boolean = df.select_dtypes(include=['bool']).columns.tolist()
return numeric, categorical, datetime, boolean
def get_basic_stats(df):
"""
Return basic statistics about the dataset
"""
stats = {
'rows': df.shape[0],
'columns': df.shape[1],
'missing_values': df.isnull().sum().sum(),
'missing_percentage': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100,
'duplicates': df.duplicated().sum(),
'memory_usage': df.memory_usage(deep=True).sum() / 1024**2 # MB
}
return stats
def suggest_visualizations(df):
"""
Suggest appropriate visualizations based on data types
"""
numeric, categorical, datetime, boolean = detect_column_types(df)
suggestions = []
if len(numeric) > 0:
suggestions.append({
'type': 'histogram',
'description': f'Distribution of numeric columns',
'columns': numeric[:3]
})
if len(categorical) > 0:
suggestions.append({
'type': 'bar_chart',
'description': f'Category distributions',
'columns': categorical[:3]
})
if len(numeric) >= 2:
suggestions.append({
'type': 'scatter_plot',
'description': 'Relationship between numeric variables',
'columns': numeric[:2]
})
if len(datetime) > 0 and len(numeric) > 0:
suggestions.append({
'type': 'line_chart',
'description': 'Time series trends',
'columns': [datetime[0], numeric[0]]
})
if len(numeric) > 1:
suggestions.append({
'type': 'correlation_heatmap',
'description': 'Correlations between numeric variables'
})
return suggestions
def format_number(num):
"""
Format large numbers with commas
"""
if pd.isna(num):
return "N/A"
return f"{num:,.0f}"
def format_percentage(num):
"""
Format as percentage
"""
if pd.isna(num):
return "N/A"
return f"{num:.1f}%"
def get_data_quality_issues(df):
"""
Identify data quality issues
"""
issues = []
# Check for missing values
missing_cols = df.columns[df.isnull().any()].tolist()
if missing_cols:
issues.append({
'type': 'missing_values',
'severity': 'high' if df.isnull().sum().sum() > len(df) * 0.1 else 'medium',
'description': f'Missing values in {len(missing_cols)} columns',
'columns': missing_cols
})
# Check for duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
issues.append({
'type': 'duplicates',
'severity': 'medium' if duplicates > len(df) * 0.05 else 'low',
'description': f'{duplicates} duplicate rows found',
'count': duplicates
})
# Check for constant columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
if constant_cols:
issues.append({
'type': 'constant_columns',
'severity': 'low',
'description': f'{len(constant_cols)} constant columns found',
'columns': constant_cols
})
# Check for outliers in numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
if len(outliers) > len(df) * 0.1:
issues.append({
'type': 'outliers',
'severity': 'medium',
'description': f'Significant outliers in {col}',
'column': col,
'outlier_count': len(outliers)
})
break # Just report first outlier issue
return issues
def get_recommendations(df):
"""
Generate data analysis recommendations
"""
numeric, categorical, datetime, boolean = detect_column_types(df)
recommendations = []
# Missing data recommendations
if df.isnull().sum().sum() > 0:
recommendations.append("Consider handling missing values using imputation or removal")
# Feature engineering suggestions
if len(numeric) >= 2:
recommendations.append("Create interaction features between highly correlated variables")
if datetime:
recommendations.append("Extract time-based features (hour, day, month, year) from datetime columns")
# Modeling suggestions
if len(numeric) > 5:
recommendations.append("Consider dimensionality reduction techniques (PCA, t-SNE)")
if df.shape[0] > 10000:
recommendations.append("Dataset is large - consider sampling for faster exploration")
# Visualization suggestions
if len(numeric) > 2:
recommendations.append("Use pair plots to visualize relationships between multiple variables")
if len(categorical) > 1:
recommendations.append("Create contingency tables to analyze categorical relationships")
return recommendations
def create_sample_dataset():
"""
Create a sample dataset for testing
"""
np.random.seed(42)
n_rows = 1000
data = {
'id': range(n_rows),
'age': np.random.normal(40, 15, n_rows).clip(18, 90).astype(int),
'income': np.random.normal(50000, 20000, n_rows).clip(20000, 150000).astype(int),
'score': np.random.uniform(0, 100, n_rows).round(2),
'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
'purchased': np.random.choice([0, 1], n_rows, p=[0.7, 0.3]),
'signup_date': pd.date_range('2023-01-01', periods=n_rows, freq='D'),
'satisfaction': np.random.choice([1, 2, 3, 4, 5], n_rows, p=[0.1, 0.15, 0.3, 0.25, 0.2])
}
# Add some missing values
df = pd.DataFrame(data)
mask = np.random.random(df.shape) < 0.05
df = df.mask(mask)
# Add some duplicates
duplicate_rows = np.random.choice(n_rows, 10, replace=False)
df = pd.concat([df, df.iloc[duplicate_rows]]).reset_index(drop=True)
return df