Saumith devarsetty
Initial deployment
3a1de6f
"""
Utility functions I use across my dashboard.
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any
def format_number(num: float, decimals: int = 2) -> str:
"""
Format a number with thousand separators and specified decimal places.
Args:
num: Number to format
decimals: Number of decimal places
Returns:
Formatted string representation of the number
"""
if pd.isna(num):
return "N/A"
return f"{num:,.{decimals}f}"
def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
"""
Categorize DataFrame columns by their data types.
Args:
df: Input DataFrame
Returns:
Dictionary with keys 'numerical', 'categorical', and 'datetime'
containing lists of column names
"""
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
return {
'numerical': numerical_cols,
'categorical': categorical_cols,
'datetime': datetime_cols
}
def detect_datetime_columns(df: pd.DataFrame) -> List[str]:
"""
Detect columns that might contain datetime data.
Args:
df: Input DataFrame
Returns:
List of column names that appear to contain datetime data
"""
datetime_cols = []
for col in df.columns:
if df[col].dtype == 'object':
# Try to parse a sample of the column
sample = df[col].dropna().head(100)
if len(sample) > 0:
try:
# Suppress warnings for datetime parsing
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pd.to_datetime(sample, errors='raise')
datetime_cols.append(col)
except (ValueError, TypeError):
pass
elif pd.api.types.is_datetime64_any_dtype(df[col]):
datetime_cols.append(col)
return datetime_cols
def safe_division(numerator: float, denominator: float, default: float = 0.0) -> float:
"""
Safely divide two numbers, returning a default value if division by zero.
Args:
numerator: The numerator
denominator: The denominator
default: Value to return if denominator is zero
Returns:
Result of division or default value
"""
if denominator == 0 or pd.isna(denominator):
return default
return numerator / denominator
def get_missing_value_summary(df: pd.DataFrame) -> pd.DataFrame:
"""
Generate a summary of missing values in the DataFrame.
Args:
df: Input DataFrame
Returns:
DataFrame with columns: Column, Missing_Count, Missing_Percentage
"""
missing_data = pd.DataFrame({
'Column': df.columns,
'Missing_Count': df.isnull().sum().values,
'Missing_Percentage': (df.isnull().sum() / len(df) * 100).values
})
# Filter to only show columns with missing values
missing_data = missing_data[missing_data['Missing_Count'] > 0]
missing_data = missing_data.sort_values('Missing_Count', ascending=False)
missing_data = missing_data.reset_index(drop=True)
return missing_data
def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
"""
Validate that a DataFrame is suitable for analysis.
Args:
df: DataFrame to validate
Returns:
Tuple of (is_valid, error_message)
"""
if df is None:
return False, "DataFrame is None"
if df.empty:
return False, "DataFrame is empty"
if len(df.columns) == 0:
return False, "DataFrame has no columns"
if len(df) < 2:
return False, "DataFrame must have at least 2 rows"
return True, ""
def truncate_string(text: str, max_length: int = 50) -> str:
"""
Truncate a string to a maximum length, adding ellipsis if needed.
Args:
text: String to truncate
max_length: Maximum length
Returns:
Truncated string
"""
if pd.isna(text):
return ""
text = str(text)
if len(text) <= max_length:
return text
return text[:max_length-3] + "..."
def get_dataframe_info(df: pd.DataFrame) -> Dict[str, Any]:
"""
Get comprehensive information about a DataFrame.
Args:
df: Input DataFrame
Returns:
Dictionary containing various DataFrame statistics
"""
col_types = get_column_types(df)
return {
'rows': len(df),
'columns': len(df.columns),
'numerical_columns': len(col_types['numerical']),
'categorical_columns': len(col_types['categorical']),
'datetime_columns': len(col_types['datetime']),
'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024,
'total_missing': df.isnull().sum().sum(),
'missing_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100)
}