""" Utility functions I use across my dashboard. """ import pandas as pd import numpy as np from typing import Dict, List, Tuple, Any def format_number(num: float, decimals: int = 2) -> str: """ Format a number with thousand separators and specified decimal places. Args: num: Number to format decimals: Number of decimal places Returns: Formatted string representation of the number """ if pd.isna(num): return "N/A" return f"{num:,.{decimals}f}" def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]: """ Categorize DataFrame columns by their data types. Args: df: Input DataFrame Returns: Dictionary with keys 'numerical', 'categorical', and 'datetime' containing lists of column names """ numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() return { 'numerical': numerical_cols, 'categorical': categorical_cols, 'datetime': datetime_cols } def detect_datetime_columns(df: pd.DataFrame) -> List[str]: """ Detect columns that might contain datetime data. Args: df: Input DataFrame Returns: List of column names that appear to contain datetime data """ datetime_cols = [] for col in df.columns: if df[col].dtype == 'object': # Try to parse a sample of the column sample = df[col].dropna().head(100) if len(sample) > 0: try: # Suppress warnings for datetime parsing import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") pd.to_datetime(sample, errors='raise') datetime_cols.append(col) except (ValueError, TypeError): pass elif pd.api.types.is_datetime64_any_dtype(df[col]): datetime_cols.append(col) return datetime_cols def safe_division(numerator: float, denominator: float, default: float = 0.0) -> float: """ Safely divide two numbers, returning a default value if division by zero. Args: numerator: The numerator denominator: The denominator default: Value to return if denominator is zero Returns: Result of division or default value """ if denominator == 0 or pd.isna(denominator): return default return numerator / denominator def get_missing_value_summary(df: pd.DataFrame) -> pd.DataFrame: """ Generate a summary of missing values in the DataFrame. Args: df: Input DataFrame Returns: DataFrame with columns: Column, Missing_Count, Missing_Percentage """ missing_data = pd.DataFrame({ 'Column': df.columns, 'Missing_Count': df.isnull().sum().values, 'Missing_Percentage': (df.isnull().sum() / len(df) * 100).values }) # Filter to only show columns with missing values missing_data = missing_data[missing_data['Missing_Count'] > 0] missing_data = missing_data.sort_values('Missing_Count', ascending=False) missing_data = missing_data.reset_index(drop=True) return missing_data def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]: """ Validate that a DataFrame is suitable for analysis. Args: df: DataFrame to validate Returns: Tuple of (is_valid, error_message) """ if df is None: return False, "DataFrame is None" if df.empty: return False, "DataFrame is empty" if len(df.columns) == 0: return False, "DataFrame has no columns" if len(df) < 2: return False, "DataFrame must have at least 2 rows" return True, "" def truncate_string(text: str, max_length: int = 50) -> str: """ Truncate a string to a maximum length, adding ellipsis if needed. Args: text: String to truncate max_length: Maximum length Returns: Truncated string """ if pd.isna(text): return "" text = str(text) if len(text) <= max_length: return text return text[:max_length-3] + "..." def get_dataframe_info(df: pd.DataFrame) -> Dict[str, Any]: """ Get comprehensive information about a DataFrame. Args: df: Input DataFrame Returns: Dictionary containing various DataFrame statistics """ col_types = get_column_types(df) return { 'rows': len(df), 'columns': len(df.columns), 'numerical_columns': len(col_types['numerical']), 'categorical_columns': len(col_types['categorical']), 'datetime_columns': len(col_types['datetime']), 'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024, 'total_missing': df.isnull().sum().sum(), 'missing_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) }