| | """ |
| | Utility functions I use across my dashboard. |
| | """ |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | from typing import Dict, List, Tuple, Any |
| |
|
| |
|
| | def format_number(num: float, decimals: int = 2) -> str: |
| | """ |
| | Format a number with thousand separators and specified decimal places. |
| | |
| | Args: |
| | num: Number to format |
| | decimals: Number of decimal places |
| | |
| | Returns: |
| | Formatted string representation of the number |
| | """ |
| | if pd.isna(num): |
| | return "N/A" |
| | return f"{num:,.{decimals}f}" |
| |
|
| |
|
| | def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]: |
| | """ |
| | Categorize DataFrame columns by their data types. |
| | |
| | Args: |
| | df: Input DataFrame |
| | |
| | Returns: |
| | Dictionary with keys 'numerical', 'categorical', and 'datetime' |
| | containing lists of column names |
| | """ |
| | numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
| | categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() |
| | datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() |
| | |
| | return { |
| | 'numerical': numerical_cols, |
| | 'categorical': categorical_cols, |
| | 'datetime': datetime_cols |
| | } |
| |
|
| |
|
| | def detect_datetime_columns(df: pd.DataFrame) -> List[str]: |
| | """ |
| | Detect columns that might contain datetime data. |
| | |
| | Args: |
| | df: Input DataFrame |
| | |
| | Returns: |
| | List of column names that appear to contain datetime data |
| | """ |
| | datetime_cols = [] |
| | |
| | for col in df.columns: |
| | if df[col].dtype == 'object': |
| | |
| | sample = df[col].dropna().head(100) |
| | if len(sample) > 0: |
| | try: |
| | |
| | import warnings |
| | with warnings.catch_warnings(): |
| | warnings.simplefilter("ignore") |
| | pd.to_datetime(sample, errors='raise') |
| | datetime_cols.append(col) |
| | except (ValueError, TypeError): |
| | pass |
| | elif pd.api.types.is_datetime64_any_dtype(df[col]): |
| | datetime_cols.append(col) |
| | |
| | return datetime_cols |
| |
|
| |
|
| | def safe_division(numerator: float, denominator: float, default: float = 0.0) -> float: |
| | """ |
| | Safely divide two numbers, returning a default value if division by zero. |
| | |
| | Args: |
| | numerator: The numerator |
| | denominator: The denominator |
| | default: Value to return if denominator is zero |
| | |
| | Returns: |
| | Result of division or default value |
| | """ |
| | if denominator == 0 or pd.isna(denominator): |
| | return default |
| | return numerator / denominator |
| |
|
| |
|
| | def get_missing_value_summary(df: pd.DataFrame) -> pd.DataFrame: |
| | """ |
| | Generate a summary of missing values in the DataFrame. |
| | |
| | Args: |
| | df: Input DataFrame |
| | |
| | Returns: |
| | DataFrame with columns: Column, Missing_Count, Missing_Percentage |
| | """ |
| | missing_data = pd.DataFrame({ |
| | 'Column': df.columns, |
| | 'Missing_Count': df.isnull().sum().values, |
| | 'Missing_Percentage': (df.isnull().sum() / len(df) * 100).values |
| | }) |
| | |
| | |
| | missing_data = missing_data[missing_data['Missing_Count'] > 0] |
| | missing_data = missing_data.sort_values('Missing_Count', ascending=False) |
| | missing_data = missing_data.reset_index(drop=True) |
| | |
| | return missing_data |
| |
|
| |
|
| | def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]: |
| | """ |
| | Validate that a DataFrame is suitable for analysis. |
| | |
| | Args: |
| | df: DataFrame to validate |
| | |
| | Returns: |
| | Tuple of (is_valid, error_message) |
| | """ |
| | if df is None: |
| | return False, "DataFrame is None" |
| | |
| | if df.empty: |
| | return False, "DataFrame is empty" |
| | |
| | if len(df.columns) == 0: |
| | return False, "DataFrame has no columns" |
| | |
| | if len(df) < 2: |
| | return False, "DataFrame must have at least 2 rows" |
| | |
| | return True, "" |
| |
|
| |
|
| | def truncate_string(text: str, max_length: int = 50) -> str: |
| | """ |
| | Truncate a string to a maximum length, adding ellipsis if needed. |
| | |
| | Args: |
| | text: String to truncate |
| | max_length: Maximum length |
| | |
| | Returns: |
| | Truncated string |
| | """ |
| | if pd.isna(text): |
| | return "" |
| | |
| | text = str(text) |
| | if len(text) <= max_length: |
| | return text |
| | |
| | return text[:max_length-3] + "..." |
| |
|
| |
|
| | def get_dataframe_info(df: pd.DataFrame) -> Dict[str, Any]: |
| | """ |
| | Get comprehensive information about a DataFrame. |
| | |
| | Args: |
| | df: Input DataFrame |
| | |
| | Returns: |
| | Dictionary containing various DataFrame statistics |
| | """ |
| | col_types = get_column_types(df) |
| | |
| | return { |
| | 'rows': len(df), |
| | 'columns': len(df.columns), |
| | 'numerical_columns': len(col_types['numerical']), |
| | 'categorical_columns': len(col_types['categorical']), |
| | 'datetime_columns': len(col_types['datetime']), |
| | 'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024, |
| | 'total_missing': df.isnull().sum().sum(), |
| | 'missing_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) |
| | } |
| |
|