""" Utility functions and constants for the Business Intelligence Dashboard. Contains helper functions for data validation, formatting, and common operations. Works with ANY dataset - no hardcoded column names. """ import pandas as pd import numpy as np from typing import List, Dict, Any, Optional, Tuple # Constants SUPPORTED_FILE_EXTENSIONS = ['.csv', '.xlsx', '.xls'] MAX_PREVIEW_ROWS = 100 DEFAULT_PREVIEW_ROWS = 10 DATE_FORMATS = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y'] NUMERIC_DTYPES = ['int64', 'int32', 'float64', 'float32'] AGGREGATION_METHODS = ['sum', 'mean', 'median', 'count', 'min', 'max'] def validate_file_extension(filename: str) -> Tuple[bool, str]: """ Validate if the uploaded file has a supported extension. Args: filename: Name of the uploaded file Returns: Tuple of (is_valid, message) """ if filename is None: return False, "No file uploaded. Please upload a CSV or Excel file." ext = '.' + filename.split('.')[-1].lower() if '.' in filename else '' if ext not in SUPPORTED_FILE_EXTENSIONS: return False, f"Unsupported file format '{ext}'. Please upload CSV or Excel files." return True, "File format is valid." def format_number(value: float, decimals: int = 2) -> str: """ Format a number with thousands separator and decimal places. Args: value: Number to format decimals: Number of decimal places Returns: Formatted string representation """ if pd.isna(value): return "N/A" if abs(value) >= 1_000_000: return f"{value/1_000_000:,.{decimals}f}M" elif abs(value) >= 1_000: return f"{value/1_000:,.{decimals}f}K" else: return f"{value:,.{decimals}f}" def get_column_type(dtype) -> str: """ Determine the general type of a column based on its dtype. Args: dtype: pandas dtype of the column Returns: String indicating 'numeric', 'categorical', or 'datetime' """ dtype_str = str(dtype) if 'int' in dtype_str or 'float' in dtype_str: return 'numeric' elif 'datetime' in dtype_str: return 'datetime' else: return 'categorical' def detect_date_columns(df: pd.DataFrame) -> List[str]: """ Detect columns that likely contain date information. Args: df: pandas DataFrame to analyze Returns: List of column names that appear to be dates """ date_columns = [] for col in df.columns: if df[col].dtype == 'datetime64[ns]': date_columns.append(col) elif df[col].dtype == 'object': sample = df[col].dropna().head(100) if len(sample) > 0: try: pd.to_datetime(sample) date_columns.append(col) except (ValueError, TypeError): pass return date_columns def calculate_percentage(part: float, whole: float) -> float: """ Calculate percentage safely, handling division by zero. Args: part: Numerator value whole: Denominator value Returns: Percentage value or 0 if whole is 0 """ if whole == 0 or pd.isna(whole): return 0.0 return (part / whole) * 100 def truncate_string(text: str, max_length: int = 50) -> str: """ Truncate a string to a maximum length with ellipsis. Args: text: String to truncate max_length: Maximum allowed length Returns: Truncated string with ellipsis if needed """ if pd.isna(text): return "" text = str(text) if len(text) <= max_length: return text return text[:max_length-3] + "..." def get_dataframe_memory_usage(df: pd.DataFrame) -> str: """ Get human-readable memory usage of a DataFrame. Args: df: pandas DataFrame Returns: Formatted string of memory usage """ bytes_used = df.memory_usage(deep=True).sum() if bytes_used >= 1_073_741_824: # 1 GB return f"{bytes_used / 1_073_741_824:.2f} GB" elif bytes_used >= 1_048_576: # 1 MB return f"{bytes_used / 1_048_576:.2f} MB" elif bytes_used >= 1024: # 1 KB return f"{bytes_used / 1024:.2f} KB" else: return f"{bytes_used} bytes" def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float: """ Perform division safely, returning default if denominator is zero. Args: numerator: The dividend denominator: The divisor default: Value to return if division is not possible Returns: Result of division or default value """ if denominator == 0 or pd.isna(denominator) or pd.isna(numerator): return default return numerator / denominator def create_info_message(title: str, content: str, msg_type: str = "info") -> str: """ Create a formatted information message. Args: title: Message title content: Message content msg_type: Type of message ('info', 'warning', 'error', 'success') Returns: Formatted markdown string """ icons = { "info": "ℹ️", "warning": "⚠️", "error": "❌", "success": "✅" } icon = icons.get(msg_type, "ℹ️") return f"### {icon} {title}\n\n{content}" def get_numeric_columns(df: pd.DataFrame) -> List[str]: """ Get list of numeric columns in a DataFrame. Args: df: pandas DataFrame Returns: List of numeric column names """ return df.select_dtypes(include=[np.number]).columns.tolist() def get_categorical_columns(df: pd.DataFrame) -> List[str]: """ Get list of categorical columns in a DataFrame. Args: df: pandas DataFrame Returns: List of categorical column names """ return df.select_dtypes(include=['object', 'category']).columns.tolist() def get_datetime_columns(df: pd.DataFrame) -> List[str]: """ Get list of datetime columns in a DataFrame. Args: df: pandas DataFrame Returns: List of datetime column names """ return df.select_dtypes(include=['datetime64']).columns.tolist()