|
|
""" |
|
|
Utility functions and constants for the Business Intelligence Dashboard. |
|
|
Contains helper functions for data validation, formatting, and common operations. |
|
|
Works with ANY dataset - no hardcoded column names. |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from typing import List, Dict, Any, Optional, Tuple |
|
|
|
|
|
|
|
|
SUPPORTED_FILE_EXTENSIONS = ['.csv', '.xlsx', '.xls'] |
|
|
MAX_PREVIEW_ROWS = 100 |
|
|
DEFAULT_PREVIEW_ROWS = 10 |
|
|
DATE_FORMATS = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y'] |
|
|
NUMERIC_DTYPES = ['int64', 'int32', 'float64', 'float32'] |
|
|
AGGREGATION_METHODS = ['sum', 'mean', 'median', 'count', 'min', 'max'] |
|
|
|
|
|
|
|
|
def validate_file_extension(filename: str) -> Tuple[bool, str]: |
|
|
""" |
|
|
Validate if the uploaded file has a supported extension. |
|
|
|
|
|
Args: |
|
|
filename: Name of the uploaded file |
|
|
|
|
|
Returns: |
|
|
Tuple of (is_valid, message) |
|
|
""" |
|
|
if filename is None: |
|
|
return False, "No file uploaded. Please upload a CSV or Excel file." |
|
|
|
|
|
ext = '.' + filename.split('.')[-1].lower() if '.' in filename else '' |
|
|
|
|
|
if ext not in SUPPORTED_FILE_EXTENSIONS: |
|
|
return False, f"Unsupported file format '{ext}'. Please upload CSV or Excel files." |
|
|
|
|
|
return True, "File format is valid." |
|
|
|
|
|
|
|
|
def format_number(value: float, decimals: int = 2) -> str: |
|
|
""" |
|
|
Format a number with thousands separator and decimal places. |
|
|
|
|
|
Args: |
|
|
value: Number to format |
|
|
decimals: Number of decimal places |
|
|
|
|
|
Returns: |
|
|
Formatted string representation |
|
|
""" |
|
|
if pd.isna(value): |
|
|
return "N/A" |
|
|
|
|
|
if abs(value) >= 1_000_000: |
|
|
return f"{value/1_000_000:,.{decimals}f}M" |
|
|
elif abs(value) >= 1_000: |
|
|
return f"{value/1_000:,.{decimals}f}K" |
|
|
else: |
|
|
return f"{value:,.{decimals}f}" |
|
|
|
|
|
|
|
|
def get_column_type(dtype) -> str: |
|
|
""" |
|
|
Determine the general type of a column based on its dtype. |
|
|
|
|
|
Args: |
|
|
dtype: pandas dtype of the column |
|
|
|
|
|
Returns: |
|
|
String indicating 'numeric', 'categorical', or 'datetime' |
|
|
""" |
|
|
dtype_str = str(dtype) |
|
|
|
|
|
if 'int' in dtype_str or 'float' in dtype_str: |
|
|
return 'numeric' |
|
|
elif 'datetime' in dtype_str: |
|
|
return 'datetime' |
|
|
else: |
|
|
return 'categorical' |
|
|
|
|
|
|
|
|
def detect_date_columns(df: pd.DataFrame) -> List[str]: |
|
|
""" |
|
|
Detect columns that likely contain date information. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame to analyze |
|
|
|
|
|
Returns: |
|
|
List of column names that appear to be dates |
|
|
""" |
|
|
date_columns = [] |
|
|
|
|
|
for col in df.columns: |
|
|
if df[col].dtype == 'datetime64[ns]': |
|
|
date_columns.append(col) |
|
|
elif df[col].dtype == 'object': |
|
|
sample = df[col].dropna().head(100) |
|
|
if len(sample) > 0: |
|
|
try: |
|
|
pd.to_datetime(sample) |
|
|
date_columns.append(col) |
|
|
except (ValueError, TypeError): |
|
|
pass |
|
|
|
|
|
return date_columns |
|
|
|
|
|
|
|
|
def calculate_percentage(part: float, whole: float) -> float: |
|
|
""" |
|
|
Calculate percentage safely, handling division by zero. |
|
|
|
|
|
Args: |
|
|
part: Numerator value |
|
|
whole: Denominator value |
|
|
|
|
|
Returns: |
|
|
Percentage value or 0 if whole is 0 |
|
|
""" |
|
|
if whole == 0 or pd.isna(whole): |
|
|
return 0.0 |
|
|
return (part / whole) * 100 |
|
|
|
|
|
|
|
|
def truncate_string(text: str, max_length: int = 50) -> str: |
|
|
""" |
|
|
Truncate a string to a maximum length with ellipsis. |
|
|
|
|
|
Args: |
|
|
text: String to truncate |
|
|
max_length: Maximum allowed length |
|
|
|
|
|
Returns: |
|
|
Truncated string with ellipsis if needed |
|
|
""" |
|
|
if pd.isna(text): |
|
|
return "" |
|
|
text = str(text) |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
return text[:max_length-3] + "..." |
|
|
|
|
|
|
|
|
def get_dataframe_memory_usage(df: pd.DataFrame) -> str: |
|
|
""" |
|
|
Get human-readable memory usage of a DataFrame. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
|
|
|
Returns: |
|
|
Formatted string of memory usage |
|
|
""" |
|
|
bytes_used = df.memory_usage(deep=True).sum() |
|
|
|
|
|
if bytes_used >= 1_073_741_824: |
|
|
return f"{bytes_used / 1_073_741_824:.2f} GB" |
|
|
elif bytes_used >= 1_048_576: |
|
|
return f"{bytes_used / 1_048_576:.2f} MB" |
|
|
elif bytes_used >= 1024: |
|
|
return f"{bytes_used / 1024:.2f} KB" |
|
|
else: |
|
|
return f"{bytes_used} bytes" |
|
|
|
|
|
|
|
|
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float: |
|
|
""" |
|
|
Perform division safely, returning default if denominator is zero. |
|
|
|
|
|
Args: |
|
|
numerator: The dividend |
|
|
denominator: The divisor |
|
|
default: Value to return if division is not possible |
|
|
|
|
|
Returns: |
|
|
Result of division or default value |
|
|
""" |
|
|
if denominator == 0 or pd.isna(denominator) or pd.isna(numerator): |
|
|
return default |
|
|
return numerator / denominator |
|
|
|
|
|
|
|
|
def create_info_message(title: str, content: str, msg_type: str = "info") -> str: |
|
|
""" |
|
|
Create a formatted information message. |
|
|
|
|
|
Args: |
|
|
title: Message title |
|
|
content: Message content |
|
|
msg_type: Type of message ('info', 'warning', 'error', 'success') |
|
|
|
|
|
Returns: |
|
|
Formatted markdown string |
|
|
""" |
|
|
icons = { |
|
|
"info": "ℹ️", |
|
|
"warning": "⚠️", |
|
|
"error": "❌", |
|
|
"success": "✅" |
|
|
} |
|
|
icon = icons.get(msg_type, "ℹ️") |
|
|
return f"### {icon} {title}\n\n{content}" |
|
|
|
|
|
|
|
|
def get_numeric_columns(df: pd.DataFrame) -> List[str]: |
|
|
""" |
|
|
Get list of numeric columns in a DataFrame. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
|
|
|
Returns: |
|
|
List of numeric column names |
|
|
""" |
|
|
return df.select_dtypes(include=[np.number]).columns.tolist() |
|
|
|
|
|
|
|
|
def get_categorical_columns(df: pd.DataFrame) -> List[str]: |
|
|
""" |
|
|
Get list of categorical columns in a DataFrame. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
|
|
|
Returns: |
|
|
List of categorical column names |
|
|
""" |
|
|
return df.select_dtypes(include=['object', 'category']).columns.tolist() |
|
|
|
|
|
|
|
|
def get_datetime_columns(df: pd.DataFrame) -> List[str]: |
|
|
""" |
|
|
Get list of datetime columns in a DataFrame. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
|
|
|
Returns: |
|
|
List of datetime column names |
|
|
""" |
|
|
return df.select_dtypes(include=['datetime64']).columns.tolist() |