yogesh882's picture
Upload 6 files
df86d3a verified
"""
Utility functions and constants for the Business Intelligence Dashboard.
Contains helper functions for data validation, formatting, and common operations.
Works with ANY dataset - no hardcoded column names.
"""
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
# Constants
SUPPORTED_FILE_EXTENSIONS = ['.csv', '.xlsx', '.xls']
MAX_PREVIEW_ROWS = 100
DEFAULT_PREVIEW_ROWS = 10
DATE_FORMATS = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y']
NUMERIC_DTYPES = ['int64', 'int32', 'float64', 'float32']
AGGREGATION_METHODS = ['sum', 'mean', 'median', 'count', 'min', 'max']
def validate_file_extension(filename: str) -> Tuple[bool, str]:
"""
Validate if the uploaded file has a supported extension.
Args:
filename: Name of the uploaded file
Returns:
Tuple of (is_valid, message)
"""
if filename is None:
return False, "No file uploaded. Please upload a CSV or Excel file."
ext = '.' + filename.split('.')[-1].lower() if '.' in filename else ''
if ext not in SUPPORTED_FILE_EXTENSIONS:
return False, f"Unsupported file format '{ext}'. Please upload CSV or Excel files."
return True, "File format is valid."
def format_number(value: float, decimals: int = 2) -> str:
"""
Format a number with thousands separator and decimal places.
Args:
value: Number to format
decimals: Number of decimal places
Returns:
Formatted string representation
"""
if pd.isna(value):
return "N/A"
if abs(value) >= 1_000_000:
return f"{value/1_000_000:,.{decimals}f}M"
elif abs(value) >= 1_000:
return f"{value/1_000:,.{decimals}f}K"
else:
return f"{value:,.{decimals}f}"
def get_column_type(dtype) -> str:
"""
Determine the general type of a column based on its dtype.
Args:
dtype: pandas dtype of the column
Returns:
String indicating 'numeric', 'categorical', or 'datetime'
"""
dtype_str = str(dtype)
if 'int' in dtype_str or 'float' in dtype_str:
return 'numeric'
elif 'datetime' in dtype_str:
return 'datetime'
else:
return 'categorical'
def detect_date_columns(df: pd.DataFrame) -> List[str]:
"""
Detect columns that likely contain date information.
Args:
df: pandas DataFrame to analyze
Returns:
List of column names that appear to be dates
"""
date_columns = []
for col in df.columns:
if df[col].dtype == 'datetime64[ns]':
date_columns.append(col)
elif df[col].dtype == 'object':
sample = df[col].dropna().head(100)
if len(sample) > 0:
try:
pd.to_datetime(sample)
date_columns.append(col)
except (ValueError, TypeError):
pass
return date_columns
def calculate_percentage(part: float, whole: float) -> float:
"""
Calculate percentage safely, handling division by zero.
Args:
part: Numerator value
whole: Denominator value
Returns:
Percentage value or 0 if whole is 0
"""
if whole == 0 or pd.isna(whole):
return 0.0
return (part / whole) * 100
def truncate_string(text: str, max_length: int = 50) -> str:
"""
Truncate a string to a maximum length with ellipsis.
Args:
text: String to truncate
max_length: Maximum allowed length
Returns:
Truncated string with ellipsis if needed
"""
if pd.isna(text):
return ""
text = str(text)
if len(text) <= max_length:
return text
return text[:max_length-3] + "..."
def get_dataframe_memory_usage(df: pd.DataFrame) -> str:
"""
Get human-readable memory usage of a DataFrame.
Args:
df: pandas DataFrame
Returns:
Formatted string of memory usage
"""
bytes_used = df.memory_usage(deep=True).sum()
if bytes_used >= 1_073_741_824: # 1 GB
return f"{bytes_used / 1_073_741_824:.2f} GB"
elif bytes_used >= 1_048_576: # 1 MB
return f"{bytes_used / 1_048_576:.2f} MB"
elif bytes_used >= 1024: # 1 KB
return f"{bytes_used / 1024:.2f} KB"
else:
return f"{bytes_used} bytes"
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
"""
Perform division safely, returning default if denominator is zero.
Args:
numerator: The dividend
denominator: The divisor
default: Value to return if division is not possible
Returns:
Result of division or default value
"""
if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
return default
return numerator / denominator
def create_info_message(title: str, content: str, msg_type: str = "info") -> str:
"""
Create a formatted information message.
Args:
title: Message title
content: Message content
msg_type: Type of message ('info', 'warning', 'error', 'success')
Returns:
Formatted markdown string
"""
icons = {
"info": "ℹ️",
"warning": "⚠️",
"error": "❌",
"success": "✅"
}
icon = icons.get(msg_type, "ℹ️")
return f"### {icon} {title}\n\n{content}"
def get_numeric_columns(df: pd.DataFrame) -> List[str]:
"""
Get list of numeric columns in a DataFrame.
Args:
df: pandas DataFrame
Returns:
List of numeric column names
"""
return df.select_dtypes(include=[np.number]).columns.tolist()
def get_categorical_columns(df: pd.DataFrame) -> List[str]:
"""
Get list of categorical columns in a DataFrame.
Args:
df: pandas DataFrame
Returns:
List of categorical column names
"""
return df.select_dtypes(include=['object', 'category']).columns.tolist()
def get_datetime_columns(df: pd.DataFrame) -> List[str]:
"""
Get list of datetime columns in a DataFrame.
Args:
df: pandas DataFrame
Returns:
List of datetime column names
"""
return df.select_dtypes(include=['datetime64']).columns.tolist()