Spaces:

yogesh882
/

Business_Intelligence_Dashboard

Sleeping

App Files Files Community

Business_Intelligence_Dashboard / utils.py

yogesh882

Upload 6 files

df86d3a verified about 1 month ago

raw

history blame contribute delete

6.42 kB

	"""
	Utility functions and constants for the Business Intelligence Dashboard.
	Contains helper functions for data validation, formatting, and common operations.
	Works with ANY dataset - no hardcoded column names.
	"""

	import pandas as pd
	import numpy as np
	from typing import List, Dict, Any, Optional, Tuple

	# Constants
	SUPPORTED_FILE_EXTENSIONS = ['.csv', '.xlsx', '.xls']
	MAX_PREVIEW_ROWS = 100
	DEFAULT_PREVIEW_ROWS = 10
	DATE_FORMATS = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y']
	NUMERIC_DTYPES = ['int64', 'int32', 'float64', 'float32']
	AGGREGATION_METHODS = ['sum', 'mean', 'median', 'count', 'min', 'max']


	def validate_file_extension(filename: str) -> Tuple[bool, str]:
	"""
	Validate if the uploaded file has a supported extension.

	Args:
	filename: Name of the uploaded file

	Returns:
	Tuple of (is_valid, message)
	"""
	if filename is None:
	return False, "No file uploaded. Please upload a CSV or Excel file."

	ext = '.' + filename.split('.')[-1].lower() if '.' in filename else ''

	if ext not in SUPPORTED_FILE_EXTENSIONS:
	return False, f"Unsupported file format '{ext}'. Please upload CSV or Excel files."

	return True, "File format is valid."


	def format_number(value: float, decimals: int = 2) -> str:
	"""
	Format a number with thousands separator and decimal places.

	Args:
	value: Number to format
	decimals: Number of decimal places

	Returns:
	Formatted string representation
	"""
	if pd.isna(value):
	return "N/A"

	if abs(value) >= 1_000_000:
	return f"{value/1_000_000:,.{decimals}f}M"
	elif abs(value) >= 1_000:
	return f"{value/1_000:,.{decimals}f}K"
	else:
	return f"{value:,.{decimals}f}"


	def get_column_type(dtype) -> str:
	"""
	Determine the general type of a column based on its dtype.

	Args:
	dtype: pandas dtype of the column

	Returns:
	String indicating 'numeric', 'categorical', or 'datetime'
	"""
	dtype_str = str(dtype)

	if 'int' in dtype_str or 'float' in dtype_str:
	return 'numeric'
	elif 'datetime' in dtype_str:
	return 'datetime'
	else:
	return 'categorical'


	def detect_date_columns(df: pd.DataFrame) -> List[str]:
	"""
	Detect columns that likely contain date information.

	Args:
	df: pandas DataFrame to analyze

	Returns:
	List of column names that appear to be dates
	"""
	date_columns = []

	for col in df.columns:
	if df[col].dtype == 'datetime64[ns]':
	date_columns.append(col)
	elif df[col].dtype == 'object':
	sample = df[col].dropna().head(100)
	if len(sample) > 0:
	try:
	pd.to_datetime(sample)
	date_columns.append(col)
	except (ValueError, TypeError):
	pass

	return date_columns


	def calculate_percentage(part: float, whole: float) -> float:
	"""
	Calculate percentage safely, handling division by zero.

	Args:
	part: Numerator value
	whole: Denominator value

	Returns:
	Percentage value or 0 if whole is 0
	"""
	if whole == 0 or pd.isna(whole):
	return 0.0
	return (part / whole) * 100


	def truncate_string(text: str, max_length: int = 50) -> str:
	"""
	Truncate a string to a maximum length with ellipsis.

	Args:
	text: String to truncate
	max_length: Maximum allowed length

	Returns:
	Truncated string with ellipsis if needed
	"""
	if pd.isna(text):
	return ""
	text = str(text)
	if len(text) <= max_length:
	return text
	return text[:max_length-3] + "..."


	def get_dataframe_memory_usage(df: pd.DataFrame) -> str:
	"""
	Get human-readable memory usage of a DataFrame.

	Args:
	df: pandas DataFrame

	Returns:
	Formatted string of memory usage
	"""
	bytes_used = df.memory_usage(deep=True).sum()

	if bytes_used >= 1_073_741_824: # 1 GB
	return f"{bytes_used / 1_073_741_824:.2f} GB"
	elif bytes_used >= 1_048_576: # 1 MB
	return f"{bytes_used / 1_048_576:.2f} MB"
	elif bytes_used >= 1024: # 1 KB
	return f"{bytes_used / 1024:.2f} KB"
	else:
	return f"{bytes_used} bytes"


	def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
	"""
	Perform division safely, returning default if denominator is zero.

	Args:
	numerator: The dividend
	denominator: The divisor
	default: Value to return if division is not possible

	Returns:
	Result of division or default value
	"""
	if denominator == 0 or pd.isna(denominator) or pd.isna(numerator):
	return default
	return numerator / denominator


	def create_info_message(title: str, content: str, msg_type: str = "info") -> str:
	"""
	Create a formatted information message.

	Args:
	title: Message title
	content: Message content
	msg_type: Type of message ('info', 'warning', 'error', 'success')

	Returns:
	Formatted markdown string
	"""
	icons = {
	"info": "ℹ️",
	"warning": "⚠️",
	"error": "❌",
	"success": "✅"
	}
	icon = icons.get(msg_type, "ℹ️")
	return f"### {icon} {title}\n\n{content}"


	def get_numeric_columns(df: pd.DataFrame) -> List[str]:
	"""
	Get list of numeric columns in a DataFrame.

	Args:
	df: pandas DataFrame

	Returns:
	List of numeric column names
	"""
	return df.select_dtypes(include=[np.number]).columns.tolist()


	def get_categorical_columns(df: pd.DataFrame) -> List[str]:
	"""
	Get list of categorical columns in a DataFrame.

	Args:
	df: pandas DataFrame

	Returns:
	List of categorical column names
	"""
	return df.select_dtypes(include=['object', 'category']).columns.tolist()


	def get_datetime_columns(df: pd.DataFrame) -> List[str]:
	"""
	Get list of datetime columns in a DataFrame.

	Args:
	df: pandas DataFrame

	Returns:
	List of datetime column names
	"""
	return df.select_dtypes(include=['datetime64']).columns.tolist()