File size: 5,281 Bytes
3a1de6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Utility functions I use across my dashboard.
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any


def format_number(num: float, decimals: int = 2) -> str:
    """
    Format a number with thousand separators and specified decimal places.
    
    Args:
        num: Number to format
        decimals: Number of decimal places
        
    Returns:
        Formatted string representation of the number
    """
    if pd.isna(num):
        return "N/A"
    return f"{num:,.{decimals}f}"


def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
    """
    Categorize DataFrame columns by their data types.
    
    Args:
        df: Input DataFrame
        
    Returns:
        Dictionary with keys 'numerical', 'categorical', and 'datetime'
        containing lists of column names
    """
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
    
    return {
        'numerical': numerical_cols,
        'categorical': categorical_cols,
        'datetime': datetime_cols
    }


def detect_datetime_columns(df: pd.DataFrame) -> List[str]:
    """
    Detect columns that might contain datetime data.
    
    Args:
        df: Input DataFrame
        
    Returns:
        List of column names that appear to contain datetime data
    """
    datetime_cols = []
    
    for col in df.columns:
        if df[col].dtype == 'object':
            # Try to parse a sample of the column
            sample = df[col].dropna().head(100)
            if len(sample) > 0:
                try:
                    # Suppress warnings for datetime parsing
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        pd.to_datetime(sample, errors='raise')
                    datetime_cols.append(col)
                except (ValueError, TypeError):
                    pass
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            datetime_cols.append(col)
    
    return datetime_cols


def safe_division(numerator: float, denominator: float, default: float = 0.0) -> float:
    """
    Safely divide two numbers, returning a default value if division by zero.
    
    Args:
        numerator: The numerator
        denominator: The denominator
        default: Value to return if denominator is zero
        
    Returns:
        Result of division or default value
    """
    if denominator == 0 or pd.isna(denominator):
        return default
    return numerator / denominator


def get_missing_value_summary(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generate a summary of missing values in the DataFrame.
    
    Args:
        df: Input DataFrame
        
    Returns:
        DataFrame with columns: Column, Missing_Count, Missing_Percentage
    """
    missing_data = pd.DataFrame({
        'Column': df.columns,
        'Missing_Count': df.isnull().sum().values,
        'Missing_Percentage': (df.isnull().sum() / len(df) * 100).values
    })
    
    # Filter to only show columns with missing values
    missing_data = missing_data[missing_data['Missing_Count'] > 0]
    missing_data = missing_data.sort_values('Missing_Count', ascending=False)
    missing_data = missing_data.reset_index(drop=True)
    
    return missing_data


def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
    """
    Validate that a DataFrame is suitable for analysis.
    
    Args:
        df: DataFrame to validate
        
    Returns:
        Tuple of (is_valid, error_message)
    """
    if df is None:
        return False, "DataFrame is None"
    
    if df.empty:
        return False, "DataFrame is empty"
    
    if len(df.columns) == 0:
        return False, "DataFrame has no columns"
    
    if len(df) < 2:
        return False, "DataFrame must have at least 2 rows"
    
    return True, ""


def truncate_string(text: str, max_length: int = 50) -> str:
    """
    Truncate a string to a maximum length, adding ellipsis if needed.
    
    Args:
        text: String to truncate
        max_length: Maximum length
        
    Returns:
        Truncated string
    """
    if pd.isna(text):
        return ""
    
    text = str(text)
    if len(text) <= max_length:
        return text
    
    return text[:max_length-3] + "..."


def get_dataframe_info(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Get comprehensive information about a DataFrame.
    
    Args:
        df: Input DataFrame
        
    Returns:
        Dictionary containing various DataFrame statistics
    """
    col_types = get_column_types(df)
    
    return {
        'rows': len(df),
        'columns': len(df.columns),
        'numerical_columns': len(col_types['numerical']),
        'categorical_columns': len(col_types['categorical']),
        'datetime_columns': len(col_types['datetime']),
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024,
        'total_missing': df.isnull().sum().sum(),
        'missing_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100)
    }