# components/statistical.py import numpy as np import pandas as pd from scipy import stats from typing import Dict, List, Optional, Union from datetime import datetime class StatisticalAnalyzer: """Statistical analysis component with datetime handling""" @staticmethod def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame: """Preprocess dataframe to handle datetime columns""" df_numeric = df.copy() for column in df.columns: # Convert datetime columns to timestamps for numerical analysis if pd.api.types.is_datetime64_any_dtype(df[column]) or ( isinstance(df[column].iloc[0], str) and bool(datetime.strptime(df[column].iloc[0], '%Y-%m-%d')) ): try: df_numeric[column] = pd.to_datetime(df[column]).astype(np.int64) // 10**9 except: # If conversion fails, exclude the column df_numeric = df_numeric.drop(columns=[column]) return df_numeric @staticmethod def analyze_distribution(values: Union[List[float], np.ndarray]) -> Dict: """Analyze data distribution""" values = np.array(values) if not np.issubdtype(values.dtype, np.number): raise ValueError("Values must be numeric for distribution analysis") result = { "n_samples": len(values), "mean": float(np.mean(values)), "std": float(np.std(values)), "median": float(np.median(values)), "quartiles": [float(np.percentile(values, q)) for q in [25, 50, 75]], "skewness": float(stats.skew(values)), "kurtosis": float(stats.kurtosis(values)) } # Test for normality if len(values) >= 3: # D'Agostino's K^2 test requires at least 3 samples statistic, p_value = stats.normaltest(values) result["normality_test"] = { "statistic": float(statistic), "p_value": float(p_value), "is_normal": p_value > 0.05 } return result @staticmethod def calculate_confidence_interval( values: Union[List[float], np.ndarray], confidence: float = 0.95 ) -> Dict: """Calculate confidence intervals""" values = np.array(values) if not np.issubdtype(values.dtype, np.number): raise ValueError("Values must be numeric for confidence interval calculation") mean = np.mean(values) std_err = stats.sem(values) ci = stats.t.interval(confidence, len(values)-1, loc=mean, scale=std_err) return { "mean": float(mean), "ci_lower": float(ci[0]), "ci_upper": float(ci[1]), "confidence": confidence } def forecast_probability_cone( self, values: Union[List[float], np.ndarray], steps: int = 10, confidence: float = 0.95 ) -> Dict: """Generate probability cone forecast""" values = np.array(values) if not np.issubdtype(values.dtype, np.number): raise ValueError("Values must be numeric for forecasting") # Use exponential smoothing for trend alpha = 0.3 smoothed = [] s = values[0] for value in values: s = alpha * value + (1-alpha) * s smoothed.append(s) # Calculate errors for confidence intervals errors = values - np.array(smoothed) std_err = np.std(errors) t_value = stats.t.ppf((1 + confidence) / 2, len(values) - 1) # Generate forecast last_smoothed = smoothed[-1] time_points = list(range(steps)) forecast = [last_smoothed] * steps # Expanding confidence intervals errors = [t_value * std_err * np.sqrt(1 + i/len(values)) for i in range(steps)] return { "time": time_points, "mean": [float(x) for x in forecast], "lower": [float(f - e) for f, e in zip(forecast, errors)], "upper": [float(f + e) for f, e in zip(forecast, errors)] } def analyze_correlations(self, df: pd.DataFrame) -> Dict: """Analyze correlations between numeric variables""" # Preprocess to handle datetime columns df_numeric = self.preprocess_dataframe(df) # Calculate correlations only for numeric columns numeric_cols = df_numeric.select_dtypes(include=[np.number]).columns corr_matrix = df_numeric[numeric_cols].corr() # Find significant correlations significant = [] for i in range(len(numeric_cols)): for j in range(i+1, len(numeric_cols)): corr = corr_matrix.iloc[i,j] if abs(corr) > 0.5: # Threshold for significant correlation significant.append({ "var1": numeric_cols[i], "var2": numeric_cols[j], "correlation": float(corr) }) return { "correlation_matrix": corr_matrix.to_dict(), "significant_correlations": significant, "numeric_columns": list(numeric_cols) }