jzou19950715's picture
Update components/statistical.py
9a8390a verified
raw
history blame
5.45 kB
# components/statistical.py
import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, List, Optional, Union
from datetime import datetime
class StatisticalAnalyzer:
"""Statistical analysis component with datetime handling"""
@staticmethod
def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Preprocess dataframe to handle datetime columns"""
df_numeric = df.copy()
for column in df.columns:
# Convert datetime columns to timestamps for numerical analysis
if pd.api.types.is_datetime64_any_dtype(df[column]) or (
isinstance(df[column].iloc[0], str) and
bool(datetime.strptime(df[column].iloc[0], '%Y-%m-%d'))
):
try:
df_numeric[column] = pd.to_datetime(df[column]).astype(np.int64) // 10**9
except:
# If conversion fails, exclude the column
df_numeric = df_numeric.drop(columns=[column])
return df_numeric
@staticmethod
def analyze_distribution(values: Union[List[float], np.ndarray]) -> Dict:
"""Analyze data distribution"""
values = np.array(values)
if not np.issubdtype(values.dtype, np.number):
raise ValueError("Values must be numeric for distribution analysis")
result = {
"n_samples": len(values),
"mean": float(np.mean(values)),
"std": float(np.std(values)),
"median": float(np.median(values)),
"quartiles": [float(np.percentile(values, q)) for q in [25, 50, 75]],
"skewness": float(stats.skew(values)),
"kurtosis": float(stats.kurtosis(values))
}
# Test for normality
if len(values) >= 3: # D'Agostino's K^2 test requires at least 3 samples
statistic, p_value = stats.normaltest(values)
result["normality_test"] = {
"statistic": float(statistic),
"p_value": float(p_value),
"is_normal": p_value > 0.05
}
return result
@staticmethod
def calculate_confidence_interval(
values: Union[List[float], np.ndarray],
confidence: float = 0.95
) -> Dict:
"""Calculate confidence intervals"""
values = np.array(values)
if not np.issubdtype(values.dtype, np.number):
raise ValueError("Values must be numeric for confidence interval calculation")
mean = np.mean(values)
std_err = stats.sem(values)
ci = stats.t.interval(confidence, len(values)-1, loc=mean, scale=std_err)
return {
"mean": float(mean),
"ci_lower": float(ci[0]),
"ci_upper": float(ci[1]),
"confidence": confidence
}
def forecast_probability_cone(
self,
values: Union[List[float], np.ndarray],
steps: int = 10,
confidence: float = 0.95
) -> Dict:
"""Generate probability cone forecast"""
values = np.array(values)
if not np.issubdtype(values.dtype, np.number):
raise ValueError("Values must be numeric for forecasting")
# Use exponential smoothing for trend
alpha = 0.3
smoothed = []
s = values[0]
for value in values:
s = alpha * value + (1-alpha) * s
smoothed.append(s)
# Calculate errors for confidence intervals
errors = values - np.array(smoothed)
std_err = np.std(errors)
t_value = stats.t.ppf((1 + confidence) / 2, len(values) - 1)
# Generate forecast
last_smoothed = smoothed[-1]
time_points = list(range(steps))
forecast = [last_smoothed] * steps
# Expanding confidence intervals
errors = [t_value * std_err * np.sqrt(1 + i/len(values))
for i in range(steps)]
return {
"time": time_points,
"mean": [float(x) for x in forecast],
"lower": [float(f - e) for f, e in zip(forecast, errors)],
"upper": [float(f + e) for f, e in zip(forecast, errors)]
}
def analyze_correlations(self, df: pd.DataFrame) -> Dict:
"""Analyze correlations between numeric variables"""
# Preprocess to handle datetime columns
df_numeric = self.preprocess_dataframe(df)
# Calculate correlations only for numeric columns
numeric_cols = df_numeric.select_dtypes(include=[np.number]).columns
corr_matrix = df_numeric[numeric_cols].corr()
# Find significant correlations
significant = []
for i in range(len(numeric_cols)):
for j in range(i+1, len(numeric_cols)):
corr = corr_matrix.iloc[i,j]
if abs(corr) > 0.5: # Threshold for significant correlation
significant.append({
"var1": numeric_cols[i],
"var2": numeric_cols[j],
"correlation": float(corr)
})
return {
"correlation_matrix": corr_matrix.to_dict(),
"significant_correlations": significant,
"numeric_columns": list(numeric_cols)
}