jzou19950715 commited on
Commit
9a8390a
·
verified ·
1 Parent(s): 14b70ba

Update components/statistical.py

Browse files
Files changed (1) hide show
  1. components/statistical.py +93 -40
components/statistical.py CHANGED
@@ -1,44 +1,74 @@
1
  # components/statistical.py
2
 
3
  import numpy as np
 
4
  from scipy import stats
5
  from typing import Dict, List, Optional, Union
6
- import pandas as pd
7
 
8
  class StatisticalAnalyzer:
9
- """Statistical analysis component"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  @staticmethod
12
- def analyze_distribution(data: Union[List[float], np.ndarray]) -> Dict:
13
  """Analyze data distribution"""
 
 
 
 
14
  result = {
15
- "n_samples": len(data),
16
- "mean": float(np.mean(data)),
17
- "std": float(np.std(data)),
18
- "median": float(np.median(data)),
19
- "skewness": float(stats.skew(data)),
20
- "kurtosis": float(stats.kurtosis(data))
 
21
  }
22
 
23
  # Test for normality
24
- statistic, p_value = stats.normaltest(data)
25
- result["normality_test"] = {
26
- "statistic": float(statistic),
27
- "p_value": float(p_value),
28
- "is_normal": p_value > 0.05
29
- }
 
30
 
31
  return result
32
 
33
  @staticmethod
34
  def calculate_confidence_interval(
35
- data: Union[List[float], np.ndarray],
36
  confidence: float = 0.95
37
  ) -> Dict:
38
  """Calculate confidence intervals"""
39
- mean = np.mean(data)
40
- std_err = stats.sem(data)
41
- ci = stats.t.interval(confidence, len(data)-1, loc=mean, scale=std_err)
 
 
 
 
42
 
43
  return {
44
  "mean": float(mean),
@@ -47,46 +77,69 @@ class StatisticalAnalyzer:
47
  "confidence": confidence
48
  }
49
 
50
- @staticmethod
51
  def forecast_probability_cone(
52
- data: Union[List[float], np.ndarray],
 
53
  steps: int = 10,
54
  confidence: float = 0.95
55
  ) -> Dict:
56
  """Generate probability cone forecast"""
57
- mean = np.mean(data)
58
- std_err = stats.sem(data)
59
- t_value = stats.t.ppf((1 + confidence) / 2, len(data) - 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
61
  time_points = list(range(steps))
62
- means = [mean] * steps
63
- errors = [t_value * std_err * np.sqrt(1 + i/len(data))
 
 
64
  for i in range(steps)]
65
 
66
  return {
67
  "time": time_points,
68
- "mean": means,
69
- "lower": [m - e for m, e in zip(means, errors)],
70
- "upper": [m + e for m, e in zip(means, errors)]
71
  }
 
 
 
 
 
72
 
73
- @staticmethod
74
- def analyze_correlations(df: pd.DataFrame) -> Dict:
75
- """Analyze correlations between variables"""
76
- corr_matrix = df.corr()
77
 
78
  # Find significant correlations
79
  significant = []
80
- for i in range(len(corr_matrix.columns)):
81
- for j in range(i+1, len(corr_matrix.columns)):
82
- if abs(corr_matrix.iloc[i,j]) > 0.5:
 
83
  significant.append({
84
- "var1": corr_matrix.columns[i],
85
- "var2": corr_matrix.columns[j],
86
- "correlation": float(corr_matrix.iloc[i,j])
87
  })
88
 
89
  return {
90
  "correlation_matrix": corr_matrix.to_dict(),
91
- "significant_correlations": significant
 
92
  }
 
1
  # components/statistical.py
2
 
3
  import numpy as np
4
+ import pandas as pd
5
  from scipy import stats
6
  from typing import Dict, List, Optional, Union
7
+ from datetime import datetime
8
 
9
  class StatisticalAnalyzer:
10
+ """Statistical analysis component with datetime handling"""
11
+
12
+ @staticmethod
13
+ def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Preprocess dataframe to handle datetime columns"""
15
+ df_numeric = df.copy()
16
+
17
+ for column in df.columns:
18
+ # Convert datetime columns to timestamps for numerical analysis
19
+ if pd.api.types.is_datetime64_any_dtype(df[column]) or (
20
+ isinstance(df[column].iloc[0], str) and
21
+ bool(datetime.strptime(df[column].iloc[0], '%Y-%m-%d'))
22
+ ):
23
+ try:
24
+ df_numeric[column] = pd.to_datetime(df[column]).astype(np.int64) // 10**9
25
+ except:
26
+ # If conversion fails, exclude the column
27
+ df_numeric = df_numeric.drop(columns=[column])
28
+
29
+ return df_numeric
30
 
31
  @staticmethod
32
+ def analyze_distribution(values: Union[List[float], np.ndarray]) -> Dict:
33
  """Analyze data distribution"""
34
+ values = np.array(values)
35
+ if not np.issubdtype(values.dtype, np.number):
36
+ raise ValueError("Values must be numeric for distribution analysis")
37
+
38
  result = {
39
+ "n_samples": len(values),
40
+ "mean": float(np.mean(values)),
41
+ "std": float(np.std(values)),
42
+ "median": float(np.median(values)),
43
+ "quartiles": [float(np.percentile(values, q)) for q in [25, 50, 75]],
44
+ "skewness": float(stats.skew(values)),
45
+ "kurtosis": float(stats.kurtosis(values))
46
  }
47
 
48
  # Test for normality
49
+ if len(values) >= 3: # D'Agostino's K^2 test requires at least 3 samples
50
+ statistic, p_value = stats.normaltest(values)
51
+ result["normality_test"] = {
52
+ "statistic": float(statistic),
53
+ "p_value": float(p_value),
54
+ "is_normal": p_value > 0.05
55
+ }
56
 
57
  return result
58
 
59
  @staticmethod
60
  def calculate_confidence_interval(
61
+ values: Union[List[float], np.ndarray],
62
  confidence: float = 0.95
63
  ) -> Dict:
64
  """Calculate confidence intervals"""
65
+ values = np.array(values)
66
+ if not np.issubdtype(values.dtype, np.number):
67
+ raise ValueError("Values must be numeric for confidence interval calculation")
68
+
69
+ mean = np.mean(values)
70
+ std_err = stats.sem(values)
71
+ ci = stats.t.interval(confidence, len(values)-1, loc=mean, scale=std_err)
72
 
73
  return {
74
  "mean": float(mean),
 
77
  "confidence": confidence
78
  }
79
 
 
80
  def forecast_probability_cone(
81
+ self,
82
+ values: Union[List[float], np.ndarray],
83
  steps: int = 10,
84
  confidence: float = 0.95
85
  ) -> Dict:
86
  """Generate probability cone forecast"""
87
+ values = np.array(values)
88
+ if not np.issubdtype(values.dtype, np.number):
89
+ raise ValueError("Values must be numeric for forecasting")
90
+
91
+ # Use exponential smoothing for trend
92
+ alpha = 0.3
93
+ smoothed = []
94
+ s = values[0]
95
+ for value in values:
96
+ s = alpha * value + (1-alpha) * s
97
+ smoothed.append(s)
98
+
99
+ # Calculate errors for confidence intervals
100
+ errors = values - np.array(smoothed)
101
+ std_err = np.std(errors)
102
+ t_value = stats.t.ppf((1 + confidence) / 2, len(values) - 1)
103
 
104
+ # Generate forecast
105
+ last_smoothed = smoothed[-1]
106
  time_points = list(range(steps))
107
+ forecast = [last_smoothed] * steps
108
+
109
+ # Expanding confidence intervals
110
+ errors = [t_value * std_err * np.sqrt(1 + i/len(values))
111
  for i in range(steps)]
112
 
113
  return {
114
  "time": time_points,
115
+ "mean": [float(x) for x in forecast],
116
+ "lower": [float(f - e) for f, e in zip(forecast, errors)],
117
+ "upper": [float(f + e) for f, e in zip(forecast, errors)]
118
  }
119
+
120
+ def analyze_correlations(self, df: pd.DataFrame) -> Dict:
121
+ """Analyze correlations between numeric variables"""
122
+ # Preprocess to handle datetime columns
123
+ df_numeric = self.preprocess_dataframe(df)
124
 
125
+ # Calculate correlations only for numeric columns
126
+ numeric_cols = df_numeric.select_dtypes(include=[np.number]).columns
127
+ corr_matrix = df_numeric[numeric_cols].corr()
 
128
 
129
  # Find significant correlations
130
  significant = []
131
+ for i in range(len(numeric_cols)):
132
+ for j in range(i+1, len(numeric_cols)):
133
+ corr = corr_matrix.iloc[i,j]
134
+ if abs(corr) > 0.5: # Threshold for significant correlation
135
  significant.append({
136
+ "var1": numeric_cols[i],
137
+ "var2": numeric_cols[j],
138
+ "correlation": float(corr)
139
  })
140
 
141
  return {
142
  "correlation_matrix": corr_matrix.to_dict(),
143
+ "significant_correlations": significant,
144
+ "numeric_columns": list(numeric_cols)
145
  }