jzou19950715 commited on
Commit
63f376c
·
verified ·
1 Parent(s): 962e9ea

Delete components

Browse files
components/analysis.py DELETED
@@ -1,90 +0,0 @@
1
- # components/analysis.py
2
-
3
- from typing import Dict, Optional
4
- import pandas as pd
5
-
6
- from .statistical import StatisticalAnalyzer
7
- from .visualization import D3Visualizer
8
-
9
- class DataAnalyzer:
10
- """Main analysis component with datetime handling"""
11
-
12
- def __init__(self):
13
- self.statistical = StatisticalAnalyzer()
14
- self.visualizer = D3Visualizer()
15
-
16
- def analyze_data(
17
- self,
18
- df: pd.DataFrame,
19
- analysis_type: str,
20
- params: Optional[Dict] = None
21
- ) -> Dict:
22
- """Analyze data based on type"""
23
- params = params or {}
24
-
25
- try:
26
- if analysis_type == "distribution":
27
- # Select column or default to first numeric column
28
- column = params.get("column")
29
- if not column or column not in df.columns:
30
- numeric_cols = df.select_dtypes(include=['number']).columns
31
- if len(numeric_cols) == 0:
32
- raise ValueError("No numeric columns found for distribution analysis")
33
- column = numeric_cols[0]
34
-
35
- values = df[column].dropna().values
36
- stats_result = self.statistical.analyze_distribution(values)
37
- viz_result = self.visualizer.create_interactive_plot(
38
- "distribution",
39
- {"values": values.tolist()}
40
- )
41
-
42
- return {
43
- "statistics": stats_result,
44
- "visualization": viz_result
45
- }
46
-
47
- elif analysis_type == "forecast":
48
- # Handle time series data
49
- column = params.get("column")
50
- if not column or column not in df.columns:
51
- numeric_cols = df.select_dtypes(include=['number']).columns
52
- if len(numeric_cols) == 0:
53
- raise ValueError("No numeric columns found for forecasting")
54
- column = numeric_cols[0]
55
-
56
- values = df[column].dropna().values
57
- forecast_result = self.statistical.forecast_probability_cone(
58
- values,
59
- steps=params.get("steps", 10)
60
- )
61
- viz_result = self.visualizer.create_interactive_plot(
62
- "forecast",
63
- forecast_result
64
- )
65
-
66
- return {
67
- "forecast": forecast_result,
68
- "visualization": viz_result
69
- }
70
-
71
- elif analysis_type == "correlation":
72
- # Analyze correlations (datetime columns are handled in StatisticalAnalyzer)
73
- corr_result = self.statistical.analyze_correlations(df)
74
- viz_result = self.visualizer.create_interactive_plot(
75
- "correlation",
76
- {"matrix": corr_result["correlation_matrix"]}
77
- )
78
-
79
- return {
80
- "correlations": corr_result,
81
- "visualization": viz_result
82
- }
83
-
84
- return {"error": "Unsupported analysis type"}
85
-
86
- except Exception as e:
87
- return {
88
- "error": str(e),
89
- "visualization": f"<div class='error'>Error in analysis: {str(e)}</div>"
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
components/statistical.py DELETED
@@ -1,145 +0,0 @@
1
- # components/statistical.py
2
-
3
- import numpy as np
4
- import pandas as pd
5
- from scipy import stats
6
- from typing import Dict, List, Optional, Union
7
- from datetime import datetime
8
-
9
- class StatisticalAnalyzer:
10
- """Statistical analysis component with datetime handling"""
11
-
12
- @staticmethod
13
- def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
14
- """Preprocess dataframe to handle datetime columns"""
15
- df_numeric = df.copy()
16
-
17
- for column in df.columns:
18
- # Convert datetime columns to timestamps for numerical analysis
19
- if pd.api.types.is_datetime64_any_dtype(df[column]) or (
20
- isinstance(df[column].iloc[0], str) and
21
- bool(datetime.strptime(df[column].iloc[0], '%Y-%m-%d'))
22
- ):
23
- try:
24
- df_numeric[column] = pd.to_datetime(df[column]).astype(np.int64) // 10**9
25
- except:
26
- # If conversion fails, exclude the column
27
- df_numeric = df_numeric.drop(columns=[column])
28
-
29
- return df_numeric
30
-
31
- @staticmethod
32
- def analyze_distribution(values: Union[List[float], np.ndarray]) -> Dict:
33
- """Analyze data distribution"""
34
- values = np.array(values)
35
- if not np.issubdtype(values.dtype, np.number):
36
- raise ValueError("Values must be numeric for distribution analysis")
37
-
38
- result = {
39
- "n_samples": len(values),
40
- "mean": float(np.mean(values)),
41
- "std": float(np.std(values)),
42
- "median": float(np.median(values)),
43
- "quartiles": [float(np.percentile(values, q)) for q in [25, 50, 75]],
44
- "skewness": float(stats.skew(values)),
45
- "kurtosis": float(stats.kurtosis(values))
46
- }
47
-
48
- # Test for normality
49
- if len(values) >= 3: # D'Agostino's K^2 test requires at least 3 samples
50
- statistic, p_value = stats.normaltest(values)
51
- result["normality_test"] = {
52
- "statistic": float(statistic),
53
- "p_value": float(p_value),
54
- "is_normal": p_value > 0.05
55
- }
56
-
57
- return result
58
-
59
- @staticmethod
60
- def calculate_confidence_interval(
61
- values: Union[List[float], np.ndarray],
62
- confidence: float = 0.95
63
- ) -> Dict:
64
- """Calculate confidence intervals"""
65
- values = np.array(values)
66
- if not np.issubdtype(values.dtype, np.number):
67
- raise ValueError("Values must be numeric for confidence interval calculation")
68
-
69
- mean = np.mean(values)
70
- std_err = stats.sem(values)
71
- ci = stats.t.interval(confidence, len(values)-1, loc=mean, scale=std_err)
72
-
73
- return {
74
- "mean": float(mean),
75
- "ci_lower": float(ci[0]),
76
- "ci_upper": float(ci[1]),
77
- "confidence": confidence
78
- }
79
-
80
- def forecast_probability_cone(
81
- self,
82
- values: Union[List[float], np.ndarray],
83
- steps: int = 10,
84
- confidence: float = 0.95
85
- ) -> Dict:
86
- """Generate probability cone forecast"""
87
- values = np.array(values)
88
- if not np.issubdtype(values.dtype, np.number):
89
- raise ValueError("Values must be numeric for forecasting")
90
-
91
- # Use exponential smoothing for trend
92
- alpha = 0.3
93
- smoothed = []
94
- s = values[0]
95
- for value in values:
96
- s = alpha * value + (1-alpha) * s
97
- smoothed.append(s)
98
-
99
- # Calculate errors for confidence intervals
100
- errors = values - np.array(smoothed)
101
- std_err = np.std(errors)
102
- t_value = stats.t.ppf((1 + confidence) / 2, len(values) - 1)
103
-
104
- # Generate forecast
105
- last_smoothed = smoothed[-1]
106
- time_points = list(range(steps))
107
- forecast = [last_smoothed] * steps
108
-
109
- # Expanding confidence intervals
110
- errors = [t_value * std_err * np.sqrt(1 + i/len(values))
111
- for i in range(steps)]
112
-
113
- return {
114
- "time": time_points,
115
- "mean": [float(x) for x in forecast],
116
- "lower": [float(f - e) for f, e in zip(forecast, errors)],
117
- "upper": [float(f + e) for f, e in zip(forecast, errors)]
118
- }
119
-
120
- def analyze_correlations(self, df: pd.DataFrame) -> Dict:
121
- """Analyze correlations between numeric variables"""
122
- # Preprocess to handle datetime columns
123
- df_numeric = self.preprocess_dataframe(df)
124
-
125
- # Calculate correlations only for numeric columns
126
- numeric_cols = df_numeric.select_dtypes(include=[np.number]).columns
127
- corr_matrix = df_numeric[numeric_cols].corr()
128
-
129
- # Find significant correlations
130
- significant = []
131
- for i in range(len(numeric_cols)):
132
- for j in range(i+1, len(numeric_cols)):
133
- corr = corr_matrix.iloc[i,j]
134
- if abs(corr) > 0.5: # Threshold for significant correlation
135
- significant.append({
136
- "var1": numeric_cols[i],
137
- "var2": numeric_cols[j],
138
- "correlation": float(corr)
139
- })
140
-
141
- return {
142
- "correlation_matrix": corr_matrix.to_dict(),
143
- "significant_correlations": significant,
144
- "numeric_columns": list(numeric_cols)
145
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
components/visualization.py DELETED
@@ -1,230 +0,0 @@
1
- # components/visualization.py
2
-
3
- class D3Visualizer:
4
- """D3.js visualization component"""
5
-
6
- @staticmethod
7
- def create_interactive_plot(plot_type: str, data: dict) -> str:
8
- """Create interactive D3 visualization"""
9
-
10
- # Base CSS for visualizations
11
- base_css = """
12
- <style>
13
- .visualization-container { width: 100%; height: 500px; }
14
- .bar { fill: steelblue; }
15
- .bar:hover { fill: brown; }
16
- .line { fill: none; stroke: steelblue; stroke-width: 2; }
17
- .area { fill: steelblue; opacity: 0.2; }
18
- .tooltip { position: absolute; padding: 8px; background: white; border: 1px solid #ddd; border-radius: 4px; }
19
- .axis-label { font-size: 12px; }
20
- </style>
21
- """
22
-
23
- if plot_type == "distribution":
24
- return base_css + f"""
25
- <div id="distribution-plot" class="visualization-container"></div>
26
- <script src="https://d3js.org/d3.v7.min.js"></script>
27
- <script>
28
- (function() {{
29
- const values = {data['values']};
30
- const margin = {{top: 40, right: 40, bottom: 60, left: 60}};
31
- const width = 800 - margin.left - margin.right;
32
- const height = 400 - margin.top - margin.bottom;
33
-
34
- // Create SVG
35
- const svg = d3.select("#distribution-plot")
36
- .append("svg")
37
- .attr("width", width + margin.left + margin.right)
38
- .attr("height", height + margin.top + margin.bottom)
39
- .append("g")
40
- .attr("transform", `translate(${{margin.left}},${{margin.top}})`);
41
-
42
- // Create scales
43
- const x = d3.scaleLinear()
44
- .domain([d3.min(values), d3.max(values)])
45
- .range([0, width]);
46
-
47
- const histogram = d3.histogram()
48
- .domain(x.domain())
49
- .thresholds(x.ticks(20));
50
-
51
- const bins = histogram(values);
52
-
53
- const y = d3.scaleLinear()
54
- .domain([0, d3.max(bins, d => d.length)])
55
- .range([height, 0]);
56
-
57
- // Create bars
58
- svg.selectAll(".bar")
59
- .data(bins)
60
- .enter()
61
- .append("rect")
62
- .attr("class", "bar")
63
- .attr("x", d => x(d.x0))
64
- .attr("y", d => y(d.length))
65
- .attr("width", d => Math.max(0, x(d.x1) - x(d.x0) - 1))
66
- .attr("height", d => height - y(d.length))
67
- .on("mouseover", function(event, d) {{
68
- tooltip.transition()
69
- .duration(200)
70
- .style("opacity", .9);
71
- tooltip.html(
72
- `Range: ${{d.x0.toFixed(2)}} - ${{d.x1.toFixed(2)}}<br/>` +
73
- `Count: ${{d.length}}`
74
- )
75
- .style("left", (event.pageX + 5) + "px")
76
- .style("top", (event.pageY - 28) + "px");
77
- }})
78
- .on("mouseout", function(d) {{
79
- tooltip.transition()
80
- .duration(500)
81
- .style("opacity", 0);
82
- }});
83
-
84
- // Add axes
85
- svg.append("g")
86
- .attr("class", "x-axis")
87
- .attr("transform", `translate(0,${{height}})`)
88
- .call(d3.axisBottom(x))
89
- .append("text")
90
- .attr("class", "axis-label")
91
- .attr("x", width/2)
92
- .attr("y", 40)
93
- .text("Value");
94
-
95
- svg.append("g")
96
- .attr("class", "y-axis")
97
- .call(d3.axisLeft(y))
98
- .append("text")
99
- .attr("class", "axis-label")
100
- .attr("transform", "rotate(-90)")
101
- .attr("y", -40)
102
- .attr("x", -height/2)
103
- .style("text-anchor", "middle")
104
- .text("Frequency");
105
-
106
- // Add tooltip
107
- const tooltip = d3.select("#distribution-plot")
108
- .append("div")
109
- .attr("class", "tooltip")
110
- .style("opacity", 0);
111
- }})();
112
- </script>
113
- """
114
-
115
- elif plot_type == "forecast":
116
- return base_css + f"""
117
- <div id="forecast-plot" class="visualization-container"></div>
118
- <script src="https://d3js.org/d3.v7.min.js"></script>
119
- <script>
120
- (function() {{
121
- const data = {data};
122
- const margin = {{top: 40, right: 40, bottom: 60, left: 60}};
123
- const width = 800 - margin.left - margin.right;
124
- const height = 400 - margin.top - margin.bottom;
125
-
126
- // Create SVG
127
- const svg = d3.select("#forecast-plot")
128
- .append("svg")
129
- .attr("width", width + margin.left + margin.right)
130
- .attr("height", height + margin.top + margin.bottom)
131
- .append("g")
132
- .attr("transform", `translate(${{margin.left}},${{margin.top}})`);
133
-
134
- // Create scales
135
- const x = d3.scaleLinear()
136
- .domain([0, data.time.length-1])
137
- .range([0, width]);
138
-
139
- const y = d3.scaleLinear()
140
- .domain([
141
- d3.min(data.lower),
142
- d3.max(data.upper)
143
- ])
144
- .range([height, 0]);
145
-
146
- // Create area
147
- const area = d3.area()
148
- .x((d, i) => x(i))
149
- .y0(d => y(d[0]))
150
- .y1(d => y(d[1]));
151
-
152
- // Add confidence interval area
153
- svg.append("path")
154
- .datum(data.time.map((t, i) => [data.lower[i], data.upper[i]]))
155
- .attr("class", "area")
156
- .attr("d", area);
157
-
158
- // Add mean line
159
- const line = d3.line()
160
- .x((d, i) => x(i))
161
- .y(d => y(d));
162
-
163
- svg.append("path")
164
- .datum(data.mean)
165
- .attr("class", "line")
166
- .attr("d", line);
167
-
168
- // Add axes
169
- svg.append("g")
170
- .attr("class", "x-axis")
171
- .attr("transform", `translate(0,${{height}})`)
172
- .call(d3.axisBottom(x))
173
- .append("text")
174
- .attr("class", "axis-label")
175
- .attr("x", width/2)
176
- .attr("y", 40)
177
- .text("Time Period");
178
-
179
- svg.append("g")
180
- .attr("class", "y-axis")
181
- .call(d3.axisLeft(y))
182
- .append("text")
183
- .attr("class", "axis-label")
184
- .attr("transform", "rotate(-90)")
185
- .attr("y", -40)
186
- .attr("x", -height/2)
187
- .style("text-anchor", "middle")
188
- .text("Value");
189
-
190
- // Add tooltip for hover
191
- const tooltip = d3.select("#forecast-plot")
192
- .append("div")
193
- .attr("class", "tooltip")
194
- .style("opacity", 0);
195
-
196
- // Add hover interaction
197
- const bisect = d3.bisector(d => d).left;
198
-
199
- svg.append("rect")
200
- .attr("class", "overlay")
201
- .attr("width", width)
202
- .attr("height", height)
203
- .style("fill", "none")
204
- .style("pointer-events", "all")
205
- .on("mousemove", function(event) {{
206
- const x0 = x.invert(d3.pointer(event)[0]);
207
- const i = Math.round(x0);
208
- if (i >= 0 && i < data.time.length) {{
209
- tooltip.transition()
210
- .duration(200)
211
- .style("opacity", .9);
212
- tooltip.html(
213
- `Time: ${{data.time[i]}}<br/>` +
214
- `Mean: ${{data.mean[i].toFixed(2)}}<br/>` +
215
- `Range: [${{data.lower[i].toFixed(2)}}, ${{data.upper[i].toFixed(2)}}]`
216
- )
217
- .style("left", (event.pageX + 5) + "px")
218
- .style("top", (event.pageY - 28) + "px");
219
- }}
220
- }})
221
- .on("mouseout", function() {{
222
- tooltip.transition()
223
- .duration(500)
224
- .style("opacity", 0);
225
- }});
226
- }})();
227
- </script>
228
- """
229
-
230
- return "Unsupported visualization type"