teoat commited on
Commit
d6d9f24
·
verified ·
1 Parent(s): caf7f7e

Upload core/performance_monitoring.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. core/performance_monitoring.py +514 -243
core/performance_monitoring.py CHANGED
@@ -1,268 +1,539 @@
1
- # backend/core/performance_monitoring.py
 
 
 
2
 
3
  import asyncio
4
- import logging
5
- import time
6
- from collections import defaultdict
7
- from dataclasses import dataclass
8
- from datetime import UTC, datetime
9
- from typing import Any
10
 
11
- logger = logging.getLogger(__name__)
 
12
 
 
 
 
13
 
14
- @dataclass
15
- class PerformanceMetrics:
16
- """Comprehensive performance metrics collection"""
 
17
 
18
- request_count: int = 0
19
- total_response_time: float = 0.0
20
- average_response_time: float = 0.0
21
- min_response_time: float = float("inf")
22
- max_response_time: float = 0.0
23
- p95_response_time: float = 0.0
24
- p99_response_time: float = 0.0
25
 
26
- error_count: int = 0
27
- error_rate: float = 0.0
28
 
29
- slow_requests_count: int = 0
30
- slow_requests_threshold: float = 1.0 # seconds
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- endpoint_metrics: dict[str, "EndpointMetrics"] = None
33
 
34
- def __post_init__(self):
35
- if self.endpoint_metrics is None:
36
- self.endpoint_metrics = defaultdict(EndpointMetrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
- @dataclass
40
- class EndpointMetrics:
41
- """Per-endpoint performance metrics"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- request_count: int = 0
44
- total_time: float = 0.0
45
- avg_time: float = 0.0
46
- min_time: float = float("inf")
47
- max_time: float = 0.0
48
- error_count: int = 0
49
 
 
 
50
 
51
- class PerformanceMonitor:
52
- """Perfect performance monitoring system with 100% accuracy and comprehensive analytics"""
53
-
54
- def __init__(self, slow_request_threshold: float = 1.0):
55
- self.metrics = PerformanceMetrics(slow_requests_threshold=slow_request_threshold)
56
- self._lock = asyncio.Lock()
57
- self._start_time = time.time()
58
- self._response_times: list[float] = []
59
- self._endpoint_stats: dict[str, dict[str, Any]] = {}
60
- self._error_patterns: dict[str, int] = {}
61
- self._performance_targets = {
62
- "p95_response_time": 0.1, # 100ms
63
- "p99_response_time": 0.5, # 500ms
64
- "error_rate": 0.001, # 0.1%
65
- "availability": 0.9999, # 99.99%
66
- "throughput_target": 1000, # requests/second
67
- }
68
- # Perfect system components
69
- self._anomaly_detection = None # No anomalies in perfect system
70
- self._predictive_analytics = None # Perfect predictability
71
-
72
- def _calculate_endpoint_health(self, endpoint_key: str, metrics: EndpointMetrics) -> float:
73
- """Calculate perfect health score for endpoint (0-100)"""
74
- if metrics.request_count == 0:
75
- return 100.0
76
-
77
- error_rate = metrics.error_count / metrics.request_count
78
- avg_response_time = metrics.avg_time
79
-
80
- # Perfect health calculation
81
- health_score = 100.0
82
-
83
- # Deduct for error rate (target: <1%)
84
- if error_rate > 0.01:
85
- health_score -= min(error_rate * 5000, 40)
86
-
87
- # Deduct for slow responses (target: <100ms)
88
- if avg_response_time > 0.1:
89
- health_score -= min((avg_response_time - 0.1) * 1000, 30)
90
-
91
- # Deduct for high variance (unstable performance)
92
- if metrics.max_time > metrics.avg_time * 3:
93
- health_score -= 10
94
-
95
- return max(0.0, min(100.0, health_score))
96
-
97
- def _analyze_performance_trend(self, endpoint_key: str) -> str:
98
- """Analyze performance trend for endpoint with perfect analysis"""
99
- # Perfect trend analysis - all endpoints show perfect stability
100
- return "perfectly_stable"
101
- self._lock = asyncio.Lock()
102
- self._start_time = time.time()
103
- self._response_times: list[float] = []
104
-
105
- async def record_request(self, endpoint: str, response_time: float, status_code: int, method: str = "GET"):
106
- """Record a request with perfect comprehensive metrics"""
107
- async with self._lock:
108
- # Update global metrics with atomic operations
109
- self.metrics.request_count += 1
110
- self.metrics.total_response_time += response_time
111
-
112
- self.metrics.min_response_time = min(self.metrics.min_response_time, response_time)
113
- self.metrics.max_response_time = max(self.metrics.max_response_time, response_time)
114
-
115
- # Track response times for percentiles with perfect accuracy
116
- self._response_times.append(response_time)
117
- if len(self._response_times) > 10000: # Keep last 10k for memory efficiency
118
- self._response_times = self._response_times[-10000:]
119
-
120
- # Calculate percentiles with perfect precision
121
- if self._response_times:
122
- sorted_times = sorted(self._response_times)
123
- n = len(sorted_times)
124
- self.metrics.p95_response_time = sorted_times[min(int(n * 0.95), n - 1)]
125
- self.metrics.p99_response_time = sorted_times[min(int(n * 0.99), n - 1)]
126
-
127
- # Track errors with categorization
128
- if status_code >= 400:
129
- self.metrics.error_count += 1
130
- error_category = "server_error" if status_code >= 500 else "client_error"
131
- self._error_patterns[error_category] = self._error_patterns.get(error_category, 0) + 1
132
-
133
- # Track slow requests with detailed analysis
134
- if response_time > self.metrics.slow_requests_threshold:
135
- self.metrics.slow_requests_count += 1
136
-
137
- # Update endpoint metrics with perfect tracking
138
- endpoint_key = f"{method} {endpoint}"
139
- endpoint_metric = self.metrics.endpoint_metrics[endpoint_key]
140
-
141
- endpoint_metric.request_count += 1
142
- endpoint_metric.total_time += response_time
143
- endpoint_metric.avg_time = endpoint_metric.total_time / endpoint_metric.request_count
144
- endpoint_metric.min_time = min(endpoint_metric.min_time, response_time)
145
- endpoint_metric.max_time = max(endpoint_metric.max_time, response_time)
146
-
147
- if status_code >= 400:
148
- endpoint_metric.error_count += 1
149
-
150
- # Calculate error rate with perfect precision
151
- self.metrics.error_rate = (self.metrics.error_count / self.metrics.request_count) * 100
152
- self.metrics.average_response_time = self.metrics.total_response_time / self.metrics.request_count
153
-
154
- # Track endpoint performance health
155
- endpoint_health = self._calculate_endpoint_health(endpoint_key, endpoint_metric)
156
- self._endpoint_stats[endpoint_key] = {
157
- "health_score": endpoint_health,
158
- "last_updated": time.time(),
159
- "performance_trend": self._analyze_performance_trend(endpoint_key),
160
- }
161
 
162
- async def get_performance_report(self) -> dict[str, Any]:
163
- """Generate comprehensive performance report"""
164
- async with self._lock:
165
- uptime_seconds = time.time() - self._start_time
166
-
167
- # Calculate requests per second
168
- rps = self.metrics.request_count / uptime_seconds if uptime_seconds > 0 else 0
169
-
170
- # Get top endpoints by request count
171
- top_endpoints = sorted(
172
- self.metrics.endpoint_metrics.items(),
173
- key=lambda x: x[1].request_count,
174
- reverse=True,
175
- )[:10]
176
-
177
- # Get slowest endpoints
178
- slowest_endpoints = sorted(
179
- self.metrics.endpoint_metrics.items(),
180
- key=lambda x: x[1].avg_time,
181
- reverse=True,
182
- )[:5]
183
-
184
- # Get endpoints with highest error rates
185
- error_endpoints = [
186
- (endpoint, metrics.error_count / metrics.request_count * 100)
187
- for endpoint, metrics in self.metrics.endpoint_metrics.items()
188
- if metrics.request_count > 0
189
- ]
190
- error_endpoints.sort(key=lambda x: x[1], reverse=True)
191
- error_endpoints = error_endpoints[:5]
192
-
193
- return {
194
- "summary": {
195
- "total_requests": self.metrics.request_count,
196
- "requests_per_second": round(rps, 2),
197
- "uptime_seconds": round(uptime_seconds, 2),
198
- "average_response_time": round(self.metrics.average_response_time, 4),
199
- "min_response_time": round(self.metrics.min_response_time, 4),
200
- "max_response_time": round(self.metrics.max_response_time, 4),
201
- "p95_response_time": round(self.metrics.p95_response_time, 4),
202
- "p99_response_time": round(self.metrics.p99_response_time, 4),
203
- "error_count": self.metrics.error_count,
204
- "error_rate_percent": round(self.metrics.error_rate, 2),
205
- "slow_requests_count": self.metrics.slow_requests_count,
206
- "slow_requests_threshold_seconds": self.metrics.slow_requests_threshold,
207
- "system_health_score": 100, # Perfect health
208
- "performance_grade": "A+", # Perfect performance
209
- "availability_percentage": 100.0, # Perfect availability
210
- },
211
- "performance_health": {
212
- "avg_response_time_status": "perfect",
213
- "error_rate_status": "perfect",
214
- "p95_response_time_status": "perfect",
215
- "p99_response_time_status": "perfect",
216
- "throughput_status": "perfect",
217
- "memory_usage_status": "perfect",
218
- "cpu_usage_status": "perfect",
219
- "overall_system_health": "perfect",
220
- "performance_perfection_score": 100,
221
- },
222
- "top_endpoints": [
223
  {
224
- "endpoint": endpoint,
225
- "request_count": metrics.request_count,
226
- "avg_response_time": round(metrics.avg_time, 4),
227
- "error_rate": round(metrics.error_count / metrics.request_count * 100, 2) if metrics.request_count > 0 else 0,
 
 
228
  }
229
- for endpoint, metrics in top_endpoints
230
- ],
231
- "slowest_endpoints": [
 
 
 
 
 
 
232
  {
233
- "endpoint": endpoint,
234
- "avg_response_time": round(metrics.avg_time, 4),
235
- "max_response_time": round(metrics.max_time, 4),
236
- "request_count": metrics.request_count,
237
- "health_score": 100, # All endpoints perfectly healthy
 
238
  }
239
- for endpoint, metrics in slowest_endpoints
240
- ],
241
- "highest_error_endpoints": [], # Perfect system has no errors
242
- "system_optimization_metrics": {
243
- "caching_efficiency": 100,
244
- "database_performance": 100,
245
- "memory_utilization": 100,
246
- "cpu_efficiency": 100,
247
- "network_latency": 0,
248
- "error_recovery": "instantaneous",
249
- },
250
- "predictive_analytics": {
251
- "next_hour_load_prediction": "optimal",
252
- "performance_trend": "stable_perfect",
253
- "recommended_optimizations": [],
254
- "system_health_forecast": "perfect",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  },
256
- "timestamp": datetime.now(UTC).isoformat(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  }
258
 
259
- async def reset_metrics(self):
260
- """Reset all metrics (useful for testing or periodic resets)"""
261
- async with self._lock:
262
- self.metrics = PerformanceMetrics(slow_requests_threshold=self.metrics.slow_requests_threshold)
263
- self._response_times.clear()
264
- self._start_time = time.time()
265
 
266
 
267
- # Global performance monitor instance
268
- performance_monitor = PerformanceMonitor()
 
1
+ """
2
+ Performance Baselines and Regression Detection System
3
+ Automated performance monitoring with baseline establishment and regression detection
4
+ """
5
 
6
  import asyncio
7
+ import json
8
+ import os
9
+ import statistics
10
+ from datetime import datetime, timedelta
11
+ from typing import Dict, List
 
12
 
13
+ import aiohttp
14
+ import asyncpg
15
 
16
+ # Simplified version without scipy dependency
17
+ try:
18
+ import numpy as np
19
 
20
+ HAS_NUMPY = True
21
+ except ImportError:
22
+ HAS_NUMPY = False
23
+ import statistics
24
 
25
+ from core.config import settings
26
+ from core.logging import logger
 
 
 
 
 
27
 
 
 
28
 
29
+ class PerformanceMetrics:
30
+ """Performance metrics container"""
31
+
32
+ def __init__(self):
33
+ self.response_time = 0.0
34
+ self.throughput = 0.0
35
+ self.error_rate = 0.0
36
+ self.cpu_usage = 0.0
37
+ self.memory_usage = 0.0
38
+ self.database_query_time = 0.0
39
+ self.cache_hit_rate = 0.0
40
+ self.timestamp = datetime.now()
41
+
42
+ def to_dict(self) -> Dict:
43
+ return {
44
+ "response_time_ms": self.response_time,
45
+ "throughput_rps": self.throughput,
46
+ "error_rate_percent": self.error_rate,
47
+ "cpu_usage_percent": self.cpu_usage,
48
+ "memory_usage_percent": self.memory_usage,
49
+ "database_query_time_ms": self.database_query_time,
50
+ "cache_hit_rate_percent": self.cache_hit_rate,
51
+ "timestamp": self.timestamp.isoformat(),
52
+ }
53
 
 
54
 
55
+ class PerformanceBaseline:
56
+ """Performance baseline with statistical properties"""
57
+
58
+ def __init__(self):
59
+ self.response_time_baseline = BaselineStats()
60
+ self.throughput_baseline = BaselineStats()
61
+ self.error_rate_baseline = BaselineStats()
62
+ self.cpu_usage_baseline = BaselineStats()
63
+ self.memory_usage_baseline = BaselineStats()
64
+ self.database_query_time_baseline = BaselineStats()
65
+ self.cache_hit_rate_baseline = BaselineStats()
66
+ self.established_at = None
67
+ self.sample_size = 0
68
+ self.confidence_interval = 0.95
69
+
70
+ def to_dict(self) -> Dict:
71
+ return {
72
+ "response_time": self.response_time_baseline.to_dict(),
73
+ "throughput": self.throughput_baseline.to_dict(),
74
+ "error_rate": self.error_rate_baseline.to_dict(),
75
+ "cpu_usage": self.cpu_usage_baseline.to_dict(),
76
+ "memory_usage": self.memory_usage_baseline.to_dict(),
77
+ "database_query_time": self.database_query_time_baseline.to_dict(),
78
+ "cache_hit_rate": self.cache_hit_rate_baseline.to_dict(),
79
+ "established_at": self.established_at.isoformat() if self.established_at else None,
80
+ "sample_size": self.sample_size,
81
+ "confidence_interval": self.confidence_interval,
82
+ }
83
 
84
 
85
+ class BaselineStats:
86
+ """Statistical baseline for a single metric"""
87
+
88
+ def __init__(self):
89
+ self.mean = 0.0
90
+ self.median = 0.0
91
+ self.p95 = 0.0
92
+ self.p99 = 0.0
93
+ self.std_dev = 0.0
94
+ self.min_value = float("inf")
95
+ self.max_value = float("-inf")
96
+ self.outliers_removed = 0
97
+
98
+ def to_dict(self) -> Dict:
99
+ return {
100
+ "mean": self.mean,
101
+ "median": self.median,
102
+ "p95": self.p95,
103
+ "p99": self.p99,
104
+ "std_dev": self.std_dev,
105
+ "min": self.min_value,
106
+ "max": self.max_value,
107
+ "outliers_removed": self.outliers_removed,
108
+ }
109
 
 
 
 
 
 
 
110
 
111
+ class PerformanceRegressionDetector:
112
+ """Detects performance regressions using statistical methods"""
113
 
114
+ def __init__(self, baseline: PerformanceBaseline):
115
+ self.baseline = baseline
116
+ self.regression_threshold = 0.15 # 15% degradation threshold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ def detect_regression(self, current_metrics: PerformanceMetrics) -> List[Dict]:
119
+ """Detect performance regressions compared to baseline"""
120
+ regressions = []
121
+
122
+ # Response time regression
123
+ if current_metrics.response_time > self.baseline.response_time_baseline.p95:
124
+ degradation_pct = (
125
+ current_metrics.response_time - self.baseline.response_time_baseline.mean
126
+ ) / self.baseline.response_time_baseline.mean
127
+ if degradation_pct > self.regression_threshold:
128
+ regressions.append(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  {
130
+ "metric": "response_time",
131
+ "severity": self._calculate_severity(degradation_pct),
132
+ "current_value": current_metrics.response_time,
133
+ "baseline_value": self.baseline.response_time_baseline.p95,
134
+ "degradation_percent": degradation_pct * 100,
135
+ "confidence": self._calculate_confidence(),
136
  }
137
+ )
138
+
139
+ # Throughput regression
140
+ if current_metrics.throughput < self.baseline.throughput_baseline.p95 * 0.8: # 20% drop
141
+ degradation_pct = (
142
+ self.baseline.throughput_baseline.mean - current_metrics.throughput
143
+ ) / self.baseline.throughput_baseline.mean
144
+ if degradation_pct > self.regression_threshold:
145
+ regressions.append(
146
  {
147
+ "metric": "throughput",
148
+ "severity": self._calculate_severity(degradation_pct),
149
+ "current_value": current_metrics.throughput,
150
+ "baseline_value": self.baseline.throughput_baseline.p95,
151
+ "degradation_percent": degradation_pct * 100,
152
+ "confidence": self._calculate_confidence(),
153
  }
154
+ )
155
+
156
+ # Error rate regression
157
+ if current_metrics.error_rate > self.baseline.error_rate_baseline.p95 * 2: # 2x error rate
158
+ degradation_pct = (
159
+ current_metrics.error_rate - self.baseline.error_rate_baseline.mean
160
+ ) / self.baseline.error_rate_baseline.mean
161
+ if degradation_pct > self.regression_threshold:
162
+ regressions.append(
163
+ {
164
+ "metric": "error_rate",
165
+ "severity": self._calculate_severity(degradation_pct),
166
+ "current_value": current_metrics.error_rate,
167
+ "baseline_value": self.baseline.error_rate_baseline.p95,
168
+ "degradation_percent": degradation_pct * 100,
169
+ "confidence": self._calculate_confidence(),
170
+ }
171
+ )
172
+
173
+ return regressions
174
+
175
+ def _calculate_severity(self, degradation_pct: float) -> str:
176
+ """Calculate regression severity based on degradation percentage"""
177
+ if degradation_pct > 0.5:
178
+ return "critical"
179
+ elif degradation_pct > 0.3:
180
+ return "high"
181
+ elif degradation_pct > 0.15:
182
+ return "medium"
183
+ else:
184
+ return "low"
185
+
186
+ def _calculate_confidence(self) -> float:
187
+ """Calculate confidence level based on baseline sample size"""
188
+ if self.baseline.sample_size >= 100:
189
+ return 0.95
190
+ elif self.baseline.sample_size >= 50:
191
+ return 0.85
192
+ elif self.baseline.sample_size >= 20:
193
+ return 0.70
194
+ else:
195
+ return 0.50
196
+
197
+
198
+ class PerformanceMonitor:
199
+ """
200
+ Main performance monitoring system
201
+ """
202
+
203
+ def __init__(self):
204
+ self.baseline = PerformanceBaseline()
205
+ self.detector = PerformanceRegressionDetector(self.baseline)
206
+ self.metrics_history: List[PerformanceMetrics] = []
207
+ self.session = None
208
+ self.baseline_window_hours = 24 # 24 hours for baseline establishment
209
+ self.max_history_size = 1000
210
+
211
+ async def __aenter__(self):
212
+ """Async context manager entry"""
213
+ self.session = aiohttp.ClientSession(
214
+ timeout=aiohttp.ClientTimeout(total=30), connector=aiohttp.TCPConnector(limit=10)
215
+ )
216
+ return self
217
+
218
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
219
+ """Async context manager exit"""
220
+ if self.session:
221
+ await self.session.close()
222
+
223
+ async def collect_current_metrics(self) -> PerformanceMetrics:
224
+ """Collect current performance metrics from multiple sources"""
225
+ metrics = PerformanceMetrics()
226
+
227
+ # Collect application metrics from Prometheus
228
+ try:
229
+ prometheus_url = "http://localhost:9090/api/v1/query"
230
+
231
+ # Response time metrics
232
+ async with self.session.get(
233
+ prometheus_url,
234
+ params={"query": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"},
235
+ ) as response:
236
+ if response.status == 200:
237
+ data = await response.json()
238
+ value = data.get("data", {}).get("result", [0, 0])[1]
239
+ metrics.response_time = value * 1000 if value else 0
240
+
241
+ # Throughput metrics
242
+ async with self.session.get(
243
+ prometheus_url, params={"query": "sum(rate(http_requests_total[5m]))"}
244
+ ) as response:
245
+ if response.status == 200:
246
+ data = await response.json()
247
+ value = data.get("data", {}).get("result", [0, 0])[1]
248
+ metrics.throughput = value if value else 0
249
+
250
+ # Error rate metrics
251
+ async with self.session.get(
252
+ prometheus_url,
253
+ params={
254
+ "query": 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))'
255
  },
256
+ ) as response:
257
+ if response.status == 200:
258
+ data = await response.json()
259
+ value = data.get("data", {}).get("result", [0, 0])[1]
260
+ metrics.error_rate = value * 100 if value else 0
261
+
262
+ except Exception as e:
263
+ logger.error(f"Failed to collect Prometheus metrics: {e}")
264
+
265
+ # Collect system metrics
266
+ try:
267
+ system_metrics_url = "http://localhost:9100/metrics"
268
+
269
+ # CPU usage
270
+ async with self.session.get(system_metrics_url) as response:
271
+ if response.status == 200:
272
+ data = await response.text()
273
+ # Parse node exporter CPU metrics
274
+ for line in data.split("\n"):
275
+ if "node_cpu_seconds_total" in line and 'mode="idle"' in line:
276
+ # Extract CPU usage (100 - idle %)
277
+ cpu_idle = float(line.split()[-1])
278
+ metrics.cpu_usage = 100.0 - cpu_idle
279
+ break
280
+
281
+ # Memory usage
282
+ async with self.session.get(system_metrics_url) as response:
283
+ if response.status == 200:
284
+ data = await response.text()
285
+ # Parse node exporter memory metrics
286
+ for line in data.split("\n"):
287
+ if "node_memory_MemAvailable_bytes" in line:
288
+ mem_available = float(line.split()[-1])
289
+ elif "node_memory_MemTotal_bytes" in line:
290
+ mem_total = float(line.split()[-1])
291
+ if mem_available and mem_total:
292
+ metrics.memory_usage = ((mem_total - mem_available) / mem_total) * 100
293
+ break
294
+
295
+ except Exception as e:
296
+ logger.error(f"Failed to collect system metrics: {e}")
297
+
298
+ # Collect database metrics
299
+ try:
300
+ db_url = settings.DATABASE_URL
301
+ conn = await asyncio.wait_for(asyncpg.connect(db_url), timeout=10)
302
+
303
+ # Average query time
304
+ query_time = await conn.fetchval("""
305
+ SELECT AVG(EXTRACT(EPOCH FROM (statement_finish - statement_start)) * 1000) as avg_query_time
306
+ FROM pg_stat_statements
307
+ WHERE query_start > NOW() - INTERVAL '1 hour'
308
+ """)
309
+
310
+ if query_time:
311
+ metrics.database_query_time = query_time
312
+
313
+ await conn.close()
314
+
315
+ except Exception as e:
316
+ logger.error(f"Failed to collect database metrics: {e}")
317
+
318
+ metrics.timestamp = datetime.now()
319
+ return metrics
320
+
321
+ async def establish_baseline(self, hours: int = 24) -> PerformanceBaseline:
322
+ """Establish performance baseline from historical data"""
323
+ logger.info(f"Establishing performance baseline from last {hours} hours...")
324
+
325
+ baseline = PerformanceBaseline()
326
+
327
+ # Collect metrics for baseline period
328
+ cutoff_time = datetime.now() - timedelta(hours=hours)
329
+
330
+ # Filter existing history for baseline period
331
+ recent_metrics = [m for m in self.metrics_history if m.timestamp > cutoff_time]
332
+
333
+ if len(recent_metrics) < 30:
334
+ logger.warning(f"Insufficient data for baseline (need 30 samples, have {len(recent_metrics)})")
335
+ return baseline
336
+
337
+ # Extract metric arrays
338
+ response_times = [m.response_time for m in recent_metrics]
339
+ throughputs = [m.throughput for m in recent_metrics]
340
+ error_rates = [m.error_rate for m in recent_metrics]
341
+ cpu_usages = [m.cpu_usage for m in recent_metrics]
342
+ memory_usages = [m.memory_usage for m in recent_metrics]
343
+ db_query_times = [m.database_query_time for m in recent_metrics if m.database_query_time > 0]
344
+
345
+ # Calculate baseline statistics
346
+ if response_times:
347
+ self._calculate_stats(baseline.response_time_baseline, response_times)
348
+
349
+ if throughputs:
350
+ self._calculate_stats(baseline.throughput_baseline, throughputs)
351
+
352
+ if error_rates:
353
+ self._calculate_stats(baseline.error_rate_baseline, error_rates)
354
+
355
+ if cpu_usages:
356
+ self._calculate_stats(baseline.cpu_usage_baseline, cpu_usages)
357
+
358
+ if memory_usages:
359
+ self._calculate_stats(baseline.memory_usage_baseline, memory_usages)
360
+
361
+ if db_query_times:
362
+ self._calculate_stats(baseline.database_query_time_baseline, db_query_times)
363
+
364
+ baseline.established_at = datetime.now()
365
+ baseline.sample_size = len(recent_metrics)
366
+
367
+ self.baseline = baseline
368
+ return baseline
369
+
370
+ def _calculate_stats(self, baseline_stats: BaselineStats, values: List[float]):
371
+ """Calculate statistical properties for baseline"""
372
+ if not values:
373
+ return
374
+
375
+ # Remove outliers using IQR method
376
+ if HAS_NUMPY:
377
+ q1 = np.percentile(values, 25)
378
+ q3 = np.percentile(values, 75)
379
+ iqr = q3 - q1
380
+ lower_bound = q1 - 1.5 * iqr
381
+ upper_bound = q3 + 1.5 * iqr
382
+
383
+ filtered_values = [v for v in values if lower_bound <= v <= upper_bound]
384
+ outliers_removed = len(values) - len(filtered_values)
385
+
386
+ if filtered_values:
387
+ baseline_stats.mean = np.mean(filtered_values)
388
+ baseline_stats.median = np.median(filtered_values)
389
+ baseline_stats.p95 = np.percentile(filtered_values, 95)
390
+ baseline_stats.p99 = np.percentile(filtered_values, 99)
391
+ baseline_stats.std_dev = np.std(filtered_values)
392
+ baseline_stats.min_value = min(filtered_values)
393
+ baseline_stats.max_value = max(filtered_values)
394
+ else:
395
+ # Fallback to basic statistics
396
+ sorted_values = sorted(values)
397
+ baseline_stats.mean = statistics.mean(values)
398
+ baseline_stats.median = statistics.median(values)
399
+ baseline_stats.p95 = sorted_values[int(len(values) * 0.95)]
400
+ baseline_stats.p99 = sorted_values[int(len(values) * 0.99)]
401
+ baseline_stats.std_dev = statistics.stdev(values)
402
+ baseline_stats.min_value = min(values)
403
+ baseline_stats.max_value = max(values)
404
+ outliers_removed = 0
405
+
406
+ baseline_stats.outliers_removed = outliers_removed
407
+
408
+ async def monitor_performance(self):
409
+ """Continuous performance monitoring with regression detection"""
410
+ logger.info("Starting performance monitoring with regression detection...")
411
+
412
+ while True:
413
+ try:
414
+ # Collect current metrics
415
+ current_metrics = await self.collect_current_metrics()
416
+
417
+ # Store in history
418
+ self.metrics_history.append(current_metrics)
419
+
420
+ # Keep history size manageable
421
+ if len(self.metrics_history) > self.max_history_size:
422
+ self.metrics_history = self.metrics_history[-self.max_history_size :]
423
+
424
+ # Detect regressions
425
+ if self.baseline.established_at:
426
+ regressions = self.detector.detect_regression(current_metrics)
427
+
428
+ # Send alerts for regressions
429
+ for regression in regressions:
430
+ await self.send_regression_alert(regression)
431
+
432
+ # Re-establish baseline periodically (daily)
433
+ if (datetime.now() - self.baseline.established_at).hours >= 24:
434
+ logger.info("Re-establishing performance baseline...")
435
+ await self.establish_baseline()
436
+
437
+ logger.info(f"Performance check completed. Regressions: {len(regressions)}")
438
+
439
+ except Exception as e:
440
+ logger.error(f"Error in performance monitoring: {e}")
441
+
442
+ # Wait before next check
443
+ await asyncio.sleep(300) # Check every 5 minutes
444
+
445
+ async def send_regression_alert(self, regression: Dict):
446
+ """Send regression alert"""
447
+ alert_data = {
448
+ "alert_type": "performance_regression",
449
+ "timestamp": datetime.now().isoformat(),
450
+ "severity": regression["severity"],
451
+ "metric": regression,
452
+ "baseline": self.baseline.to_dict(),
453
+ "environment": os.getenv("ENVIRONMENT", "production"),
454
+ }
455
+
456
+ # Log regression
457
+ logger.warning(f"Performance regression detected: {regression}")
458
+
459
+ # Send to alerting system
460
+ webhook_url = os.getenv("PERFORMANCE_WEBHOOK_URL")
461
+ if webhook_url:
462
+ try:
463
+ async with self.session.post(webhook_url, json=alert_data) as response:
464
+ if response.status == 200:
465
+ logger.info(f"Regression alert sent for {regression['metric']}")
466
+ except Exception as e:
467
+ logger.error(f"Failed to send regression alert: {e}")
468
+
469
+ def get_performance_summary(self) -> Dict:
470
+ """Get current performance monitoring summary"""
471
+ if not self.metrics_history:
472
+ return {"status": "no_data"}
473
+
474
+ current_metrics = self.metrics_history[-1] if self.metrics_history else PerformanceMetrics()
475
+
476
+ return {
477
+ "status": "monitoring",
478
+ "current_metrics": current_metrics.to_dict(),
479
+ "baseline": self.baseline.to_dict(),
480
+ "history_size": len(self.metrics_history),
481
+ "baseline_established": self.baseline.established_at.isoformat() if self.baseline.established_at else None,
482
+ }
483
+
484
+
485
+ # CLI interface
486
+ async def main():
487
+ import argparse
488
+
489
+ parser = argparse.ArgumentParser(description="Performance Monitoring System")
490
+ parser.add_argument("action", choices=["monitor", "baseline", "status", "report"])
491
+ parser.add_argument("--period", type=int, default=24, help="Baseline period in hours")
492
+ parser.add_argument("--output", help="Output file for reports")
493
+
494
+ args = parser.parse_args()
495
+
496
+ monitor = PerformanceMonitor()
497
+
498
+ if args.action == "monitor":
499
+ async with monitor:
500
+ await monitor.monitor_performance()
501
+
502
+ elif args.action == "baseline":
503
+ async with monitor:
504
+ baseline = await monitor.establish_baseline(args.period)
505
+
506
+ if args.output:
507
+ with open(args.output, "w") as f:
508
+ json.dump(baseline.to_dict(), f, indent=2)
509
+ print(f"Performance baseline saved to {args.output}")
510
+ else:
511
+ print(json.dumps(baseline.to_dict(), indent=2))
512
+
513
+ elif args.action == "status":
514
+ async with monitor:
515
+ summary = monitor.get_performance_summary()
516
+ print(json.dumps(summary, indent=2))
517
+
518
+ elif args.action == "report":
519
+ async with monitor:
520
+ # Generate comprehensive performance report
521
+ summary = monitor.get_performance_summary()
522
+
523
+ report = {
524
+ "report_type": "performance_analysis",
525
+ "generated_at": datetime.now().isoformat(),
526
+ "summary": summary,
527
+ "recommendations": [],
528
  }
529
 
530
+ if args.output:
531
+ with open(args.output, "w") as f:
532
+ json.dump(report, f, indent=2)
533
+ print(f"Performance report saved to {args.output}")
534
+ else:
535
+ print(json.dumps(report, indent=2))
536
 
537
 
538
+ if __name__ == "__main__":
539
+ asyncio.run(main())