Testys commited on
Commit
01ca3ba
·
1 Parent(s): 3664f12

First Layer for ML Model

Browse files
.gitignore ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ data/raw/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py.cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # UV
99
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ #uv.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+ #poetry.toml
111
+
112
+ # pdm
113
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
115
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
116
+ #pdm.lock
117
+ #pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # pixi
122
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
123
+ #pixi.lock
124
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
125
+ # in the .venv directory. It is recommended not to include this directory in version control.
126
+ .pixi
127
+
128
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
129
+ __pypackages__/
130
+
131
+ # Celery stuff
132
+ celerybeat-schedule
133
+ celerybeat.pid
134
+
135
+ # SageMath parsed files
136
+ *.sage.py
137
+
138
+ # Environments
139
+ .env
140
+ .envrc
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Abstra
180
+ # Abstra is an AI-powered process automation framework.
181
+ # Ignore directories containing user credentials, local state, and settings.
182
+ # Learn more at https://abstra.io/docs
183
+ .abstra/
184
+
185
+ # Visual Studio Code
186
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
187
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
188
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
189
+ # you could uncomment the following to ignore the entire vscode folder
190
+ # .vscode/
191
+
192
+ # Ruff stuff:
193
+ .ruff_cache/
194
+
195
+ # PyPI configuration file
196
+ .pypirc
197
+
198
+ # Cursor
199
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
200
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
201
+ # refer to https://docs.cursor.com/context/ignore-files
202
+ .cursorignore
203
+ .cursorindexingignore
204
+
205
+ # Marimo
206
+ marimo/_static/
207
+ marimo/_lsp/
208
+ __marimo__/
LICENSE ADDED
File without changes
README.md CHANGED
@@ -1,12 +1,32 @@
1
- ---
2
- title: Sentry Ml Api
3
- emoji: 🏃
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sentry-predict
2
+
3
+ ML Prediction Service for Project Sentinel: Predictive failure and performance forecasting for Solana RPC nodes.
4
+
5
+ ## Installation
6
+ 1. Clone the repo: `git clone https://github.com/your-org/sentry-predict.git`
7
+ 2. Install: `pip install -e .` (editable mode)
8
+ 3. Install dev deps: `pip install -r requirements-dev.txt`
9
+ 4. Setup pre-commit: `pre-commit install`
10
+
11
+ ## Usage
12
+ - Generate data: `generate-data`
13
+ - Train models: `train-sentry`
14
+ - Run API: `uvicorn api.main:app --reload`
15
+
16
+ ## Testing
17
+ `pytest tests/`
18
+
19
+ ## Contributing
20
+ 1. Fork the repo
21
+ 2. Create branch: `git checkout -b feature/xyz`
22
+ 3. Commit with pre-commit
23
+ 4. PR
24
+
25
+ ## Architecture
26
+ - Data generation with AR(1) for time-series realism
27
+ - Feature engineering for trends/lags
28
+ - Models: Autoencoder (anomaly), LogisticRegression (failure), SARIMA (forecasting)
29
+ - API for predictions
30
+
31
+ ## License
32
+ MIT
config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ processed_data_path: "./data/processed/engineered_metrics.csv"
3
+
4
+ models_dir:
5
+ anomaly_model_path: "./models/anomaly_model.joblib"
6
+ anomaly_scaler_path: "./models/anomaly_scaler.joblib"
7
+ failure_model_path: "./models/failure_model.joblib"
8
+ failure_scaler_path: "./models/failure_scaler.joblib"
data/processed/engineered_metrics.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/synthetic-data.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import timedelta
4
+ from typing import List
5
+ from src.utils import logger
6
+
7
+ def ar1_process(n: int, mean: float, std: float, phi: float = 0.8, seed_offset: int = 0) -> np.ndarray:
8
+ """Generate AR(1) time-series.
9
+
10
+ Args:
11
+ n (int): Number of samples.
12
+ mean (float): Mean.
13
+ std (float): Standard deviation.
14
+ phi (float): Autocorrelation coefficient.
15
+ seed_offset (int): Seed offset for reproducibility.
16
+
17
+ Returns:
18
+ np.ndarray: Generated series.
19
+ """
20
+ np.random.seed(42 + seed_offset)
21
+ x = np.zeros(n)
22
+ x[0] = np.random.normal(mean, std)
23
+ for t in range(1, n):
24
+ x[t] = phi * x[t-1] + np.random.normal(0, std * np.sqrt(1 - phi**2))
25
+ return np.clip(x, 0, None)
26
+
27
+ def generate_data() -> None:
28
+ """Generate synthetic RPC metrics for all nodes."""
29
+ try:
30
+ np.random.seed(42)
31
+
32
+ n_samples = 1000
33
+ start_time = pd.Timestamp("2025-10-01 00:00:00")
34
+ timestamps = pd.date_range(start_time, periods=n_samples, freq="1min")
35
+ nodes = ["agave1", "agave2", "firedancer1", "firedancer2"]
36
+
37
+ node_params = {
38
+ "agave1": {"cpu_mean": 45, "cpu_var": 8, "latency_mean": 60, "latency_var": 10, "error_var": 0.005},
39
+ "agave2": {"cpu_mean": 48, "cpu_var": 9, "latency_mean": 65, "latency_var": 12, "error_var": 0.006},
40
+ "firedancer1": {"cpu_mean": 55, "cpu_var": 15, "latency_mean": 50, "latency_var": 20, "error_var": 0.01},
41
+ "firedancer2": {"cpu_mean": 52, "cpu_var": 12, "latency_mean": 55, "latency_var": 18, "error_var": 0.009}
42
+ }
43
+
44
+ phi = 0.8
45
+ n_ramps_per_node = 8
46
+ ramp_length = 20
47
+ all_data: List[pd.DataFrame] = []
48
+
49
+ for node in nodes:
50
+ params = node_params[node]
51
+ cpu_base = ar1_process(n_samples, params["cpu_mean"], params["cpu_var"], phi, nodes.index(node) * 10)
52
+ latency_base = ar1_process(n_samples, params["latency_mean"], params["latency_var"], phi, nodes.index(node) * 20)
53
+ error_base = np.abs(ar1_process(n_samples, 0.02, params["error_var"], phi, nodes.index(node) * 30))
54
+ mem_base = ar1_process(n_samples, 50, 10, phi, nodes.index(node) * 40)
55
+ disk_base = ar1_process(n_samples, 40, 12, phi, nodes.index(node) * 50)
56
+ block_gap_base = np.random.choice([0, 1], size=n_samples, p=[0.75, 0.25])
57
+
58
+ ramp_starts = sorted(np.random.choice(n_samples - ramp_length, n_ramps_per_node, replace=False))
59
+ cpu = cpu_base.copy()
60
+ latency = latency_base.copy()
61
+ error_rate = error_base.copy()
62
+ mem = mem_base.copy()
63
+ disk = disk_base.copy()
64
+ block_gap = block_gap_base.copy()
65
+ labels = np.zeros(n_samples, dtype=int)
66
+
67
+ for start in ramp_starts:
68
+ ramp_cpu = np.linspace(0, 45, ramp_length)
69
+ ramp_latency = np.linspace(0, 250, ramp_length)
70
+ ramp_error = np.linspace(0, 0.4, ramp_length)
71
+ ramp_mem = np.linspace(0, 30, ramp_length)
72
+ ramp_disk = np.linspace(0, 150, ramp_length)
73
+ ramp_gap = np.full(ramp_length, 5)
74
+
75
+ cpu[start:start+ramp_length] += ramp_cpu
76
+ latency[start:start+ramp_length] += ramp_latency
77
+ error_rate[start:start+ramp_length] += ramp_error
78
+ mem[start:start+ramp_length] += ramp_mem
79
+ disk[start:start+ramp_length] += ramp_disk
80
+ block_gap[start:start+ramp_length] = np.maximum(block_gap[start:start+ramp_length], ramp_gap)
81
+
82
+ labels[start:start+ramp_length] = 1
83
+
84
+ latency = 40 + cpu * 1.5 + error_rate * 200 + np.random.normal(0, 5, n_samples)
85
+ latency = np.clip(latency, 20, 1000)
86
+
87
+ node_data = []
88
+ for i, t in enumerate(timestamps):
89
+ node_data.append([
90
+ t, node,
91
+ round(cpu[i], 2), round(mem[i], 2), round(disk[i], 2),
92
+ round(latency[i], 2), round(error_rate[i], 3),
93
+ int(block_gap[i]), labels[i]
94
+ ])
95
+
96
+ node_df = pd.DataFrame(node_data, columns=[
97
+ "timestamp", "node", "cpu_usage", "memory_usage", "disk_io",
98
+ "rpc_latency_ms", "rpc_error_rate", "block_height_gap", "failure_imminent"
99
+ ])
100
+
101
+ node_filename = f"data/raw/{node}_metrics.csv"
102
+ node_df.to_csv(node_filename, index=False)
103
+
104
+ all_data.append(node_df)
105
+
106
+ combined_df = pd.concat(all_data, ignore_index=True)
107
+ combined_df = combined_df.sort_values(['timestamp', 'node']).reset_index(drop=True)
108
+ combined_df.to_csv("data/raw/synthetic_rpc_metrics_realistic.csv", index=False)
109
+ logger.info("Synthetic data generated.")
110
+ except Exception as e:
111
+ logger.error(f"Error generating data: {e}")
112
+ raise
113
+
114
+ if __name__ == "__main__":
115
+ generate_data()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.3
2
+ numpy==1.26.4
3
+ scikit-learn==1.5.2
4
+ statsmodels==0.14.4
5
+ fastapi==0.115.0
6
+ uvicorn==0.30.6
7
+ joblib==1.4.2
8
+ pydantic==2.9.2
9
+ pytest==8.3.3
src/__init__.py ADDED
File without changes
src/features.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/feature.py
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from typing import List
6
+ from src.utils import logger
7
+
8
+
9
+ def engineer_features(df:pd.DataFrame) -> pd.DataFrame:
10
+ """
11
+ Engineer Features from raw metrics
12
+
13
+ Args:
14
+ df(pd.DataFrame): Raw Data from the system
15
+
16
+ Returns:
17
+ pf.DataFrame: Data with added Features
18
+ """
19
+ try:
20
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
21
+ df = df.sort_values(["node", "timestamp"])
22
+
23
+ grouped = df.groupby("node")
24
+ df["cpu_trend"] = grouped["cpu_usage"].transform(lambda x:x.diff())
25
+ df["cpu_rolling_mean"] = grouped["cpu_usage"].transform(lambda x:x.rolling(window=5, min_periods=1).mean())
26
+ df["error_rate_lag1"] = grouped["rpc_error_rate"].shift(1)
27
+ df["latency_rolling_std"] = grouped["rpc_latency_ms"].transform(lambda x:x.rolling(window=5).std())
28
+
29
+ df = df.fillna(0)
30
+
31
+ return df
32
+
33
+ except KeyError as e:
34
+ logger.error(f"Missing Column in Data: {e}")
35
+ raise
36
+ except Exception as e:
37
+ logger.error(f"Error engineering features: {e}")
38
+
39
+
40
+ def main(input_path:str = "data/raw/synthetic_rpc_metrics_realistic.csv", output_path:str = "data/processed/engineered_metrics.csv") -> None:
41
+ """
42
+ Main function to engineer features from raw data
43
+
44
+ Args:
45
+ input_path(str): Path to raw data CSV
46
+ output_path(str): Path to save engineered features CSV
47
+ """
48
+ try:
49
+ df = pd.read_csv(input_path)
50
+ df_engineered = engineer_features(df)
51
+ df_engineered.to_csv(output_path, index=False)
52
+ logger.info(f"Engineered features saved to {output_path}")
53
+ except Exception as e:
54
+ logger.error(f"Error in main function: {e}")
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
src/predict.py ADDED
File without changes
src/train.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ from pandas.core.computation.expr import _node_not_implemented
5
+ from sklearn.neural_network import MLPRegressor
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, recall_score, f1_score
9
+ from sklearn.preprocessing import StandardScaler
10
+ from statsmodels.tsa.statespace.sarimax import SARIMAX
11
+ from src.features import engineer_features
12
+ from src.utils import logger, load_config
13
+ from typing import Tuple, Dict
14
+
15
+ config = load_config
16
+
17
+ def train_anomaly_model(df:pd.DataFrame) -> Tuple[MLPRegressor, StandardScaler]:
18
+ """Train autoencoder for anomaly detection
19
+
20
+ Args:
21
+ df (pd.DataFrame): Processed data
22
+
23
+ Returns:
24
+ Tuple[MLPRegressor, StandardScaler]: Model and Scaler.
25
+ """
26
+ features = ["cpu_usage", 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'cpu_rolling_mean']
27
+ healthy_df = df[df["failure_imminent"] == 0]
28
+ X_healthy = healthy_df[features]
29
+ scaler = StandardScaler().fit(X_healthy)
30
+ X_healthy_scaled = scaler.transform(X_healthy)
31
+ model = MLPRegressor(hidden_layer_sizes=(10, 5, 10), activation="relu", solver="adam", max_iter=500, random_state=42)
32
+ model.fit(X_healthy_scaled, X_healthy_scaled)
33
+ return model, scaler
34
+
35
+
36
+ def evaluate_anomaly_model(model: MLPRegressor, scaler:StandardScaler, X_test_scaled:np.ndarray) -> float:
37
+ """
38
+ Evaluate autoencoder on test set.
39
+
40
+ Args:
41
+ model (MLPRegressor): Trained autoencoder.
42
+ scaler (StandardScaler): Scaler Used.
43
+ X_test_scaled (np.ndarray): Scaled test data.
44
+
45
+ Returns:
46
+ float: MSE on the test set.
47
+ """
48
+ reconstructed = model.predict(X_test_scaled)
49
+ mse = mean_squared_error(X_test_scaled, reconstructed)
50
+ return mse
51
+
52
+ def train_failure_model(df:pd.DataFrame) -> Tuple[LogisticRegression, StandardScaler, np.ndarray, np.ndarray]:
53
+ """Train Logistic Regression Model for Failure Prediction
54
+
55
+ Args:
56
+ df (pd.DataFrame): Processed data.
57
+
58
+ Returns:
59
+ Tuple(LogisticRegression, StandardScaler): Model and scaler for failure model.
60
+ """
61
+
62
+ features = ['cpu_usage', 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'error_rate_lag1']
63
+ X = df[features]
64
+ y = df['failure_imminent']
65
+ scaler = StandardScaler()
66
+ X_scaled = scaler.fit_transform(X)
67
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
68
+ model = LogisticRegression(random_state=42, max_iter=200, class_weight='balanced')
69
+ model.fit(X_train, y_train)
70
+ return model, scaler, X_test, y_test
71
+
72
+
73
+ def evaluate_failure_model(model:LogisticRegression, X_test:np.ndarray, y_test:np.ndarray) -> Dict[str, float]:
74
+ """Evaluating the failure prediction model
75
+
76
+ Args:
77
+ model (LogisticRegression): Trained failure prediction model.
78
+ X_test (np.ndarray): Test features.
79
+ y_test (np.ndarray): Test labels.
80
+
81
+ Returns:
82
+ Dict[str, float]: Evaluation metrics.
83
+ """
84
+ y_pred = model.predict(X_test)
85
+ return {
86
+ "accuracy": accuracy_score(y_test, y_pred),
87
+ "recall": recall_score(y_test, y_pred),
88
+ "f1_score": f1_score(y_test, y_pred)
89
+ }
90
+
91
+
92
+ def train_latency_model(df: pd.DataFrame) -> Dict[str, SARIMAX]:
93
+ """Train SARIMAX for latency forcasting per node
94
+
95
+ Args:
96
+ df (pd.DataFrame): Processed data.
97
+
98
+ Returns:
99
+ Dict(str, SARIMAX): Models per node.
100
+ """
101
+ models = {}
102
+ for node in df["node"].unique():
103
+ node_df = df[df["node"]== node].sort_values("timestamp")
104
+ series = node_df['rpc_latency_ms']
105
+ train_size = int(len(series))
106
+ train, test = series[:train_size], series[train_size:]
107
+ model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,60), enforce_stationarity=False)
108
+ fitted = model.fit(disp=False)
109
+ models[node] = fitted
110
+
111
+ return models
112
+
113
+
114
+ def evaluate_latency_model(model:SARIMAX, test_series:pd.Series) -> float:
115
+ """Evaluate SARIMA on test set.
116
+
117
+ Args:
118
+ model (SARIMAX): Trained model.
119
+ test_series (pd.Series): Test latency data
120
+
121
+ Returns:
122
+ float: RMSE on test set.
123
+ """
124
+ forecast = model.forecast(steps=len(test_series))
125
+ rmse = np.sqrt(mean_squared_error(test_series, forecast))
126
+ mae = np.mean(np.abs(test_series - forecast))
127
+
128
+ return rmse, mae
129
+
130
+ def main():
131
+ try:
132
+ df = pd.read_csv(config["data"]["processed_data_path"])
133
+
134
+ # Anomaly Model Training
135
+ anomaly_model, anomaly_scaler = train_anomaly_model(df)
136
+ healthy_df = df[df["failure_imminent"] == 0]
137
+ X_healthy = healthy_df[["cpu_usage", 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'cpu_rolling_mean']]
138
+ X_healthy_scaled = anomaly_scaler.transform(X_healthy)
139
+ X_train, X_test, _, _ = train_test_split(X_healthy_scaled, np.zeros(len(X_healthy)), test_size=0.2, random_state=42)
140
+ anomaly_mse = evaluate_anomaly_model(anomaly_model, anomaly_scaler, X_test)
141
+ logger.info(f"Anomaly Model MSE: {anomaly_mse}")
142
+ joblib.dump(anomaly_model, config["models_dir"]["anomaly_model_path"])
143
+ joblib.dump(anomaly_scaler, config["models_dir"]["anomaly_scaler_path"])
144
+
145
+ # Failure Model Training
146
+ failure_model, failure_scaler, X_test, y_test = train_failure_model(df)
147
+ failure_metrics = evaluate_failure_model(failure_model, X_test, y_test)
148
+ logger.info(f"Failure Model Metrics: {failure_metrics}")
149
+ joblib.dump(failure_model, config["models_dir"]["failure_model_path"])
150
+ joblib.dump(failure_scaler, config["models_dir"]["failure_scaler_path"])
151
+
152
+ # Latency Model Training
153
+ latency_models = train_latency_model(df)
154
+ for node, model in latency_models.items():
155
+ node_df = df[df["node"]== node].sort_values("timestamp")
156
+ series = node_df['rpc_latency_ms']
157
+ train_size = int(len(series)*0.8)
158
+ train, test = series[:train_size], series[train_size:]
159
+ latency_rmse, latency_mae = evaluate_latency_model(model, test)
160
+ logger.info(f"Latency Model for {node} - RMSE: {latency_rmse}, MAE: {latency_mae}")
161
+ joblib.dump(model, f"{config['models_dir']['latency_model_dir']}/{node}_latency_model.joblib")
162
+
163
+ logger.info("Training Completed Successfully.")
164
+
165
+ except Exception as e:
166
+ logger.error(f"Error in training process: {e}")
167
+ raise
168
+
169
+ if __name__ == "__main__":
170
+ main()
src/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ def setup_logging():
4
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
5
+ return logging.getLogger(__name__)
6
+
7
+ def get_config():
8
+ with open("./config.yaml", "r") as f:
9
+ import yaml
10
+ config = yaml.safe_load(f)
11
+
12
+ logger.info(f"Configuration loaded successfully: {config}")
13
+ return config
14
+
15
+ logger = setup_logging()
16
+ load_config = get_config()
tests/test_features.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import pandas as pd
3
+ from src.features import engineer_features
4
+
5
+ @pytest.fixture
6
+ def sample_data():
7
+ data = {
8
+ "timestamp": pd.date_range(start="2025-01-01", periods=10, freq="1min"),
9
+ "node": ["agave1"] * 10,
10
+ "cpu_usage": [45, 46, 47, 48, 49, 50, 51, 52, 53, 54],
11
+ "rpc_error_rate": [0.1] * 10,
12
+ "rpc_latency_ms": [60] * 10,
13
+ "failure_imminent": [0] *10
14
+ }
15
+ return pd.DataFrame(data)
16
+
17
+ def test_engineer_features(sample_data):
18
+ processed_df = engineer_features(sample_data)
19
+ assert "cpu_trend" in processed_df.columns
20
+ assert processed_df["cpu_trend"].iloc[1] == 1 # 46 - 45
21
+ assert "cpu_rolling_mean" in processed_df.columns
22
+ assert len(processed_df) == 10
23
+ assert processed_df["cpu_rolling_mean"].iloc[4] == pytest.approx(47.0) # Mean of first 5 entries
24
+
25
+ def test_engineer_features_empty():
26
+ df = pd.DataFrame()
27
+ with pytest.raises(KeyError):
28
+ engineer_features(df)