Spaces:
Configuration error
Configuration error
First Layer for ML Model
Browse files- .gitignore +208 -0
- LICENSE +0 -0
- README.md +32 -12
- config.yaml +8 -0
- data/processed/engineered_metrics.csv +0 -0
- data/synthetic-data.py +115 -0
- requirements.txt +9 -0
- src/__init__.py +0 -0
- src/features.py +58 -0
- src/predict.py +0 -0
- src/train.py +170 -0
- src/utils.py +16 -0
- tests/test_features.py +28 -0
.gitignore
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
data/raw/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
share/python-wheels/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
MANIFEST
|
| 29 |
+
|
| 30 |
+
# PyInstaller
|
| 31 |
+
# Usually these files are written by a python script from a template
|
| 32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 33 |
+
*.manifest
|
| 34 |
+
*.spec
|
| 35 |
+
|
| 36 |
+
# Installer logs
|
| 37 |
+
pip-log.txt
|
| 38 |
+
pip-delete-this-directory.txt
|
| 39 |
+
|
| 40 |
+
# Unit test / coverage reports
|
| 41 |
+
htmlcov/
|
| 42 |
+
.tox/
|
| 43 |
+
.nox/
|
| 44 |
+
.coverage
|
| 45 |
+
.coverage.*
|
| 46 |
+
.cache
|
| 47 |
+
nosetests.xml
|
| 48 |
+
coverage.xml
|
| 49 |
+
*.cover
|
| 50 |
+
*.py.cover
|
| 51 |
+
.hypothesis/
|
| 52 |
+
.pytest_cache/
|
| 53 |
+
cover/
|
| 54 |
+
|
| 55 |
+
# Translations
|
| 56 |
+
*.mo
|
| 57 |
+
*.pot
|
| 58 |
+
|
| 59 |
+
# Django stuff:
|
| 60 |
+
*.log
|
| 61 |
+
local_settings.py
|
| 62 |
+
db.sqlite3
|
| 63 |
+
db.sqlite3-journal
|
| 64 |
+
|
| 65 |
+
# Flask stuff:
|
| 66 |
+
instance/
|
| 67 |
+
.webassets-cache
|
| 68 |
+
|
| 69 |
+
# Scrapy stuff:
|
| 70 |
+
.scrapy
|
| 71 |
+
|
| 72 |
+
# Sphinx documentation
|
| 73 |
+
docs/_build/
|
| 74 |
+
|
| 75 |
+
# PyBuilder
|
| 76 |
+
.pybuilder/
|
| 77 |
+
target/
|
| 78 |
+
|
| 79 |
+
# Jupyter Notebook
|
| 80 |
+
.ipynb_checkpoints
|
| 81 |
+
|
| 82 |
+
# IPython
|
| 83 |
+
profile_default/
|
| 84 |
+
ipython_config.py
|
| 85 |
+
|
| 86 |
+
# pyenv
|
| 87 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 88 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 89 |
+
# .python-version
|
| 90 |
+
|
| 91 |
+
# pipenv
|
| 92 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 93 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 94 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 95 |
+
# install all needed dependencies.
|
| 96 |
+
#Pipfile.lock
|
| 97 |
+
|
| 98 |
+
# UV
|
| 99 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 100 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 101 |
+
# commonly ignored for libraries.
|
| 102 |
+
#uv.lock
|
| 103 |
+
|
| 104 |
+
# poetry
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 106 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 107 |
+
# commonly ignored for libraries.
|
| 108 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 109 |
+
#poetry.lock
|
| 110 |
+
#poetry.toml
|
| 111 |
+
|
| 112 |
+
# pdm
|
| 113 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 114 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 115 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 116 |
+
#pdm.lock
|
| 117 |
+
#pdm.toml
|
| 118 |
+
.pdm-python
|
| 119 |
+
.pdm-build/
|
| 120 |
+
|
| 121 |
+
# pixi
|
| 122 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 123 |
+
#pixi.lock
|
| 124 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 125 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 126 |
+
.pixi
|
| 127 |
+
|
| 128 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 129 |
+
__pypackages__/
|
| 130 |
+
|
| 131 |
+
# Celery stuff
|
| 132 |
+
celerybeat-schedule
|
| 133 |
+
celerybeat.pid
|
| 134 |
+
|
| 135 |
+
# SageMath parsed files
|
| 136 |
+
*.sage.py
|
| 137 |
+
|
| 138 |
+
# Environments
|
| 139 |
+
.env
|
| 140 |
+
.envrc
|
| 141 |
+
.venv
|
| 142 |
+
env/
|
| 143 |
+
venv/
|
| 144 |
+
ENV/
|
| 145 |
+
env.bak/
|
| 146 |
+
venv.bak/
|
| 147 |
+
|
| 148 |
+
# Spyder project settings
|
| 149 |
+
.spyderproject
|
| 150 |
+
.spyproject
|
| 151 |
+
|
| 152 |
+
# Rope project settings
|
| 153 |
+
.ropeproject
|
| 154 |
+
|
| 155 |
+
# mkdocs documentation
|
| 156 |
+
/site
|
| 157 |
+
|
| 158 |
+
# mypy
|
| 159 |
+
.mypy_cache/
|
| 160 |
+
.dmypy.json
|
| 161 |
+
dmypy.json
|
| 162 |
+
|
| 163 |
+
# Pyre type checker
|
| 164 |
+
.pyre/
|
| 165 |
+
|
| 166 |
+
# pytype static type analyzer
|
| 167 |
+
.pytype/
|
| 168 |
+
|
| 169 |
+
# Cython debug symbols
|
| 170 |
+
cython_debug/
|
| 171 |
+
|
| 172 |
+
# PyCharm
|
| 173 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 174 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 175 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 176 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 177 |
+
#.idea/
|
| 178 |
+
|
| 179 |
+
# Abstra
|
| 180 |
+
# Abstra is an AI-powered process automation framework.
|
| 181 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 182 |
+
# Learn more at https://abstra.io/docs
|
| 183 |
+
.abstra/
|
| 184 |
+
|
| 185 |
+
# Visual Studio Code
|
| 186 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 187 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 188 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 189 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 190 |
+
# .vscode/
|
| 191 |
+
|
| 192 |
+
# Ruff stuff:
|
| 193 |
+
.ruff_cache/
|
| 194 |
+
|
| 195 |
+
# PyPI configuration file
|
| 196 |
+
.pypirc
|
| 197 |
+
|
| 198 |
+
# Cursor
|
| 199 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 200 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 201 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 202 |
+
.cursorignore
|
| 203 |
+
.cursorindexingignore
|
| 204 |
+
|
| 205 |
+
# Marimo
|
| 206 |
+
marimo/_static/
|
| 207 |
+
marimo/_lsp/
|
| 208 |
+
__marimo__/
|
LICENSE
ADDED
|
File without changes
|
README.md
CHANGED
|
@@ -1,12 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sentry-predict
|
| 2 |
+
|
| 3 |
+
ML Prediction Service for Project Sentinel: Predictive failure and performance forecasting for Solana RPC nodes.
|
| 4 |
+
|
| 5 |
+
## Installation
|
| 6 |
+
1. Clone the repo: `git clone https://github.com/your-org/sentry-predict.git`
|
| 7 |
+
2. Install: `pip install -e .` (editable mode)
|
| 8 |
+
3. Install dev deps: `pip install -r requirements-dev.txt`
|
| 9 |
+
4. Setup pre-commit: `pre-commit install`
|
| 10 |
+
|
| 11 |
+
## Usage
|
| 12 |
+
- Generate data: `generate-data`
|
| 13 |
+
- Train models: `train-sentry`
|
| 14 |
+
- Run API: `uvicorn api.main:app --reload`
|
| 15 |
+
|
| 16 |
+
## Testing
|
| 17 |
+
`pytest tests/`
|
| 18 |
+
|
| 19 |
+
## Contributing
|
| 20 |
+
1. Fork the repo
|
| 21 |
+
2. Create branch: `git checkout -b feature/xyz`
|
| 22 |
+
3. Commit with pre-commit
|
| 23 |
+
4. PR
|
| 24 |
+
|
| 25 |
+
## Architecture
|
| 26 |
+
- Data generation with AR(1) for time-series realism
|
| 27 |
+
- Feature engineering for trends/lags
|
| 28 |
+
- Models: Autoencoder (anomaly), LogisticRegression (failure), SARIMA (forecasting)
|
| 29 |
+
- API for predictions
|
| 30 |
+
|
| 31 |
+
## License
|
| 32 |
+
MIT
|
config.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data:
|
| 2 |
+
processed_data_path: "./data/processed/engineered_metrics.csv"
|
| 3 |
+
|
| 4 |
+
models_dir:
|
| 5 |
+
anomaly_model_path: "./models/anomaly_model.joblib"
|
| 6 |
+
anomaly_scaler_path: "./models/anomaly_scaler.joblib"
|
| 7 |
+
failure_model_path: "./models/failure_model.joblib"
|
| 8 |
+
failure_scaler_path: "./models/failure_scaler.joblib"
|
data/processed/engineered_metrics.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/synthetic-data.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from datetime import timedelta
|
| 4 |
+
from typing import List
|
| 5 |
+
from src.utils import logger
|
| 6 |
+
|
| 7 |
+
def ar1_process(n: int, mean: float, std: float, phi: float = 0.8, seed_offset: int = 0) -> np.ndarray:
|
| 8 |
+
"""Generate AR(1) time-series.
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
n (int): Number of samples.
|
| 12 |
+
mean (float): Mean.
|
| 13 |
+
std (float): Standard deviation.
|
| 14 |
+
phi (float): Autocorrelation coefficient.
|
| 15 |
+
seed_offset (int): Seed offset for reproducibility.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
np.ndarray: Generated series.
|
| 19 |
+
"""
|
| 20 |
+
np.random.seed(42 + seed_offset)
|
| 21 |
+
x = np.zeros(n)
|
| 22 |
+
x[0] = np.random.normal(mean, std)
|
| 23 |
+
for t in range(1, n):
|
| 24 |
+
x[t] = phi * x[t-1] + np.random.normal(0, std * np.sqrt(1 - phi**2))
|
| 25 |
+
return np.clip(x, 0, None)
|
| 26 |
+
|
| 27 |
+
def generate_data() -> None:
|
| 28 |
+
"""Generate synthetic RPC metrics for all nodes."""
|
| 29 |
+
try:
|
| 30 |
+
np.random.seed(42)
|
| 31 |
+
|
| 32 |
+
n_samples = 1000
|
| 33 |
+
start_time = pd.Timestamp("2025-10-01 00:00:00")
|
| 34 |
+
timestamps = pd.date_range(start_time, periods=n_samples, freq="1min")
|
| 35 |
+
nodes = ["agave1", "agave2", "firedancer1", "firedancer2"]
|
| 36 |
+
|
| 37 |
+
node_params = {
|
| 38 |
+
"agave1": {"cpu_mean": 45, "cpu_var": 8, "latency_mean": 60, "latency_var": 10, "error_var": 0.005},
|
| 39 |
+
"agave2": {"cpu_mean": 48, "cpu_var": 9, "latency_mean": 65, "latency_var": 12, "error_var": 0.006},
|
| 40 |
+
"firedancer1": {"cpu_mean": 55, "cpu_var": 15, "latency_mean": 50, "latency_var": 20, "error_var": 0.01},
|
| 41 |
+
"firedancer2": {"cpu_mean": 52, "cpu_var": 12, "latency_mean": 55, "latency_var": 18, "error_var": 0.009}
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
phi = 0.8
|
| 45 |
+
n_ramps_per_node = 8
|
| 46 |
+
ramp_length = 20
|
| 47 |
+
all_data: List[pd.DataFrame] = []
|
| 48 |
+
|
| 49 |
+
for node in nodes:
|
| 50 |
+
params = node_params[node]
|
| 51 |
+
cpu_base = ar1_process(n_samples, params["cpu_mean"], params["cpu_var"], phi, nodes.index(node) * 10)
|
| 52 |
+
latency_base = ar1_process(n_samples, params["latency_mean"], params["latency_var"], phi, nodes.index(node) * 20)
|
| 53 |
+
error_base = np.abs(ar1_process(n_samples, 0.02, params["error_var"], phi, nodes.index(node) * 30))
|
| 54 |
+
mem_base = ar1_process(n_samples, 50, 10, phi, nodes.index(node) * 40)
|
| 55 |
+
disk_base = ar1_process(n_samples, 40, 12, phi, nodes.index(node) * 50)
|
| 56 |
+
block_gap_base = np.random.choice([0, 1], size=n_samples, p=[0.75, 0.25])
|
| 57 |
+
|
| 58 |
+
ramp_starts = sorted(np.random.choice(n_samples - ramp_length, n_ramps_per_node, replace=False))
|
| 59 |
+
cpu = cpu_base.copy()
|
| 60 |
+
latency = latency_base.copy()
|
| 61 |
+
error_rate = error_base.copy()
|
| 62 |
+
mem = mem_base.copy()
|
| 63 |
+
disk = disk_base.copy()
|
| 64 |
+
block_gap = block_gap_base.copy()
|
| 65 |
+
labels = np.zeros(n_samples, dtype=int)
|
| 66 |
+
|
| 67 |
+
for start in ramp_starts:
|
| 68 |
+
ramp_cpu = np.linspace(0, 45, ramp_length)
|
| 69 |
+
ramp_latency = np.linspace(0, 250, ramp_length)
|
| 70 |
+
ramp_error = np.linspace(0, 0.4, ramp_length)
|
| 71 |
+
ramp_mem = np.linspace(0, 30, ramp_length)
|
| 72 |
+
ramp_disk = np.linspace(0, 150, ramp_length)
|
| 73 |
+
ramp_gap = np.full(ramp_length, 5)
|
| 74 |
+
|
| 75 |
+
cpu[start:start+ramp_length] += ramp_cpu
|
| 76 |
+
latency[start:start+ramp_length] += ramp_latency
|
| 77 |
+
error_rate[start:start+ramp_length] += ramp_error
|
| 78 |
+
mem[start:start+ramp_length] += ramp_mem
|
| 79 |
+
disk[start:start+ramp_length] += ramp_disk
|
| 80 |
+
block_gap[start:start+ramp_length] = np.maximum(block_gap[start:start+ramp_length], ramp_gap)
|
| 81 |
+
|
| 82 |
+
labels[start:start+ramp_length] = 1
|
| 83 |
+
|
| 84 |
+
latency = 40 + cpu * 1.5 + error_rate * 200 + np.random.normal(0, 5, n_samples)
|
| 85 |
+
latency = np.clip(latency, 20, 1000)
|
| 86 |
+
|
| 87 |
+
node_data = []
|
| 88 |
+
for i, t in enumerate(timestamps):
|
| 89 |
+
node_data.append([
|
| 90 |
+
t, node,
|
| 91 |
+
round(cpu[i], 2), round(mem[i], 2), round(disk[i], 2),
|
| 92 |
+
round(latency[i], 2), round(error_rate[i], 3),
|
| 93 |
+
int(block_gap[i]), labels[i]
|
| 94 |
+
])
|
| 95 |
+
|
| 96 |
+
node_df = pd.DataFrame(node_data, columns=[
|
| 97 |
+
"timestamp", "node", "cpu_usage", "memory_usage", "disk_io",
|
| 98 |
+
"rpc_latency_ms", "rpc_error_rate", "block_height_gap", "failure_imminent"
|
| 99 |
+
])
|
| 100 |
+
|
| 101 |
+
node_filename = f"data/raw/{node}_metrics.csv"
|
| 102 |
+
node_df.to_csv(node_filename, index=False)
|
| 103 |
+
|
| 104 |
+
all_data.append(node_df)
|
| 105 |
+
|
| 106 |
+
combined_df = pd.concat(all_data, ignore_index=True)
|
| 107 |
+
combined_df = combined_df.sort_values(['timestamp', 'node']).reset_index(drop=True)
|
| 108 |
+
combined_df.to_csv("data/raw/synthetic_rpc_metrics_realistic.csv", index=False)
|
| 109 |
+
logger.info("Synthetic data generated.")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Error generating data: {e}")
|
| 112 |
+
raise
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
generate_data()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.2.3
|
| 2 |
+
numpy==1.26.4
|
| 3 |
+
scikit-learn==1.5.2
|
| 4 |
+
statsmodels==0.14.4
|
| 5 |
+
fastapi==0.115.0
|
| 6 |
+
uvicorn==0.30.6
|
| 7 |
+
joblib==1.4.2
|
| 8 |
+
pydantic==2.9.2
|
| 9 |
+
pytest==8.3.3
|
src/__init__.py
ADDED
|
File without changes
|
src/features.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/feature.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List
|
| 6 |
+
from src.utils import logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def engineer_features(df:pd.DataFrame) -> pd.DataFrame:
|
| 10 |
+
"""
|
| 11 |
+
Engineer Features from raw metrics
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
df(pd.DataFrame): Raw Data from the system
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
pf.DataFrame: Data with added Features
|
| 18 |
+
"""
|
| 19 |
+
try:
|
| 20 |
+
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
| 21 |
+
df = df.sort_values(["node", "timestamp"])
|
| 22 |
+
|
| 23 |
+
grouped = df.groupby("node")
|
| 24 |
+
df["cpu_trend"] = grouped["cpu_usage"].transform(lambda x:x.diff())
|
| 25 |
+
df["cpu_rolling_mean"] = grouped["cpu_usage"].transform(lambda x:x.rolling(window=5, min_periods=1).mean())
|
| 26 |
+
df["error_rate_lag1"] = grouped["rpc_error_rate"].shift(1)
|
| 27 |
+
df["latency_rolling_std"] = grouped["rpc_latency_ms"].transform(lambda x:x.rolling(window=5).std())
|
| 28 |
+
|
| 29 |
+
df = df.fillna(0)
|
| 30 |
+
|
| 31 |
+
return df
|
| 32 |
+
|
| 33 |
+
except KeyError as e:
|
| 34 |
+
logger.error(f"Missing Column in Data: {e}")
|
| 35 |
+
raise
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.error(f"Error engineering features: {e}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main(input_path:str = "data/raw/synthetic_rpc_metrics_realistic.csv", output_path:str = "data/processed/engineered_metrics.csv") -> None:
|
| 41 |
+
"""
|
| 42 |
+
Main function to engineer features from raw data
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
input_path(str): Path to raw data CSV
|
| 46 |
+
output_path(str): Path to save engineered features CSV
|
| 47 |
+
"""
|
| 48 |
+
try:
|
| 49 |
+
df = pd.read_csv(input_path)
|
| 50 |
+
df_engineered = engineer_features(df)
|
| 51 |
+
df_engineered.to_csv(output_path, index=False)
|
| 52 |
+
logger.info(f"Engineered features saved to {output_path}")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.error(f"Error in main function: {e}")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|
src/predict.py
ADDED
|
File without changes
|
src/train.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
from pandas.core.computation.expr import _node_not_implemented
|
| 5 |
+
from sklearn.neural_network import MLPRegressor
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, recall_score, f1_score
|
| 9 |
+
from sklearn.preprocessing import StandardScaler
|
| 10 |
+
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
| 11 |
+
from src.features import engineer_features
|
| 12 |
+
from src.utils import logger, load_config
|
| 13 |
+
from typing import Tuple, Dict
|
| 14 |
+
|
| 15 |
+
config = load_config
|
| 16 |
+
|
| 17 |
+
def train_anomaly_model(df:pd.DataFrame) -> Tuple[MLPRegressor, StandardScaler]:
|
| 18 |
+
"""Train autoencoder for anomaly detection
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
df (pd.DataFrame): Processed data
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Tuple[MLPRegressor, StandardScaler]: Model and Scaler.
|
| 25 |
+
"""
|
| 26 |
+
features = ["cpu_usage", 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'cpu_rolling_mean']
|
| 27 |
+
healthy_df = df[df["failure_imminent"] == 0]
|
| 28 |
+
X_healthy = healthy_df[features]
|
| 29 |
+
scaler = StandardScaler().fit(X_healthy)
|
| 30 |
+
X_healthy_scaled = scaler.transform(X_healthy)
|
| 31 |
+
model = MLPRegressor(hidden_layer_sizes=(10, 5, 10), activation="relu", solver="adam", max_iter=500, random_state=42)
|
| 32 |
+
model.fit(X_healthy_scaled, X_healthy_scaled)
|
| 33 |
+
return model, scaler
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def evaluate_anomaly_model(model: MLPRegressor, scaler:StandardScaler, X_test_scaled:np.ndarray) -> float:
|
| 37 |
+
"""
|
| 38 |
+
Evaluate autoencoder on test set.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
model (MLPRegressor): Trained autoencoder.
|
| 42 |
+
scaler (StandardScaler): Scaler Used.
|
| 43 |
+
X_test_scaled (np.ndarray): Scaled test data.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
float: MSE on the test set.
|
| 47 |
+
"""
|
| 48 |
+
reconstructed = model.predict(X_test_scaled)
|
| 49 |
+
mse = mean_squared_error(X_test_scaled, reconstructed)
|
| 50 |
+
return mse
|
| 51 |
+
|
| 52 |
+
def train_failure_model(df:pd.DataFrame) -> Tuple[LogisticRegression, StandardScaler, np.ndarray, np.ndarray]:
|
| 53 |
+
"""Train Logistic Regression Model for Failure Prediction
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
df (pd.DataFrame): Processed data.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Tuple(LogisticRegression, StandardScaler): Model and scaler for failure model.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
features = ['cpu_usage', 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'error_rate_lag1']
|
| 63 |
+
X = df[features]
|
| 64 |
+
y = df['failure_imminent']
|
| 65 |
+
scaler = StandardScaler()
|
| 66 |
+
X_scaled = scaler.fit_transform(X)
|
| 67 |
+
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
|
| 68 |
+
model = LogisticRegression(random_state=42, max_iter=200, class_weight='balanced')
|
| 69 |
+
model.fit(X_train, y_train)
|
| 70 |
+
return model, scaler, X_test, y_test
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def evaluate_failure_model(model:LogisticRegression, X_test:np.ndarray, y_test:np.ndarray) -> Dict[str, float]:
|
| 74 |
+
"""Evaluating the failure prediction model
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
model (LogisticRegression): Trained failure prediction model.
|
| 78 |
+
X_test (np.ndarray): Test features.
|
| 79 |
+
y_test (np.ndarray): Test labels.
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
Dict[str, float]: Evaluation metrics.
|
| 83 |
+
"""
|
| 84 |
+
y_pred = model.predict(X_test)
|
| 85 |
+
return {
|
| 86 |
+
"accuracy": accuracy_score(y_test, y_pred),
|
| 87 |
+
"recall": recall_score(y_test, y_pred),
|
| 88 |
+
"f1_score": f1_score(y_test, y_pred)
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def train_latency_model(df: pd.DataFrame) -> Dict[str, SARIMAX]:
|
| 93 |
+
"""Train SARIMAX for latency forcasting per node
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
df (pd.DataFrame): Processed data.
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
Dict(str, SARIMAX): Models per node.
|
| 100 |
+
"""
|
| 101 |
+
models = {}
|
| 102 |
+
for node in df["node"].unique():
|
| 103 |
+
node_df = df[df["node"]== node].sort_values("timestamp")
|
| 104 |
+
series = node_df['rpc_latency_ms']
|
| 105 |
+
train_size = int(len(series))
|
| 106 |
+
train, test = series[:train_size], series[train_size:]
|
| 107 |
+
model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,60), enforce_stationarity=False)
|
| 108 |
+
fitted = model.fit(disp=False)
|
| 109 |
+
models[node] = fitted
|
| 110 |
+
|
| 111 |
+
return models
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def evaluate_latency_model(model:SARIMAX, test_series:pd.Series) -> float:
|
| 115 |
+
"""Evaluate SARIMA on test set.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
model (SARIMAX): Trained model.
|
| 119 |
+
test_series (pd.Series): Test latency data
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
float: RMSE on test set.
|
| 123 |
+
"""
|
| 124 |
+
forecast = model.forecast(steps=len(test_series))
|
| 125 |
+
rmse = np.sqrt(mean_squared_error(test_series, forecast))
|
| 126 |
+
mae = np.mean(np.abs(test_series - forecast))
|
| 127 |
+
|
| 128 |
+
return rmse, mae
|
| 129 |
+
|
| 130 |
+
def main():
|
| 131 |
+
try:
|
| 132 |
+
df = pd.read_csv(config["data"]["processed_data_path"])
|
| 133 |
+
|
| 134 |
+
# Anomaly Model Training
|
| 135 |
+
anomaly_model, anomaly_scaler = train_anomaly_model(df)
|
| 136 |
+
healthy_df = df[df["failure_imminent"] == 0]
|
| 137 |
+
X_healthy = healthy_df[["cpu_usage", 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'cpu_rolling_mean']]
|
| 138 |
+
X_healthy_scaled = anomaly_scaler.transform(X_healthy)
|
| 139 |
+
X_train, X_test, _, _ = train_test_split(X_healthy_scaled, np.zeros(len(X_healthy)), test_size=0.2, random_state=42)
|
| 140 |
+
anomaly_mse = evaluate_anomaly_model(anomaly_model, anomaly_scaler, X_test)
|
| 141 |
+
logger.info(f"Anomaly Model MSE: {anomaly_mse}")
|
| 142 |
+
joblib.dump(anomaly_model, config["models_dir"]["anomaly_model_path"])
|
| 143 |
+
joblib.dump(anomaly_scaler, config["models_dir"]["anomaly_scaler_path"])
|
| 144 |
+
|
| 145 |
+
# Failure Model Training
|
| 146 |
+
failure_model, failure_scaler, X_test, y_test = train_failure_model(df)
|
| 147 |
+
failure_metrics = evaluate_failure_model(failure_model, X_test, y_test)
|
| 148 |
+
logger.info(f"Failure Model Metrics: {failure_metrics}")
|
| 149 |
+
joblib.dump(failure_model, config["models_dir"]["failure_model_path"])
|
| 150 |
+
joblib.dump(failure_scaler, config["models_dir"]["failure_scaler_path"])
|
| 151 |
+
|
| 152 |
+
# Latency Model Training
|
| 153 |
+
latency_models = train_latency_model(df)
|
| 154 |
+
for node, model in latency_models.items():
|
| 155 |
+
node_df = df[df["node"]== node].sort_values("timestamp")
|
| 156 |
+
series = node_df['rpc_latency_ms']
|
| 157 |
+
train_size = int(len(series)*0.8)
|
| 158 |
+
train, test = series[:train_size], series[train_size:]
|
| 159 |
+
latency_rmse, latency_mae = evaluate_latency_model(model, test)
|
| 160 |
+
logger.info(f"Latency Model for {node} - RMSE: {latency_rmse}, MAE: {latency_mae}")
|
| 161 |
+
joblib.dump(model, f"{config['models_dir']['latency_model_dir']}/{node}_latency_model.joblib")
|
| 162 |
+
|
| 163 |
+
logger.info("Training Completed Successfully.")
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Error in training process: {e}")
|
| 167 |
+
raise
|
| 168 |
+
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
main()
|
src/utils.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
def setup_logging():
|
| 4 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 5 |
+
return logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
def get_config():
|
| 8 |
+
with open("./config.yaml", "r") as f:
|
| 9 |
+
import yaml
|
| 10 |
+
config = yaml.safe_load(f)
|
| 11 |
+
|
| 12 |
+
logger.info(f"Configuration loaded successfully: {config}")
|
| 13 |
+
return config
|
| 14 |
+
|
| 15 |
+
logger = setup_logging()
|
| 16 |
+
load_config = get_config()
|
tests/test_features.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from src.features import engineer_features
|
| 4 |
+
|
| 5 |
+
@pytest.fixture
|
| 6 |
+
def sample_data():
|
| 7 |
+
data = {
|
| 8 |
+
"timestamp": pd.date_range(start="2025-01-01", periods=10, freq="1min"),
|
| 9 |
+
"node": ["agave1"] * 10,
|
| 10 |
+
"cpu_usage": [45, 46, 47, 48, 49, 50, 51, 52, 53, 54],
|
| 11 |
+
"rpc_error_rate": [0.1] * 10,
|
| 12 |
+
"rpc_latency_ms": [60] * 10,
|
| 13 |
+
"failure_imminent": [0] *10
|
| 14 |
+
}
|
| 15 |
+
return pd.DataFrame(data)
|
| 16 |
+
|
| 17 |
+
def test_engineer_features(sample_data):
|
| 18 |
+
processed_df = engineer_features(sample_data)
|
| 19 |
+
assert "cpu_trend" in processed_df.columns
|
| 20 |
+
assert processed_df["cpu_trend"].iloc[1] == 1 # 46 - 45
|
| 21 |
+
assert "cpu_rolling_mean" in processed_df.columns
|
| 22 |
+
assert len(processed_df) == 10
|
| 23 |
+
assert processed_df["cpu_rolling_mean"].iloc[4] == pytest.approx(47.0) # Mean of first 5 entries
|
| 24 |
+
|
| 25 |
+
def test_engineer_features_empty():
|
| 26 |
+
df = pd.DataFrame()
|
| 27 |
+
with pytest.raises(KeyError):
|
| 28 |
+
engineer_features(df)
|