Spaces:

Testys
/

sentry-ml-api

Configuration error

App Files Files Community

Testys commited on Oct 22, 2025

Commit

01ca3ba

1 Parent(s): 3664f12

First Layer for ML Model

Browse files

Files changed (13) hide show

.gitignore +208 -0
LICENSE +0 -0
README.md +32 -12
config.yaml +8 -0
data/processed/engineered_metrics.csv +0 -0
data/synthetic-data.py +115 -0
requirements.txt +9 -0
src/__init__.py +0 -0
src/features.py +58 -0
src/predict.py +0 -0
src/train.py +170 -0
src/utils.py +16 -0
tests/test_features.py +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,208 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+data/raw/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

LICENSE ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -1,12 +1,32 @@
----
-title: Sentry Ml Api
-emoji: 🏃
-colorFrom: yellow
-colorTo: gray
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# sentry-predict
+ML Prediction Service for Project Sentinel: Predictive failure and performance forecasting for Solana RPC nodes.
+## Installation
+1. Clone the repo: `git clone https://github.com/your-org/sentry-predict.git`
+2. Install: `pip install -e .` (editable mode)
+3. Install dev deps: `pip install -r requirements-dev.txt`
+4. Setup pre-commit: `pre-commit install`
+## Usage
+- Generate data: `generate-data`
+- Train models: `train-sentry`
+- Run API: `uvicorn api.main:app --reload`
+## Testing
+`pytest tests/`
+## Contributing
+1. Fork the repo
+2. Create branch: `git checkout -b feature/xyz`
+3. Commit with pre-commit
+4. PR
+## Architecture
+- Data generation with AR(1) for time-series realism
+- Feature engineering for trends/lags
+- Models: Autoencoder (anomaly), LogisticRegression (failure), SARIMA (forecasting)
+- API for predictions
+## License
+MIT

config.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+data:
+  processed_data_path: "./data/processed/engineered_metrics.csv"
+models_dir:
+    anomaly_model_path: "./models/anomaly_model.joblib"
+    anomaly_scaler_path: "./models/anomaly_scaler.joblib"
+    failure_model_path: "./models/failure_model.joblib"
+    failure_scaler_path: "./models/failure_scaler.joblib"

data/processed/engineered_metrics.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/synthetic-data.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import pandas as pd
+import numpy as np
+from datetime import timedelta
+from typing import List
+from src.utils import logger
+def ar1_process(n: int, mean: float, std: float, phi: float = 0.8, seed_offset: int = 0) -> np.ndarray:
+    """Generate AR(1) time-series.
+    Args:
+        n (int): Number of samples.
+        mean (float): Mean.
+        std (float): Standard deviation.
+        phi (float): Autocorrelation coefficient.
+        seed_offset (int): Seed offset for reproducibility.
+    Returns:
+        np.ndarray: Generated series.
+    """
+    np.random.seed(42 + seed_offset)
+    x = np.zeros(n)
+    x[0] = np.random.normal(mean, std)
+    for t in range(1, n):
+        x[t] = phi * x[t-1] + np.random.normal(0, std * np.sqrt(1 - phi**2))
+    return np.clip(x, 0, None)
+def generate_data() -> None:
+    """Generate synthetic RPC metrics for all nodes."""
+    try:
+        np.random.seed(42)
+        n_samples = 1000
+        start_time = pd.Timestamp("2025-10-01 00:00:00")
+        timestamps = pd.date_range(start_time, periods=n_samples, freq="1min")
+        nodes = ["agave1", "agave2", "firedancer1", "firedancer2"]
+        node_params = {
+            "agave1": {"cpu_mean": 45, "cpu_var": 8, "latency_mean": 60, "latency_var": 10, "error_var": 0.005},
+            "agave2": {"cpu_mean": 48, "cpu_var": 9, "latency_mean": 65, "latency_var": 12, "error_var": 0.006},
+            "firedancer1": {"cpu_mean": 55, "cpu_var": 15, "latency_mean": 50, "latency_var": 20, "error_var": 0.01},
+            "firedancer2": {"cpu_mean": 52, "cpu_var": 12, "latency_mean": 55, "latency_var": 18, "error_var": 0.009}
+        }
+        phi = 0.8
+        n_ramps_per_node = 8
+        ramp_length = 20
+        all_data: List[pd.DataFrame] = []
+        for node in nodes:
+            params = node_params[node]
+            cpu_base = ar1_process(n_samples, params["cpu_mean"], params["cpu_var"], phi, nodes.index(node) * 10)
+            latency_base = ar1_process(n_samples, params["latency_mean"], params["latency_var"], phi, nodes.index(node) * 20)
+            error_base = np.abs(ar1_process(n_samples, 0.02, params["error_var"], phi, nodes.index(node) * 30))
+            mem_base = ar1_process(n_samples, 50, 10, phi, nodes.index(node) * 40)
+            disk_base = ar1_process(n_samples, 40, 12, phi, nodes.index(node) * 50)
+            block_gap_base = np.random.choice([0, 1], size=n_samples, p=[0.75, 0.25])
+            ramp_starts = sorted(np.random.choice(n_samples - ramp_length, n_ramps_per_node, replace=False))
+            cpu = cpu_base.copy()
+            latency = latency_base.copy()
+            error_rate = error_base.copy()
+            mem = mem_base.copy()
+            disk = disk_base.copy()
+            block_gap = block_gap_base.copy()
+            labels = np.zeros(n_samples, dtype=int)
+            for start in ramp_starts:
+                ramp_cpu = np.linspace(0, 45, ramp_length)
+                ramp_latency = np.linspace(0, 250, ramp_length)
+                ramp_error = np.linspace(0, 0.4, ramp_length)
+                ramp_mem = np.linspace(0, 30, ramp_length)
+                ramp_disk = np.linspace(0, 150, ramp_length)
+                ramp_gap = np.full(ramp_length, 5)
+                cpu[start:start+ramp_length] += ramp_cpu
+                latency[start:start+ramp_length] += ramp_latency
+                error_rate[start:start+ramp_length] += ramp_error
+                mem[start:start+ramp_length] += ramp_mem
+                disk[start:start+ramp_length] += ramp_disk
+                block_gap[start:start+ramp_length] = np.maximum(block_gap[start:start+ramp_length], ramp_gap)
+                labels[start:start+ramp_length] = 1
+            latency = 40 + cpu * 1.5 + error_rate * 200 + np.random.normal(0, 5, n_samples)
+            latency = np.clip(latency, 20, 1000)
+            node_data = []
+            for i, t in enumerate(timestamps):
+                node_data.append([
+                    t, node,
+                    round(cpu[i], 2), round(mem[i], 2), round(disk[i], 2),
+                    round(latency[i], 2), round(error_rate[i], 3),
+                    int(block_gap[i]), labels[i]
+                ])
+            node_df = pd.DataFrame(node_data, columns=[
+                "timestamp", "node", "cpu_usage", "memory_usage", "disk_io",
+                "rpc_latency_ms", "rpc_error_rate", "block_height_gap", "failure_imminent"
+            ])
+            node_filename = f"data/raw/{node}_metrics.csv"
+            node_df.to_csv(node_filename, index=False)
+            all_data.append(node_df)
+        combined_df = pd.concat(all_data, ignore_index=True)
+        combined_df = combined_df.sort_values(['timestamp', 'node']).reset_index(drop=True)
+        combined_df.to_csv("data/raw/synthetic_rpc_metrics_realistic.csv", index=False)
+        logger.info("Synthetic data generated.")
+    except Exception as e:
+        logger.error(f"Error generating data: {e}")
+        raise
+if __name__ == "__main__":
+    generate_data()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pandas==2.2.3
+numpy==1.26.4
+scikit-learn==1.5.2
+statsmodels==0.14.4
+fastapi==0.115.0
+uvicorn==0.30.6
+joblib==1.4.2
+pydantic==2.9.2
+pytest==8.3.3

src/__init__.py ADDED Viewed

File without changes

src/features.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# src/feature.py
+import pandas as pd
+import numpy as np
+from typing import List
+from src.utils import logger
+def engineer_features(df:pd.DataFrame) -> pd.DataFrame:
+    """
+    Engineer Features from raw metrics
+    Args:
+        df(pd.DataFrame): Raw Data from the system
+    Returns:
+        pf.DataFrame: Data with added Features
+    """
+    try:
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+        df = df.sort_values(["node", "timestamp"])
+        grouped = df.groupby("node")
+        df["cpu_trend"] = grouped["cpu_usage"].transform(lambda x:x.diff())
+        df["cpu_rolling_mean"] = grouped["cpu_usage"].transform(lambda x:x.rolling(window=5, min_periods=1).mean())
+        df["error_rate_lag1"] = grouped["rpc_error_rate"].shift(1)
+        df["latency_rolling_std"] = grouped["rpc_latency_ms"].transform(lambda x:x.rolling(window=5).std())
+        df = df.fillna(0)
+        return df
+    except KeyError as e:
+        logger.error(f"Missing Column in Data: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"Error engineering features: {e}")
+def main(input_path:str = "data/raw/synthetic_rpc_metrics_realistic.csv", output_path:str = "data/processed/engineered_metrics.csv") -> None:
+    """
+    Main function to engineer features from raw data
+    Args:
+        input_path(str): Path to raw data CSV
+        output_path(str): Path to save engineered features CSV
+    """
+    try:
+        df = pd.read_csv(input_path)
+        df_engineered = engineer_features(df)
+        df_engineered.to_csv(output_path, index=False)
+        logger.info(f"Engineered features saved to {output_path}")
+    except Exception as e:
+        logger.error(f"Error in main function: {e}")
+if __name__ == "__main__":
+    main()

src/predict.py ADDED Viewed

File without changes

src/train.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import pandas as pd
+import numpy as np
+import joblib
+from pandas.core.computation.expr import _node_not_implemented
+from sklearn.neural_network import MLPRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, recall_score, f1_score
+from sklearn.preprocessing import StandardScaler
+from statsmodels.tsa.statespace.sarimax import SARIMAX
+from src.features import engineer_features
+from src.utils import logger, load_config
+from typing import Tuple, Dict
+config = load_config
+def train_anomaly_model(df:pd.DataFrame) -> Tuple[MLPRegressor, StandardScaler]:
+    """Train autoencoder for anomaly detection
+    Args:
+        df (pd.DataFrame): Processed data
+    Returns:
+        Tuple[MLPRegressor, StandardScaler]: Model and Scaler.
+    """
+    features = ["cpu_usage", 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'cpu_rolling_mean']
+    healthy_df = df[df["failure_imminent"] == 0]
+    X_healthy = healthy_df[features]
+    scaler = StandardScaler().fit(X_healthy)
+    X_healthy_scaled = scaler.transform(X_healthy)
+    model = MLPRegressor(hidden_layer_sizes=(10, 5, 10), activation="relu", solver="adam", max_iter=500, random_state=42)
+    model.fit(X_healthy_scaled, X_healthy_scaled)
+    return model, scaler
+def evaluate_anomaly_model(model: MLPRegressor, scaler:StandardScaler, X_test_scaled:np.ndarray) -> float:
+    """
+    Evaluate autoencoder on test set.
+    Args:
+        model (MLPRegressor): Trained autoencoder.
+        scaler (StandardScaler): Scaler Used.
+        X_test_scaled (np.ndarray): Scaled test data.
+    Returns:
+        float: MSE on the test set.
+    """
+    reconstructed = model.predict(X_test_scaled)
+    mse = mean_squared_error(X_test_scaled, reconstructed)
+    return mse
+def train_failure_model(df:pd.DataFrame) -> Tuple[LogisticRegression, StandardScaler, np.ndarray, np.ndarray]:
+    """Train Logistic Regression Model for Failure Prediction
+    Args:
+        df (pd.DataFrame): Processed data.
+    Returns:
+        Tuple(LogisticRegression, StandardScaler): Model and scaler for failure model.
+    """
+    features = ['cpu_usage', 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'error_rate_lag1']
+    X = df[features]
+    y = df['failure_imminent']
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
+    model = LogisticRegression(random_state=42, max_iter=200, class_weight='balanced')
+    model.fit(X_train, y_train)
+    return model, scaler, X_test, y_test
+def evaluate_failure_model(model:LogisticRegression, X_test:np.ndarray, y_test:np.ndarray) -> Dict[str, float]:
+    """Evaluating the failure prediction model
+    Args:
+        model (LogisticRegression): Trained failure prediction model.
+        X_test (np.ndarray): Test features.
+        y_test (np.ndarray): Test labels.
+    Returns:
+        Dict[str, float]: Evaluation metrics.
+    """
+    y_pred = model.predict(X_test)
+    return {
+        "accuracy": accuracy_score(y_test, y_pred),
+        "recall": recall_score(y_test, y_pred),
+        "f1_score": f1_score(y_test, y_pred)
+    }
+def train_latency_model(df: pd.DataFrame) -> Dict[str, SARIMAX]:
+    """Train SARIMAX for latency forcasting per node
+    Args:
+        df (pd.DataFrame): Processed data.
+    Returns:
+        Dict(str, SARIMAX): Models per node.
+    """
+    models = {}
+    for node in df["node"].unique():
+        node_df = df[df["node"]== node].sort_values("timestamp")
+        series = node_df['rpc_latency_ms']
+        train_size = int(len(series))
+        train, test = series[:train_size], series[train_size:]
+        model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,60), enforce_stationarity=False)
+        fitted = model.fit(disp=False)
+        models[node] = fitted
+    return models
+def evaluate_latency_model(model:SARIMAX, test_series:pd.Series) -> float:
+    """Evaluate SARIMA on test set.
+    Args:
+        model (SARIMAX): Trained model.
+        test_series (pd.Series): Test latency data
+    Returns:
+        float: RMSE on test set.
+    """
+    forecast = model.forecast(steps=len(test_series))
+    rmse = np.sqrt(mean_squared_error(test_series, forecast))
+    mae = np.mean(np.abs(test_series - forecast))
+    return rmse, mae
+def main():
+    try:
+        df = pd.read_csv(config["data"]["processed_data_path"])
+        # Anomaly Model Training
+        anomaly_model, anomaly_scaler = train_anomaly_model(df)
+        healthy_df = df[df["failure_imminent"] == 0]
+        X_healthy = healthy_df[["cpu_usage", 'rpc_error_rate', 'rpc_latency_ms', 'cpu_trend', 'cpu_rolling_mean']]
+        X_healthy_scaled = anomaly_scaler.transform(X_healthy)
+        X_train, X_test, _, _ = train_test_split(X_healthy_scaled, np.zeros(len(X_healthy)), test_size=0.2, random_state=42)
+        anomaly_mse = evaluate_anomaly_model(anomaly_model, anomaly_scaler, X_test)
+        logger.info(f"Anomaly Model MSE: {anomaly_mse}")
+        joblib.dump(anomaly_model, config["models_dir"]["anomaly_model_path"])
+        joblib.dump(anomaly_scaler, config["models_dir"]["anomaly_scaler_path"])
+        # Failure Model Training
+        failure_model, failure_scaler, X_test, y_test = train_failure_model(df)
+        failure_metrics = evaluate_failure_model(failure_model, X_test, y_test)
+        logger.info(f"Failure Model Metrics: {failure_metrics}")
+        joblib.dump(failure_model, config["models_dir"]["failure_model_path"])
+        joblib.dump(failure_scaler, config["models_dir"]["failure_scaler_path"])
+        # Latency Model Training
+        latency_models = train_latency_model(df)
+        for node, model in latency_models.items():
+            node_df = df[df["node"]== node].sort_values("timestamp")
+            series = node_df['rpc_latency_ms']
+            train_size = int(len(series)*0.8)
+            train, test = series[:train_size], series[train_size:]
+            latency_rmse, latency_mae = evaluate_latency_model(model, test)
+            logger.info(f"Latency Model for {node} - RMSE: {latency_rmse}, MAE: {latency_mae}")
+            joblib.dump(model, f"{config['models_dir']['latency_model_dir']}/{node}_latency_model.joblib")
+        logger.info("Training Completed Successfully.")
+    except Exception as e:
+        logger.error(f"Error in training process: {e}")
+        raise
+if __name__ == "__main__":
+    main()

src/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import logging
+def setup_logging():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+    return logging.getLogger(__name__)
+def get_config():
+    with open("./config.yaml", "r") as f:
+        import yaml
+        config = yaml.safe_load(f)
+    logger.info(f"Configuration loaded successfully: {config}")
+    return config
+logger = setup_logging()
+load_config = get_config()

tests/test_features.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import pytest
+import pandas as pd
+from src.features import engineer_features
+@pytest.fixture
+def sample_data():
+    data = {
+        "timestamp": pd.date_range(start="2025-01-01", periods=10, freq="1min"),
+        "node": ["agave1"] * 10,
+        "cpu_usage": [45, 46, 47, 48, 49, 50, 51, 52, 53, 54],
+        "rpc_error_rate": [0.1] * 10,
+        "rpc_latency_ms": [60] * 10,
+        "failure_imminent": [0]  *10
+    }
+    return pd.DataFrame(data)
+def test_engineer_features(sample_data):
+    processed_df = engineer_features(sample_data)
+    assert "cpu_trend" in processed_df.columns
+    assert processed_df["cpu_trend"].iloc[1] == 1  # 46 - 45
+    assert "cpu_rolling_mean" in processed_df.columns
+    assert len(processed_df) == 10
+    assert processed_df["cpu_rolling_mean"].iloc[4] == pytest.approx(47.0)  # Mean of first 5 entries
+def test_engineer_features_empty():
+    df = pd.DataFrame()
+    with pytest.raises(KeyError):
+        engineer_features(df)