giuto's picture
Update Grafana and Prometheus configurations, enhance drift detection scripts, and add monitoring dashboard
1396866
"""
Prepare baseline/reference data for drift detection.
This script samples representative data from the training set.
"""
import pickle
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path
from sklearn.model_selection import train_test_split
# Paths
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
BASELINE_DIR = Path(__file__).parent.parent / "baseline"
BASELINE_DIR.mkdir(parents=True, exist_ok=True)
def load_training_data():
"""Load the original training dataset from SQLite database."""
# Load from SQLite database
db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"
if not db_path.exists():
raise FileNotFoundError(f"Database not found at {db_path}")
print(f"Loading data from database: {db_path}")
conn = sqlite3.connect(db_path)
# Load from the main table
query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
df = pd.read_sql_query(query, conn)
conn.close()
print(f"Loaded {len(df)} training samples")
return df
def prepare_baseline(df, sample_size=1000, random_state=42):
"""
Sample representative baseline data.
Args:
df: Training dataframe
sample_size: Number of samples for baseline
random_state: Random seed for reproducibility
Returns:
Baseline dataframe
"""
# Stratified sampling if you have labels
if 'label' in df.columns:
_, baseline_df = train_test_split(
df,
test_size=sample_size,
random_state=random_state,
stratify=df['label']
)
else:
baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state)
print(f"Sampled {len(baseline_df)} baseline samples")
return baseline_df
def extract_features(df):
"""
Extract features used for drift detection.
Should match the features used by your model.
"""
# Select only numeric columns, exclude labels and IDs
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
feature_columns = [col for col in numeric_cols if col not in exclude_cols]
X = df[feature_columns].values
print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
return X
def save_baseline(baseline_data, filename="reference_data.pkl"):
"""Save baseline data to disk."""
baseline_path = BASELINE_DIR / filename
with open(baseline_path, 'wb') as f:
pickle.dump(baseline_data, f)
print(f"Baseline saved to {baseline_path}")
print(f" Shape: {baseline_data.shape}")
print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB")
def main():
"""Main execution."""
print("=" * 60)
print("Preparing Baseline Data for Drift Detection")
print("=" * 60)
# Load data
df = load_training_data()
# Sample baseline
baseline_df = prepare_baseline(df, sample_size=1000)
# Extract features
X_baseline = extract_features(baseline_df)
# Save
save_baseline(X_baseline)
print("\n" + "=" * 60)
print("Baseline preparation complete!")
print("=" * 60)
if __name__ == "__main__":
main()