""" Prepare baseline/reference data for drift detection. This script samples representative data from the training set. """ import pickle import pandas as pd import numpy as np import sqlite3 from pathlib import Path from sklearn.model_selection import train_test_split # Paths PROJECT_ROOT = Path(__file__).parent.parent.parent.parent BASELINE_DIR = Path(__file__).parent.parent / "baseline" BASELINE_DIR.mkdir(parents=True, exist_ok=True) def load_training_data(): """Load the original training dataset from SQLite database.""" # Load from SQLite database db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db" if not db_path.exists(): raise FileNotFoundError(f"Database not found at {db_path}") print(f"Loading data from database: {db_path}") conn = sqlite3.connect(db_path) # Load from the main table query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000" df = pd.read_sql_query(query, conn) conn.close() print(f"Loaded {len(df)} training samples") return df def prepare_baseline(df, sample_size=1000, random_state=42): """ Sample representative baseline data. Args: df: Training dataframe sample_size: Number of samples for baseline random_state: Random seed for reproducibility Returns: Baseline dataframe """ # Stratified sampling if you have labels if 'label' in df.columns: _, baseline_df = train_test_split( df, test_size=sample_size, random_state=random_state, stratify=df['label'] ) else: baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state) print(f"Sampled {len(baseline_df)} baseline samples") return baseline_df def extract_features(df): """ Extract features used for drift detection. Should match the features used by your model. """ # Select only numeric columns, exclude labels and IDs numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id'] feature_columns = [col for col in numeric_cols if col not in exclude_cols] X = df[feature_columns].values print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples") return X def save_baseline(baseline_data, filename="reference_data.pkl"): """Save baseline data to disk.""" baseline_path = BASELINE_DIR / filename with open(baseline_path, 'wb') as f: pickle.dump(baseline_data, f) print(f"Baseline saved to {baseline_path}") print(f" Shape: {baseline_data.shape}") print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB") def main(): """Main execution.""" print("=" * 60) print("Preparing Baseline Data for Drift Detection") print("=" * 60) # Load data df = load_training_data() # Sample baseline baseline_df = prepare_baseline(df, sample_size=1000) # Extract features X_baseline = extract_features(baseline_df) # Save save_baseline(X_baseline) print("\n" + "=" * 60) print("Baseline preparation complete!") print("=" * 60) if __name__ == "__main__": main()