| | """ |
| | Prepare baseline/reference data for drift detection. |
| | This script samples representative data from the training set. |
| | """ |
| |
|
| | import pickle |
| | import pandas as pd |
| | import numpy as np |
| | import sqlite3 |
| | from pathlib import Path |
| | from sklearn.model_selection import train_test_split |
| |
|
| | |
| | PROJECT_ROOT = Path(__file__).parent.parent.parent.parent |
| | BASELINE_DIR = Path(__file__).parent.parent / "baseline" |
| | BASELINE_DIR.mkdir(parents=True, exist_ok=True) |
| |
|
| |
|
| | def load_training_data(): |
| | """Load the original training dataset from SQLite database.""" |
| | |
| | db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db" |
| | |
| | if not db_path.exists(): |
| | raise FileNotFoundError(f"Database not found at {db_path}") |
| | |
| | print(f"Loading data from database: {db_path}") |
| | conn = sqlite3.connect(db_path) |
| | |
| | |
| | query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000" |
| | df = pd.read_sql_query(query, conn) |
| | conn.close() |
| | |
| | print(f"Loaded {len(df)} training samples") |
| | return df |
| |
|
| |
|
| | def prepare_baseline(df, sample_size=1000, random_state=42): |
| | """ |
| | Sample representative baseline data. |
| | |
| | Args: |
| | df: Training dataframe |
| | sample_size: Number of samples for baseline |
| | random_state: Random seed for reproducibility |
| | |
| | Returns: |
| | Baseline dataframe |
| | """ |
| | |
| | if 'label' in df.columns: |
| | _, baseline_df = train_test_split( |
| | df, |
| | test_size=sample_size, |
| | random_state=random_state, |
| | stratify=df['label'] |
| | ) |
| | else: |
| | baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state) |
| | |
| | print(f"Sampled {len(baseline_df)} baseline samples") |
| | return baseline_df |
| |
|
| |
|
| | def extract_features(df): |
| | """ |
| | Extract features used for drift detection. |
| | Should match the features used by your model. |
| | """ |
| | |
| | |
| | numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
| | exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id'] |
| | feature_columns = [col for col in numeric_cols if col not in exclude_cols] |
| | |
| | X = df[feature_columns].values |
| | |
| | print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples") |
| | return X |
| |
|
| |
|
| | def save_baseline(baseline_data, filename="reference_data.pkl"): |
| | """Save baseline data to disk.""" |
| | baseline_path = BASELINE_DIR / filename |
| | |
| | with open(baseline_path, 'wb') as f: |
| | pickle.dump(baseline_data, f) |
| | |
| | print(f"Baseline saved to {baseline_path}") |
| | print(f" Shape: {baseline_data.shape}") |
| | print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB") |
| |
|
| |
|
| | def main(): |
| | """Main execution.""" |
| | print("=" * 60) |
| | print("Preparing Baseline Data for Drift Detection") |
| | print("=" * 60) |
| | |
| | |
| | df = load_training_data() |
| | |
| | |
| | baseline_df = prepare_baseline(df, sample_size=1000) |
| | |
| | |
| | X_baseline = extract_features(baseline_df) |
| | |
| | |
| | save_baseline(X_baseline) |
| | |
| | print("\n" + "=" * 60) |
| | print("Baseline preparation complete!") |
| | print("=" * 60) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |