Spaces:
Sleeping
Sleeping
| """ | |
| Prepare baseline/reference data for drift detection. | |
| This script samples representative data from the training set. | |
| """ | |
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| import sqlite3 | |
| from pathlib import Path | |
| from sklearn.model_selection import train_test_split | |
| # Paths | |
| PROJECT_ROOT = Path(__file__).parent.parent.parent.parent | |
| BASELINE_DIR = Path(__file__).parent.parent / "baseline" | |
| BASELINE_DIR.mkdir(parents=True, exist_ok=True) | |
| def load_training_data(): | |
| """Load the original training dataset from SQLite database.""" | |
| # Load from SQLite database | |
| db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db" | |
| if not db_path.exists(): | |
| raise FileNotFoundError(f"Database not found at {db_path}") | |
| print(f"Loading data from database: {db_path}") | |
| conn = sqlite3.connect(db_path) | |
| # Load from the main table | |
| query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000" | |
| df = pd.read_sql_query(query, conn) | |
| conn.close() | |
| print(f"Loaded {len(df)} training samples") | |
| return df | |
| def prepare_baseline(df, sample_size=1000, random_state=42): | |
| """ | |
| Sample representative baseline data. | |
| Args: | |
| df: Training dataframe | |
| sample_size: Number of samples for baseline | |
| random_state: Random seed for reproducibility | |
| Returns: | |
| Baseline dataframe | |
| """ | |
| # Stratified sampling if you have labels | |
| if 'label' in df.columns: | |
| _, baseline_df = train_test_split( | |
| df, | |
| test_size=sample_size, | |
| random_state=random_state, | |
| stratify=df['label'] | |
| ) | |
| else: | |
| baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state) | |
| print(f"Sampled {len(baseline_df)} baseline samples") | |
| return baseline_df | |
| def extract_features(df): | |
| """ | |
| Extract features used for drift detection. | |
| Should match the features used by your model. | |
| """ | |
| # Select only numeric columns, exclude labels and IDs | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id'] | |
| feature_columns = [col for col in numeric_cols if col not in exclude_cols] | |
| X = df[feature_columns].values | |
| print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples") | |
| return X | |
| def save_baseline(baseline_data, filename="reference_data.pkl"): | |
| """Save baseline data to disk.""" | |
| baseline_path = BASELINE_DIR / filename | |
| with open(baseline_path, 'wb') as f: | |
| pickle.dump(baseline_data, f) | |
| print(f"Baseline saved to {baseline_path}") | |
| print(f" Shape: {baseline_data.shape}") | |
| print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB") | |
| def main(): | |
| """Main execution.""" | |
| print("=" * 60) | |
| print("Preparing Baseline Data for Drift Detection") | |
| print("=" * 60) | |
| # Load data | |
| df = load_training_data() | |
| # Sample baseline | |
| baseline_df = prepare_baseline(df, sample_size=1000) | |
| # Extract features | |
| X_baseline = extract_features(baseline_df) | |
| # Save | |
| save_baseline(X_baseline) | |
| print("\n" + "=" * 60) | |
| print("Baseline preparation complete!") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() |