Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 3,340 Bytes

"""
Prepare baseline/reference data for drift detection.
This script samples representative data from the training set.
"""

import pickle
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path
from sklearn.model_selection import train_test_split

# Paths
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
BASELINE_DIR = Path(__file__).parent.parent / "baseline"
BASELINE_DIR.mkdir(parents=True, exist_ok=True)


def load_training_data():
    """Load the original training dataset from SQLite database."""
    # Load from SQLite database
    db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"
    
    if not db_path.exists():
        raise FileNotFoundError(f"Database not found at {db_path}")
    
    print(f"Loading data from database: {db_path}")
    conn = sqlite3.connect(db_path)
    
    # Load from the main table
    query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    print(f"Loaded {len(df)} training samples")
    return df


def prepare_baseline(df, sample_size=1000, random_state=42):
    """
    Sample representative baseline data.
    
    Args:
        df: Training dataframe
        sample_size: Number of samples for baseline
        random_state: Random seed for reproducibility
    
    Returns:
        Baseline dataframe
    """
    # Stratified sampling if you have labels
    if 'label' in df.columns:
        _, baseline_df = train_test_split(
            df,
            test_size=sample_size,
            random_state=random_state,
            stratify=df['label']
        )
    else:
        baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state)
    
    print(f"Sampled {len(baseline_df)} baseline samples")
    return baseline_df


def extract_features(df):
    """
    Extract features used for drift detection.
    Should match the features used by your model.
    """
    
    # Select only numeric columns, exclude labels and IDs
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
    feature_columns = [col for col in numeric_cols if col not in exclude_cols]
    
    X = df[feature_columns].values
    
    print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
    return X


def save_baseline(baseline_data, filename="reference_data.pkl"):
    """Save baseline data to disk."""
    baseline_path = BASELINE_DIR / filename
    
    with open(baseline_path, 'wb') as f:
        pickle.dump(baseline_data, f)
    
    print(f"Baseline saved to {baseline_path}")
    print(f"   Shape: {baseline_data.shape}")
    print(f"   Size: {baseline_path.stat().st_size / 1024:.2f} KB")


def main():
    """Main execution."""
    print("=" * 60)
    print("Preparing Baseline Data for Drift Detection")
    print("=" * 60)
    
    # Load data
    df = load_training_data()
    
    # Sample baseline
    baseline_df = prepare_baseline(df, sample_size=1000)
    
    # Extract features
    X_baseline = extract_features(baseline_df)
    
    # Save
    save_baseline(X_baseline)
    
    print("\n" + "=" * 60)
    print("Baseline preparation complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()