|
|
""" |
|
|
Prepare baseline/reference data for drift detection. |
|
|
This script samples representative data from the training set. |
|
|
""" |
|
|
|
|
|
import pickle |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import sqlite3 |
|
|
from pathlib import Path |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent |
|
|
BASELINE_DIR = Path(__file__).parent.parent / "baseline" |
|
|
BASELINE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
def load_training_data(): |
|
|
"""Load the original training dataset from SQLite database.""" |
|
|
|
|
|
db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db" |
|
|
|
|
|
if not db_path.exists(): |
|
|
raise FileNotFoundError(f"Database not found at {db_path}") |
|
|
|
|
|
print(f"Loading data from database: {db_path}") |
|
|
conn = sqlite3.connect(db_path) |
|
|
|
|
|
|
|
|
query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000" |
|
|
df = pd.read_sql_query(query, conn) |
|
|
conn.close() |
|
|
|
|
|
print(f"Loaded {len(df)} training samples") |
|
|
return df |
|
|
|
|
|
|
|
|
def prepare_baseline(df, sample_size=1000, random_state=42): |
|
|
""" |
|
|
Sample representative baseline data. |
|
|
|
|
|
Args: |
|
|
df: Training dataframe |
|
|
sample_size: Number of samples for baseline |
|
|
random_state: Random seed for reproducibility |
|
|
|
|
|
Returns: |
|
|
Baseline dataframe |
|
|
""" |
|
|
|
|
|
if 'label' in df.columns: |
|
|
_, baseline_df = train_test_split( |
|
|
df, |
|
|
test_size=sample_size, |
|
|
random_state=random_state, |
|
|
stratify=df['label'] |
|
|
) |
|
|
else: |
|
|
baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state) |
|
|
|
|
|
print(f"Sampled {len(baseline_df)} baseline samples") |
|
|
return baseline_df |
|
|
|
|
|
|
|
|
def extract_features(df): |
|
|
""" |
|
|
Extract features used for drift detection. |
|
|
Should match the features used by your model. |
|
|
""" |
|
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
|
|
exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id'] |
|
|
feature_columns = [col for col in numeric_cols if col not in exclude_cols] |
|
|
|
|
|
X = df[feature_columns].values |
|
|
|
|
|
print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples") |
|
|
return X |
|
|
|
|
|
|
|
|
def save_baseline(baseline_data, filename="reference_data.pkl"): |
|
|
"""Save baseline data to disk.""" |
|
|
baseline_path = BASELINE_DIR / filename |
|
|
|
|
|
with open(baseline_path, 'wb') as f: |
|
|
pickle.dump(baseline_data, f) |
|
|
|
|
|
print(f"Baseline saved to {baseline_path}") |
|
|
print(f" Shape: {baseline_data.shape}") |
|
|
print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main execution.""" |
|
|
print("=" * 60) |
|
|
print("Preparing Baseline Data for Drift Detection") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
df = load_training_data() |
|
|
|
|
|
|
|
|
baseline_df = prepare_baseline(df, sample_size=1000) |
|
|
|
|
|
|
|
|
X_baseline = extract_features(baseline_df) |
|
|
|
|
|
|
|
|
save_baseline(X_baseline) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Baseline preparation complete!") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |