File size: 3,340 Bytes
8ed9c1d 1396866 8ed9c1d 1396866 8ed9c1d 1396866 8ed9c1d 1396866 8ed9c1d 1396866 8ed9c1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
"""
Prepare baseline/reference data for drift detection.
This script samples representative data from the training set.
"""
import pickle
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path
from sklearn.model_selection import train_test_split
# Paths
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
BASELINE_DIR = Path(__file__).parent.parent / "baseline"
BASELINE_DIR.mkdir(parents=True, exist_ok=True)
def load_training_data():
"""Load the original training dataset from SQLite database."""
# Load from SQLite database
db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"
if not db_path.exists():
raise FileNotFoundError(f"Database not found at {db_path}")
print(f"Loading data from database: {db_path}")
conn = sqlite3.connect(db_path)
# Load from the main table
query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
df = pd.read_sql_query(query, conn)
conn.close()
print(f"Loaded {len(df)} training samples")
return df
def prepare_baseline(df, sample_size=1000, random_state=42):
"""
Sample representative baseline data.
Args:
df: Training dataframe
sample_size: Number of samples for baseline
random_state: Random seed for reproducibility
Returns:
Baseline dataframe
"""
# Stratified sampling if you have labels
if 'label' in df.columns:
_, baseline_df = train_test_split(
df,
test_size=sample_size,
random_state=random_state,
stratify=df['label']
)
else:
baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state)
print(f"Sampled {len(baseline_df)} baseline samples")
return baseline_df
def extract_features(df):
"""
Extract features used for drift detection.
Should match the features used by your model.
"""
# Select only numeric columns, exclude labels and IDs
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
feature_columns = [col for col in numeric_cols if col not in exclude_cols]
X = df[feature_columns].values
print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
return X
def save_baseline(baseline_data, filename="reference_data.pkl"):
"""Save baseline data to disk."""
baseline_path = BASELINE_DIR / filename
with open(baseline_path, 'wb') as f:
pickle.dump(baseline_data, f)
print(f"Baseline saved to {baseline_path}")
print(f" Shape: {baseline_data.shape}")
print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB")
def main():
"""Main execution."""
print("=" * 60)
print("Preparing Baseline Data for Drift Detection")
print("=" * 60)
# Load data
df = load_training_data()
# Sample baseline
baseline_df = prepare_baseline(df, sample_size=1000)
# Extract features
X_baseline = extract_features(baseline_df)
# Save
save_baseline(X_baseline)
print("\n" + "=" * 60)
print("Baseline preparation complete!")
print("=" * 60)
if __name__ == "__main__":
main() |