Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

App Files Files Community

Hopcroft-Skill-Classification / monitoring /drift /scripts /prepare_baseline.py

giuto

Update Grafana and Prometheus configurations, enhance drift detection scripts, and add monitoring dashboard

1396866 2 months ago

raw

history blame

3.34 kB

	"""
	Prepare baseline/reference data for drift detection.
	This script samples representative data from the training set.
	"""

	import pickle
	import pandas as pd
	import numpy as np
	import sqlite3
	from pathlib import Path
	from sklearn.model_selection import train_test_split

	# Paths
	PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
	BASELINE_DIR = Path(__file__).parent.parent / "baseline"
	BASELINE_DIR.mkdir(parents=True, exist_ok=True)


	def load_training_data():
	"""Load the original training dataset from SQLite database."""
	# Load from SQLite database
	db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"

	if not db_path.exists():
	raise FileNotFoundError(f"Database not found at {db_path}")

	print(f"Loading data from database: {db_path}")
	conn = sqlite3.connect(db_path)

	# Load from the main table
	query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
	df = pd.read_sql_query(query, conn)
	conn.close()

	print(f"Loaded {len(df)} training samples")
	return df


	def prepare_baseline(df, sample_size=1000, random_state=42):
	"""
	Sample representative baseline data.

	Args:
	df: Training dataframe
	sample_size: Number of samples for baseline
	random_state: Random seed for reproducibility

	Returns:
	Baseline dataframe
	"""
	# Stratified sampling if you have labels
	if 'label' in df.columns:
	_, baseline_df = train_test_split(
	df,
	test_size=sample_size,
	random_state=random_state,
	stratify=df['label']
	)
	else:
	baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state)

	print(f"Sampled {len(baseline_df)} baseline samples")
	return baseline_df


	def extract_features(df):
	"""
	Extract features used for drift detection.
	Should match the features used by your model.
	"""

	# Select only numeric columns, exclude labels and IDs
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
	exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
	feature_columns = [col for col in numeric_cols if col not in exclude_cols]

	X = df[feature_columns].values

	print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
	return X


	def save_baseline(baseline_data, filename="reference_data.pkl"):
	"""Save baseline data to disk."""
	baseline_path = BASELINE_DIR / filename

	with open(baseline_path, 'wb') as f:
	pickle.dump(baseline_data, f)

	print(f"Baseline saved to {baseline_path}")
	print(f" Shape: {baseline_data.shape}")
	print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB")


	def main():
	"""Main execution."""
	print("=" * 60)
	print("Preparing Baseline Data for Drift Detection")
	print("=" * 60)

	# Load data
	df = load_training_data()

	# Sample baseline
	baseline_df = prepare_baseline(df, sample_size=1000)

	# Extract features
	X_baseline = extract_features(baseline_df)

	# Save
	save_baseline(X_baseline)

	print("\n" + "=" * 60)
	print("Baseline preparation complete!")
	print("=" * 60)


	if __name__ == "__main__":
	main()