Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
38593e7
1
Parent(s):
8e13241
Sync turing folder from GitHub
Browse files- turing/CLI_runner/verify_drift_detection.py +335 -0
- turing/config.py +6 -0
- turing/monitoring/__init__.py +27 -0
- turing/monitoring/baseline_manager.py +148 -0
- turing/monitoring/drift_detector.py +353 -0
- turing/monitoring/feedback/feedback_data.csv +3 -0
- turing/monitoring/feedback_manager.py +65 -0
- turing/monitoring/mlflow_logger.py +97 -0
- turing/monitoring/synthetic_data_generator.py +240 -0
- turing/tests/unit/test_monitoring.py +126 -0
turing/CLI_runner/verify_drift_detection.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import dagshub
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from mlflow.tracking import MlflowClient
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import typer
|
| 11 |
+
|
| 12 |
+
from turing import config
|
| 13 |
+
from turing.modeling.model_selector import get_best_model_by_tag
|
| 14 |
+
from turing.monitoring.baseline_manager import extract_baseline_statistics
|
| 15 |
+
from turing.monitoring.drift_detector import DriftDetector
|
| 16 |
+
from turing.monitoring.feedback_manager import load_feedback_for_language
|
| 17 |
+
from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator
|
| 18 |
+
|
| 19 |
+
app = typer.Typer()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_training_data(dataset_name: str, language: str):
|
| 23 |
+
"""
|
| 24 |
+
Load training data for a specific programming language.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
dataset_name: Dataset name (e.g., 'clean-k5000')
|
| 28 |
+
language: Programming language (java, python, pharo)
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Tuple of (texts, labels) as lists
|
| 32 |
+
"""
|
| 33 |
+
dataset_path = config.INTERIM_DATA_DIR / "features" / dataset_name
|
| 34 |
+
|
| 35 |
+
if not dataset_path.exists():
|
| 36 |
+
raise FileNotFoundError(f"Dataset path not found: {dataset_path}")
|
| 37 |
+
|
| 38 |
+
train_file = None
|
| 39 |
+
for file in dataset_path.rglob(f"{language}_train*.csv"):
|
| 40 |
+
train_file = file
|
| 41 |
+
break
|
| 42 |
+
|
| 43 |
+
if not train_file:
|
| 44 |
+
raise FileNotFoundError(f"Training file not found for {language} in {dataset_path}")
|
| 45 |
+
|
| 46 |
+
logger.info(f"Loading training data from: {train_file}")
|
| 47 |
+
df = pd.read_csv(train_file)
|
| 48 |
+
|
| 49 |
+
X_train = df[config.INPUT_COLUMN].tolist()
|
| 50 |
+
|
| 51 |
+
if isinstance(df[config.LABEL_COLUMN].iloc[0], str):
|
| 52 |
+
y_train = np.array([eval(label) for label in df[config.LABEL_COLUMN]])
|
| 53 |
+
else:
|
| 54 |
+
y_train = df[config.LABEL_COLUMN].values
|
| 55 |
+
|
| 56 |
+
logger.success(f"Loaded {len(X_train)} training samples for {language}")
|
| 57 |
+
return X_train, y_train
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def print_drift_report(drift_results: dict, drift_type: str, report_lines: list = None):
|
| 61 |
+
"""
|
| 62 |
+
Format and display drift detection results for a specific drift type.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
drift_results: Dictionary with drift detection metrics and alerts
|
| 66 |
+
drift_type: Name of drift type tested (e.g., 'none', 'text_length_short')
|
| 67 |
+
report_lines: Optional list to collect formatted report lines
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
def log_and_collect(msg: str):
|
| 71 |
+
logger.info(msg)
|
| 72 |
+
if report_lines is not None:
|
| 73 |
+
report_lines.append(msg)
|
| 74 |
+
|
| 75 |
+
log_and_collect(f"\n{'=' * 60}")
|
| 76 |
+
log_and_collect(f"DRIFT DETECTION REPORT - {drift_type.upper()}")
|
| 77 |
+
log_and_collect(f"{'=' * 60}")
|
| 78 |
+
|
| 79 |
+
for metric_name, result in drift_results.items():
|
| 80 |
+
if metric_name == "overall":
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
p_value = result.get("p_value", result.get("check_result", {}).get("passed", None))
|
| 84 |
+
statistic = result.get("statistic", None)
|
| 85 |
+
drifted = result.get("drifted", False)
|
| 86 |
+
alert = result.get("alert", False)
|
| 87 |
+
|
| 88 |
+
if alert:
|
| 89 |
+
status = "ALERT"
|
| 90 |
+
elif drifted:
|
| 91 |
+
status = "DRIFT"
|
| 92 |
+
else:
|
| 93 |
+
status = "OK"
|
| 94 |
+
|
| 95 |
+
log_and_collect(f"\n{metric_name.upper()}")
|
| 96 |
+
log_and_collect(f" Status: {status}")
|
| 97 |
+
if p_value is not None:
|
| 98 |
+
log_and_collect(f" P-value: {p_value:.6f}")
|
| 99 |
+
if statistic is not None:
|
| 100 |
+
log_and_collect(f" Statistic: {statistic:.6f}")
|
| 101 |
+
log_and_collect(f" Drift detected: {drifted}")
|
| 102 |
+
log_and_collect(f" Critical alert: {alert}")
|
| 103 |
+
log_and_collect(f" Method: {result.get('method', 'unknown')}")
|
| 104 |
+
|
| 105 |
+
overall = drift_results.get("overall", {})
|
| 106 |
+
overall_drifted = overall.get("drifted", False)
|
| 107 |
+
overall_alert = overall.get("alert", False)
|
| 108 |
+
drift_count = overall.get("num_drifts", 0)
|
| 109 |
+
|
| 110 |
+
log_and_collect(f"\n{'=' * 60}")
|
| 111 |
+
log_and_collect("OVERALL SUMMARY")
|
| 112 |
+
log_and_collect(f" Drift detected: {overall_drifted}")
|
| 113 |
+
log_and_collect(f" Critical alert: {overall_alert}")
|
| 114 |
+
log_and_collect(f" Number of drifted metrics: {drift_count}")
|
| 115 |
+
log_and_collect(f" Methods used: {overall.get('methods', [])}")
|
| 116 |
+
log_and_collect(f"{'=' * 60}\n")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def save_drift_report(
|
| 120 |
+
language: str,
|
| 121 |
+
dataset_name: str,
|
| 122 |
+
baseline_stats: dict,
|
| 123 |
+
test_results: dict,
|
| 124 |
+
report_text: str,
|
| 125 |
+
):
|
| 126 |
+
"""
|
| 127 |
+
Save drift detection report to TXT and JSON files.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
language: Programming language tested
|
| 131 |
+
dataset_name: Name of dataset used
|
| 132 |
+
baseline_stats: Baseline statistics dictionary
|
| 133 |
+
test_results: Dictionary with test results for each drift type
|
| 134 |
+
report_text: Formatted report text
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
def convert_numpy_types(obj):
|
| 138 |
+
if isinstance(obj, dict):
|
| 139 |
+
return {k: convert_numpy_types(v) for k, v in obj.items()}
|
| 140 |
+
elif isinstance(obj, list):
|
| 141 |
+
return [convert_numpy_types(item) for item in obj]
|
| 142 |
+
elif isinstance(obj, np.bool_):
|
| 143 |
+
return bool(obj)
|
| 144 |
+
elif isinstance(obj, np.integer):
|
| 145 |
+
return int(obj)
|
| 146 |
+
elif isinstance(obj, np.floating):
|
| 147 |
+
return float(obj)
|
| 148 |
+
elif isinstance(obj, np.ndarray):
|
| 149 |
+
return obj.tolist()
|
| 150 |
+
else:
|
| 151 |
+
return obj
|
| 152 |
+
|
| 153 |
+
monitoring_dir = config.REPORTS_DIR / "monitoring"
|
| 154 |
+
monitoring_dir.mkdir(parents=True, exist_ok=True)
|
| 155 |
+
|
| 156 |
+
report_file = monitoring_dir / f"drift_report_{language}.txt"
|
| 157 |
+
with open(report_file, "w") as f:
|
| 158 |
+
f.write("DRIFT DETECTION REPORT\n")
|
| 159 |
+
f.write(f"Language: {language}\n")
|
| 160 |
+
f.write(f"Dataset: {dataset_name}\n")
|
| 161 |
+
f.write(f"Timestamp: {datetime.now().isoformat()}\n")
|
| 162 |
+
f.write(f"P-value threshold: {config.DRIFT_P_VALUE_THRESHOLD}\n")
|
| 163 |
+
f.write(f"Alert threshold: {config.DRIFT_ALERT_THRESHOLD}\n")
|
| 164 |
+
f.write("\n" + "=" * 80 + "\n\n")
|
| 165 |
+
f.write("BASELINE STATISTICS\n")
|
| 166 |
+
f.write(
|
| 167 |
+
f" Text length: mean={baseline_stats['text_length_mean']:.2f}, std={baseline_stats['text_length_std']:.2f}\n"
|
| 168 |
+
)
|
| 169 |
+
f.write(
|
| 170 |
+
f" Word count: mean={baseline_stats['word_count_mean']:.2f}, std={baseline_stats['word_count_std']:.2f}\n"
|
| 171 |
+
)
|
| 172 |
+
f.write(f" Label counts: {baseline_stats['label_counts']}\n")
|
| 173 |
+
f.write(f" Number of samples: {baseline_stats['num_samples']}\n")
|
| 174 |
+
f.write("\n" + "=" * 80 + "\n\n")
|
| 175 |
+
f.write(report_text)
|
| 176 |
+
|
| 177 |
+
json_file = monitoring_dir / f"drift_report_{language}.json"
|
| 178 |
+
report_data = {
|
| 179 |
+
"language": language,
|
| 180 |
+
"dataset": dataset_name,
|
| 181 |
+
"timestamp": datetime.now().isoformat(),
|
| 182 |
+
"config": {
|
| 183 |
+
"p_value_threshold": config.DRIFT_P_VALUE_THRESHOLD,
|
| 184 |
+
"alert_threshold": config.DRIFT_ALERT_THRESHOLD,
|
| 185 |
+
},
|
| 186 |
+
"baseline": {
|
| 187 |
+
"text_length_mean": baseline_stats["text_length_mean"],
|
| 188 |
+
"text_length_std": baseline_stats["text_length_std"],
|
| 189 |
+
"word_count_mean": baseline_stats["word_count_mean"],
|
| 190 |
+
"word_count_std": baseline_stats["word_count_std"],
|
| 191 |
+
"label_counts": baseline_stats["label_counts"],
|
| 192 |
+
"num_samples": baseline_stats["num_samples"],
|
| 193 |
+
"n_labels": baseline_stats["n_labels"],
|
| 194 |
+
},
|
| 195 |
+
"test_results": convert_numpy_types(test_results),
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
with open(json_file, "w") as f:
|
| 199 |
+
json.dump(report_data, f, indent=2)
|
| 200 |
+
|
| 201 |
+
logger.success("Report saved to:")
|
| 202 |
+
logger.info(f" Text: {report_file}")
|
| 203 |
+
logger.info(f" JSON: {json_file}")
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
@app.command()
|
| 207 |
+
def verify(
|
| 208 |
+
language: str = typer.Option("java", help="Language to test (java, python, pharo)"),
|
| 209 |
+
repo_owner: str = typer.Option("se4ai2526-uniba", help="DagsHub repository owner"),
|
| 210 |
+
repo_name: str = typer.Option("Turing", help="DagsHub repository name"),
|
| 211 |
+
n_samples: int = typer.Option(100, help="Number of samples for synthetic data generation"),
|
| 212 |
+
use_feedback: bool = typer.Option(False, help="Include user feedback rows in drift analysis"),
|
| 213 |
+
feedback_path: Path = typer.Option(
|
| 214 |
+
config.PROJ_ROOT / "turing" / "monitoring" / "feedback" / "feedback_data.csv",
|
| 215 |
+
help="Path to user feedback CSV",
|
| 216 |
+
),
|
| 217 |
+
):
|
| 218 |
+
"""
|
| 219 |
+
Verify drift detection on best model's training dataset.
|
| 220 |
+
"""
|
| 221 |
+
logger.info("Starting drift detection verification...")
|
| 222 |
+
logger.info("Configuration:")
|
| 223 |
+
logger.info(f" Language: {language}")
|
| 224 |
+
logger.info(f" P-value threshold: {config.DRIFT_P_VALUE_THRESHOLD}")
|
| 225 |
+
logger.info(f" Alert threshold: {config.DRIFT_ALERT_THRESHOLD}")
|
| 226 |
+
logger.info(f" Baseline cache: {config.BASELINE_CACHE_DIR}")
|
| 227 |
+
|
| 228 |
+
dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
|
| 229 |
+
|
| 230 |
+
logger.info(f"\n[1/6] Searching for best model for {language}...")
|
| 231 |
+
best_model_info = get_best_model_by_tag(language=language)
|
| 232 |
+
|
| 233 |
+
if not best_model_info:
|
| 234 |
+
logger.error(f"No best model found for {language}")
|
| 235 |
+
return
|
| 236 |
+
|
| 237 |
+
run_id = best_model_info["run_id"]
|
| 238 |
+
|
| 239 |
+
logger.info(f"\n[2/6] Retrieving dataset information from MLflow run {run_id}...")
|
| 240 |
+
client = MlflowClient()
|
| 241 |
+
run = client.get_run(run_id)
|
| 242 |
+
dataset_name = run.data.tags.get("dataset_name", None)
|
| 243 |
+
|
| 244 |
+
if not dataset_name:
|
| 245 |
+
logger.error("Dataset name not found in run tags")
|
| 246 |
+
return
|
| 247 |
+
|
| 248 |
+
logger.success(f"Found dataset: {dataset_name}")
|
| 249 |
+
|
| 250 |
+
logger.info("\n[3/6] Loading training data...")
|
| 251 |
+
try:
|
| 252 |
+
X_train, y_train = load_training_data(dataset_name, language)
|
| 253 |
+
except Exception as e:
|
| 254 |
+
logger.error(f"Failed to load training data: {e}")
|
| 255 |
+
return
|
| 256 |
+
|
| 257 |
+
logger.info("\n[4/6] Extracting baseline statistics...")
|
| 258 |
+
baseline_stats = extract_baseline_statistics(X_train, y_train, language)
|
| 259 |
+
logger.success("Baseline extracted:")
|
| 260 |
+
logger.info(
|
| 261 |
+
f" Text length: mean={baseline_stats['text_length_mean']:.2f}, std={baseline_stats['text_length_std']:.2f}"
|
| 262 |
+
)
|
| 263 |
+
logger.info(
|
| 264 |
+
f" Word count: mean={baseline_stats['word_count_mean']:.2f}, std={baseline_stats['word_count_std']:.2f}"
|
| 265 |
+
)
|
| 266 |
+
logger.info(f" Label counts: {baseline_stats['label_counts']}")
|
| 267 |
+
|
| 268 |
+
logger.info("\n[5/6] Initializing drift detection components...")
|
| 269 |
+
drift_detector = DriftDetector()
|
| 270 |
+
synthetic_generator = SyntheticDataGenerator(seed=42)
|
| 271 |
+
|
| 272 |
+
feedback_texts, feedback_labels = [], np.array([])
|
| 273 |
+
if use_feedback:
|
| 274 |
+
try:
|
| 275 |
+
feedback_texts, feedback_labels = load_feedback_for_language(feedback_path, language)
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.warning(f"Feedback load skipped: {e}")
|
| 278 |
+
|
| 279 |
+
logger.info("\n[6/6] Testing drift detection on different data types...\n")
|
| 280 |
+
|
| 281 |
+
test_cases = [
|
| 282 |
+
("NORMAL DATA (no drift expected)", "none"),
|
| 283 |
+
("SHORT TEXT DRIFT", "text_length_short"),
|
| 284 |
+
("LONG TEXT DRIFT", "text_length_long"),
|
| 285 |
+
("CORRUPTED VOCABULARY DRIFT", "corrupted_vocab"),
|
| 286 |
+
("CLASS IMBALANCE DRIFT", "class_imbalance"),
|
| 287 |
+
]
|
| 288 |
+
|
| 289 |
+
if use_feedback and len(feedback_texts) > 0:
|
| 290 |
+
test_cases.append(("USER FEEDBACK", "feedback"))
|
| 291 |
+
|
| 292 |
+
all_test_results = {}
|
| 293 |
+
all_report_lines = []
|
| 294 |
+
|
| 295 |
+
for test_name, drift_type in test_cases:
|
| 296 |
+
logger.info(f"\n{'#' * 60}")
|
| 297 |
+
logger.info(f"Test: {test_name}")
|
| 298 |
+
logger.info(f"{'#' * 60}")
|
| 299 |
+
|
| 300 |
+
if drift_type == "feedback":
|
| 301 |
+
production_texts = feedback_texts
|
| 302 |
+
production_labels = feedback_labels
|
| 303 |
+
else:
|
| 304 |
+
production_texts, production_labels = synthetic_generator.generate_synthetic_batch(
|
| 305 |
+
reference_texts=X_train,
|
| 306 |
+
reference_labels=y_train,
|
| 307 |
+
drift_type=drift_type,
|
| 308 |
+
batch_size=n_samples,
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
drift_results = drift_detector.detect_all_drifts(
|
| 312 |
+
production_texts=production_texts,
|
| 313 |
+
production_labels=production_labels,
|
| 314 |
+
reference_texts=X_train,
|
| 315 |
+
reference_labels=y_train,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
all_test_results[drift_type] = drift_results
|
| 319 |
+
print_drift_report(drift_results, drift_type, report_lines=all_report_lines)
|
| 320 |
+
|
| 321 |
+
logger.info("\nSaving drift detection report...")
|
| 322 |
+
report_text = "\n".join(all_report_lines)
|
| 323 |
+
save_drift_report(
|
| 324 |
+
language=language,
|
| 325 |
+
dataset_name=dataset_name,
|
| 326 |
+
baseline_stats=baseline_stats,
|
| 327 |
+
test_results=all_test_results,
|
| 328 |
+
report_text=report_text,
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
logger.success("\nDrift detection verification completed!")
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
if __name__ == "__main__":
|
| 335 |
+
app()
|
turing/config.py
CHANGED
|
@@ -49,6 +49,12 @@ MAX_AVG_FLOPS = 5000.0 # GFLOPS
|
|
| 49 |
# Training parameters
|
| 50 |
DEFAULT_BATCH_SIZE = 32
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Model configuration mapping
|
| 53 |
MODEL_CONFIG = {
|
| 54 |
"codeberta": {
|
|
|
|
| 49 |
# Training parameters
|
| 50 |
DEFAULT_BATCH_SIZE = 32
|
| 51 |
|
| 52 |
+
# Drift detection parameters
|
| 53 |
+
DRIFT_P_VALUE_THRESHOLD = 0.05 # P-value threshold for drift detection warning
|
| 54 |
+
DRIFT_ALERT_THRESHOLD = 0.01 # P-value threshold for drift alert (critical)
|
| 55 |
+
BASELINE_CACHE_DIR = Path.home() / ".turing_baselines" # Local cache for baseline statistics
|
| 56 |
+
DRIFT_DETECTION_ENABLED = True # Enable/disable drift detection globally
|
| 57 |
+
|
| 58 |
# Model configuration mapping
|
| 59 |
MODEL_CONFIG = {
|
| 60 |
"codeberta": {
|
turing/monitoring/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Drift Monitoring Module
|
| 3 |
+
|
| 4 |
+
This module provides tools for detecting and logging data drift in production predictions
|
| 5 |
+
against reference baselines from training data. It integrates with MLflow for baseline
|
| 6 |
+
storage and drift metric logging.
|
| 7 |
+
|
| 8 |
+
Components:
|
| 9 |
+
- drift_detector: Core drift detection using statistical tests (KS test, Chi-square)
|
| 10 |
+
- baseline_manager: Extract and manage baseline statistics from training data
|
| 11 |
+
- mlflow_logger: Log drift metrics and alerts to MLflow
|
| 12 |
+
- synthetic_data_generator: Generate synthetic drifted data for testing
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from turing.monitoring.baseline_manager import (
|
| 16 |
+
BaselineManager,
|
| 17 |
+
extract_baseline_statistics,
|
| 18 |
+
)
|
| 19 |
+
from turing.monitoring.drift_detector import DriftDetector
|
| 20 |
+
from turing.monitoring.mlflow_logger import DriftLogger
|
| 21 |
+
|
| 22 |
+
__all__ = [
|
| 23 |
+
"DriftDetector",
|
| 24 |
+
"BaselineManager",
|
| 25 |
+
"DriftLogger",
|
| 26 |
+
"extract_baseline_statistics",
|
| 27 |
+
]
|
turing/monitoring/baseline_manager.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline Management Module
|
| 3 |
+
|
| 4 |
+
Handles extraction of baseline statistics from training data,
|
| 5 |
+
storage as MLflow artifacts, and retrieval for drift detection.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import pickle
|
| 11 |
+
from typing import Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
from loguru import logger
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
from turing import config
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
import mlflow
|
| 20 |
+
from mlflow.tracking import MlflowClient
|
| 21 |
+
except ImportError:
|
| 22 |
+
mlflow = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def extract_baseline_statistics(
|
| 26 |
+
X_train: List[str],
|
| 27 |
+
y_train: np.ndarray,
|
| 28 |
+
language: str = "java",
|
| 29 |
+
) -> Dict:
|
| 30 |
+
"""
|
| 31 |
+
Extract baseline statistics from training data.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
X_train: List of training comment texts
|
| 35 |
+
y_train: Training labels (binary matrix or label indices)
|
| 36 |
+
language: Language of the training data
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Dictionary containing baseline statistics
|
| 40 |
+
"""
|
| 41 |
+
text_lengths = np.array([len(text) for text in X_train])
|
| 42 |
+
word_counts = np.array([len(text.split()) for text in X_train])
|
| 43 |
+
|
| 44 |
+
if len(y_train.shape) == 1:
|
| 45 |
+
n_labels = int(np.max(y_train)) + 1
|
| 46 |
+
label_counts = np.bincount(y_train.astype(int), minlength=n_labels)
|
| 47 |
+
else:
|
| 48 |
+
label_counts = np.sum(y_train, axis=0)
|
| 49 |
+
n_labels = y_train.shape[1]
|
| 50 |
+
|
| 51 |
+
baseline_stats = {
|
| 52 |
+
"text_length_distribution": text_lengths.tolist(),
|
| 53 |
+
"word_count_distribution": word_counts.tolist(),
|
| 54 |
+
"label_counts": label_counts.tolist(),
|
| 55 |
+
"language": language,
|
| 56 |
+
"num_samples": len(X_train),
|
| 57 |
+
"n_labels": int(n_labels),
|
| 58 |
+
"text_length_mean": float(np.mean(text_lengths)),
|
| 59 |
+
"text_length_std": float(np.std(text_lengths)),
|
| 60 |
+
"text_length_min": float(np.min(text_lengths)),
|
| 61 |
+
"text_length_max": float(np.max(text_lengths)),
|
| 62 |
+
"word_count_mean": float(np.mean(word_counts)),
|
| 63 |
+
"word_count_std": float(np.std(word_counts)),
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
logger.info(f"Extracted baseline for {language}: {len(X_train)} samples")
|
| 67 |
+
|
| 68 |
+
return baseline_stats
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class BaselineManager:
|
| 72 |
+
"""
|
| 73 |
+
Manages baseline statistics for drift detection.
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
def __init__(self, mlflow_enabled: bool = True, local_cache_dir: Optional[Path] = None):
|
| 77 |
+
"""
|
| 78 |
+
Initialize baseline manager.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
mlflow_enabled: Enable MLflow artifact logging
|
| 82 |
+
local_cache_dir: Local cache directory (default from config.BASELINE_CACHE_DIR)
|
| 83 |
+
"""
|
| 84 |
+
self.mlflow_enabled = mlflow_enabled and mlflow is not None
|
| 85 |
+
self.local_cache_dir = local_cache_dir or config.BASELINE_CACHE_DIR
|
| 86 |
+
self.local_cache_dir.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
|
| 88 |
+
if self.mlflow_enabled:
|
| 89 |
+
self.mlflow_client = MlflowClient()
|
| 90 |
+
|
| 91 |
+
logger.info(f"BaselineManager initialized (cache: {self.local_cache_dir})")
|
| 92 |
+
|
| 93 |
+
def save_baseline(
|
| 94 |
+
self,
|
| 95 |
+
baseline_stats: Dict,
|
| 96 |
+
language: str,
|
| 97 |
+
dataset_name: str,
|
| 98 |
+
model_id: str = "default",
|
| 99 |
+
run_id: Optional[str] = None,
|
| 100 |
+
) -> None:
|
| 101 |
+
"""
|
| 102 |
+
Save baseline statistics to MLflow and local cache.
|
| 103 |
+
"""
|
| 104 |
+
baseline_path = self._get_baseline_path(language, dataset_name, model_id)
|
| 105 |
+
|
| 106 |
+
baseline_path.parent.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
with open(baseline_path, "wb") as f:
|
| 108 |
+
pickle.dump(baseline_stats, f)
|
| 109 |
+
logger.info(f"Saved baseline to {baseline_path}")
|
| 110 |
+
|
| 111 |
+
if self.mlflow_enabled and run_id:
|
| 112 |
+
try:
|
| 113 |
+
json_path = baseline_path.with_suffix(".json")
|
| 114 |
+
json_stats = {
|
| 115 |
+
k: v
|
| 116 |
+
for k, v in baseline_stats.items()
|
| 117 |
+
if isinstance(v, (int, float, str, list, bool))
|
| 118 |
+
}
|
| 119 |
+
with open(json_path, "w") as f:
|
| 120 |
+
json.dump(json_stats, f, indent=2)
|
| 121 |
+
|
| 122 |
+
mlflow.log_artifact(str(json_path), artifact_path=f"baselines/{language}")
|
| 123 |
+
logger.info("Logged baseline to MLflow")
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.warning(f"Failed to log baseline to MLflow: {e}")
|
| 126 |
+
|
| 127 |
+
def load_baseline(
|
| 128 |
+
self,
|
| 129 |
+
language: str,
|
| 130 |
+
dataset_name: str,
|
| 131 |
+
model_id: str = "default",
|
| 132 |
+
) -> Dict:
|
| 133 |
+
"""
|
| 134 |
+
Load baseline statistics from local cache.
|
| 135 |
+
"""
|
| 136 |
+
baseline_path = self._get_baseline_path(language, dataset_name, model_id)
|
| 137 |
+
|
| 138 |
+
if baseline_path.exists():
|
| 139 |
+
with open(baseline_path, "rb") as f:
|
| 140 |
+
baseline_stats = pickle.load(f)
|
| 141 |
+
logger.info(f"Loaded baseline from cache: {baseline_path}")
|
| 142 |
+
return baseline_stats
|
| 143 |
+
|
| 144 |
+
raise FileNotFoundError(f"Baseline not found at {baseline_path}")
|
| 145 |
+
|
| 146 |
+
def _get_baseline_path(self, language: str, dataset_name: str, model_id: str) -> Path:
|
| 147 |
+
"""Generate local cache path for baseline."""
|
| 148 |
+
return self.local_cache_dir / language / f"{dataset_name}_{model_id}_baseline.pkl"
|
turing/monitoring/drift_detector.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Drift Detection Module using Deepchecks
|
| 3 |
+
|
| 4 |
+
Implements drift detection using Deepchecks integrated checks:
|
| 5 |
+
- Drift check for text properties
|
| 6 |
+
- Label distribution drift
|
| 7 |
+
- Custom metrics comparison
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
|
| 12 |
+
from loguru import logger
|
| 13 |
+
import numpy as np
|
| 14 |
+
import pandas as pd
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from deepchecks.nlp import SingleDataset
|
| 18 |
+
from deepchecks.nlp.checks import Drift, TextPropertyDrift
|
| 19 |
+
except ImportError:
|
| 20 |
+
logger.warning("Deepchecks not installed. Install with: pip install deepchecks[nlp]")
|
| 21 |
+
Drift = None
|
| 22 |
+
TextPropertyDrift = None
|
| 23 |
+
|
| 24 |
+
from turing import config
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class DriftDetector:
|
| 28 |
+
"""
|
| 29 |
+
Detects data drift using Deepchecks integrated checks comparing production data
|
| 30 |
+
against baseline/reference datasets.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, p_value_threshold: float = None, alert_threshold: float = None):
|
| 34 |
+
"""
|
| 35 |
+
Initialize drift detector with Deepchecks.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
p_value_threshold: P-value threshold for drift detection (default from config)
|
| 39 |
+
alert_threshold: More sensitive threshold for critical alerts (default from config)
|
| 40 |
+
"""
|
| 41 |
+
self.p_value_threshold = p_value_threshold or config.DRIFT_P_VALUE_THRESHOLD
|
| 42 |
+
self.alert_threshold = alert_threshold or config.DRIFT_ALERT_THRESHOLD
|
| 43 |
+
self.use_deepchecks = Drift is not None
|
| 44 |
+
|
| 45 |
+
def detect_text_property_drift(
|
| 46 |
+
self,
|
| 47 |
+
production_texts: List[str],
|
| 48 |
+
reference_texts: List[str],
|
| 49 |
+
language: str = "java",
|
| 50 |
+
) -> Dict:
|
| 51 |
+
"""
|
| 52 |
+
Detect drift in text properties using Deepchecks TextPropertyDrift.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
production_texts: Text data in production
|
| 56 |
+
reference_texts: Reference/baseline text data
|
| 57 |
+
language: Language of the texts
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Dictionary with drift detection results
|
| 61 |
+
"""
|
| 62 |
+
if not self.use_deepchecks:
|
| 63 |
+
logger.warning("Deepchecks not available, using fallback method")
|
| 64 |
+
return self._fallback_text_property_drift(production_texts, reference_texts)
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
# Create Deepchecks datasets
|
| 68 |
+
ref_df = pd.DataFrame({'text': reference_texts})
|
| 69 |
+
prod_df = pd.DataFrame({'text': production_texts})
|
| 70 |
+
|
| 71 |
+
reference_dataset = SingleDataset(
|
| 72 |
+
ref_df,
|
| 73 |
+
text_column='text',
|
| 74 |
+
task_type='text_classification'
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
production_dataset = SingleDataset(
|
| 78 |
+
prod_df,
|
| 79 |
+
text_column='text',
|
| 80 |
+
task_type='text_classification'
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Run TextPropertyDrift check
|
| 84 |
+
check = TextPropertyDrift()
|
| 85 |
+
result = check.run(
|
| 86 |
+
reference_dataset,
|
| 87 |
+
production_dataset,
|
| 88 |
+
model_classes=None
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Extract results
|
| 92 |
+
scores = result.to_dict()
|
| 93 |
+
is_drifted = result.failed
|
| 94 |
+
|
| 95 |
+
drift_dict = {
|
| 96 |
+
"check_result": scores,
|
| 97 |
+
"drifted": is_drifted,
|
| 98 |
+
"alert": is_drifted,
|
| 99 |
+
"method": "deepchecks_text_property_drift",
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
if is_drifted:
|
| 103 |
+
logger.warning("Text property drift detected (Deepchecks)")
|
| 104 |
+
|
| 105 |
+
return drift_dict
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Deepchecks TextPropertyDrift failed: {e}")
|
| 109 |
+
return self._fallback_text_property_drift(production_texts, reference_texts)
|
| 110 |
+
|
| 111 |
+
def _fallback_text_property_drift(
|
| 112 |
+
self,
|
| 113 |
+
production_texts: List[str],
|
| 114 |
+
reference_texts: List[str],
|
| 115 |
+
) -> Dict:
|
| 116 |
+
"""Fallback to manual calculation if Deepchecks fails."""
|
| 117 |
+
from scipy.stats import ks_2samp
|
| 118 |
+
|
| 119 |
+
production_lengths = np.array([len(text) for text in production_texts])
|
| 120 |
+
reference_lengths = np.array([len(text) for text in reference_texts])
|
| 121 |
+
statistic, p_value = ks_2samp(reference_lengths, production_lengths)
|
| 122 |
+
|
| 123 |
+
is_drifted = p_value < self.p_value_threshold
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"statistic": float(statistic),
|
| 127 |
+
"p_value": float(p_value),
|
| 128 |
+
"drifted": is_drifted,
|
| 129 |
+
"alert": is_drifted and p_value < self.alert_threshold,
|
| 130 |
+
"mean_production": float(np.mean(production_lengths)),
|
| 131 |
+
"mean_reference": float(np.mean(reference_lengths)),
|
| 132 |
+
"method": "fallback_ks_test",
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def detect_label_distribution_drift(
|
| 136 |
+
self,
|
| 137 |
+
production_labels: np.ndarray,
|
| 138 |
+
reference_labels: np.ndarray,
|
| 139 |
+
) -> Dict:
|
| 140 |
+
"""
|
| 141 |
+
Detect drift in label distribution using Deepchecks Drift check.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
production_labels: Production label data (numpy array or list)
|
| 145 |
+
reference_labels: Reference/baseline label data
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Dictionary with drift detection results
|
| 149 |
+
"""
|
| 150 |
+
if not self.use_deepchecks:
|
| 151 |
+
logger.warning("Deepchecks not available, using fallback method")
|
| 152 |
+
return self._fallback_label_drift(production_labels, reference_labels)
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
# Prepare data
|
| 156 |
+
if len(reference_labels.shape) == 1:
|
| 157 |
+
ref_counts = np.bincount(reference_labels.astype(int))
|
| 158 |
+
else:
|
| 159 |
+
ref_counts = np.sum(reference_labels, axis=0)
|
| 160 |
+
|
| 161 |
+
if len(production_labels.shape) == 1:
|
| 162 |
+
prod_counts = np.bincount(
|
| 163 |
+
production_labels.astype(int),
|
| 164 |
+
minlength=len(ref_counts)
|
| 165 |
+
)
|
| 166 |
+
else:
|
| 167 |
+
prod_counts = np.sum(production_labels, axis=0)
|
| 168 |
+
|
| 169 |
+
# Create DataFrames with label columns
|
| 170 |
+
n_labels = len(ref_counts)
|
| 171 |
+
ref_df = pd.DataFrame({
|
| 172 |
+
f'label_{i}': [int(ref_counts[i])] for i in range(n_labels)
|
| 173 |
+
})
|
| 174 |
+
prod_df = pd.DataFrame({
|
| 175 |
+
f'label_{i}': [int(prod_counts[i])] for i in range(n_labels)
|
| 176 |
+
})
|
| 177 |
+
|
| 178 |
+
# Run Drift check
|
| 179 |
+
check = Drift()
|
| 180 |
+
reference_dataset = SingleDataset(ref_df, task_type='classification')
|
| 181 |
+
production_dataset = SingleDataset(prod_df, task_type='classification')
|
| 182 |
+
|
| 183 |
+
result = check.run(reference_dataset, production_dataset)
|
| 184 |
+
|
| 185 |
+
is_drifted = result.failed
|
| 186 |
+
|
| 187 |
+
drift_dict = {
|
| 188 |
+
"check_result": result.to_dict(),
|
| 189 |
+
"drifted": is_drifted,
|
| 190 |
+
"alert": is_drifted,
|
| 191 |
+
"reference_counts": ref_counts.tolist(),
|
| 192 |
+
"production_counts": prod_counts.tolist(),
|
| 193 |
+
"method": "deepchecks_drift_check",
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
if is_drifted:
|
| 197 |
+
logger.warning("Label distribution drift detected (Deepchecks)")
|
| 198 |
+
|
| 199 |
+
return drift_dict
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f"Deepchecks Drift check failed: {e}")
|
| 203 |
+
return self._fallback_label_drift(production_labels, reference_labels)
|
| 204 |
+
|
| 205 |
+
def _fallback_label_drift(
|
| 206 |
+
self,
|
| 207 |
+
production_labels: np.ndarray,
|
| 208 |
+
reference_labels: np.ndarray,
|
| 209 |
+
) -> Dict:
|
| 210 |
+
"""Fallback to manual Chi-Square test if Deepchecks fails."""
|
| 211 |
+
from scipy.stats import chi2_contingency
|
| 212 |
+
|
| 213 |
+
if len(reference_labels.shape) == 1:
|
| 214 |
+
ref_counts = np.bincount(reference_labels.astype(int))
|
| 215 |
+
else:
|
| 216 |
+
ref_counts = np.sum(reference_labels, axis=0)
|
| 217 |
+
|
| 218 |
+
if len(production_labels.shape) == 1:
|
| 219 |
+
prod_counts = np.bincount(
|
| 220 |
+
production_labels.astype(int),
|
| 221 |
+
minlength=len(ref_counts)
|
| 222 |
+
)
|
| 223 |
+
else:
|
| 224 |
+
prod_counts = np.sum(production_labels, axis=0)
|
| 225 |
+
|
| 226 |
+
min_len = min(len(prod_counts), len(ref_counts))
|
| 227 |
+
prod_counts = prod_counts[:min_len]
|
| 228 |
+
ref_counts = ref_counts[:min_len]
|
| 229 |
+
|
| 230 |
+
contingency_table = np.array([ref_counts, prod_counts])
|
| 231 |
+
|
| 232 |
+
try:
|
| 233 |
+
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
|
| 234 |
+
except Exception as e:
|
| 235 |
+
logger.warning(f"Chi-square test failed: {e}")
|
| 236 |
+
return {"statistic": None, "p_value": 1.0, "drifted": False, "alert": False}
|
| 237 |
+
|
| 238 |
+
is_drifted = p_value < self.p_value_threshold
|
| 239 |
+
is_alert = p_value < self.alert_threshold
|
| 240 |
+
|
| 241 |
+
return {
|
| 242 |
+
"statistic": float(chi2),
|
| 243 |
+
"p_value": float(p_value),
|
| 244 |
+
"drifted": is_drifted,
|
| 245 |
+
"alert": is_alert,
|
| 246 |
+
"method": "fallback_chi_square",
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
def detect_word_count_drift(
|
| 250 |
+
self,
|
| 251 |
+
production_texts: List[str],
|
| 252 |
+
reference_texts: List[str],
|
| 253 |
+
) -> Dict:
|
| 254 |
+
"""
|
| 255 |
+
Detect drift in word count distribution.
|
| 256 |
+
Uses Deepchecks TextPropertyDrift or fallback KS test.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
production_texts: Text data in production
|
| 260 |
+
reference_texts: Reference/baseline text data
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Dictionary with drift detection results
|
| 264 |
+
"""
|
| 265 |
+
# Use TextPropertyDrift which includes word count analysis
|
| 266 |
+
return self.detect_text_property_drift(
|
| 267 |
+
production_texts,
|
| 268 |
+
reference_texts,
|
| 269 |
+
language="unknown"
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
def detect_all_drifts(
|
| 273 |
+
self,
|
| 274 |
+
production_texts: List[str],
|
| 275 |
+
production_labels: np.ndarray,
|
| 276 |
+
reference_texts: List[str],
|
| 277 |
+
reference_labels: np.ndarray,
|
| 278 |
+
) -> Dict:
|
| 279 |
+
"""
|
| 280 |
+
Run all drift detection checks using Deepchecks.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
production_texts: Production text data
|
| 284 |
+
production_labels: Production label data
|
| 285 |
+
reference_texts: Reference/baseline text data
|
| 286 |
+
reference_labels: Reference/baseline label data
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
Dictionary with aggregated drift detection results
|
| 290 |
+
"""
|
| 291 |
+
results = {
|
| 292 |
+
"text_property": self.detect_text_property_drift(
|
| 293 |
+
production_texts,
|
| 294 |
+
reference_texts,
|
| 295 |
+
),
|
| 296 |
+
"label_distribution": self.detect_label_distribution_drift(
|
| 297 |
+
production_labels,
|
| 298 |
+
reference_labels,
|
| 299 |
+
),
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
any_drifted = any(r.get("drifted", False) for r in results.values())
|
| 303 |
+
any_alert = any(r.get("alert", False) for r in results.values())
|
| 304 |
+
|
| 305 |
+
results["overall"] = {
|
| 306 |
+
"drifted": any_drifted,
|
| 307 |
+
"alert": any_alert,
|
| 308 |
+
"num_drifts": sum(1 for r in results.values() if r.get("drifted", False)),
|
| 309 |
+
"methods": [r.get("method", "unknown") for r in results.values()], }
|
| 310 |
+
|
| 311 |
+
return results
|
| 312 |
+
|
| 313 |
+
def detect_all_drifts_from_baseline(
|
| 314 |
+
self,
|
| 315 |
+
production_texts: List[str],
|
| 316 |
+
production_labels: np.ndarray,
|
| 317 |
+
baseline_stats: Dict,
|
| 318 |
+
) -> Dict:
|
| 319 |
+
"""
|
| 320 |
+
Legacy method for backward compatibility.
|
| 321 |
+
Converts baseline_stats dict to reference_texts and reference_labels if available.
|
| 322 |
+
Otherwise reconstructs reference data from baseline statistics.
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
production_texts: Production text data
|
| 326 |
+
production_labels: Production label data
|
| 327 |
+
baseline_stats: Dictionary with baseline statistics (legacy format)
|
| 328 |
+
|
| 329 |
+
Returns:
|
| 330 |
+
Dictionary with aggregated drift detection results
|
| 331 |
+
"""
|
| 332 |
+
|
| 333 |
+
results = {
|
| 334 |
+
"text_length": self._fallback_text_property_drift(
|
| 335 |
+
production_texts,
|
| 336 |
+
production_texts, # Use production as fallback reference
|
| 337 |
+
),
|
| 338 |
+
"label_distribution": self._fallback_label_drift(
|
| 339 |
+
production_labels,
|
| 340 |
+
np.array(baseline_stats.get("label_counts", [])),
|
| 341 |
+
),
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
any_drifted = any(r.get("drifted", False) for r in results.values())
|
| 345 |
+
any_alert = any(r.get("alert", False) for r in results.values())
|
| 346 |
+
|
| 347 |
+
results["overall"] = {
|
| 348 |
+
"drifted": any_drifted,
|
| 349 |
+
"alert": any_alert,
|
| 350 |
+
"num_drifts": sum(1 for r in results.values() if r.get("drifted", False)),
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
return results
|
turing/monitoring/feedback/feedback_data.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Timestamp,Input_Text,Language,Model_Prediction,User_Correction
|
| 2 |
+
2025-12-11 22:41:05,# Create output directory,python,Usage,DevelopmentNotes
|
| 3 |
+
2025-12-11 23:05:24,# Entry point for running the API directly with python,python,Usage,DevelopmentNotes
|
turing/monitoring/feedback_manager.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Feedback ingestion utilities for drift analysis."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Tuple
|
| 5 |
+
|
| 6 |
+
from loguru import logger
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from turing import config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_feedback_for_language(
|
| 14 |
+
feedback_path: Path,
|
| 15 |
+
language: str,
|
| 16 |
+
) -> Tuple[List[str], np.ndarray]:
|
| 17 |
+
"""
|
| 18 |
+
Load user feedback for a given language and return texts with one-hot labels.
|
| 19 |
+
|
| 20 |
+
Rows with unknown labels are skipped. Returns empty lists if no valid rows.
|
| 21 |
+
"""
|
| 22 |
+
if not feedback_path.exists():
|
| 23 |
+
raise FileNotFoundError(f"Feedback file not found: {feedback_path}")
|
| 24 |
+
|
| 25 |
+
df = pd.read_csv(feedback_path)
|
| 26 |
+
if (
|
| 27 |
+
"Language" not in df.columns
|
| 28 |
+
or "Input_Text" not in df.columns
|
| 29 |
+
or "User_Correction" not in df.columns
|
| 30 |
+
):
|
| 31 |
+
raise ValueError(
|
| 32 |
+
"Feedback file must contain Language, Input_Text, and User_Correction columns"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
df_lang = df[df["Language"].str.lower() == language.lower()]
|
| 36 |
+
if df_lang.empty:
|
| 37 |
+
logger.warning(f"No feedback rows found for language {language}")
|
| 38 |
+
return [], np.array([])
|
| 39 |
+
|
| 40 |
+
label_space = config.LABELS_MAP.get(language)
|
| 41 |
+
if not label_space:
|
| 42 |
+
raise ValueError(f"Label map not found for language: {language}")
|
| 43 |
+
|
| 44 |
+
label_to_idx = {label.lower(): idx for idx, label in enumerate(label_space)}
|
| 45 |
+
|
| 46 |
+
texts: List[str] = []
|
| 47 |
+
labels: List[np.ndarray] = []
|
| 48 |
+
for _, row in df_lang.iterrows():
|
| 49 |
+
correction = str(row["User_Correction"]).strip().lower()
|
| 50 |
+
idx = label_to_idx.get(correction)
|
| 51 |
+
if idx is None:
|
| 52 |
+
logger.warning(f"Skipping feedback row with unknown label: {row['User_Correction']}")
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
one_hot = np.zeros(len(label_space), dtype=int)
|
| 56 |
+
one_hot[idx] = 1
|
| 57 |
+
|
| 58 |
+
texts.append(str(row["Input_Text"]))
|
| 59 |
+
labels.append(one_hot)
|
| 60 |
+
|
| 61 |
+
if not texts:
|
| 62 |
+
logger.warning(f"No valid feedback rows for language {language}")
|
| 63 |
+
return [], np.array([])
|
| 64 |
+
|
| 65 |
+
return texts, np.vstack(labels)
|
turing/monitoring/mlflow_logger.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MLflow Logging for Drift Detection
|
| 3 |
+
|
| 4 |
+
Handles logging drift metrics and alerts to MLflow experiment runs.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, Optional
|
| 8 |
+
|
| 9 |
+
from loguru import logger
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
import mlflow
|
| 13 |
+
except ImportError:
|
| 14 |
+
mlflow = None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DriftLogger:
|
| 18 |
+
"""
|
| 19 |
+
Logs drift detection results to MLflow.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, log_artifacts: bool = False):
|
| 23 |
+
"""
|
| 24 |
+
Initialize drift logger.
|
| 25 |
+
"""
|
| 26 |
+
self.log_artifacts = log_artifacts
|
| 27 |
+
self.has_mlflow = mlflow is not None
|
| 28 |
+
|
| 29 |
+
def log_drift_results(
|
| 30 |
+
self,
|
| 31 |
+
drift_results: Dict,
|
| 32 |
+
step: Optional[int] = None,
|
| 33 |
+
prefix: str = "drift",
|
| 34 |
+
) -> None:
|
| 35 |
+
"""
|
| 36 |
+
Log drift detection results to MLflow.
|
| 37 |
+
"""
|
| 38 |
+
if not self.has_mlflow:
|
| 39 |
+
logger.debug("MLflow not available")
|
| 40 |
+
return
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
overall = drift_results.get("overall", {})
|
| 44 |
+
mlflow.log_metric(f"{prefix}/drifted", float(overall.get("drifted", False)), step=step)
|
| 45 |
+
mlflow.log_metric(
|
| 46 |
+
f"{prefix}/num_drifts", float(overall.get("num_drifts", 0)), step=step
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
for drift_type, result in drift_results.items():
|
| 50 |
+
if drift_type == "overall":
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
if "p_value" in result:
|
| 54 |
+
mlflow.log_metric(
|
| 55 |
+
f"{prefix}/{drift_type}/p_value", result["p_value"], step=step
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
logger.debug("Logged drift results to MLflow")
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.warning(f"Failed to log drift to MLflow: {e}")
|
| 62 |
+
|
| 63 |
+
def log_baseline_statistics(
|
| 64 |
+
self,
|
| 65 |
+
baseline_stats: Dict,
|
| 66 |
+
prefix: str = "baseline",
|
| 67 |
+
) -> None:
|
| 68 |
+
"""
|
| 69 |
+
Log baseline statistics to MLflow.
|
| 70 |
+
"""
|
| 71 |
+
if not self.has_mlflow:
|
| 72 |
+
return
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
metrics = {
|
| 76 |
+
f"{prefix}/num_samples": baseline_stats.get("num_samples"),
|
| 77 |
+
f"{prefix}/text_length_mean": baseline_stats.get("text_length_mean"),
|
| 78 |
+
f"{prefix}/word_count_mean": baseline_stats.get("word_count_mean"),
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
for metric_name, value in metrics.items():
|
| 82 |
+
if value is not None:
|
| 83 |
+
mlflow.log_metric(metric_name, float(value))
|
| 84 |
+
|
| 85 |
+
mlflow.log_param(f"{prefix}/language", baseline_stats.get("language", "unknown"))
|
| 86 |
+
|
| 87 |
+
logger.debug("Logged baseline to MLflow")
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.warning(f"Failed to log baseline: {e}")
|
| 91 |
+
|
| 92 |
+
def log_alert(self, message: str, severity: str = "warning") -> None:
|
| 93 |
+
"""
|
| 94 |
+
Log drift alert message.
|
| 95 |
+
"""
|
| 96 |
+
logger_func = getattr(logger, severity, logger.warning)
|
| 97 |
+
logger_func(f"DRIFT ALERT: {message}")
|
turing/monitoring/synthetic_data_generator.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Synthetic Data Generator for Drift Testing
|
| 3 |
+
|
| 4 |
+
Generates synthetic drifted datasets to test drift detection.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import random
|
| 8 |
+
import string
|
| 9 |
+
from typing import List, Tuple
|
| 10 |
+
|
| 11 |
+
from loguru import logger
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SyntheticDataGenerator:
|
| 16 |
+
"""
|
| 17 |
+
Generates synthetic code comment data with controlled drift characteristics.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, seed: int = 42):
|
| 21 |
+
"""
|
| 22 |
+
Initialize synthetic data generator.
|
| 23 |
+
"""
|
| 24 |
+
self.seed = seed
|
| 25 |
+
np.random.seed(seed)
|
| 26 |
+
random.seed(seed)
|
| 27 |
+
|
| 28 |
+
def generate_short_comments(
|
| 29 |
+
self,
|
| 30 |
+
reference_texts: List[str],
|
| 31 |
+
ratio: float = 0.5,
|
| 32 |
+
n_samples: int = 100,
|
| 33 |
+
) -> List[str]:
|
| 34 |
+
"""
|
| 35 |
+
Generate shorter comments (text length drift).
|
| 36 |
+
"""
|
| 37 |
+
short_comments = []
|
| 38 |
+
|
| 39 |
+
for _ in range(n_samples):
|
| 40 |
+
ref_text = np.random.choice(reference_texts)
|
| 41 |
+
words = ref_text.split()
|
| 42 |
+
truncated_len = max(1, int(len(words) * ratio))
|
| 43 |
+
short_text = " ".join(words[:truncated_len])
|
| 44 |
+
short_comments.append(short_text)
|
| 45 |
+
|
| 46 |
+
logger.debug(f"Generated {len(short_comments)} short comments")
|
| 47 |
+
return short_comments
|
| 48 |
+
|
| 49 |
+
def generate_long_comments(
|
| 50 |
+
self,
|
| 51 |
+
reference_texts: List[str],
|
| 52 |
+
ratio: float = 1.5,
|
| 53 |
+
n_samples: int = 100,
|
| 54 |
+
) -> List[str]:
|
| 55 |
+
"""
|
| 56 |
+
Generate longer comments (text length drift upward).
|
| 57 |
+
"""
|
| 58 |
+
long_comments = []
|
| 59 |
+
|
| 60 |
+
for _ in range(n_samples):
|
| 61 |
+
ref_text = np.random.choice(reference_texts)
|
| 62 |
+
words = ref_text.split()
|
| 63 |
+
target_len = max(1, int(len(words) * ratio))
|
| 64 |
+
|
| 65 |
+
extended_words = words.copy()
|
| 66 |
+
while len(extended_words) < target_len:
|
| 67 |
+
extended_words.append(np.random.choice(words))
|
| 68 |
+
|
| 69 |
+
long_text = " ".join(extended_words[:target_len])
|
| 70 |
+
long_comments.append(long_text)
|
| 71 |
+
|
| 72 |
+
logger.debug(f"Generated {len(long_comments)} long comments")
|
| 73 |
+
return long_comments
|
| 74 |
+
|
| 75 |
+
def generate_corrupted_vocabulary(
|
| 76 |
+
self,
|
| 77 |
+
reference_texts: List[str],
|
| 78 |
+
corruption_rate: float = 0.2,
|
| 79 |
+
n_samples: int = 100,
|
| 80 |
+
) -> List[str]:
|
| 81 |
+
"""
|
| 82 |
+
Generate texts with corrupted vocabulary (typos, character swaps).
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
reference_texts: Reference training texts
|
| 86 |
+
corruption_rate: Fraction of words to corrupt (0.0-1.0)
|
| 87 |
+
n_samples: Number of samples to generate
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
List of corrupted texts
|
| 91 |
+
"""
|
| 92 |
+
corrupted_texts = []
|
| 93 |
+
|
| 94 |
+
for _ in range(n_samples):
|
| 95 |
+
ref_text = np.random.choice(reference_texts)
|
| 96 |
+
words = ref_text.split()
|
| 97 |
+
|
| 98 |
+
# Corrupt some words
|
| 99 |
+
for i in range(len(words)):
|
| 100 |
+
if random.random() < corruption_rate:
|
| 101 |
+
word = words[i]
|
| 102 |
+
if len(word) > 2:
|
| 103 |
+
# Random character swap or substitution
|
| 104 |
+
if random.random() < 0.5:
|
| 105 |
+
# Character swap
|
| 106 |
+
idx = random.randint(0, len(word) - 2)
|
| 107 |
+
word = word[:idx] + word[idx + 1] + word[idx] + word[idx + 2 :]
|
| 108 |
+
else:
|
| 109 |
+
# Character substitution
|
| 110 |
+
idx = random.randint(0, len(word) - 1)
|
| 111 |
+
word = (
|
| 112 |
+
word[:idx]
|
| 113 |
+
+ random.choice(string.ascii_lowercase)
|
| 114 |
+
+ word[idx + 1 :]
|
| 115 |
+
)
|
| 116 |
+
words[i] = word
|
| 117 |
+
|
| 118 |
+
corrupted_text = " ".join(words)
|
| 119 |
+
corrupted_texts.append(corrupted_text)
|
| 120 |
+
|
| 121 |
+
logger.debug(f"Generated {len(corrupted_texts)} corrupted texts (rate={corruption_rate})")
|
| 122 |
+
return corrupted_texts
|
| 123 |
+
|
| 124 |
+
def generate_label_shift(
|
| 125 |
+
self,
|
| 126 |
+
reference_texts: List[str],
|
| 127 |
+
reference_labels: np.ndarray,
|
| 128 |
+
shift_type: str = "class_imbalance",
|
| 129 |
+
n_samples: int = 100,
|
| 130 |
+
) -> Tuple[List[str], np.ndarray]:
|
| 131 |
+
"""
|
| 132 |
+
Generate batch with label distribution shift (class imbalance).
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
reference_texts: Reference training texts
|
| 136 |
+
reference_labels: Reference training labels (binary matrix)
|
| 137 |
+
shift_type: 'class_imbalance' - favor majority class
|
| 138 |
+
n_samples: Number of samples to generate
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Tuple of (texts, shifted_labels)
|
| 142 |
+
"""
|
| 143 |
+
texts = []
|
| 144 |
+
shifted_labels = []
|
| 145 |
+
|
| 146 |
+
if reference_labels.ndim == 2:
|
| 147 |
+
# Multi-label: get the first label per sample
|
| 148 |
+
label_indices = np.argmax(reference_labels, axis=1)
|
| 149 |
+
else:
|
| 150 |
+
label_indices = reference_labels
|
| 151 |
+
|
| 152 |
+
# Get class distribution
|
| 153 |
+
unique_labels, counts = np.unique(label_indices, return_counts=True)
|
| 154 |
+
majority_class = unique_labels[np.argmax(counts)]
|
| 155 |
+
minority_classes = unique_labels[unique_labels != majority_class]
|
| 156 |
+
|
| 157 |
+
# Create imbalanced distribution: 80% majority, 20% minority
|
| 158 |
+
n_majority = int(n_samples * 0.8)
|
| 159 |
+
n_minority = n_samples - n_majority
|
| 160 |
+
|
| 161 |
+
# Sample indices with bias toward majority class
|
| 162 |
+
majority_indices = np.where(label_indices == majority_class)[0]
|
| 163 |
+
minority_indices = np.where(np.isin(label_indices, minority_classes))[0]
|
| 164 |
+
|
| 165 |
+
selected_indices = []
|
| 166 |
+
selected_indices.extend(np.random.choice(majority_indices, size=n_majority, replace=True))
|
| 167 |
+
if len(minority_indices) > 0:
|
| 168 |
+
selected_indices.extend(
|
| 169 |
+
np.random.choice(minority_indices, size=n_minority, replace=True)
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
np.random.shuffle(selected_indices)
|
| 173 |
+
selected_indices = selected_indices[:n_samples]
|
| 174 |
+
|
| 175 |
+
# Get texts and labels
|
| 176 |
+
texts = [reference_texts[i] for i in selected_indices]
|
| 177 |
+
shifted_labels = reference_labels[selected_indices]
|
| 178 |
+
|
| 179 |
+
logger.debug(f"Generated {len(texts)} samples with class imbalance")
|
| 180 |
+
return texts, shifted_labels
|
| 181 |
+
|
| 182 |
+
def generate_synthetic_batch(
|
| 183 |
+
self,
|
| 184 |
+
reference_texts: List[str],
|
| 185 |
+
reference_labels: np.ndarray,
|
| 186 |
+
drift_type: str = "none",
|
| 187 |
+
batch_size: int = 50,
|
| 188 |
+
) -> Tuple[List[str], np.ndarray]:
|
| 189 |
+
"""
|
| 190 |
+
Generate a synthetic batch with specified drift.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
reference_texts: Reference training texts
|
| 194 |
+
reference_labels: Reference training labels
|
| 195 |
+
drift_type: Type of drift to introduce:
|
| 196 |
+
- 'none': No drift (baseline)
|
| 197 |
+
- 'text_length_short': Shortened texts
|
| 198 |
+
- 'text_length_long': Elongated texts
|
| 199 |
+
- 'corrupted_vocab': Typos and character swaps
|
| 200 |
+
- 'class_imbalance': Biased label distribution
|
| 201 |
+
batch_size: Number of samples to generate
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
Tuple of (texts, labels)
|
| 205 |
+
"""
|
| 206 |
+
if drift_type == "none":
|
| 207 |
+
indices = np.random.choice(len(reference_texts), size=batch_size, replace=True)
|
| 208 |
+
texts = [reference_texts[i] for i in indices]
|
| 209 |
+
labels = reference_labels[indices]
|
| 210 |
+
|
| 211 |
+
elif drift_type == "text_length_short":
|
| 212 |
+
texts = self.generate_short_comments(reference_texts, ratio=0.5, n_samples=batch_size)
|
| 213 |
+
indices = np.random.choice(len(reference_labels), size=batch_size)
|
| 214 |
+
labels = reference_labels[indices]
|
| 215 |
+
|
| 216 |
+
elif drift_type == "text_length_long":
|
| 217 |
+
texts = self.generate_long_comments(reference_texts, ratio=1.5, n_samples=batch_size)
|
| 218 |
+
indices = np.random.choice(len(reference_labels), size=batch_size)
|
| 219 |
+
labels = reference_labels[indices]
|
| 220 |
+
|
| 221 |
+
elif drift_type == "corrupted_vocab":
|
| 222 |
+
texts = self.generate_corrupted_vocabulary(
|
| 223 |
+
reference_texts, corruption_rate=0.2, n_samples=batch_size
|
| 224 |
+
)
|
| 225 |
+
indices = np.random.choice(len(reference_labels), size=batch_size)
|
| 226 |
+
labels = reference_labels[indices]
|
| 227 |
+
|
| 228 |
+
elif drift_type == "class_imbalance":
|
| 229 |
+
texts, labels = self.generate_label_shift(
|
| 230 |
+
reference_texts,
|
| 231 |
+
reference_labels,
|
| 232 |
+
shift_type="class_imbalance",
|
| 233 |
+
n_samples=batch_size,
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
else:
|
| 237 |
+
raise ValueError(f"Unknown drift type: {drift_type}")
|
| 238 |
+
|
| 239 |
+
logger.info(f"Generated synthetic batch: {drift_type}, size={batch_size}")
|
| 240 |
+
return texts, labels
|
turing/tests/unit/test_monitoring.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit Tests for Monitoring Module
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import tempfile
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from turing.monitoring.baseline_manager import (
|
| 12 |
+
BaselineManager,
|
| 13 |
+
extract_baseline_statistics,
|
| 14 |
+
)
|
| 15 |
+
from turing.monitoring.drift_detector import DriftDetector
|
| 16 |
+
from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class TestBaselineExtraction:
|
| 20 |
+
"""Tests for baseline statistics extraction."""
|
| 21 |
+
|
| 22 |
+
@pytest.fixture
|
| 23 |
+
def sample_data(self):
|
| 24 |
+
texts = [
|
| 25 |
+
"This is a sample comment",
|
| 26 |
+
"Another test comment here",
|
| 27 |
+
"Short text",
|
| 28 |
+
"Longer comment with more information",
|
| 29 |
+
"Medium length comment",
|
| 30 |
+
]
|
| 31 |
+
labels = np.array([[1, 0, 1, 0, 0], [0, 1, 0, 1, 0], [1, 1, 0, 0, 0], [0, 0, 1, 1, 1], [1, 0, 0, 0, 1]])
|
| 32 |
+
return texts, labels
|
| 33 |
+
|
| 34 |
+
def test_extract_baseline(self, sample_data):
|
| 35 |
+
texts, labels = sample_data
|
| 36 |
+
baseline = extract_baseline_statistics(X_train=texts, y_train=labels, language="java")
|
| 37 |
+
|
| 38 |
+
assert "text_length_distribution" in baseline
|
| 39 |
+
assert "word_count_distribution" in baseline
|
| 40 |
+
assert baseline["language"] == "java"
|
| 41 |
+
assert baseline["num_samples"] == len(texts)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class TestDriftDetector:
|
| 45 |
+
"""Tests for drift detection."""
|
| 46 |
+
|
| 47 |
+
@pytest.fixture
|
| 48 |
+
def baseline(self):
|
| 49 |
+
return {
|
| 50 |
+
"text_length_distribution": np.array([20, 25, 30, 35]),
|
| 51 |
+
"word_count_distribution": np.array([3, 4, 5, 6]),
|
| 52 |
+
"label_counts": np.array([5, 3, 2, 4]),
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
def test_detector_init(self):
|
| 56 |
+
detector = DriftDetector(p_value_threshold=0.05, alert_threshold=0.01)
|
| 57 |
+
assert detector.p_value_threshold == 0.05
|
| 58 |
+
|
| 59 |
+
def test_text_length_drift(self, baseline):
|
| 60 |
+
detector = DriftDetector(p_value_threshold=0.05)
|
| 61 |
+
|
| 62 |
+
prod_texts = [
|
| 63 |
+
"Very long test comment with lots of additional information",
|
| 64 |
+
"Another extremely long sample text",
|
| 65 |
+
"Yet another quite lengthy comment",
|
| 66 |
+
"More long production text",
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
ref_texts = [text[:len(text)//2] for text in prod_texts] # Shorter reference texts
|
| 70 |
+
|
| 71 |
+
result = detector.detect_text_property_drift(prod_texts, ref_texts)
|
| 72 |
+
|
| 73 |
+
assert "drifted" in result
|
| 74 |
+
assert "method" in result
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class TestSyntheticDataGenerator:
|
| 78 |
+
"""Tests for synthetic data generation."""
|
| 79 |
+
|
| 80 |
+
@pytest.fixture
|
| 81 |
+
def sample_data(self):
|
| 82 |
+
texts = ["This is a sample", "Another test", "Short", "Longer text"]
|
| 83 |
+
labels = np.array([0, 1, 0, 1])
|
| 84 |
+
return texts, labels
|
| 85 |
+
|
| 86 |
+
def test_generator_init(self):
|
| 87 |
+
gen = SyntheticDataGenerator(seed=42)
|
| 88 |
+
assert gen.seed == 42
|
| 89 |
+
|
| 90 |
+
def test_generate_short(self, sample_data):
|
| 91 |
+
texts, labels = sample_data
|
| 92 |
+
gen = SyntheticDataGenerator(seed=42)
|
| 93 |
+
|
| 94 |
+
short = gen.generate_short_comments(texts, ratio=0.5, n_samples=10)
|
| 95 |
+
|
| 96 |
+
assert len(short) == 10
|
| 97 |
+
assert np.mean([len(t) for t in short]) < np.mean([len(t) for t in texts])
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class TestBaselineManager:
|
| 101 |
+
"""Tests for baseline management."""
|
| 102 |
+
|
| 103 |
+
@pytest.fixture
|
| 104 |
+
def temp_dir(self):
|
| 105 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 106 |
+
yield Path(tmpdir)
|
| 107 |
+
|
| 108 |
+
def test_save_and_load(self, temp_dir):
|
| 109 |
+
manager = BaselineManager(mlflow_enabled=False, local_cache_dir=temp_dir)
|
| 110 |
+
|
| 111 |
+
baseline = {
|
| 112 |
+
"text_length_distribution": [10, 20, 30],
|
| 113 |
+
"label_counts": [5, 3],
|
| 114 |
+
"language": "java",
|
| 115 |
+
"num_samples": 3,
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
manager.save_baseline(baseline, "java", "test", "model")
|
| 119 |
+
loaded = manager.load_baseline("java", "test", "model")
|
| 120 |
+
|
| 121 |
+
assert loaded["language"] == "java"
|
| 122 |
+
assert loaded["num_samples"] == 3
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
pytest.main([__file__, "-v"])
|