github-actions[bot] commited on
Commit
38593e7
·
1 Parent(s): 8e13241

Sync turing folder from GitHub

Browse files
turing/CLI_runner/verify_drift_detection.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import dagshub
6
+ from loguru import logger
7
+ from mlflow.tracking import MlflowClient
8
+ import numpy as np
9
+ import pandas as pd
10
+ import typer
11
+
12
+ from turing import config
13
+ from turing.modeling.model_selector import get_best_model_by_tag
14
+ from turing.monitoring.baseline_manager import extract_baseline_statistics
15
+ from turing.monitoring.drift_detector import DriftDetector
16
+ from turing.monitoring.feedback_manager import load_feedback_for_language
17
+ from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator
18
+
19
+ app = typer.Typer()
20
+
21
+
22
+ def load_training_data(dataset_name: str, language: str):
23
+ """
24
+ Load training data for a specific programming language.
25
+
26
+ Args:
27
+ dataset_name: Dataset name (e.g., 'clean-k5000')
28
+ language: Programming language (java, python, pharo)
29
+
30
+ Returns:
31
+ Tuple of (texts, labels) as lists
32
+ """
33
+ dataset_path = config.INTERIM_DATA_DIR / "features" / dataset_name
34
+
35
+ if not dataset_path.exists():
36
+ raise FileNotFoundError(f"Dataset path not found: {dataset_path}")
37
+
38
+ train_file = None
39
+ for file in dataset_path.rglob(f"{language}_train*.csv"):
40
+ train_file = file
41
+ break
42
+
43
+ if not train_file:
44
+ raise FileNotFoundError(f"Training file not found for {language} in {dataset_path}")
45
+
46
+ logger.info(f"Loading training data from: {train_file}")
47
+ df = pd.read_csv(train_file)
48
+
49
+ X_train = df[config.INPUT_COLUMN].tolist()
50
+
51
+ if isinstance(df[config.LABEL_COLUMN].iloc[0], str):
52
+ y_train = np.array([eval(label) for label in df[config.LABEL_COLUMN]])
53
+ else:
54
+ y_train = df[config.LABEL_COLUMN].values
55
+
56
+ logger.success(f"Loaded {len(X_train)} training samples for {language}")
57
+ return X_train, y_train
58
+
59
+
60
+ def print_drift_report(drift_results: dict, drift_type: str, report_lines: list = None):
61
+ """
62
+ Format and display drift detection results for a specific drift type.
63
+
64
+ Args:
65
+ drift_results: Dictionary with drift detection metrics and alerts
66
+ drift_type: Name of drift type tested (e.g., 'none', 'text_length_short')
67
+ report_lines: Optional list to collect formatted report lines
68
+ """
69
+
70
+ def log_and_collect(msg: str):
71
+ logger.info(msg)
72
+ if report_lines is not None:
73
+ report_lines.append(msg)
74
+
75
+ log_and_collect(f"\n{'=' * 60}")
76
+ log_and_collect(f"DRIFT DETECTION REPORT - {drift_type.upper()}")
77
+ log_and_collect(f"{'=' * 60}")
78
+
79
+ for metric_name, result in drift_results.items():
80
+ if metric_name == "overall":
81
+ continue
82
+
83
+ p_value = result.get("p_value", result.get("check_result", {}).get("passed", None))
84
+ statistic = result.get("statistic", None)
85
+ drifted = result.get("drifted", False)
86
+ alert = result.get("alert", False)
87
+
88
+ if alert:
89
+ status = "ALERT"
90
+ elif drifted:
91
+ status = "DRIFT"
92
+ else:
93
+ status = "OK"
94
+
95
+ log_and_collect(f"\n{metric_name.upper()}")
96
+ log_and_collect(f" Status: {status}")
97
+ if p_value is not None:
98
+ log_and_collect(f" P-value: {p_value:.6f}")
99
+ if statistic is not None:
100
+ log_and_collect(f" Statistic: {statistic:.6f}")
101
+ log_and_collect(f" Drift detected: {drifted}")
102
+ log_and_collect(f" Critical alert: {alert}")
103
+ log_and_collect(f" Method: {result.get('method', 'unknown')}")
104
+
105
+ overall = drift_results.get("overall", {})
106
+ overall_drifted = overall.get("drifted", False)
107
+ overall_alert = overall.get("alert", False)
108
+ drift_count = overall.get("num_drifts", 0)
109
+
110
+ log_and_collect(f"\n{'=' * 60}")
111
+ log_and_collect("OVERALL SUMMARY")
112
+ log_and_collect(f" Drift detected: {overall_drifted}")
113
+ log_and_collect(f" Critical alert: {overall_alert}")
114
+ log_and_collect(f" Number of drifted metrics: {drift_count}")
115
+ log_and_collect(f" Methods used: {overall.get('methods', [])}")
116
+ log_and_collect(f"{'=' * 60}\n")
117
+
118
+
119
+ def save_drift_report(
120
+ language: str,
121
+ dataset_name: str,
122
+ baseline_stats: dict,
123
+ test_results: dict,
124
+ report_text: str,
125
+ ):
126
+ """
127
+ Save drift detection report to TXT and JSON files.
128
+
129
+ Args:
130
+ language: Programming language tested
131
+ dataset_name: Name of dataset used
132
+ baseline_stats: Baseline statistics dictionary
133
+ test_results: Dictionary with test results for each drift type
134
+ report_text: Formatted report text
135
+ """
136
+
137
+ def convert_numpy_types(obj):
138
+ if isinstance(obj, dict):
139
+ return {k: convert_numpy_types(v) for k, v in obj.items()}
140
+ elif isinstance(obj, list):
141
+ return [convert_numpy_types(item) for item in obj]
142
+ elif isinstance(obj, np.bool_):
143
+ return bool(obj)
144
+ elif isinstance(obj, np.integer):
145
+ return int(obj)
146
+ elif isinstance(obj, np.floating):
147
+ return float(obj)
148
+ elif isinstance(obj, np.ndarray):
149
+ return obj.tolist()
150
+ else:
151
+ return obj
152
+
153
+ monitoring_dir = config.REPORTS_DIR / "monitoring"
154
+ monitoring_dir.mkdir(parents=True, exist_ok=True)
155
+
156
+ report_file = monitoring_dir / f"drift_report_{language}.txt"
157
+ with open(report_file, "w") as f:
158
+ f.write("DRIFT DETECTION REPORT\n")
159
+ f.write(f"Language: {language}\n")
160
+ f.write(f"Dataset: {dataset_name}\n")
161
+ f.write(f"Timestamp: {datetime.now().isoformat()}\n")
162
+ f.write(f"P-value threshold: {config.DRIFT_P_VALUE_THRESHOLD}\n")
163
+ f.write(f"Alert threshold: {config.DRIFT_ALERT_THRESHOLD}\n")
164
+ f.write("\n" + "=" * 80 + "\n\n")
165
+ f.write("BASELINE STATISTICS\n")
166
+ f.write(
167
+ f" Text length: mean={baseline_stats['text_length_mean']:.2f}, std={baseline_stats['text_length_std']:.2f}\n"
168
+ )
169
+ f.write(
170
+ f" Word count: mean={baseline_stats['word_count_mean']:.2f}, std={baseline_stats['word_count_std']:.2f}\n"
171
+ )
172
+ f.write(f" Label counts: {baseline_stats['label_counts']}\n")
173
+ f.write(f" Number of samples: {baseline_stats['num_samples']}\n")
174
+ f.write("\n" + "=" * 80 + "\n\n")
175
+ f.write(report_text)
176
+
177
+ json_file = monitoring_dir / f"drift_report_{language}.json"
178
+ report_data = {
179
+ "language": language,
180
+ "dataset": dataset_name,
181
+ "timestamp": datetime.now().isoformat(),
182
+ "config": {
183
+ "p_value_threshold": config.DRIFT_P_VALUE_THRESHOLD,
184
+ "alert_threshold": config.DRIFT_ALERT_THRESHOLD,
185
+ },
186
+ "baseline": {
187
+ "text_length_mean": baseline_stats["text_length_mean"],
188
+ "text_length_std": baseline_stats["text_length_std"],
189
+ "word_count_mean": baseline_stats["word_count_mean"],
190
+ "word_count_std": baseline_stats["word_count_std"],
191
+ "label_counts": baseline_stats["label_counts"],
192
+ "num_samples": baseline_stats["num_samples"],
193
+ "n_labels": baseline_stats["n_labels"],
194
+ },
195
+ "test_results": convert_numpy_types(test_results),
196
+ }
197
+
198
+ with open(json_file, "w") as f:
199
+ json.dump(report_data, f, indent=2)
200
+
201
+ logger.success("Report saved to:")
202
+ logger.info(f" Text: {report_file}")
203
+ logger.info(f" JSON: {json_file}")
204
+
205
+
206
+ @app.command()
207
+ def verify(
208
+ language: str = typer.Option("java", help="Language to test (java, python, pharo)"),
209
+ repo_owner: str = typer.Option("se4ai2526-uniba", help="DagsHub repository owner"),
210
+ repo_name: str = typer.Option("Turing", help="DagsHub repository name"),
211
+ n_samples: int = typer.Option(100, help="Number of samples for synthetic data generation"),
212
+ use_feedback: bool = typer.Option(False, help="Include user feedback rows in drift analysis"),
213
+ feedback_path: Path = typer.Option(
214
+ config.PROJ_ROOT / "turing" / "monitoring" / "feedback" / "feedback_data.csv",
215
+ help="Path to user feedback CSV",
216
+ ),
217
+ ):
218
+ """
219
+ Verify drift detection on best model's training dataset.
220
+ """
221
+ logger.info("Starting drift detection verification...")
222
+ logger.info("Configuration:")
223
+ logger.info(f" Language: {language}")
224
+ logger.info(f" P-value threshold: {config.DRIFT_P_VALUE_THRESHOLD}")
225
+ logger.info(f" Alert threshold: {config.DRIFT_ALERT_THRESHOLD}")
226
+ logger.info(f" Baseline cache: {config.BASELINE_CACHE_DIR}")
227
+
228
+ dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
229
+
230
+ logger.info(f"\n[1/6] Searching for best model for {language}...")
231
+ best_model_info = get_best_model_by_tag(language=language)
232
+
233
+ if not best_model_info:
234
+ logger.error(f"No best model found for {language}")
235
+ return
236
+
237
+ run_id = best_model_info["run_id"]
238
+
239
+ logger.info(f"\n[2/6] Retrieving dataset information from MLflow run {run_id}...")
240
+ client = MlflowClient()
241
+ run = client.get_run(run_id)
242
+ dataset_name = run.data.tags.get("dataset_name", None)
243
+
244
+ if not dataset_name:
245
+ logger.error("Dataset name not found in run tags")
246
+ return
247
+
248
+ logger.success(f"Found dataset: {dataset_name}")
249
+
250
+ logger.info("\n[3/6] Loading training data...")
251
+ try:
252
+ X_train, y_train = load_training_data(dataset_name, language)
253
+ except Exception as e:
254
+ logger.error(f"Failed to load training data: {e}")
255
+ return
256
+
257
+ logger.info("\n[4/6] Extracting baseline statistics...")
258
+ baseline_stats = extract_baseline_statistics(X_train, y_train, language)
259
+ logger.success("Baseline extracted:")
260
+ logger.info(
261
+ f" Text length: mean={baseline_stats['text_length_mean']:.2f}, std={baseline_stats['text_length_std']:.2f}"
262
+ )
263
+ logger.info(
264
+ f" Word count: mean={baseline_stats['word_count_mean']:.2f}, std={baseline_stats['word_count_std']:.2f}"
265
+ )
266
+ logger.info(f" Label counts: {baseline_stats['label_counts']}")
267
+
268
+ logger.info("\n[5/6] Initializing drift detection components...")
269
+ drift_detector = DriftDetector()
270
+ synthetic_generator = SyntheticDataGenerator(seed=42)
271
+
272
+ feedback_texts, feedback_labels = [], np.array([])
273
+ if use_feedback:
274
+ try:
275
+ feedback_texts, feedback_labels = load_feedback_for_language(feedback_path, language)
276
+ except Exception as e:
277
+ logger.warning(f"Feedback load skipped: {e}")
278
+
279
+ logger.info("\n[6/6] Testing drift detection on different data types...\n")
280
+
281
+ test_cases = [
282
+ ("NORMAL DATA (no drift expected)", "none"),
283
+ ("SHORT TEXT DRIFT", "text_length_short"),
284
+ ("LONG TEXT DRIFT", "text_length_long"),
285
+ ("CORRUPTED VOCABULARY DRIFT", "corrupted_vocab"),
286
+ ("CLASS IMBALANCE DRIFT", "class_imbalance"),
287
+ ]
288
+
289
+ if use_feedback and len(feedback_texts) > 0:
290
+ test_cases.append(("USER FEEDBACK", "feedback"))
291
+
292
+ all_test_results = {}
293
+ all_report_lines = []
294
+
295
+ for test_name, drift_type in test_cases:
296
+ logger.info(f"\n{'#' * 60}")
297
+ logger.info(f"Test: {test_name}")
298
+ logger.info(f"{'#' * 60}")
299
+
300
+ if drift_type == "feedback":
301
+ production_texts = feedback_texts
302
+ production_labels = feedback_labels
303
+ else:
304
+ production_texts, production_labels = synthetic_generator.generate_synthetic_batch(
305
+ reference_texts=X_train,
306
+ reference_labels=y_train,
307
+ drift_type=drift_type,
308
+ batch_size=n_samples,
309
+ )
310
+
311
+ drift_results = drift_detector.detect_all_drifts(
312
+ production_texts=production_texts,
313
+ production_labels=production_labels,
314
+ reference_texts=X_train,
315
+ reference_labels=y_train,
316
+ )
317
+
318
+ all_test_results[drift_type] = drift_results
319
+ print_drift_report(drift_results, drift_type, report_lines=all_report_lines)
320
+
321
+ logger.info("\nSaving drift detection report...")
322
+ report_text = "\n".join(all_report_lines)
323
+ save_drift_report(
324
+ language=language,
325
+ dataset_name=dataset_name,
326
+ baseline_stats=baseline_stats,
327
+ test_results=all_test_results,
328
+ report_text=report_text,
329
+ )
330
+
331
+ logger.success("\nDrift detection verification completed!")
332
+
333
+
334
+ if __name__ == "__main__":
335
+ app()
turing/config.py CHANGED
@@ -49,6 +49,12 @@ MAX_AVG_FLOPS = 5000.0 # GFLOPS
49
  # Training parameters
50
  DEFAULT_BATCH_SIZE = 32
51
 
 
 
 
 
 
 
52
  # Model configuration mapping
53
  MODEL_CONFIG = {
54
  "codeberta": {
 
49
  # Training parameters
50
  DEFAULT_BATCH_SIZE = 32
51
 
52
+ # Drift detection parameters
53
+ DRIFT_P_VALUE_THRESHOLD = 0.05 # P-value threshold for drift detection warning
54
+ DRIFT_ALERT_THRESHOLD = 0.01 # P-value threshold for drift alert (critical)
55
+ BASELINE_CACHE_DIR = Path.home() / ".turing_baselines" # Local cache for baseline statistics
56
+ DRIFT_DETECTION_ENABLED = True # Enable/disable drift detection globally
57
+
58
  # Model configuration mapping
59
  MODEL_CONFIG = {
60
  "codeberta": {
turing/monitoring/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Drift Monitoring Module
3
+
4
+ This module provides tools for detecting and logging data drift in production predictions
5
+ against reference baselines from training data. It integrates with MLflow for baseline
6
+ storage and drift metric logging.
7
+
8
+ Components:
9
+ - drift_detector: Core drift detection using statistical tests (KS test, Chi-square)
10
+ - baseline_manager: Extract and manage baseline statistics from training data
11
+ - mlflow_logger: Log drift metrics and alerts to MLflow
12
+ - synthetic_data_generator: Generate synthetic drifted data for testing
13
+ """
14
+
15
+ from turing.monitoring.baseline_manager import (
16
+ BaselineManager,
17
+ extract_baseline_statistics,
18
+ )
19
+ from turing.monitoring.drift_detector import DriftDetector
20
+ from turing.monitoring.mlflow_logger import DriftLogger
21
+
22
+ __all__ = [
23
+ "DriftDetector",
24
+ "BaselineManager",
25
+ "DriftLogger",
26
+ "extract_baseline_statistics",
27
+ ]
turing/monitoring/baseline_manager.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Baseline Management Module
3
+
4
+ Handles extraction of baseline statistics from training data,
5
+ storage as MLflow artifacts, and retrieval for drift detection.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ import pickle
11
+ from typing import Dict, List, Optional
12
+
13
+ from loguru import logger
14
+ import numpy as np
15
+
16
+ from turing import config
17
+
18
+ try:
19
+ import mlflow
20
+ from mlflow.tracking import MlflowClient
21
+ except ImportError:
22
+ mlflow = None
23
+
24
+
25
+ def extract_baseline_statistics(
26
+ X_train: List[str],
27
+ y_train: np.ndarray,
28
+ language: str = "java",
29
+ ) -> Dict:
30
+ """
31
+ Extract baseline statistics from training data.
32
+
33
+ Args:
34
+ X_train: List of training comment texts
35
+ y_train: Training labels (binary matrix or label indices)
36
+ language: Language of the training data
37
+
38
+ Returns:
39
+ Dictionary containing baseline statistics
40
+ """
41
+ text_lengths = np.array([len(text) for text in X_train])
42
+ word_counts = np.array([len(text.split()) for text in X_train])
43
+
44
+ if len(y_train.shape) == 1:
45
+ n_labels = int(np.max(y_train)) + 1
46
+ label_counts = np.bincount(y_train.astype(int), minlength=n_labels)
47
+ else:
48
+ label_counts = np.sum(y_train, axis=0)
49
+ n_labels = y_train.shape[1]
50
+
51
+ baseline_stats = {
52
+ "text_length_distribution": text_lengths.tolist(),
53
+ "word_count_distribution": word_counts.tolist(),
54
+ "label_counts": label_counts.tolist(),
55
+ "language": language,
56
+ "num_samples": len(X_train),
57
+ "n_labels": int(n_labels),
58
+ "text_length_mean": float(np.mean(text_lengths)),
59
+ "text_length_std": float(np.std(text_lengths)),
60
+ "text_length_min": float(np.min(text_lengths)),
61
+ "text_length_max": float(np.max(text_lengths)),
62
+ "word_count_mean": float(np.mean(word_counts)),
63
+ "word_count_std": float(np.std(word_counts)),
64
+ }
65
+
66
+ logger.info(f"Extracted baseline for {language}: {len(X_train)} samples")
67
+
68
+ return baseline_stats
69
+
70
+
71
+ class BaselineManager:
72
+ """
73
+ Manages baseline statistics for drift detection.
74
+ """
75
+
76
+ def __init__(self, mlflow_enabled: bool = True, local_cache_dir: Optional[Path] = None):
77
+ """
78
+ Initialize baseline manager.
79
+
80
+ Args:
81
+ mlflow_enabled: Enable MLflow artifact logging
82
+ local_cache_dir: Local cache directory (default from config.BASELINE_CACHE_DIR)
83
+ """
84
+ self.mlflow_enabled = mlflow_enabled and mlflow is not None
85
+ self.local_cache_dir = local_cache_dir or config.BASELINE_CACHE_DIR
86
+ self.local_cache_dir.mkdir(parents=True, exist_ok=True)
87
+
88
+ if self.mlflow_enabled:
89
+ self.mlflow_client = MlflowClient()
90
+
91
+ logger.info(f"BaselineManager initialized (cache: {self.local_cache_dir})")
92
+
93
+ def save_baseline(
94
+ self,
95
+ baseline_stats: Dict,
96
+ language: str,
97
+ dataset_name: str,
98
+ model_id: str = "default",
99
+ run_id: Optional[str] = None,
100
+ ) -> None:
101
+ """
102
+ Save baseline statistics to MLflow and local cache.
103
+ """
104
+ baseline_path = self._get_baseline_path(language, dataset_name, model_id)
105
+
106
+ baseline_path.parent.mkdir(parents=True, exist_ok=True)
107
+ with open(baseline_path, "wb") as f:
108
+ pickle.dump(baseline_stats, f)
109
+ logger.info(f"Saved baseline to {baseline_path}")
110
+
111
+ if self.mlflow_enabled and run_id:
112
+ try:
113
+ json_path = baseline_path.with_suffix(".json")
114
+ json_stats = {
115
+ k: v
116
+ for k, v in baseline_stats.items()
117
+ if isinstance(v, (int, float, str, list, bool))
118
+ }
119
+ with open(json_path, "w") as f:
120
+ json.dump(json_stats, f, indent=2)
121
+
122
+ mlflow.log_artifact(str(json_path), artifact_path=f"baselines/{language}")
123
+ logger.info("Logged baseline to MLflow")
124
+ except Exception as e:
125
+ logger.warning(f"Failed to log baseline to MLflow: {e}")
126
+
127
+ def load_baseline(
128
+ self,
129
+ language: str,
130
+ dataset_name: str,
131
+ model_id: str = "default",
132
+ ) -> Dict:
133
+ """
134
+ Load baseline statistics from local cache.
135
+ """
136
+ baseline_path = self._get_baseline_path(language, dataset_name, model_id)
137
+
138
+ if baseline_path.exists():
139
+ with open(baseline_path, "rb") as f:
140
+ baseline_stats = pickle.load(f)
141
+ logger.info(f"Loaded baseline from cache: {baseline_path}")
142
+ return baseline_stats
143
+
144
+ raise FileNotFoundError(f"Baseline not found at {baseline_path}")
145
+
146
+ def _get_baseline_path(self, language: str, dataset_name: str, model_id: str) -> Path:
147
+ """Generate local cache path for baseline."""
148
+ return self.local_cache_dir / language / f"{dataset_name}_{model_id}_baseline.pkl"
turing/monitoring/drift_detector.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Drift Detection Module using Deepchecks
3
+
4
+ Implements drift detection using Deepchecks integrated checks:
5
+ - Drift check for text properties
6
+ - Label distribution drift
7
+ - Custom metrics comparison
8
+ """
9
+
10
+ from typing import Dict, List
11
+
12
+ from loguru import logger
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ try:
17
+ from deepchecks.nlp import SingleDataset
18
+ from deepchecks.nlp.checks import Drift, TextPropertyDrift
19
+ except ImportError:
20
+ logger.warning("Deepchecks not installed. Install with: pip install deepchecks[nlp]")
21
+ Drift = None
22
+ TextPropertyDrift = None
23
+
24
+ from turing import config
25
+
26
+
27
+ class DriftDetector:
28
+ """
29
+ Detects data drift using Deepchecks integrated checks comparing production data
30
+ against baseline/reference datasets.
31
+ """
32
+
33
+ def __init__(self, p_value_threshold: float = None, alert_threshold: float = None):
34
+ """
35
+ Initialize drift detector with Deepchecks.
36
+
37
+ Args:
38
+ p_value_threshold: P-value threshold for drift detection (default from config)
39
+ alert_threshold: More sensitive threshold for critical alerts (default from config)
40
+ """
41
+ self.p_value_threshold = p_value_threshold or config.DRIFT_P_VALUE_THRESHOLD
42
+ self.alert_threshold = alert_threshold or config.DRIFT_ALERT_THRESHOLD
43
+ self.use_deepchecks = Drift is not None
44
+
45
+ def detect_text_property_drift(
46
+ self,
47
+ production_texts: List[str],
48
+ reference_texts: List[str],
49
+ language: str = "java",
50
+ ) -> Dict:
51
+ """
52
+ Detect drift in text properties using Deepchecks TextPropertyDrift.
53
+
54
+ Args:
55
+ production_texts: Text data in production
56
+ reference_texts: Reference/baseline text data
57
+ language: Language of the texts
58
+
59
+ Returns:
60
+ Dictionary with drift detection results
61
+ """
62
+ if not self.use_deepchecks:
63
+ logger.warning("Deepchecks not available, using fallback method")
64
+ return self._fallback_text_property_drift(production_texts, reference_texts)
65
+
66
+ try:
67
+ # Create Deepchecks datasets
68
+ ref_df = pd.DataFrame({'text': reference_texts})
69
+ prod_df = pd.DataFrame({'text': production_texts})
70
+
71
+ reference_dataset = SingleDataset(
72
+ ref_df,
73
+ text_column='text',
74
+ task_type='text_classification'
75
+ )
76
+
77
+ production_dataset = SingleDataset(
78
+ prod_df,
79
+ text_column='text',
80
+ task_type='text_classification'
81
+ )
82
+
83
+ # Run TextPropertyDrift check
84
+ check = TextPropertyDrift()
85
+ result = check.run(
86
+ reference_dataset,
87
+ production_dataset,
88
+ model_classes=None
89
+ )
90
+
91
+ # Extract results
92
+ scores = result.to_dict()
93
+ is_drifted = result.failed
94
+
95
+ drift_dict = {
96
+ "check_result": scores,
97
+ "drifted": is_drifted,
98
+ "alert": is_drifted,
99
+ "method": "deepchecks_text_property_drift",
100
+ }
101
+
102
+ if is_drifted:
103
+ logger.warning("Text property drift detected (Deepchecks)")
104
+
105
+ return drift_dict
106
+
107
+ except Exception as e:
108
+ logger.error(f"Deepchecks TextPropertyDrift failed: {e}")
109
+ return self._fallback_text_property_drift(production_texts, reference_texts)
110
+
111
+ def _fallback_text_property_drift(
112
+ self,
113
+ production_texts: List[str],
114
+ reference_texts: List[str],
115
+ ) -> Dict:
116
+ """Fallback to manual calculation if Deepchecks fails."""
117
+ from scipy.stats import ks_2samp
118
+
119
+ production_lengths = np.array([len(text) for text in production_texts])
120
+ reference_lengths = np.array([len(text) for text in reference_texts])
121
+ statistic, p_value = ks_2samp(reference_lengths, production_lengths)
122
+
123
+ is_drifted = p_value < self.p_value_threshold
124
+
125
+ return {
126
+ "statistic": float(statistic),
127
+ "p_value": float(p_value),
128
+ "drifted": is_drifted,
129
+ "alert": is_drifted and p_value < self.alert_threshold,
130
+ "mean_production": float(np.mean(production_lengths)),
131
+ "mean_reference": float(np.mean(reference_lengths)),
132
+ "method": "fallback_ks_test",
133
+ }
134
+
135
+ def detect_label_distribution_drift(
136
+ self,
137
+ production_labels: np.ndarray,
138
+ reference_labels: np.ndarray,
139
+ ) -> Dict:
140
+ """
141
+ Detect drift in label distribution using Deepchecks Drift check.
142
+
143
+ Args:
144
+ production_labels: Production label data (numpy array or list)
145
+ reference_labels: Reference/baseline label data
146
+
147
+ Returns:
148
+ Dictionary with drift detection results
149
+ """
150
+ if not self.use_deepchecks:
151
+ logger.warning("Deepchecks not available, using fallback method")
152
+ return self._fallback_label_drift(production_labels, reference_labels)
153
+
154
+ try:
155
+ # Prepare data
156
+ if len(reference_labels.shape) == 1:
157
+ ref_counts = np.bincount(reference_labels.astype(int))
158
+ else:
159
+ ref_counts = np.sum(reference_labels, axis=0)
160
+
161
+ if len(production_labels.shape) == 1:
162
+ prod_counts = np.bincount(
163
+ production_labels.astype(int),
164
+ minlength=len(ref_counts)
165
+ )
166
+ else:
167
+ prod_counts = np.sum(production_labels, axis=0)
168
+
169
+ # Create DataFrames with label columns
170
+ n_labels = len(ref_counts)
171
+ ref_df = pd.DataFrame({
172
+ f'label_{i}': [int(ref_counts[i])] for i in range(n_labels)
173
+ })
174
+ prod_df = pd.DataFrame({
175
+ f'label_{i}': [int(prod_counts[i])] for i in range(n_labels)
176
+ })
177
+
178
+ # Run Drift check
179
+ check = Drift()
180
+ reference_dataset = SingleDataset(ref_df, task_type='classification')
181
+ production_dataset = SingleDataset(prod_df, task_type='classification')
182
+
183
+ result = check.run(reference_dataset, production_dataset)
184
+
185
+ is_drifted = result.failed
186
+
187
+ drift_dict = {
188
+ "check_result": result.to_dict(),
189
+ "drifted": is_drifted,
190
+ "alert": is_drifted,
191
+ "reference_counts": ref_counts.tolist(),
192
+ "production_counts": prod_counts.tolist(),
193
+ "method": "deepchecks_drift_check",
194
+ }
195
+
196
+ if is_drifted:
197
+ logger.warning("Label distribution drift detected (Deepchecks)")
198
+
199
+ return drift_dict
200
+
201
+ except Exception as e:
202
+ logger.error(f"Deepchecks Drift check failed: {e}")
203
+ return self._fallback_label_drift(production_labels, reference_labels)
204
+
205
+ def _fallback_label_drift(
206
+ self,
207
+ production_labels: np.ndarray,
208
+ reference_labels: np.ndarray,
209
+ ) -> Dict:
210
+ """Fallback to manual Chi-Square test if Deepchecks fails."""
211
+ from scipy.stats import chi2_contingency
212
+
213
+ if len(reference_labels.shape) == 1:
214
+ ref_counts = np.bincount(reference_labels.astype(int))
215
+ else:
216
+ ref_counts = np.sum(reference_labels, axis=0)
217
+
218
+ if len(production_labels.shape) == 1:
219
+ prod_counts = np.bincount(
220
+ production_labels.astype(int),
221
+ minlength=len(ref_counts)
222
+ )
223
+ else:
224
+ prod_counts = np.sum(production_labels, axis=0)
225
+
226
+ min_len = min(len(prod_counts), len(ref_counts))
227
+ prod_counts = prod_counts[:min_len]
228
+ ref_counts = ref_counts[:min_len]
229
+
230
+ contingency_table = np.array([ref_counts, prod_counts])
231
+
232
+ try:
233
+ chi2, p_value, dof, expected = chi2_contingency(contingency_table)
234
+ except Exception as e:
235
+ logger.warning(f"Chi-square test failed: {e}")
236
+ return {"statistic": None, "p_value": 1.0, "drifted": False, "alert": False}
237
+
238
+ is_drifted = p_value < self.p_value_threshold
239
+ is_alert = p_value < self.alert_threshold
240
+
241
+ return {
242
+ "statistic": float(chi2),
243
+ "p_value": float(p_value),
244
+ "drifted": is_drifted,
245
+ "alert": is_alert,
246
+ "method": "fallback_chi_square",
247
+ }
248
+
249
+ def detect_word_count_drift(
250
+ self,
251
+ production_texts: List[str],
252
+ reference_texts: List[str],
253
+ ) -> Dict:
254
+ """
255
+ Detect drift in word count distribution.
256
+ Uses Deepchecks TextPropertyDrift or fallback KS test.
257
+
258
+ Args:
259
+ production_texts: Text data in production
260
+ reference_texts: Reference/baseline text data
261
+
262
+ Returns:
263
+ Dictionary with drift detection results
264
+ """
265
+ # Use TextPropertyDrift which includes word count analysis
266
+ return self.detect_text_property_drift(
267
+ production_texts,
268
+ reference_texts,
269
+ language="unknown"
270
+ )
271
+
272
+ def detect_all_drifts(
273
+ self,
274
+ production_texts: List[str],
275
+ production_labels: np.ndarray,
276
+ reference_texts: List[str],
277
+ reference_labels: np.ndarray,
278
+ ) -> Dict:
279
+ """
280
+ Run all drift detection checks using Deepchecks.
281
+
282
+ Args:
283
+ production_texts: Production text data
284
+ production_labels: Production label data
285
+ reference_texts: Reference/baseline text data
286
+ reference_labels: Reference/baseline label data
287
+
288
+ Returns:
289
+ Dictionary with aggregated drift detection results
290
+ """
291
+ results = {
292
+ "text_property": self.detect_text_property_drift(
293
+ production_texts,
294
+ reference_texts,
295
+ ),
296
+ "label_distribution": self.detect_label_distribution_drift(
297
+ production_labels,
298
+ reference_labels,
299
+ ),
300
+ }
301
+
302
+ any_drifted = any(r.get("drifted", False) for r in results.values())
303
+ any_alert = any(r.get("alert", False) for r in results.values())
304
+
305
+ results["overall"] = {
306
+ "drifted": any_drifted,
307
+ "alert": any_alert,
308
+ "num_drifts": sum(1 for r in results.values() if r.get("drifted", False)),
309
+ "methods": [r.get("method", "unknown") for r in results.values()], }
310
+
311
+ return results
312
+
313
+ def detect_all_drifts_from_baseline(
314
+ self,
315
+ production_texts: List[str],
316
+ production_labels: np.ndarray,
317
+ baseline_stats: Dict,
318
+ ) -> Dict:
319
+ """
320
+ Legacy method for backward compatibility.
321
+ Converts baseline_stats dict to reference_texts and reference_labels if available.
322
+ Otherwise reconstructs reference data from baseline statistics.
323
+
324
+ Args:
325
+ production_texts: Production text data
326
+ production_labels: Production label data
327
+ baseline_stats: Dictionary with baseline statistics (legacy format)
328
+
329
+ Returns:
330
+ Dictionary with aggregated drift detection results
331
+ """
332
+
333
+ results = {
334
+ "text_length": self._fallback_text_property_drift(
335
+ production_texts,
336
+ production_texts, # Use production as fallback reference
337
+ ),
338
+ "label_distribution": self._fallback_label_drift(
339
+ production_labels,
340
+ np.array(baseline_stats.get("label_counts", [])),
341
+ ),
342
+ }
343
+
344
+ any_drifted = any(r.get("drifted", False) for r in results.values())
345
+ any_alert = any(r.get("alert", False) for r in results.values())
346
+
347
+ results["overall"] = {
348
+ "drifted": any_drifted,
349
+ "alert": any_alert,
350
+ "num_drifts": sum(1 for r in results.values() if r.get("drifted", False)),
351
+ }
352
+
353
+ return results
turing/monitoring/feedback/feedback_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Timestamp,Input_Text,Language,Model_Prediction,User_Correction
2
+ 2025-12-11 22:41:05,# Create output directory,python,Usage,DevelopmentNotes
3
+ 2025-12-11 23:05:24,# Entry point for running the API directly with python,python,Usage,DevelopmentNotes
turing/monitoring/feedback_manager.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Feedback ingestion utilities for drift analysis."""
2
+
3
+ from pathlib import Path
4
+ from typing import List, Tuple
5
+
6
+ from loguru import logger
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from turing import config
11
+
12
+
13
+ def load_feedback_for_language(
14
+ feedback_path: Path,
15
+ language: str,
16
+ ) -> Tuple[List[str], np.ndarray]:
17
+ """
18
+ Load user feedback for a given language and return texts with one-hot labels.
19
+
20
+ Rows with unknown labels are skipped. Returns empty lists if no valid rows.
21
+ """
22
+ if not feedback_path.exists():
23
+ raise FileNotFoundError(f"Feedback file not found: {feedback_path}")
24
+
25
+ df = pd.read_csv(feedback_path)
26
+ if (
27
+ "Language" not in df.columns
28
+ or "Input_Text" not in df.columns
29
+ or "User_Correction" not in df.columns
30
+ ):
31
+ raise ValueError(
32
+ "Feedback file must contain Language, Input_Text, and User_Correction columns"
33
+ )
34
+
35
+ df_lang = df[df["Language"].str.lower() == language.lower()]
36
+ if df_lang.empty:
37
+ logger.warning(f"No feedback rows found for language {language}")
38
+ return [], np.array([])
39
+
40
+ label_space = config.LABELS_MAP.get(language)
41
+ if not label_space:
42
+ raise ValueError(f"Label map not found for language: {language}")
43
+
44
+ label_to_idx = {label.lower(): idx for idx, label in enumerate(label_space)}
45
+
46
+ texts: List[str] = []
47
+ labels: List[np.ndarray] = []
48
+ for _, row in df_lang.iterrows():
49
+ correction = str(row["User_Correction"]).strip().lower()
50
+ idx = label_to_idx.get(correction)
51
+ if idx is None:
52
+ logger.warning(f"Skipping feedback row with unknown label: {row['User_Correction']}")
53
+ continue
54
+
55
+ one_hot = np.zeros(len(label_space), dtype=int)
56
+ one_hot[idx] = 1
57
+
58
+ texts.append(str(row["Input_Text"]))
59
+ labels.append(one_hot)
60
+
61
+ if not texts:
62
+ logger.warning(f"No valid feedback rows for language {language}")
63
+ return [], np.array([])
64
+
65
+ return texts, np.vstack(labels)
turing/monitoring/mlflow_logger.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLflow Logging for Drift Detection
3
+
4
+ Handles logging drift metrics and alerts to MLflow experiment runs.
5
+ """
6
+
7
+ from typing import Dict, Optional
8
+
9
+ from loguru import logger
10
+
11
+ try:
12
+ import mlflow
13
+ except ImportError:
14
+ mlflow = None
15
+
16
+
17
+ class DriftLogger:
18
+ """
19
+ Logs drift detection results to MLflow.
20
+ """
21
+
22
+ def __init__(self, log_artifacts: bool = False):
23
+ """
24
+ Initialize drift logger.
25
+ """
26
+ self.log_artifacts = log_artifacts
27
+ self.has_mlflow = mlflow is not None
28
+
29
+ def log_drift_results(
30
+ self,
31
+ drift_results: Dict,
32
+ step: Optional[int] = None,
33
+ prefix: str = "drift",
34
+ ) -> None:
35
+ """
36
+ Log drift detection results to MLflow.
37
+ """
38
+ if not self.has_mlflow:
39
+ logger.debug("MLflow not available")
40
+ return
41
+
42
+ try:
43
+ overall = drift_results.get("overall", {})
44
+ mlflow.log_metric(f"{prefix}/drifted", float(overall.get("drifted", False)), step=step)
45
+ mlflow.log_metric(
46
+ f"{prefix}/num_drifts", float(overall.get("num_drifts", 0)), step=step
47
+ )
48
+
49
+ for drift_type, result in drift_results.items():
50
+ if drift_type == "overall":
51
+ continue
52
+
53
+ if "p_value" in result:
54
+ mlflow.log_metric(
55
+ f"{prefix}/{drift_type}/p_value", result["p_value"], step=step
56
+ )
57
+
58
+ logger.debug("Logged drift results to MLflow")
59
+
60
+ except Exception as e:
61
+ logger.warning(f"Failed to log drift to MLflow: {e}")
62
+
63
+ def log_baseline_statistics(
64
+ self,
65
+ baseline_stats: Dict,
66
+ prefix: str = "baseline",
67
+ ) -> None:
68
+ """
69
+ Log baseline statistics to MLflow.
70
+ """
71
+ if not self.has_mlflow:
72
+ return
73
+
74
+ try:
75
+ metrics = {
76
+ f"{prefix}/num_samples": baseline_stats.get("num_samples"),
77
+ f"{prefix}/text_length_mean": baseline_stats.get("text_length_mean"),
78
+ f"{prefix}/word_count_mean": baseline_stats.get("word_count_mean"),
79
+ }
80
+
81
+ for metric_name, value in metrics.items():
82
+ if value is not None:
83
+ mlflow.log_metric(metric_name, float(value))
84
+
85
+ mlflow.log_param(f"{prefix}/language", baseline_stats.get("language", "unknown"))
86
+
87
+ logger.debug("Logged baseline to MLflow")
88
+
89
+ except Exception as e:
90
+ logger.warning(f"Failed to log baseline: {e}")
91
+
92
+ def log_alert(self, message: str, severity: str = "warning") -> None:
93
+ """
94
+ Log drift alert message.
95
+ """
96
+ logger_func = getattr(logger, severity, logger.warning)
97
+ logger_func(f"DRIFT ALERT: {message}")
turing/monitoring/synthetic_data_generator.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Synthetic Data Generator for Drift Testing
3
+
4
+ Generates synthetic drifted datasets to test drift detection.
5
+ """
6
+
7
+ import random
8
+ import string
9
+ from typing import List, Tuple
10
+
11
+ from loguru import logger
12
+ import numpy as np
13
+
14
+
15
+ class SyntheticDataGenerator:
16
+ """
17
+ Generates synthetic code comment data with controlled drift characteristics.
18
+ """
19
+
20
+ def __init__(self, seed: int = 42):
21
+ """
22
+ Initialize synthetic data generator.
23
+ """
24
+ self.seed = seed
25
+ np.random.seed(seed)
26
+ random.seed(seed)
27
+
28
+ def generate_short_comments(
29
+ self,
30
+ reference_texts: List[str],
31
+ ratio: float = 0.5,
32
+ n_samples: int = 100,
33
+ ) -> List[str]:
34
+ """
35
+ Generate shorter comments (text length drift).
36
+ """
37
+ short_comments = []
38
+
39
+ for _ in range(n_samples):
40
+ ref_text = np.random.choice(reference_texts)
41
+ words = ref_text.split()
42
+ truncated_len = max(1, int(len(words) * ratio))
43
+ short_text = " ".join(words[:truncated_len])
44
+ short_comments.append(short_text)
45
+
46
+ logger.debug(f"Generated {len(short_comments)} short comments")
47
+ return short_comments
48
+
49
+ def generate_long_comments(
50
+ self,
51
+ reference_texts: List[str],
52
+ ratio: float = 1.5,
53
+ n_samples: int = 100,
54
+ ) -> List[str]:
55
+ """
56
+ Generate longer comments (text length drift upward).
57
+ """
58
+ long_comments = []
59
+
60
+ for _ in range(n_samples):
61
+ ref_text = np.random.choice(reference_texts)
62
+ words = ref_text.split()
63
+ target_len = max(1, int(len(words) * ratio))
64
+
65
+ extended_words = words.copy()
66
+ while len(extended_words) < target_len:
67
+ extended_words.append(np.random.choice(words))
68
+
69
+ long_text = " ".join(extended_words[:target_len])
70
+ long_comments.append(long_text)
71
+
72
+ logger.debug(f"Generated {len(long_comments)} long comments")
73
+ return long_comments
74
+
75
+ def generate_corrupted_vocabulary(
76
+ self,
77
+ reference_texts: List[str],
78
+ corruption_rate: float = 0.2,
79
+ n_samples: int = 100,
80
+ ) -> List[str]:
81
+ """
82
+ Generate texts with corrupted vocabulary (typos, character swaps).
83
+
84
+ Args:
85
+ reference_texts: Reference training texts
86
+ corruption_rate: Fraction of words to corrupt (0.0-1.0)
87
+ n_samples: Number of samples to generate
88
+
89
+ Returns:
90
+ List of corrupted texts
91
+ """
92
+ corrupted_texts = []
93
+
94
+ for _ in range(n_samples):
95
+ ref_text = np.random.choice(reference_texts)
96
+ words = ref_text.split()
97
+
98
+ # Corrupt some words
99
+ for i in range(len(words)):
100
+ if random.random() < corruption_rate:
101
+ word = words[i]
102
+ if len(word) > 2:
103
+ # Random character swap or substitution
104
+ if random.random() < 0.5:
105
+ # Character swap
106
+ idx = random.randint(0, len(word) - 2)
107
+ word = word[:idx] + word[idx + 1] + word[idx] + word[idx + 2 :]
108
+ else:
109
+ # Character substitution
110
+ idx = random.randint(0, len(word) - 1)
111
+ word = (
112
+ word[:idx]
113
+ + random.choice(string.ascii_lowercase)
114
+ + word[idx + 1 :]
115
+ )
116
+ words[i] = word
117
+
118
+ corrupted_text = " ".join(words)
119
+ corrupted_texts.append(corrupted_text)
120
+
121
+ logger.debug(f"Generated {len(corrupted_texts)} corrupted texts (rate={corruption_rate})")
122
+ return corrupted_texts
123
+
124
+ def generate_label_shift(
125
+ self,
126
+ reference_texts: List[str],
127
+ reference_labels: np.ndarray,
128
+ shift_type: str = "class_imbalance",
129
+ n_samples: int = 100,
130
+ ) -> Tuple[List[str], np.ndarray]:
131
+ """
132
+ Generate batch with label distribution shift (class imbalance).
133
+
134
+ Args:
135
+ reference_texts: Reference training texts
136
+ reference_labels: Reference training labels (binary matrix)
137
+ shift_type: 'class_imbalance' - favor majority class
138
+ n_samples: Number of samples to generate
139
+
140
+ Returns:
141
+ Tuple of (texts, shifted_labels)
142
+ """
143
+ texts = []
144
+ shifted_labels = []
145
+
146
+ if reference_labels.ndim == 2:
147
+ # Multi-label: get the first label per sample
148
+ label_indices = np.argmax(reference_labels, axis=1)
149
+ else:
150
+ label_indices = reference_labels
151
+
152
+ # Get class distribution
153
+ unique_labels, counts = np.unique(label_indices, return_counts=True)
154
+ majority_class = unique_labels[np.argmax(counts)]
155
+ minority_classes = unique_labels[unique_labels != majority_class]
156
+
157
+ # Create imbalanced distribution: 80% majority, 20% minority
158
+ n_majority = int(n_samples * 0.8)
159
+ n_minority = n_samples - n_majority
160
+
161
+ # Sample indices with bias toward majority class
162
+ majority_indices = np.where(label_indices == majority_class)[0]
163
+ minority_indices = np.where(np.isin(label_indices, minority_classes))[0]
164
+
165
+ selected_indices = []
166
+ selected_indices.extend(np.random.choice(majority_indices, size=n_majority, replace=True))
167
+ if len(minority_indices) > 0:
168
+ selected_indices.extend(
169
+ np.random.choice(minority_indices, size=n_minority, replace=True)
170
+ )
171
+
172
+ np.random.shuffle(selected_indices)
173
+ selected_indices = selected_indices[:n_samples]
174
+
175
+ # Get texts and labels
176
+ texts = [reference_texts[i] for i in selected_indices]
177
+ shifted_labels = reference_labels[selected_indices]
178
+
179
+ logger.debug(f"Generated {len(texts)} samples with class imbalance")
180
+ return texts, shifted_labels
181
+
182
+ def generate_synthetic_batch(
183
+ self,
184
+ reference_texts: List[str],
185
+ reference_labels: np.ndarray,
186
+ drift_type: str = "none",
187
+ batch_size: int = 50,
188
+ ) -> Tuple[List[str], np.ndarray]:
189
+ """
190
+ Generate a synthetic batch with specified drift.
191
+
192
+ Args:
193
+ reference_texts: Reference training texts
194
+ reference_labels: Reference training labels
195
+ drift_type: Type of drift to introduce:
196
+ - 'none': No drift (baseline)
197
+ - 'text_length_short': Shortened texts
198
+ - 'text_length_long': Elongated texts
199
+ - 'corrupted_vocab': Typos and character swaps
200
+ - 'class_imbalance': Biased label distribution
201
+ batch_size: Number of samples to generate
202
+
203
+ Returns:
204
+ Tuple of (texts, labels)
205
+ """
206
+ if drift_type == "none":
207
+ indices = np.random.choice(len(reference_texts), size=batch_size, replace=True)
208
+ texts = [reference_texts[i] for i in indices]
209
+ labels = reference_labels[indices]
210
+
211
+ elif drift_type == "text_length_short":
212
+ texts = self.generate_short_comments(reference_texts, ratio=0.5, n_samples=batch_size)
213
+ indices = np.random.choice(len(reference_labels), size=batch_size)
214
+ labels = reference_labels[indices]
215
+
216
+ elif drift_type == "text_length_long":
217
+ texts = self.generate_long_comments(reference_texts, ratio=1.5, n_samples=batch_size)
218
+ indices = np.random.choice(len(reference_labels), size=batch_size)
219
+ labels = reference_labels[indices]
220
+
221
+ elif drift_type == "corrupted_vocab":
222
+ texts = self.generate_corrupted_vocabulary(
223
+ reference_texts, corruption_rate=0.2, n_samples=batch_size
224
+ )
225
+ indices = np.random.choice(len(reference_labels), size=batch_size)
226
+ labels = reference_labels[indices]
227
+
228
+ elif drift_type == "class_imbalance":
229
+ texts, labels = self.generate_label_shift(
230
+ reference_texts,
231
+ reference_labels,
232
+ shift_type="class_imbalance",
233
+ n_samples=batch_size,
234
+ )
235
+
236
+ else:
237
+ raise ValueError(f"Unknown drift type: {drift_type}")
238
+
239
+ logger.info(f"Generated synthetic batch: {drift_type}, size={batch_size}")
240
+ return texts, labels
turing/tests/unit/test_monitoring.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit Tests for Monitoring Module
3
+ """
4
+
5
+ from pathlib import Path
6
+ import tempfile
7
+
8
+ import numpy as np
9
+ import pytest
10
+
11
+ from turing.monitoring.baseline_manager import (
12
+ BaselineManager,
13
+ extract_baseline_statistics,
14
+ )
15
+ from turing.monitoring.drift_detector import DriftDetector
16
+ from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator
17
+
18
+
19
+ class TestBaselineExtraction:
20
+ """Tests for baseline statistics extraction."""
21
+
22
+ @pytest.fixture
23
+ def sample_data(self):
24
+ texts = [
25
+ "This is a sample comment",
26
+ "Another test comment here",
27
+ "Short text",
28
+ "Longer comment with more information",
29
+ "Medium length comment",
30
+ ]
31
+ labels = np.array([[1, 0, 1, 0, 0], [0, 1, 0, 1, 0], [1, 1, 0, 0, 0], [0, 0, 1, 1, 1], [1, 0, 0, 0, 1]])
32
+ return texts, labels
33
+
34
+ def test_extract_baseline(self, sample_data):
35
+ texts, labels = sample_data
36
+ baseline = extract_baseline_statistics(X_train=texts, y_train=labels, language="java")
37
+
38
+ assert "text_length_distribution" in baseline
39
+ assert "word_count_distribution" in baseline
40
+ assert baseline["language"] == "java"
41
+ assert baseline["num_samples"] == len(texts)
42
+
43
+
44
+ class TestDriftDetector:
45
+ """Tests for drift detection."""
46
+
47
+ @pytest.fixture
48
+ def baseline(self):
49
+ return {
50
+ "text_length_distribution": np.array([20, 25, 30, 35]),
51
+ "word_count_distribution": np.array([3, 4, 5, 6]),
52
+ "label_counts": np.array([5, 3, 2, 4]),
53
+ }
54
+
55
+ def test_detector_init(self):
56
+ detector = DriftDetector(p_value_threshold=0.05, alert_threshold=0.01)
57
+ assert detector.p_value_threshold == 0.05
58
+
59
+ def test_text_length_drift(self, baseline):
60
+ detector = DriftDetector(p_value_threshold=0.05)
61
+
62
+ prod_texts = [
63
+ "Very long test comment with lots of additional information",
64
+ "Another extremely long sample text",
65
+ "Yet another quite lengthy comment",
66
+ "More long production text",
67
+ ]
68
+
69
+ ref_texts = [text[:len(text)//2] for text in prod_texts] # Shorter reference texts
70
+
71
+ result = detector.detect_text_property_drift(prod_texts, ref_texts)
72
+
73
+ assert "drifted" in result
74
+ assert "method" in result
75
+
76
+
77
+ class TestSyntheticDataGenerator:
78
+ """Tests for synthetic data generation."""
79
+
80
+ @pytest.fixture
81
+ def sample_data(self):
82
+ texts = ["This is a sample", "Another test", "Short", "Longer text"]
83
+ labels = np.array([0, 1, 0, 1])
84
+ return texts, labels
85
+
86
+ def test_generator_init(self):
87
+ gen = SyntheticDataGenerator(seed=42)
88
+ assert gen.seed == 42
89
+
90
+ def test_generate_short(self, sample_data):
91
+ texts, labels = sample_data
92
+ gen = SyntheticDataGenerator(seed=42)
93
+
94
+ short = gen.generate_short_comments(texts, ratio=0.5, n_samples=10)
95
+
96
+ assert len(short) == 10
97
+ assert np.mean([len(t) for t in short]) < np.mean([len(t) for t in texts])
98
+
99
+
100
+ class TestBaselineManager:
101
+ """Tests for baseline management."""
102
+
103
+ @pytest.fixture
104
+ def temp_dir(self):
105
+ with tempfile.TemporaryDirectory() as tmpdir:
106
+ yield Path(tmpdir)
107
+
108
+ def test_save_and_load(self, temp_dir):
109
+ manager = BaselineManager(mlflow_enabled=False, local_cache_dir=temp_dir)
110
+
111
+ baseline = {
112
+ "text_length_distribution": [10, 20, 30],
113
+ "label_counts": [5, 3],
114
+ "language": "java",
115
+ "num_samples": 3,
116
+ }
117
+
118
+ manager.save_baseline(baseline, "java", "test", "model")
119
+ loaded = manager.load_baseline("java", "test", "model")
120
+
121
+ assert loaded["language"] == "java"
122
+ assert loaded["num_samples"] == 3
123
+
124
+
125
+ if __name__ == "__main__":
126
+ pytest.main([__file__, "-v"])