Spaces:

turing-team
/

turing-space

Running

App Files Files Community

github-actions[bot] commited on Jan 5

Commit

38593e7

1 Parent(s): 8e13241

Sync turing folder from GitHub

Browse files

Files changed (10) hide show

turing/CLI_runner/verify_drift_detection.py +335 -0
turing/config.py +6 -0
turing/monitoring/__init__.py +27 -0
turing/monitoring/baseline_manager.py +148 -0
turing/monitoring/drift_detector.py +353 -0
turing/monitoring/feedback/feedback_data.csv +3 -0
turing/monitoring/feedback_manager.py +65 -0
turing/monitoring/mlflow_logger.py +97 -0
turing/monitoring/synthetic_data_generator.py +240 -0
turing/tests/unit/test_monitoring.py +126 -0

turing/CLI_runner/verify_drift_detection.py ADDED Viewed

	@@ -0,0 +1,335 @@

+from datetime import datetime
+import json
+from pathlib import Path
+import dagshub
+from loguru import logger
+from mlflow.tracking import MlflowClient
+import numpy as np
+import pandas as pd
+import typer
+from turing import config
+from turing.modeling.model_selector import get_best_model_by_tag
+from turing.monitoring.baseline_manager import extract_baseline_statistics
+from turing.monitoring.drift_detector import DriftDetector
+from turing.monitoring.feedback_manager import load_feedback_for_language
+from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator
+app = typer.Typer()
+def load_training_data(dataset_name: str, language: str):
+    """
+    Load training data for a specific programming language.
+    Args:
+        dataset_name: Dataset name (e.g., 'clean-k5000')
+        language: Programming language (java, python, pharo)
+    Returns:
+        Tuple of (texts, labels) as lists
+    """
+    dataset_path = config.INTERIM_DATA_DIR / "features" / dataset_name
+    if not dataset_path.exists():
+        raise FileNotFoundError(f"Dataset path not found: {dataset_path}")
+    train_file = None
+    for file in dataset_path.rglob(f"{language}_train*.csv"):
+        train_file = file
+        break
+    if not train_file:
+        raise FileNotFoundError(f"Training file not found for {language} in {dataset_path}")
+    logger.info(f"Loading training data from: {train_file}")
+    df = pd.read_csv(train_file)
+    X_train = df[config.INPUT_COLUMN].tolist()
+    if isinstance(df[config.LABEL_COLUMN].iloc[0], str):
+        y_train = np.array([eval(label) for label in df[config.LABEL_COLUMN]])
+    else:
+        y_train = df[config.LABEL_COLUMN].values
+    logger.success(f"Loaded {len(X_train)} training samples for {language}")
+    return X_train, y_train
+def print_drift_report(drift_results: dict, drift_type: str, report_lines: list = None):
+    """
+    Format and display drift detection results for a specific drift type.
+    Args:
+        drift_results: Dictionary with drift detection metrics and alerts
+        drift_type: Name of drift type tested (e.g., 'none', 'text_length_short')
+        report_lines: Optional list to collect formatted report lines
+    """
+    def log_and_collect(msg: str):
+        logger.info(msg)
+        if report_lines is not None:
+            report_lines.append(msg)
+    log_and_collect(f"\n{'=' * 60}")
+    log_and_collect(f"DRIFT DETECTION REPORT - {drift_type.upper()}")
+    log_and_collect(f"{'=' * 60}")
+    for metric_name, result in drift_results.items():
+        if metric_name == "overall":
+            continue
+        p_value = result.get("p_value", result.get("check_result", {}).get("passed", None))
+        statistic = result.get("statistic", None)
+        drifted = result.get("drifted", False)
+        alert = result.get("alert", False)
+        if alert:
+            status = "ALERT"
+        elif drifted:
+            status = "DRIFT"
+        else:
+            status = "OK"
+        log_and_collect(f"\n{metric_name.upper()}")
+        log_and_collect(f"  Status: {status}")
+        if p_value is not None:
+            log_and_collect(f"  P-value: {p_value:.6f}")
+        if statistic is not None:
+            log_and_collect(f"  Statistic: {statistic:.6f}")
+        log_and_collect(f"  Drift detected: {drifted}")
+        log_and_collect(f"  Critical alert: {alert}")
+        log_and_collect(f"  Method: {result.get('method', 'unknown')}")
+    overall = drift_results.get("overall", {})
+    overall_drifted = overall.get("drifted", False)
+    overall_alert = overall.get("alert", False)
+    drift_count = overall.get("num_drifts", 0)
+    log_and_collect(f"\n{'=' * 60}")
+    log_and_collect("OVERALL SUMMARY")
+    log_and_collect(f"  Drift detected: {overall_drifted}")
+    log_and_collect(f"  Critical alert: {overall_alert}")
+    log_and_collect(f"  Number of drifted metrics: {drift_count}")
+    log_and_collect(f"  Methods used: {overall.get('methods', [])}")
+    log_and_collect(f"{'=' * 60}\n")
+def save_drift_report(
+    language: str,
+    dataset_name: str,
+    baseline_stats: dict,
+    test_results: dict,
+    report_text: str,
+):
+    """
+    Save drift detection report to TXT and JSON files.
+    Args:
+        language: Programming language tested
+        dataset_name: Name of dataset used
+        baseline_stats: Baseline statistics dictionary
+        test_results: Dictionary with test results for each drift type
+        report_text: Formatted report text
+    """
+    def convert_numpy_types(obj):
+        if isinstance(obj, dict):
+            return {k: convert_numpy_types(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [convert_numpy_types(item) for item in obj]
+        elif isinstance(obj, np.bool_):
+            return bool(obj)
+        elif isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return obj
+    monitoring_dir = config.REPORTS_DIR / "monitoring"
+    monitoring_dir.mkdir(parents=True, exist_ok=True)
+    report_file = monitoring_dir / f"drift_report_{language}.txt"
+    with open(report_file, "w") as f:
+        f.write("DRIFT DETECTION REPORT\n")
+        f.write(f"Language: {language}\n")
+        f.write(f"Dataset: {dataset_name}\n")
+        f.write(f"Timestamp: {datetime.now().isoformat()}\n")
+        f.write(f"P-value threshold: {config.DRIFT_P_VALUE_THRESHOLD}\n")
+        f.write(f"Alert threshold: {config.DRIFT_ALERT_THRESHOLD}\n")
+        f.write("\n" + "=" * 80 + "\n\n")
+        f.write("BASELINE STATISTICS\n")
+        f.write(
+            f"  Text length: mean={baseline_stats['text_length_mean']:.2f}, std={baseline_stats['text_length_std']:.2f}\n"
+        )
+        f.write(
+            f"  Word count: mean={baseline_stats['word_count_mean']:.2f}, std={baseline_stats['word_count_std']:.2f}\n"
+        )
+        f.write(f"  Label counts: {baseline_stats['label_counts']}\n")
+        f.write(f"  Number of samples: {baseline_stats['num_samples']}\n")
+        f.write("\n" + "=" * 80 + "\n\n")
+        f.write(report_text)
+    json_file = monitoring_dir / f"drift_report_{language}.json"
+    report_data = {
+        "language": language,
+        "dataset": dataset_name,
+        "timestamp": datetime.now().isoformat(),
+        "config": {
+            "p_value_threshold": config.DRIFT_P_VALUE_THRESHOLD,
+            "alert_threshold": config.DRIFT_ALERT_THRESHOLD,
+        },
+        "baseline": {
+            "text_length_mean": baseline_stats["text_length_mean"],
+            "text_length_std": baseline_stats["text_length_std"],
+            "word_count_mean": baseline_stats["word_count_mean"],
+            "word_count_std": baseline_stats["word_count_std"],
+            "label_counts": baseline_stats["label_counts"],
+            "num_samples": baseline_stats["num_samples"],
+            "n_labels": baseline_stats["n_labels"],
+        },
+        "test_results": convert_numpy_types(test_results),
+    }
+    with open(json_file, "w") as f:
+        json.dump(report_data, f, indent=2)
+    logger.success("Report saved to:")
+    logger.info(f"  Text: {report_file}")
+    logger.info(f"  JSON: {json_file}")
+@app.command()
+def verify(
+    language: str = typer.Option("java", help="Language to test (java, python, pharo)"),
+    repo_owner: str = typer.Option("se4ai2526-uniba", help="DagsHub repository owner"),
+    repo_name: str = typer.Option("Turing", help="DagsHub repository name"),
+    n_samples: int = typer.Option(100, help="Number of samples for synthetic data generation"),
+    use_feedback: bool = typer.Option(False, help="Include user feedback rows in drift analysis"),
+    feedback_path: Path = typer.Option(
+        config.PROJ_ROOT / "turing" / "monitoring" / "feedback" / "feedback_data.csv",
+        help="Path to user feedback CSV",
+    ),
+):
+    """
+    Verify drift detection on best model's training dataset.
+    """
+    logger.info("Starting drift detection verification...")
+    logger.info("Configuration:")
+    logger.info(f"  Language: {language}")
+    logger.info(f"  P-value threshold: {config.DRIFT_P_VALUE_THRESHOLD}")
+    logger.info(f"  Alert threshold: {config.DRIFT_ALERT_THRESHOLD}")
+    logger.info(f"  Baseline cache: {config.BASELINE_CACHE_DIR}")
+    dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
+    logger.info(f"\n[1/6] Searching for best model for {language}...")
+    best_model_info = get_best_model_by_tag(language=language)
+    if not best_model_info:
+        logger.error(f"No best model found for {language}")
+        return
+    run_id = best_model_info["run_id"]
+    logger.info(f"\n[2/6] Retrieving dataset information from MLflow run {run_id}...")
+    client = MlflowClient()
+    run = client.get_run(run_id)
+    dataset_name = run.data.tags.get("dataset_name", None)
+    if not dataset_name:
+        logger.error("Dataset name not found in run tags")
+        return
+    logger.success(f"Found dataset: {dataset_name}")
+    logger.info("\n[3/6] Loading training data...")
+    try:
+        X_train, y_train = load_training_data(dataset_name, language)
+    except Exception as e:
+        logger.error(f"Failed to load training data: {e}")
+        return
+    logger.info("\n[4/6] Extracting baseline statistics...")
+    baseline_stats = extract_baseline_statistics(X_train, y_train, language)
+    logger.success("Baseline extracted:")
+    logger.info(
+        f"  Text length: mean={baseline_stats['text_length_mean']:.2f}, std={baseline_stats['text_length_std']:.2f}"
+    )
+    logger.info(
+        f"  Word count: mean={baseline_stats['word_count_mean']:.2f}, std={baseline_stats['word_count_std']:.2f}"
+    )
+    logger.info(f"  Label counts: {baseline_stats['label_counts']}")
+    logger.info("\n[5/6] Initializing drift detection components...")
+    drift_detector = DriftDetector()
+    synthetic_generator = SyntheticDataGenerator(seed=42)
+    feedback_texts, feedback_labels = [], np.array([])
+    if use_feedback:
+        try:
+            feedback_texts, feedback_labels = load_feedback_for_language(feedback_path, language)
+        except Exception as e:
+            logger.warning(f"Feedback load skipped: {e}")
+    logger.info("\n[6/6] Testing drift detection on different data types...\n")
+    test_cases = [
+        ("NORMAL DATA (no drift expected)", "none"),
+        ("SHORT TEXT DRIFT", "text_length_short"),
+        ("LONG TEXT DRIFT", "text_length_long"),
+        ("CORRUPTED VOCABULARY DRIFT", "corrupted_vocab"),
+        ("CLASS IMBALANCE DRIFT", "class_imbalance"),
+    ]
+    if use_feedback and len(feedback_texts) > 0:
+        test_cases.append(("USER FEEDBACK", "feedback"))
+    all_test_results = {}
+    all_report_lines = []
+    for test_name, drift_type in test_cases:
+        logger.info(f"\n{'#' * 60}")
+        logger.info(f"Test: {test_name}")
+        logger.info(f"{'#' * 60}")
+        if drift_type == "feedback":
+            production_texts = feedback_texts
+            production_labels = feedback_labels
+        else:
+            production_texts, production_labels = synthetic_generator.generate_synthetic_batch(
+                reference_texts=X_train,
+                reference_labels=y_train,
+                drift_type=drift_type,
+                batch_size=n_samples,
+            )
+        drift_results = drift_detector.detect_all_drifts(
+            production_texts=production_texts,
+            production_labels=production_labels,
+            reference_texts=X_train,
+            reference_labels=y_train,
+        )
+        all_test_results[drift_type] = drift_results
+        print_drift_report(drift_results, drift_type, report_lines=all_report_lines)
+    logger.info("\nSaving drift detection report...")
+    report_text = "\n".join(all_report_lines)
+    save_drift_report(
+        language=language,
+        dataset_name=dataset_name,
+        baseline_stats=baseline_stats,
+        test_results=all_test_results,
+        report_text=report_text,
+    )
+    logger.success("\nDrift detection verification completed!")
+if __name__ == "__main__":
+    app()

turing/config.py CHANGED Viewed

@@ -49,6 +49,12 @@ MAX_AVG_FLOPS = 5000.0  # GFLOPS
 # Training parameters
 DEFAULT_BATCH_SIZE = 32
 # Model configuration mapping
 MODEL_CONFIG = {
     "codeberta": {

 # Training parameters
 DEFAULT_BATCH_SIZE = 32
+# Drift detection parameters
+DRIFT_P_VALUE_THRESHOLD = 0.05  # P-value threshold for drift detection warning
+DRIFT_ALERT_THRESHOLD = 0.01  # P-value threshold for drift alert (critical)
+BASELINE_CACHE_DIR = Path.home() / ".turing_baselines"  # Local cache for baseline statistics
+DRIFT_DETECTION_ENABLED = True  # Enable/disable drift detection globally
 # Model configuration mapping
 MODEL_CONFIG = {
     "codeberta": {

turing/monitoring/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Data Drift Monitoring Module
+This module provides tools for detecting and logging data drift in production predictions
+against reference baselines from training data. It integrates with MLflow for baseline
+storage and drift metric logging.
+Components:
+- drift_detector: Core drift detection using statistical tests (KS test, Chi-square)
+- baseline_manager: Extract and manage baseline statistics from training data
+- mlflow_logger: Log drift metrics and alerts to MLflow
+- synthetic_data_generator: Generate synthetic drifted data for testing
+"""
+from turing.monitoring.baseline_manager import (
+    BaselineManager,
+    extract_baseline_statistics,
+)
+from turing.monitoring.drift_detector import DriftDetector
+from turing.monitoring.mlflow_logger import DriftLogger
+__all__ = [
+    "DriftDetector",
+    "BaselineManager",
+    "DriftLogger",
+    "extract_baseline_statistics",
+]

turing/monitoring/baseline_manager.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Baseline Management Module
+Handles extraction of baseline statistics from training data,
+storage as MLflow artifacts, and retrieval for drift detection.
+"""
+import json
+from pathlib import Path
+import pickle
+from typing import Dict, List, Optional
+from loguru import logger
+import numpy as np
+from turing import config
+try:
+    import mlflow
+    from mlflow.tracking import MlflowClient
+except ImportError:
+    mlflow = None
+def extract_baseline_statistics(
+    X_train: List[str],
+    y_train: np.ndarray,
+    language: str = "java",
+) -> Dict:
+    """
+    Extract baseline statistics from training data.
+    Args:
+        X_train: List of training comment texts
+        y_train: Training labels (binary matrix or label indices)
+        language: Language of the training data
+    Returns:
+        Dictionary containing baseline statistics
+    """
+    text_lengths = np.array([len(text) for text in X_train])
+    word_counts = np.array([len(text.split()) for text in X_train])
+    if len(y_train.shape) == 1:
+        n_labels = int(np.max(y_train)) + 1
+        label_counts = np.bincount(y_train.astype(int), minlength=n_labels)
+    else:
+        label_counts = np.sum(y_train, axis=0)
+        n_labels = y_train.shape[1]
+    baseline_stats = {
+        "text_length_distribution": text_lengths.tolist(),
+        "word_count_distribution": word_counts.tolist(),
+        "label_counts": label_counts.tolist(),
+        "language": language,
+        "num_samples": len(X_train),
+        "n_labels": int(n_labels),
+        "text_length_mean": float(np.mean(text_lengths)),
+        "text_length_std": float(np.std(text_lengths)),
+        "text_length_min": float(np.min(text_lengths)),
+        "text_length_max": float(np.max(text_lengths)),
+        "word_count_mean": float(np.mean(word_counts)),
+        "word_count_std": float(np.std(word_counts)),
+    }
+    logger.info(f"Extracted baseline for {language}: {len(X_train)} samples")
+    return baseline_stats
+class BaselineManager:
+    """
+    Manages baseline statistics for drift detection.
+    """
+    def __init__(self, mlflow_enabled: bool = True, local_cache_dir: Optional[Path] = None):
+        """
+        Initialize baseline manager.
+        Args:
+            mlflow_enabled: Enable MLflow artifact logging
+            local_cache_dir: Local cache directory (default from config.BASELINE_CACHE_DIR)
+        """
+        self.mlflow_enabled = mlflow_enabled and mlflow is not None
+        self.local_cache_dir = local_cache_dir or config.BASELINE_CACHE_DIR
+        self.local_cache_dir.mkdir(parents=True, exist_ok=True)
+        if self.mlflow_enabled:
+            self.mlflow_client = MlflowClient()
+        logger.info(f"BaselineManager initialized (cache: {self.local_cache_dir})")
+    def save_baseline(
+        self,
+        baseline_stats: Dict,
+        language: str,
+        dataset_name: str,
+        model_id: str = "default",
+        run_id: Optional[str] = None,
+    ) -> None:
+        """
+        Save baseline statistics to MLflow and local cache.
+        """
+        baseline_path = self._get_baseline_path(language, dataset_name, model_id)
+        baseline_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(baseline_path, "wb") as f:
+            pickle.dump(baseline_stats, f)
+        logger.info(f"Saved baseline to {baseline_path}")
+        if self.mlflow_enabled and run_id:
+            try:
+                json_path = baseline_path.with_suffix(".json")
+                json_stats = {
+                    k: v
+                    for k, v in baseline_stats.items()
+                    if isinstance(v, (int, float, str, list, bool))
+                }
+                with open(json_path, "w") as f:
+                    json.dump(json_stats, f, indent=2)
+                mlflow.log_artifact(str(json_path), artifact_path=f"baselines/{language}")
+                logger.info("Logged baseline to MLflow")
+            except Exception as e:
+                logger.warning(f"Failed to log baseline to MLflow: {e}")
+    def load_baseline(
+        self,
+        language: str,
+        dataset_name: str,
+        model_id: str = "default",
+    ) -> Dict:
+        """
+        Load baseline statistics from local cache.
+        """
+        baseline_path = self._get_baseline_path(language, dataset_name, model_id)
+        if baseline_path.exists():
+            with open(baseline_path, "rb") as f:
+                baseline_stats = pickle.load(f)
+            logger.info(f"Loaded baseline from cache: {baseline_path}")
+            return baseline_stats
+        raise FileNotFoundError(f"Baseline not found at {baseline_path}")
+    def _get_baseline_path(self, language: str, dataset_name: str, model_id: str) -> Path:
+        """Generate local cache path for baseline."""
+        return self.local_cache_dir / language / f"{dataset_name}_{model_id}_baseline.pkl"

turing/monitoring/drift_detector.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+Drift Detection Module using Deepchecks
+Implements drift detection using Deepchecks integrated checks:
+- Drift check for text properties
+- Label distribution drift
+- Custom metrics comparison
+"""
+from typing import Dict, List
+from loguru import logger
+import numpy as np
+import pandas as pd
+try:
+    from deepchecks.nlp import SingleDataset
+    from deepchecks.nlp.checks import Drift, TextPropertyDrift
+except ImportError:
+    logger.warning("Deepchecks not installed. Install with: pip install deepchecks[nlp]")
+    Drift = None
+    TextPropertyDrift = None
+from turing import config
+class DriftDetector:
+    """
+    Detects data drift using Deepchecks integrated checks comparing production data
+    against baseline/reference datasets.
+    """
+    def __init__(self, p_value_threshold: float = None, alert_threshold: float = None):
+        """
+        Initialize drift detector with Deepchecks.
+        Args:
+            p_value_threshold: P-value threshold for drift detection (default from config)
+            alert_threshold: More sensitive threshold for critical alerts (default from config)
+        """
+        self.p_value_threshold = p_value_threshold or config.DRIFT_P_VALUE_THRESHOLD
+        self.alert_threshold = alert_threshold or config.DRIFT_ALERT_THRESHOLD
+        self.use_deepchecks = Drift is not None
+    def detect_text_property_drift(
+        self,
+        production_texts: List[str],
+        reference_texts: List[str],
+        language: str = "java",
+    ) -> Dict:
+        """
+        Detect drift in text properties using Deepchecks TextPropertyDrift.
+        Args:
+            production_texts: Text data in production
+            reference_texts: Reference/baseline text data
+            language: Language of the texts
+        Returns:
+            Dictionary with drift detection results
+        """
+        if not self.use_deepchecks:
+            logger.warning("Deepchecks not available, using fallback method")
+            return self._fallback_text_property_drift(production_texts, reference_texts)
+        try:
+            # Create Deepchecks datasets
+            ref_df = pd.DataFrame({'text': reference_texts})
+            prod_df = pd.DataFrame({'text': production_texts})
+            reference_dataset = SingleDataset(
+                ref_df,
+                text_column='text',
+                task_type='text_classification'
+            )
+            production_dataset = SingleDataset(
+                prod_df,
+                text_column='text',
+                task_type='text_classification'
+            )
+            # Run TextPropertyDrift check
+            check = TextPropertyDrift()
+            result = check.run(
+                reference_dataset,
+                production_dataset,
+                model_classes=None
+            )
+            # Extract results
+            scores = result.to_dict()
+            is_drifted = result.failed
+            drift_dict = {
+                "check_result": scores,
+                "drifted": is_drifted,
+                "alert": is_drifted,
+                "method": "deepchecks_text_property_drift",
+            }
+            if is_drifted:
+                logger.warning("Text property drift detected (Deepchecks)")
+            return drift_dict
+        except Exception as e:
+            logger.error(f"Deepchecks TextPropertyDrift failed: {e}")
+            return self._fallback_text_property_drift(production_texts, reference_texts)
+    def _fallback_text_property_drift(
+        self,
+        production_texts: List[str],
+        reference_texts: List[str],
+    ) -> Dict:
+        """Fallback to manual calculation if Deepchecks fails."""
+        from scipy.stats import ks_2samp
+        production_lengths = np.array([len(text) for text in production_texts])
+        reference_lengths = np.array([len(text) for text in reference_texts])
+        statistic, p_value = ks_2samp(reference_lengths, production_lengths)
+        is_drifted = p_value < self.p_value_threshold
+        return {
+            "statistic": float(statistic),
+            "p_value": float(p_value),
+            "drifted": is_drifted,
+            "alert": is_drifted and p_value < self.alert_threshold,
+            "mean_production": float(np.mean(production_lengths)),
+            "mean_reference": float(np.mean(reference_lengths)),
+            "method": "fallback_ks_test",
+        }
+    def detect_label_distribution_drift(
+        self,
+        production_labels: np.ndarray,
+        reference_labels: np.ndarray,
+    ) -> Dict:
+        """
+        Detect drift in label distribution using Deepchecks Drift check.
+        Args:
+            production_labels: Production label data (numpy array or list)
+            reference_labels: Reference/baseline label data
+        Returns:
+            Dictionary with drift detection results
+        """
+        if not self.use_deepchecks:
+            logger.warning("Deepchecks not available, using fallback method")
+            return self._fallback_label_drift(production_labels, reference_labels)
+        try:
+            # Prepare data
+            if len(reference_labels.shape) == 1:
+                ref_counts = np.bincount(reference_labels.astype(int))
+            else:
+                ref_counts = np.sum(reference_labels, axis=0)
+            if len(production_labels.shape) == 1:
+                prod_counts = np.bincount(
+                    production_labels.astype(int),
+                    minlength=len(ref_counts)
+                )
+            else:
+                prod_counts = np.sum(production_labels, axis=0)
+            # Create DataFrames with label columns
+            n_labels = len(ref_counts)
+            ref_df = pd.DataFrame({
+                f'label_{i}': [int(ref_counts[i])] for i in range(n_labels)
+            })
+            prod_df = pd.DataFrame({
+                f'label_{i}': [int(prod_counts[i])] for i in range(n_labels)
+            })
+            # Run Drift check
+            check = Drift()
+            reference_dataset = SingleDataset(ref_df, task_type='classification')
+            production_dataset = SingleDataset(prod_df, task_type='classification')
+            result = check.run(reference_dataset, production_dataset)
+            is_drifted = result.failed
+            drift_dict = {
+                "check_result": result.to_dict(),
+                "drifted": is_drifted,
+                "alert": is_drifted,
+                "reference_counts": ref_counts.tolist(),
+                "production_counts": prod_counts.tolist(),
+                "method": "deepchecks_drift_check",
+            }
+            if is_drifted:
+                logger.warning("Label distribution drift detected (Deepchecks)")
+            return drift_dict
+        except Exception as e:
+            logger.error(f"Deepchecks Drift check failed: {e}")
+            return self._fallback_label_drift(production_labels, reference_labels)
+    def _fallback_label_drift(
+        self,
+        production_labels: np.ndarray,
+        reference_labels: np.ndarray,
+    ) -> Dict:
+        """Fallback to manual Chi-Square test if Deepchecks fails."""
+        from scipy.stats import chi2_contingency
+        if len(reference_labels.shape) == 1:
+            ref_counts = np.bincount(reference_labels.astype(int))
+        else:
+            ref_counts = np.sum(reference_labels, axis=0)
+        if len(production_labels.shape) == 1:
+            prod_counts = np.bincount(
+                production_labels.astype(int),
+                minlength=len(ref_counts)
+            )
+        else:
+            prod_counts = np.sum(production_labels, axis=0)
+        min_len = min(len(prod_counts), len(ref_counts))
+        prod_counts = prod_counts[:min_len]
+        ref_counts = ref_counts[:min_len]
+        contingency_table = np.array([ref_counts, prod_counts])
+        try:
+            chi2, p_value, dof, expected = chi2_contingency(contingency_table)
+        except Exception as e:
+            logger.warning(f"Chi-square test failed: {e}")
+            return {"statistic": None, "p_value": 1.0, "drifted": False, "alert": False}
+        is_drifted = p_value < self.p_value_threshold
+        is_alert = p_value < self.alert_threshold
+        return {
+            "statistic": float(chi2),
+            "p_value": float(p_value),
+            "drifted": is_drifted,
+            "alert": is_alert,
+            "method": "fallback_chi_square",
+        }
+    def detect_word_count_drift(
+        self,
+        production_texts: List[str],
+        reference_texts: List[str],
+    ) -> Dict:
+        """
+        Detect drift in word count distribution.
+        Uses Deepchecks TextPropertyDrift or fallback KS test.
+        Args:
+            production_texts: Text data in production
+            reference_texts: Reference/baseline text data
+        Returns:
+            Dictionary with drift detection results
+        """
+        # Use TextPropertyDrift which includes word count analysis
+        return self.detect_text_property_drift(
+            production_texts,
+            reference_texts,
+            language="unknown"
+        )
+    def detect_all_drifts(
+        self,
+        production_texts: List[str],
+        production_labels: np.ndarray,
+        reference_texts: List[str],
+        reference_labels: np.ndarray,
+    ) -> Dict:
+        """
+        Run all drift detection checks using Deepchecks.
+        Args:
+            production_texts: Production text data
+            production_labels: Production label data
+            reference_texts: Reference/baseline text data
+            reference_labels: Reference/baseline label data
+        Returns:
+            Dictionary with aggregated drift detection results
+        """
+        results = {
+            "text_property": self.detect_text_property_drift(
+                production_texts,
+                reference_texts,
+            ),
+            "label_distribution": self.detect_label_distribution_drift(
+                production_labels,
+                reference_labels,
+            ),
+        }
+        any_drifted = any(r.get("drifted", False) for r in results.values())
+        any_alert = any(r.get("alert", False) for r in results.values())
+        results["overall"] = {
+            "drifted": any_drifted,
+            "alert": any_alert,
+            "num_drifts": sum(1 for r in results.values() if r.get("drifted", False)),
+            "methods": [r.get("method", "unknown") for r in results.values()],        }
+        return results
+    def detect_all_drifts_from_baseline(
+        self,
+        production_texts: List[str],
+        production_labels: np.ndarray,
+        baseline_stats: Dict,
+    ) -> Dict:
+        """
+        Legacy method for backward compatibility.
+        Converts baseline_stats dict to reference_texts and reference_labels if available.
+        Otherwise reconstructs reference data from baseline statistics.
+        Args:
+            production_texts: Production text data
+            production_labels: Production label data
+            baseline_stats: Dictionary with baseline statistics (legacy format)
+        Returns:
+            Dictionary with aggregated drift detection results
+        """
+        results = {
+            "text_length": self._fallback_text_property_drift(
+                production_texts,
+                production_texts,  # Use production as fallback reference
+            ),
+            "label_distribution": self._fallback_label_drift(
+                production_labels,
+                np.array(baseline_stats.get("label_counts", [])),
+            ),
+        }
+        any_drifted = any(r.get("drifted", False) for r in results.values())
+        any_alert = any(r.get("alert", False) for r in results.values())
+        results["overall"] = {
+            "drifted": any_drifted,
+            "alert": any_alert,
+            "num_drifts": sum(1 for r in results.values() if r.get("drifted", False)),
+        }
+        return results

turing/monitoring/feedback/feedback_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+Timestamp,Input_Text,Language,Model_Prediction,User_Correction
+2025-12-11 22:41:05,# Create output directory,python,Usage,DevelopmentNotes
+2025-12-11 23:05:24,# Entry point for running the API directly with python,python,Usage,DevelopmentNotes

turing/monitoring/feedback_manager.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Feedback ingestion utilities for drift analysis."""
+from pathlib import Path
+from typing import List, Tuple
+from loguru import logger
+import numpy as np
+import pandas as pd
+from turing import config
+def load_feedback_for_language(
+    feedback_path: Path,
+    language: str,
+) -> Tuple[List[str], np.ndarray]:
+    """
+    Load user feedback for a given language and return texts with one-hot labels.
+    Rows with unknown labels are skipped. Returns empty lists if no valid rows.
+    """
+    if not feedback_path.exists():
+        raise FileNotFoundError(f"Feedback file not found: {feedback_path}")
+    df = pd.read_csv(feedback_path)
+    if (
+        "Language" not in df.columns
+        or "Input_Text" not in df.columns
+        or "User_Correction" not in df.columns
+    ):
+        raise ValueError(
+            "Feedback file must contain Language, Input_Text, and User_Correction columns"
+        )
+    df_lang = df[df["Language"].str.lower() == language.lower()]
+    if df_lang.empty:
+        logger.warning(f"No feedback rows found for language {language}")
+        return [], np.array([])
+    label_space = config.LABELS_MAP.get(language)
+    if not label_space:
+        raise ValueError(f"Label map not found for language: {language}")
+    label_to_idx = {label.lower(): idx for idx, label in enumerate(label_space)}
+    texts: List[str] = []
+    labels: List[np.ndarray] = []
+    for _, row in df_lang.iterrows():
+        correction = str(row["User_Correction"]).strip().lower()
+        idx = label_to_idx.get(correction)
+        if idx is None:
+            logger.warning(f"Skipping feedback row with unknown label: {row['User_Correction']}")
+            continue
+        one_hot = np.zeros(len(label_space), dtype=int)
+        one_hot[idx] = 1
+        texts.append(str(row["Input_Text"]))
+        labels.append(one_hot)
+    if not texts:
+        logger.warning(f"No valid feedback rows for language {language}")
+        return [], np.array([])
+    return texts, np.vstack(labels)

turing/monitoring/mlflow_logger.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+MLflow Logging for Drift Detection
+Handles logging drift metrics and alerts to MLflow experiment runs.
+"""
+from typing import Dict, Optional
+from loguru import logger
+try:
+    import mlflow
+except ImportError:
+    mlflow = None
+class DriftLogger:
+    """
+    Logs drift detection results to MLflow.
+    """
+    def __init__(self, log_artifacts: bool = False):
+        """
+        Initialize drift logger.
+        """
+        self.log_artifacts = log_artifacts
+        self.has_mlflow = mlflow is not None
+    def log_drift_results(
+        self,
+        drift_results: Dict,
+        step: Optional[int] = None,
+        prefix: str = "drift",
+    ) -> None:
+        """
+        Log drift detection results to MLflow.
+        """
+        if not self.has_mlflow:
+            logger.debug("MLflow not available")
+            return
+        try:
+            overall = drift_results.get("overall", {})
+            mlflow.log_metric(f"{prefix}/drifted", float(overall.get("drifted", False)), step=step)
+            mlflow.log_metric(
+                f"{prefix}/num_drifts", float(overall.get("num_drifts", 0)), step=step
+            )
+            for drift_type, result in drift_results.items():
+                if drift_type == "overall":
+                    continue
+                if "p_value" in result:
+                    mlflow.log_metric(
+                        f"{prefix}/{drift_type}/p_value", result["p_value"], step=step
+                    )
+            logger.debug("Logged drift results to MLflow")
+        except Exception as e:
+            logger.warning(f"Failed to log drift to MLflow: {e}")
+    def log_baseline_statistics(
+        self,
+        baseline_stats: Dict,
+        prefix: str = "baseline",
+    ) -> None:
+        """
+        Log baseline statistics to MLflow.
+        """
+        if not self.has_mlflow:
+            return
+        try:
+            metrics = {
+                f"{prefix}/num_samples": baseline_stats.get("num_samples"),
+                f"{prefix}/text_length_mean": baseline_stats.get("text_length_mean"),
+                f"{prefix}/word_count_mean": baseline_stats.get("word_count_mean"),
+            }
+            for metric_name, value in metrics.items():
+                if value is not None:
+                    mlflow.log_metric(metric_name, float(value))
+            mlflow.log_param(f"{prefix}/language", baseline_stats.get("language", "unknown"))
+            logger.debug("Logged baseline to MLflow")
+        except Exception as e:
+            logger.warning(f"Failed to log baseline: {e}")
+    def log_alert(self, message: str, severity: str = "warning") -> None:
+        """
+        Log drift alert message.
+        """
+        logger_func = getattr(logger, severity, logger.warning)
+        logger_func(f"DRIFT ALERT: {message}")

turing/monitoring/synthetic_data_generator.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Synthetic Data Generator for Drift Testing
+Generates synthetic drifted datasets to test drift detection.
+"""
+import random
+import string
+from typing import List, Tuple
+from loguru import logger
+import numpy as np
+class SyntheticDataGenerator:
+    """
+    Generates synthetic code comment data with controlled drift characteristics.
+    """
+    def __init__(self, seed: int = 42):
+        """
+        Initialize synthetic data generator.
+        """
+        self.seed = seed
+        np.random.seed(seed)
+        random.seed(seed)
+    def generate_short_comments(
+        self,
+        reference_texts: List[str],
+        ratio: float = 0.5,
+        n_samples: int = 100,
+    ) -> List[str]:
+        """
+        Generate shorter comments (text length drift).
+        """
+        short_comments = []
+        for _ in range(n_samples):
+            ref_text = np.random.choice(reference_texts)
+            words = ref_text.split()
+            truncated_len = max(1, int(len(words) * ratio))
+            short_text = " ".join(words[:truncated_len])
+            short_comments.append(short_text)
+        logger.debug(f"Generated {len(short_comments)} short comments")
+        return short_comments
+    def generate_long_comments(
+        self,
+        reference_texts: List[str],
+        ratio: float = 1.5,
+        n_samples: int = 100,
+    ) -> List[str]:
+        """
+        Generate longer comments (text length drift upward).
+        """
+        long_comments = []
+        for _ in range(n_samples):
+            ref_text = np.random.choice(reference_texts)
+            words = ref_text.split()
+            target_len = max(1, int(len(words) * ratio))
+            extended_words = words.copy()
+            while len(extended_words) < target_len:
+                extended_words.append(np.random.choice(words))
+            long_text = " ".join(extended_words[:target_len])
+            long_comments.append(long_text)
+        logger.debug(f"Generated {len(long_comments)} long comments")
+        return long_comments
+    def generate_corrupted_vocabulary(
+        self,
+        reference_texts: List[str],
+        corruption_rate: float = 0.2,
+        n_samples: int = 100,
+    ) -> List[str]:
+        """
+        Generate texts with corrupted vocabulary (typos, character swaps).
+        Args:
+            reference_texts: Reference training texts
+            corruption_rate: Fraction of words to corrupt (0.0-1.0)
+            n_samples: Number of samples to generate
+        Returns:
+            List of corrupted texts
+        """
+        corrupted_texts = []
+        for _ in range(n_samples):
+            ref_text = np.random.choice(reference_texts)
+            words = ref_text.split()
+            # Corrupt some words
+            for i in range(len(words)):
+                if random.random() < corruption_rate:
+                    word = words[i]
+                    if len(word) > 2:
+                        # Random character swap or substitution
+                        if random.random() < 0.5:
+                            # Character swap
+                            idx = random.randint(0, len(word) - 2)
+                            word = word[:idx] + word[idx + 1] + word[idx] + word[idx + 2 :]
+                        else:
+                            # Character substitution
+                            idx = random.randint(0, len(word) - 1)
+                            word = (
+                                word[:idx]
+                                + random.choice(string.ascii_lowercase)
+                                + word[idx + 1 :]
+                            )
+                    words[i] = word
+            corrupted_text = " ".join(words)
+            corrupted_texts.append(corrupted_text)
+        logger.debug(f"Generated {len(corrupted_texts)} corrupted texts (rate={corruption_rate})")
+        return corrupted_texts
+    def generate_label_shift(
+        self,
+        reference_texts: List[str],
+        reference_labels: np.ndarray,
+        shift_type: str = "class_imbalance",
+        n_samples: int = 100,
+    ) -> Tuple[List[str], np.ndarray]:
+        """
+        Generate batch with label distribution shift (class imbalance).
+        Args:
+            reference_texts: Reference training texts
+            reference_labels: Reference training labels (binary matrix)
+            shift_type: 'class_imbalance' - favor majority class
+            n_samples: Number of samples to generate
+        Returns:
+            Tuple of (texts, shifted_labels)
+        """
+        texts = []
+        shifted_labels = []
+        if reference_labels.ndim == 2:
+            # Multi-label: get the first label per sample
+            label_indices = np.argmax(reference_labels, axis=1)
+        else:
+            label_indices = reference_labels
+        # Get class distribution
+        unique_labels, counts = np.unique(label_indices, return_counts=True)
+        majority_class = unique_labels[np.argmax(counts)]
+        minority_classes = unique_labels[unique_labels != majority_class]
+        # Create imbalanced distribution: 80% majority, 20% minority
+        n_majority = int(n_samples * 0.8)
+        n_minority = n_samples - n_majority
+        # Sample indices with bias toward majority class
+        majority_indices = np.where(label_indices == majority_class)[0]
+        minority_indices = np.where(np.isin(label_indices, minority_classes))[0]
+        selected_indices = []
+        selected_indices.extend(np.random.choice(majority_indices, size=n_majority, replace=True))
+        if len(minority_indices) > 0:
+            selected_indices.extend(
+                np.random.choice(minority_indices, size=n_minority, replace=True)
+            )
+        np.random.shuffle(selected_indices)
+        selected_indices = selected_indices[:n_samples]
+        # Get texts and labels
+        texts = [reference_texts[i] for i in selected_indices]
+        shifted_labels = reference_labels[selected_indices]
+        logger.debug(f"Generated {len(texts)} samples with class imbalance")
+        return texts, shifted_labels
+    def generate_synthetic_batch(
+        self,
+        reference_texts: List[str],
+        reference_labels: np.ndarray,
+        drift_type: str = "none",
+        batch_size: int = 50,
+    ) -> Tuple[List[str], np.ndarray]:
+        """
+        Generate a synthetic batch with specified drift.
+        Args:
+            reference_texts: Reference training texts
+            reference_labels: Reference training labels
+            drift_type: Type of drift to introduce:
+                - 'none': No drift (baseline)
+                - 'text_length_short': Shortened texts
+                - 'text_length_long': Elongated texts
+                - 'corrupted_vocab': Typos and character swaps
+                - 'class_imbalance': Biased label distribution
+            batch_size: Number of samples to generate
+        Returns:
+            Tuple of (texts, labels)
+        """
+        if drift_type == "none":
+            indices = np.random.choice(len(reference_texts), size=batch_size, replace=True)
+            texts = [reference_texts[i] for i in indices]
+            labels = reference_labels[indices]
+        elif drift_type == "text_length_short":
+            texts = self.generate_short_comments(reference_texts, ratio=0.5, n_samples=batch_size)
+            indices = np.random.choice(len(reference_labels), size=batch_size)
+            labels = reference_labels[indices]
+        elif drift_type == "text_length_long":
+            texts = self.generate_long_comments(reference_texts, ratio=1.5, n_samples=batch_size)
+            indices = np.random.choice(len(reference_labels), size=batch_size)
+            labels = reference_labels[indices]
+        elif drift_type == "corrupted_vocab":
+            texts = self.generate_corrupted_vocabulary(
+                reference_texts, corruption_rate=0.2, n_samples=batch_size
+            )
+            indices = np.random.choice(len(reference_labels), size=batch_size)
+            labels = reference_labels[indices]
+        elif drift_type == "class_imbalance":
+            texts, labels = self.generate_label_shift(
+                reference_texts,
+                reference_labels,
+                shift_type="class_imbalance",
+                n_samples=batch_size,
+            )
+        else:
+            raise ValueError(f"Unknown drift type: {drift_type}")
+        logger.info(f"Generated synthetic batch: {drift_type}, size={batch_size}")
+        return texts, labels

turing/tests/unit/test_monitoring.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Unit Tests for Monitoring Module
+"""
+from pathlib import Path
+import tempfile
+import numpy as np
+import pytest
+from turing.monitoring.baseline_manager import (
+    BaselineManager,
+    extract_baseline_statistics,
+)
+from turing.monitoring.drift_detector import DriftDetector
+from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator
+class TestBaselineExtraction:
+    """Tests for baseline statistics extraction."""
+    @pytest.fixture
+    def sample_data(self):
+        texts = [
+            "This is a sample comment",
+            "Another test comment here",
+            "Short text",
+            "Longer comment with more information",
+            "Medium length comment",
+        ]
+        labels = np.array([[1, 0, 1, 0, 0], [0, 1, 0, 1, 0], [1, 1, 0, 0, 0], [0, 0, 1, 1, 1], [1, 0, 0, 0, 1]])
+        return texts, labels
+    def test_extract_baseline(self, sample_data):
+        texts, labels = sample_data
+        baseline = extract_baseline_statistics(X_train=texts, y_train=labels, language="java")
+        assert "text_length_distribution" in baseline
+        assert "word_count_distribution" in baseline
+        assert baseline["language"] == "java"
+        assert baseline["num_samples"] == len(texts)
+class TestDriftDetector:
+    """Tests for drift detection."""
+    @pytest.fixture
+    def baseline(self):
+        return {
+            "text_length_distribution": np.array([20, 25, 30, 35]),
+            "word_count_distribution": np.array([3, 4, 5, 6]),
+            "label_counts": np.array([5, 3, 2, 4]),
+        }
+    def test_detector_init(self):
+        detector = DriftDetector(p_value_threshold=0.05, alert_threshold=0.01)
+        assert detector.p_value_threshold == 0.05
+    def test_text_length_drift(self, baseline):
+        detector = DriftDetector(p_value_threshold=0.05)
+        prod_texts = [
+            "Very long test comment with lots of additional information",
+            "Another extremely long sample text",
+            "Yet another quite lengthy comment",
+            "More long production text",
+        ]
+        ref_texts = [text[:len(text)//2] for text in prod_texts]  # Shorter reference texts
+        result = detector.detect_text_property_drift(prod_texts, ref_texts)
+        assert "drifted" in result
+        assert "method" in result
+class TestSyntheticDataGenerator:
+    """Tests for synthetic data generation."""
+    @pytest.fixture
+    def sample_data(self):
+        texts = ["This is a sample", "Another test", "Short", "Longer text"]
+        labels = np.array([0, 1, 0, 1])
+        return texts, labels
+    def test_generator_init(self):
+        gen = SyntheticDataGenerator(seed=42)
+        assert gen.seed == 42
+    def test_generate_short(self, sample_data):
+        texts, labels = sample_data
+        gen = SyntheticDataGenerator(seed=42)
+        short = gen.generate_short_comments(texts, ratio=0.5, n_samples=10)
+        assert len(short) == 10
+        assert np.mean([len(t) for t in short]) < np.mean([len(t) for t in texts])
+class TestBaselineManager:
+    """Tests for baseline management."""
+    @pytest.fixture
+    def temp_dir(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield Path(tmpdir)
+    def test_save_and_load(self, temp_dir):
+        manager = BaselineManager(mlflow_enabled=False, local_cache_dir=temp_dir)
+        baseline = {
+            "text_length_distribution": [10, 20, 30],
+            "label_counts": [5, 3],
+            "language": "java",
+            "num_samples": 3,
+        }
+        manager.save_baseline(baseline, "java", "test", "model")
+        loaded = manager.load_baseline("java", "test", "model")
+        assert loaded["language"] == "java"
+        assert loaded["num_samples"] == 3
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])