|
|
"""Script to generate reference statistics for drift detection.""" |
|
|
|
|
|
import logging |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
import pandas as pd |
|
|
|
|
|
from data.data_loader import load_data |
|
|
from utils.text_processing import normalise_text |
|
|
from monitoring.data_drift import DataDriftDetector |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def generate_reference_stats( |
|
|
data_path: str, |
|
|
output_path: str = "monitoring/reference_stats.json", |
|
|
) -> None: |
|
|
""" |
|
|
Generate reference statistics from training data. |
|
|
|
|
|
Args: |
|
|
data_path: Path to training data TSV file |
|
|
output_path: Path to save reference statistics |
|
|
""" |
|
|
logger.info(f"Loading data from {data_path}") |
|
|
|
|
|
|
|
|
df, _, _ = load_data(data_path) |
|
|
|
|
|
|
|
|
df['title_clean'] = df['title'].apply(normalise_text) |
|
|
if 'snippet' in df.columns: |
|
|
df['snippet_clean'] = df['snippet'].fillna("").apply(normalise_text) |
|
|
else: |
|
|
df['snippet_clean'] = "" |
|
|
|
|
|
|
|
|
detector = DataDriftDetector(reference_data=df[['title_clean', 'snippet_clean']]) |
|
|
|
|
|
|
|
|
output_path = Path(output_path) |
|
|
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
detector.save_reference_stats(str(output_path)) |
|
|
|
|
|
logger.info(f"Reference statistics saved to {output_path}") |
|
|
logger.info(f"Statistics computed for {len(df)} samples") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser(description="Generate reference statistics") |
|
|
parser.add_argument( |
|
|
"--data-path", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Path to training data TSV file" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
type=str, |
|
|
default="monitoring/reference_stats.json", |
|
|
help="Output path for reference statistics" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
generate_reference_stats( |
|
|
data_path=args.data_path, |
|
|
output_path=args.output, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|