File size: 2,020 Bytes
198ccb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
"""Script to generate reference statistics for drift detection."""
import logging
import argparse
from pathlib import Path
import pandas as pd
from data.data_loader import load_data
from utils.text_processing import normalise_text
from monitoring.data_drift import DataDriftDetector
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def generate_reference_stats(
data_path: str,
output_path: str = "monitoring/reference_stats.json",
) -> None:
"""
Generate reference statistics from training data.
Args:
data_path: Path to training data TSV file
output_path: Path to save reference statistics
"""
logger.info(f"Loading data from {data_path}")
# Load data
df, _, _ = load_data(data_path)
# Process text
df['title_clean'] = df['title'].apply(normalise_text)
if 'snippet' in df.columns:
df['snippet_clean'] = df['snippet'].fillna("").apply(normalise_text)
else:
df['snippet_clean'] = ""
# Create drift detector
detector = DataDriftDetector(reference_data=df[['title_clean', 'snippet_clean']])
# Save reference statistics
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
detector.save_reference_stats(str(output_path))
logger.info(f"Reference statistics saved to {output_path}")
logger.info(f"Statistics computed for {len(df)} samples")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate reference statistics")
parser.add_argument(
"--data-path",
type=str,
required=True,
help="Path to training data TSV file"
)
parser.add_argument(
"--output",
type=str,
default="monitoring/reference_stats.json",
help="Output path for reference statistics"
)
args = parser.parse_args()
generate_reference_stats(
data_path=args.data_path,
output_path=args.output,
)
|