"""Tests for the baseline classifier and label encoding.""" import pandas as pd import pytest from sklearn.pipeline import Pipeline from src.models.baseline import build_pipeline from src.models.intent_classifier import LABEL2ID, ID2LABEL, INTENT_CATEGORIES def _make_dummy_df(n: int = 20) -> pd.DataFrame: import itertools labels = list(itertools.islice(itertools.cycle(sorted(INTENT_CATEGORIES)), n)) texts = [f"sample query number {i} for {labels[i]}" for i in range(n)] return pd.DataFrame({"text": texts, "label": labels}) def test_label_encoding_roundtrip(): for label in INTENT_CATEGORIES: idx = LABEL2ID[label] assert ID2LABEL[idx] == label def test_label_encoding_count(): assert len(LABEL2ID) == 6 assert len(ID2LABEL) == 6 def test_build_pipeline_returns_sklearn_pipeline(): pipeline = build_pipeline() assert isinstance(pipeline, Pipeline) def test_baseline_fit_predict(): df = _make_dummy_df(60) pipeline = build_pipeline(max_features=100, min_df=1) pipeline.fit(df["text"], df["label"]) preds = pipeline.predict(df["text"][:5]) assert len(preds) == 5 for pred in preds: assert pred in INTENT_CATEGORIES