SignalMod / tests /test_hybrid_clean.py
Mirae Kang
feat: implement new models and improve UI, #23
46cc63a
"""Tests for dual-input hybrid helpers."""
import numpy as np
import pandas as pd
import pytest
from src.features.metadata_features import extract_metadata_features
from src.models.hybrid_ensemble import compute_performance_weights
def test_extract_metadata_features():
df = pd.DataFrame({"Text": ["Hello!! WORLD?", "ok"], "IsToxic": [1, 0]})
meta = extract_metadata_features(df)
assert list(meta.columns) == [
"char_length",
"word_count",
"exclamation_ratio",
"question_ratio",
"caps_ratio",
]
assert meta.loc[0, "exclamation_ratio"] > 0
def test_compute_performance_weights_bounds():
y = np.array([0, 1, 1, 0])
b = np.array([0.2, 0.8, 0.7, 0.3])
l = np.array([0.3, 0.6, 0.75, 0.25])
bw, lw, info = compute_performance_weights(
b, l, y, min_lr_weight=0.15, max_lr_weight=0.45
)
assert pytest.approx(bw + lw, rel=1e-5) == 1.0
assert 0.15 <= lw <= 0.45
assert "bert_val_score" in info
def test_load_dual_track_data_generates_clean_text(tmp_path, monkeypatch):
pytest.importorskip("spacy")
raw = tmp_path / "raw.csv"
pd.DataFrame(
{
"CommentId": [1, 2],
"Text": ["Visit http://x.com now!!!", "nice video"],
"IsToxic": [1, 0],
}
).to_csv(raw, index=False)
from src.data import dual_loader as dl
monkeypatch.setattr(
dl.TextPreprocessor,
"__init__",
lambda self, config_path=None: None,
)
monkeypatch.setattr(
dl.TextPreprocessor,
"transform",
lambda self, s: s.str.lower(),
)
out = dl.load_dual_track_data(
raw,
processed_preprocessed=tmp_path / "pre.csv",
processed_stats=tmp_path / "missing_stats.csv",
write_preprocessed_if_missing=True,
project_root=tmp_path,
)
assert "clean_text" in out.columns
assert (out["char_length"] > 0).all()