Mirae Kang commited on
Commit ·
447c4a0
1
Parent(s): df89287
feat: implement unit tests, #13
Browse files- tests/.gitkeep +0 -0
- tests/conftest.py +40 -0
- tests/test_api.py +60 -0
- tests/test_model.py +45 -0
- tests/test_preprocessor.py +37 -0
- tests/test_vectorizer.py +36 -0
tests/.gitkeep
DELETED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fixtures compartidas para tests del proyecto."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 10 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 11 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@pytest.fixture(scope="session", autouse=True)
|
| 15 |
+
def _project_cwd():
|
| 16 |
+
"""Los módulos y configs usan rutas relativas al root del repo."""
|
| 17 |
+
prev = os.getcwd()
|
| 18 |
+
os.chdir(PROJECT_ROOT)
|
| 19 |
+
yield
|
| 20 |
+
os.chdir(prev)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@pytest.fixture(scope="session")
|
| 24 |
+
def project_root() -> Path:
|
| 25 |
+
return PROJECT_ROOT
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@pytest.fixture(scope="session")
|
| 29 |
+
def features_config(project_root: Path) -> str:
|
| 30 |
+
return str(project_root / "configs" / "features.yaml")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@pytest.fixture(scope="session")
|
| 34 |
+
def models_config(project_root: Path) -> str:
|
| 35 |
+
return str(project_root / "configs" / "models.yaml")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@pytest.fixture(scope="session")
|
| 39 |
+
def best_params_config(project_root: Path) -> str:
|
| 40 |
+
return str(project_root / "configs" / "best_params.yaml")
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests del endpoint POST /predict."""
|
| 2 |
+
|
| 3 |
+
from unittest.mock import MagicMock
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from fastapi.testclient import TestClient
|
| 7 |
+
|
| 8 |
+
from src.api import main as api_main
|
| 9 |
+
|
| 10 |
+
PREDICT_RESPONSE_KEYS = {
|
| 11 |
+
"text",
|
| 12 |
+
"is_toxic",
|
| 13 |
+
"probability",
|
| 14 |
+
"labels",
|
| 15 |
+
"model_used",
|
| 16 |
+
"latency_ms",
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@pytest.fixture
|
| 21 |
+
def client():
|
| 22 |
+
mock_service = MagicMock()
|
| 23 |
+
mock_service.predict.return_value = {
|
| 24 |
+
"is_toxic": False,
|
| 25 |
+
"probability": 0.12,
|
| 26 |
+
"labels": [],
|
| 27 |
+
"model_used": "LR + TF-IDF (local)",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
with TestClient(api_main.app) as test_client:
|
| 31 |
+
api_main._state["service"] = mock_service
|
| 32 |
+
api_main._state["model_name"] = "LR + TF-IDF (local)"
|
| 33 |
+
api_main._state["predictions_served"] = 0
|
| 34 |
+
yield test_client
|
| 35 |
+
|
| 36 |
+
api_main._state["service"] = None
|
| 37 |
+
api_main._state["model_name"] = None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_predict_returns_correct_structure(client: TestClient):
|
| 41 |
+
response = client.post(
|
| 42 |
+
"/predict",
|
| 43 |
+
json={"text": "This is a sample comment", "threshold": 0.5},
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
assert response.status_code == 200
|
| 47 |
+
data = response.json()
|
| 48 |
+
assert PREDICT_RESPONSE_KEYS <= set(data.keys())
|
| 49 |
+
assert data["text"] == "This is a sample comment"
|
| 50 |
+
assert isinstance(data["is_toxic"], bool)
|
| 51 |
+
assert 0.0 <= data["probability"] <= 1.0
|
| 52 |
+
assert isinstance(data["labels"], list)
|
| 53 |
+
assert isinstance(data["model_used"], str)
|
| 54 |
+
assert isinstance(data["latency_ms"], (int, float))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_predict_rejects_empty_text(client: TestClient):
|
| 58 |
+
response = client.post("/predict", json={"text": " "})
|
| 59 |
+
|
| 60 |
+
assert response.status_code == 422
|
tests/test_model.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests de salida binaria de modelos sklearn."""
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from src.models.baseline import build_model
|
| 7 |
+
|
| 8 |
+
X_TRAIN = [
|
| 9 |
+
"the quick brown fox is nice",
|
| 10 |
+
"the lazy dog sleeps well",
|
| 11 |
+
"the fox and dog are friends",
|
| 12 |
+
"another calm peaceful day today",
|
| 13 |
+
"you are stupid and worthless idiot",
|
| 14 |
+
"kill them all right now attack",
|
| 15 |
+
]
|
| 16 |
+
Y_TRAIN = [0, 0, 0, 0, 1, 1]
|
| 17 |
+
X_PRED = ["the fox is calm", "you idiot fool"]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@pytest.fixture(scope="module")
|
| 21 |
+
def trained_lr(models_config: str, features_config: str, best_params_config: str):
|
| 22 |
+
model = build_model(
|
| 23 |
+
"lr",
|
| 24 |
+
config_path=models_config,
|
| 25 |
+
feat_config_path=features_config,
|
| 26 |
+
best_params_path=best_params_config,
|
| 27 |
+
)
|
| 28 |
+
model.fit(X_TRAIN, Y_TRAIN)
|
| 29 |
+
return model
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_predict_binary_labels(trained_lr):
|
| 33 |
+
preds = trained_lr.predict(X_PRED)
|
| 34 |
+
|
| 35 |
+
assert preds.shape == (len(X_PRED),)
|
| 36 |
+
assert set(np.unique(preds)).issubset({0, 1})
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_predict_proba_valid_binary_distribution(trained_lr):
|
| 40 |
+
proba = trained_lr.predict_proba(X_PRED)
|
| 41 |
+
|
| 42 |
+
assert proba.shape == (len(X_PRED), 2)
|
| 43 |
+
assert np.all(proba >= 0.0)
|
| 44 |
+
assert np.all(proba <= 1.0)
|
| 45 |
+
np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5)
|
tests/test_preprocessor.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests del pipeline de preprocesamiento de texto."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from src.features.text_preprocessor import TextPreprocessor
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@pytest.fixture(scope="module")
|
| 11 |
+
def preprocessor(features_config: str) -> TextPreprocessor:
|
| 12 |
+
return TextPreprocessor(config_path=features_config)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_empty_text_returns_empty_string(preprocessor: TextPreprocessor):
|
| 16 |
+
assert preprocessor.transform("") == ""
|
| 17 |
+
assert preprocessor.transform(" ") == ""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_url_text_removes_urls(preprocessor: TextPreprocessor):
|
| 21 |
+
raw = "Visit https://example.com/path and www.test.org now"
|
| 22 |
+
clean = preprocessor.transform(raw)
|
| 23 |
+
|
| 24 |
+
assert "http" not in clean
|
| 25 |
+
assert "www." not in clean
|
| 26 |
+
assert "example.com" not in clean
|
| 27 |
+
assert re.search(r"https?://", clean) is None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_normal_text_lowercase_and_lemmatized(preprocessor: TextPreprocessor):
|
| 31 |
+
raw = "The runners are running quickly"
|
| 32 |
+
clean = preprocessor.transform(raw)
|
| 33 |
+
|
| 34 |
+
assert isinstance(clean, str)
|
| 35 |
+
assert clean == clean.lower()
|
| 36 |
+
assert clean != ""
|
| 37 |
+
assert "run" in clean.split()
|
tests/test_vectorizer.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests del vectorizador TF-IDF."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from src.features.vectorizer import Vectorizer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# min_df=3 en configs/features.yaml: términos deben aparecer en ≥3 documentos
|
| 9 |
+
CORPUS_TRAIN = [
|
| 10 |
+
"the quick brown fox jumps",
|
| 11 |
+
"the lazy dog runs fast",
|
| 12 |
+
"the fox and dog play together",
|
| 13 |
+
"another quick fox story here",
|
| 14 |
+
]
|
| 15 |
+
CORPUS_TEST = ["the fox is quick today"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@pytest.fixture(scope="module")
|
| 19 |
+
def vectorizer(features_config: str) -> Vectorizer:
|
| 20 |
+
return Vectorizer(config_path=features_config, method="tfidf")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_fit_transform_output_shape(vectorizer: Vectorizer):
|
| 24 |
+
matrix = vectorizer.fit_transform(CORPUS_TRAIN)
|
| 25 |
+
|
| 26 |
+
assert matrix.shape[0] == len(CORPUS_TRAIN)
|
| 27 |
+
assert matrix.shape[1] > 0
|
| 28 |
+
assert matrix.shape[1] <= 5000
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_transform_preserves_sample_count(vectorizer: Vectorizer):
|
| 32 |
+
train_matrix = vectorizer.fit_transform(CORPUS_TRAIN)
|
| 33 |
+
test_matrix = vectorizer.transform(CORPUS_TEST)
|
| 34 |
+
|
| 35 |
+
assert test_matrix.shape[0] == len(CORPUS_TEST)
|
| 36 |
+
assert test_matrix.shape[1] == train_matrix.shape[1]
|