Mirae Kang commited on
Commit
447c4a0
·
1 Parent(s): df89287

feat: implement unit tests, #13

Browse files
tests/.gitkeep DELETED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fixtures compartidas para tests del proyecto."""
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
10
+ if str(PROJECT_ROOT) not in sys.path:
11
+ sys.path.insert(0, str(PROJECT_ROOT))
12
+
13
+
14
+ @pytest.fixture(scope="session", autouse=True)
15
+ def _project_cwd():
16
+ """Los módulos y configs usan rutas relativas al root del repo."""
17
+ prev = os.getcwd()
18
+ os.chdir(PROJECT_ROOT)
19
+ yield
20
+ os.chdir(prev)
21
+
22
+
23
+ @pytest.fixture(scope="session")
24
+ def project_root() -> Path:
25
+ return PROJECT_ROOT
26
+
27
+
28
+ @pytest.fixture(scope="session")
29
+ def features_config(project_root: Path) -> str:
30
+ return str(project_root / "configs" / "features.yaml")
31
+
32
+
33
+ @pytest.fixture(scope="session")
34
+ def models_config(project_root: Path) -> str:
35
+ return str(project_root / "configs" / "models.yaml")
36
+
37
+
38
+ @pytest.fixture(scope="session")
39
+ def best_params_config(project_root: Path) -> str:
40
+ return str(project_root / "configs" / "best_params.yaml")
tests/test_api.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests del endpoint POST /predict."""
2
+
3
+ from unittest.mock import MagicMock
4
+
5
+ import pytest
6
+ from fastapi.testclient import TestClient
7
+
8
+ from src.api import main as api_main
9
+
10
+ PREDICT_RESPONSE_KEYS = {
11
+ "text",
12
+ "is_toxic",
13
+ "probability",
14
+ "labels",
15
+ "model_used",
16
+ "latency_ms",
17
+ }
18
+
19
+
20
+ @pytest.fixture
21
+ def client():
22
+ mock_service = MagicMock()
23
+ mock_service.predict.return_value = {
24
+ "is_toxic": False,
25
+ "probability": 0.12,
26
+ "labels": [],
27
+ "model_used": "LR + TF-IDF (local)",
28
+ }
29
+
30
+ with TestClient(api_main.app) as test_client:
31
+ api_main._state["service"] = mock_service
32
+ api_main._state["model_name"] = "LR + TF-IDF (local)"
33
+ api_main._state["predictions_served"] = 0
34
+ yield test_client
35
+
36
+ api_main._state["service"] = None
37
+ api_main._state["model_name"] = None
38
+
39
+
40
+ def test_predict_returns_correct_structure(client: TestClient):
41
+ response = client.post(
42
+ "/predict",
43
+ json={"text": "This is a sample comment", "threshold": 0.5},
44
+ )
45
+
46
+ assert response.status_code == 200
47
+ data = response.json()
48
+ assert PREDICT_RESPONSE_KEYS <= set(data.keys())
49
+ assert data["text"] == "This is a sample comment"
50
+ assert isinstance(data["is_toxic"], bool)
51
+ assert 0.0 <= data["probability"] <= 1.0
52
+ assert isinstance(data["labels"], list)
53
+ assert isinstance(data["model_used"], str)
54
+ assert isinstance(data["latency_ms"], (int, float))
55
+
56
+
57
+ def test_predict_rejects_empty_text(client: TestClient):
58
+ response = client.post("/predict", json={"text": " "})
59
+
60
+ assert response.status_code == 422
tests/test_model.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests de salida binaria de modelos sklearn."""
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ from src.models.baseline import build_model
7
+
8
+ X_TRAIN = [
9
+ "the quick brown fox is nice",
10
+ "the lazy dog sleeps well",
11
+ "the fox and dog are friends",
12
+ "another calm peaceful day today",
13
+ "you are stupid and worthless idiot",
14
+ "kill them all right now attack",
15
+ ]
16
+ Y_TRAIN = [0, 0, 0, 0, 1, 1]
17
+ X_PRED = ["the fox is calm", "you idiot fool"]
18
+
19
+
20
+ @pytest.fixture(scope="module")
21
+ def trained_lr(models_config: str, features_config: str, best_params_config: str):
22
+ model = build_model(
23
+ "lr",
24
+ config_path=models_config,
25
+ feat_config_path=features_config,
26
+ best_params_path=best_params_config,
27
+ )
28
+ model.fit(X_TRAIN, Y_TRAIN)
29
+ return model
30
+
31
+
32
+ def test_predict_binary_labels(trained_lr):
33
+ preds = trained_lr.predict(X_PRED)
34
+
35
+ assert preds.shape == (len(X_PRED),)
36
+ assert set(np.unique(preds)).issubset({0, 1})
37
+
38
+
39
+ def test_predict_proba_valid_binary_distribution(trained_lr):
40
+ proba = trained_lr.predict_proba(X_PRED)
41
+
42
+ assert proba.shape == (len(X_PRED), 2)
43
+ assert np.all(proba >= 0.0)
44
+ assert np.all(proba <= 1.0)
45
+ np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5)
tests/test_preprocessor.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests del pipeline de preprocesamiento de texto."""
2
+
3
+ import re
4
+
5
+ import pytest
6
+
7
+ from src.features.text_preprocessor import TextPreprocessor
8
+
9
+
10
+ @pytest.fixture(scope="module")
11
+ def preprocessor(features_config: str) -> TextPreprocessor:
12
+ return TextPreprocessor(config_path=features_config)
13
+
14
+
15
+ def test_empty_text_returns_empty_string(preprocessor: TextPreprocessor):
16
+ assert preprocessor.transform("") == ""
17
+ assert preprocessor.transform(" ") == ""
18
+
19
+
20
+ def test_url_text_removes_urls(preprocessor: TextPreprocessor):
21
+ raw = "Visit https://example.com/path and www.test.org now"
22
+ clean = preprocessor.transform(raw)
23
+
24
+ assert "http" not in clean
25
+ assert "www." not in clean
26
+ assert "example.com" not in clean
27
+ assert re.search(r"https?://", clean) is None
28
+
29
+
30
+ def test_normal_text_lowercase_and_lemmatized(preprocessor: TextPreprocessor):
31
+ raw = "The runners are running quickly"
32
+ clean = preprocessor.transform(raw)
33
+
34
+ assert isinstance(clean, str)
35
+ assert clean == clean.lower()
36
+ assert clean != ""
37
+ assert "run" in clean.split()
tests/test_vectorizer.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests del vectorizador TF-IDF."""
2
+
3
+ import pytest
4
+
5
+ from src.features.vectorizer import Vectorizer
6
+
7
+
8
+ # min_df=3 en configs/features.yaml: términos deben aparecer en ≥3 documentos
9
+ CORPUS_TRAIN = [
10
+ "the quick brown fox jumps",
11
+ "the lazy dog runs fast",
12
+ "the fox and dog play together",
13
+ "another quick fox story here",
14
+ ]
15
+ CORPUS_TEST = ["the fox is quick today"]
16
+
17
+
18
+ @pytest.fixture(scope="module")
19
+ def vectorizer(features_config: str) -> Vectorizer:
20
+ return Vectorizer(config_path=features_config, method="tfidf")
21
+
22
+
23
+ def test_fit_transform_output_shape(vectorizer: Vectorizer):
24
+ matrix = vectorizer.fit_transform(CORPUS_TRAIN)
25
+
26
+ assert matrix.shape[0] == len(CORPUS_TRAIN)
27
+ assert matrix.shape[1] > 0
28
+ assert matrix.shape[1] <= 5000
29
+
30
+
31
+ def test_transform_preserves_sample_count(vectorizer: Vectorizer):
32
+ train_matrix = vectorizer.fit_transform(CORPUS_TRAIN)
33
+ test_matrix = vectorizer.transform(CORPUS_TEST)
34
+
35
+ assert test_matrix.shape[0] == len(CORPUS_TEST)
36
+ assert test_matrix.shape[1] == train_matrix.shape[1]