File size: 2,671 Bytes
46cc63a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""Unit tests for stable training helpers (no full fine-tune)."""

import numpy as np
import pytest

from src.models.hybrid_ensemble import fit_lr_with_gap_control, soft_vote_probs
from src.models.transformer_trainer import freeze_distilbert_partial


def test_fit_lr_with_gap_control_picks_regularized_c():
    import pandas as pd

    rng = np.random.default_rng(0)
    n = 120
    X = pd.Series(["word " * (i % 5) + str(i) for i in range(n)])
    y = pd.Series(rng.integers(0, 2, size=n))
    X_tr, X_te = X.iloc[:100], X.iloc[100:]
    y_tr, y_te = y.iloc[:100], y.iloc[100:]
    lr_cfg = {
        "C": 0.05,
        "max_iter": 500,
        "class_weight": "balanced",
        "solver": "lbfgs",
        "gap_search": {
            "enabled": True,
            "param_grid": [{"C": 0.05, "max_features": 200}, {"C": 0.001, "max_features": 50}],
        },
    }
    tfidf_cfg = {"max_features": 200, "ngram_range": [1, 1], "min_df": 1}
    model, meta = fit_lr_with_gap_control(X_tr, y_tr, X_te, y_te, lr_cfg, tfidf_cfg, max_gap=0.05)
    assert model.is_fitted
    assert "C" in meta


def test_soft_vote_equal_weights():
    a = np.array([0.8, 0.2])
    b = np.array([0.4, 0.6])
    out = soft_vote_probs(a, b, 0.5, 0.5)
    np.testing.assert_allclose(out, [0.6, 0.4])


def test_deduplicate_by_cosine_drops_near_duplicates(monkeypatch):
    from src.features import augmentation as aug

    class FakeModel:
        def encode(self, texts, **kwargs):
            if len(texts) == 1:
                return np.array([[1.0, 0.0]])
            return np.array([[0.99, 0.01], [0.0, 1.0]])

    monkeypatch.setattr(
        "sentence_transformers.SentenceTransformer",
        lambda *_a, **_k: FakeModel(),
    )

    kept_t, kept_l = aug.deduplicate_by_cosine(
        ["near dup", "different"],
        [1, 1],
        ["ref"],
        threshold=0.95,
    )
    assert kept_t == ["different"]
    assert kept_l == [1]


def test_partial_freeze_distilbert():
    pytest.importorskip("transformers")
    from transformers import AutoModelForSequenceClassification

    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2,
    )
    freeze_distilbert_partial(model, freeze_first_n=4)
    layers = list(model.distilbert.transformer.layer)
    for i, layer in enumerate(layers):
        frozen = not any(p.requires_grad for p in layer.parameters())
        if i < 4:
            assert frozen, f"layer {i} should be frozen"
        else:
            assert not frozen, f"layer {i} should be trainable"

    assert model.pre_classifier.weight.requires_grad
    assert model.classifier.weight.requires_grad