File size: 1,903 Bytes
3df5819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""Tests for the style fingerprinting module."""

import pytest
import torch
from src.style.fingerprinter import StyleFingerprinter, StyleProjectionMLP
from src.style.style_vector import cosine_similarity, average_style_vectors


@pytest.fixture
def fingerprinter(tmp_path):
    awl = tmp_path / "awl.txt"
    awl.write_text("analysis\nconsider\nestablish\nsignificant\n")
    return StyleFingerprinter(spacy_model="en_core_web_sm", awl_path=str(awl))


def test_style_vector_shape(fingerprinter):
    """Test that style vectors have correct dimensionality."""
    vec = fingerprinter.extract_vector("This is a test sentence for analysis.")
    assert vec.shape == (512,)


def test_style_vector_different_texts(fingerprinter):
    """Test that different writing styles produce different vectors."""
    formal = "The analysis demonstrates significant correlations between variables."
    informal = "yo this stuff is like totally awesome and cool"
    v1 = fingerprinter.extract_vector(formal)
    v2 = fingerprinter.extract_vector(informal)
    sim = cosine_similarity(v1, v2)
    assert sim < 0.99  # Should not be identical


def test_style_blend(fingerprinter):
    """Test that blended vectors have unit norm."""
    v1 = fingerprinter.extract_vector("Academic formal text with analysis.")
    v2 = fingerprinter.extract_vector("Casual informal text with stuff.")
    blended = fingerprinter.blend_vectors(v1, v2, alpha=0.6)
    norm = torch.norm(blended).item()
    assert abs(norm - 1.0) < 0.01  # Should be L2-normalised


def test_raw_features_keys(fingerprinter):
    """Test that raw features contain expected keys."""
    features = fingerprinter.extract_raw_features("The quick brown fox jumps over the lazy dog.")
    assert "sentence_length_mean" in features
    assert "type_token_ratio" in features
    assert "passive_voice_ratio" in features
    assert "lexical_density" in features