File size: 2,618 Bytes
cf4ac41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pytest
from datetime import datetime
from app.services.parser.schemas import RawMessage
from app.services.preprocessing.cleaner import MessageCleaner

def test_basic_cleaning():
    cleaner = MessageCleaner()
    raw = RawMessage(timestamp=datetime.now(), sender="User 1", content="   Hello!   How are you?   ")
    processed = cleaner.process_message(1, raw)
    assert processed.base_clean == "Hello! How are you?"
    assert processed.metadata.word_count == 4

def test_emoji_extraction():
    cleaner = MessageCleaner()
    raw = RawMessage(timestamp=datetime.now(), sender="User 1", content="Love this! πŸ˜βœ¨πŸš€")
    processed = cleaner.process_message(1, raw)
    assert processed.metadata.emoji_count == 3
    assert "😍" in processed.metadata.emoji_list
    assert "✨" in processed.metadata.emoji_list

def test_intensity_signals():
    cleaner = MessageCleaner()
    raw = RawMessage(timestamp=datetime.now(), sender="User 2", content="WHAT IS GOING ON?!?!")
    processed = cleaner.process_message(1, raw)
    assert processed.metadata.caps_ratio > 0.8
    assert processed.metadata.repeated_punctuation_count > 0

def test_topic_variant():
    cleaner = MessageCleaner()
    # Topic variant should be lowercase, no emojis, stripped punctuation
    raw = RawMessage(timestamp=datetime.now(), sender="User 1", content="Check this: https://example.com/foo AND πŸš€!")
    processed = cleaner.process_message(1, raw)
    
    topic_v = processed.variants["topic"]
    assert "πŸš€" not in topic_v
    assert "http" not in topic_v
    assert "and" in topic_v
    assert "check" in topic_v
    assert ":" not in topic_v

def test_repeated_chars():
    cleaner = MessageCleaner()
    raw = RawMessage(timestamp=datetime.now(), sender="User 2", content="Looooooooooooool that is so coooooool!!!")
    processed = cleaner.process_message(1, raw)
    assert processed.metadata.repeated_char_count >= 2
    assert processed.metadata.repeated_punctuation_count >= 1

def test_process_sequence():
    cleaner = MessageCleaner()
    messages = [
        RawMessage(timestamp=datetime.now(), sender="Alice", content="Hi! πŸ‘‹"),
        RawMessage(timestamp=datetime.now(), sender="Bob", content="Hello Alice."),
        RawMessage(timestamp=datetime.now(), sender="Alice", content="Check this link out: https://cool.com")
    ]
    result = cleaner.process_sequence(messages)
    assert len(result.messages) == 3
    assert "Alice" in result.global_metadata["participant_stats"]
    assert "Bob" in result.global_metadata["participant_stats"]
    assert result.global_metadata["total_emojis"]["πŸ‘‹"] == 1