Spaces:
Sleeping
Sleeping
File size: 2,618 Bytes
cf4ac41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import pytest
from datetime import datetime
from app.services.parser.schemas import RawMessage
from app.services.preprocessing.cleaner import MessageCleaner
def test_basic_cleaning():
cleaner = MessageCleaner()
raw = RawMessage(timestamp=datetime.now(), sender="User 1", content=" Hello! How are you? ")
processed = cleaner.process_message(1, raw)
assert processed.base_clean == "Hello! How are you?"
assert processed.metadata.word_count == 4
def test_emoji_extraction():
cleaner = MessageCleaner()
raw = RawMessage(timestamp=datetime.now(), sender="User 1", content="Love this! πβ¨π")
processed = cleaner.process_message(1, raw)
assert processed.metadata.emoji_count == 3
assert "π" in processed.metadata.emoji_list
assert "β¨" in processed.metadata.emoji_list
def test_intensity_signals():
cleaner = MessageCleaner()
raw = RawMessage(timestamp=datetime.now(), sender="User 2", content="WHAT IS GOING ON?!?!")
processed = cleaner.process_message(1, raw)
assert processed.metadata.caps_ratio > 0.8
assert processed.metadata.repeated_punctuation_count > 0
def test_topic_variant():
cleaner = MessageCleaner()
# Topic variant should be lowercase, no emojis, stripped punctuation
raw = RawMessage(timestamp=datetime.now(), sender="User 1", content="Check this: https://example.com/foo AND π!")
processed = cleaner.process_message(1, raw)
topic_v = processed.variants["topic"]
assert "π" not in topic_v
assert "http" not in topic_v
assert "and" in topic_v
assert "check" in topic_v
assert ":" not in topic_v
def test_repeated_chars():
cleaner = MessageCleaner()
raw = RawMessage(timestamp=datetime.now(), sender="User 2", content="Looooooooooooool that is so coooooool!!!")
processed = cleaner.process_message(1, raw)
assert processed.metadata.repeated_char_count >= 2
assert processed.metadata.repeated_punctuation_count >= 1
def test_process_sequence():
cleaner = MessageCleaner()
messages = [
RawMessage(timestamp=datetime.now(), sender="Alice", content="Hi! π"),
RawMessage(timestamp=datetime.now(), sender="Bob", content="Hello Alice."),
RawMessage(timestamp=datetime.now(), sender="Alice", content="Check this link out: https://cool.com")
]
result = cleaner.process_sequence(messages)
assert len(result.messages) == 3
assert "Alice" in result.global_metadata["participant_stats"]
assert "Bob" in result.global_metadata["participant_stats"]
assert result.global_metadata["total_emojis"]["π"] == 1
|