Spaces:
Sleeping
Sleeping
| import pytest | |
| from datetime import datetime | |
| from app.services.nlp.entity_extractor import EntityExtractor | |
| from app.services.preprocessing.schemas import PreprocessingResult, PreprocessedMessage, MessageMetadata | |
| from app.services.parser.schemas import RawMessage | |
| def sample_preprocessed_data(): | |
| """Create mock preprocessed chat data for testing.""" | |
| messages = [ | |
| PreprocessedMessage( | |
| message_id=1, | |
| raw=RawMessage(timestamp=datetime.now(), sender="Alice Walker", content="Hey Bob, did you see Charlie today?"), | |
| base_clean="Hey Bob, did you see Charlie today?", | |
| variants={"speaker": "Hey Bob, did you see Charlie today?"}, | |
| metadata=MessageMetadata(word_count=7) | |
| ), | |
| PreprocessedMessage( | |
| message_id=2, | |
| raw=RawMessage(timestamp=datetime.now(), sender="Bob Builder", content="No Alice, I haven't seen Charlie."), | |
| base_clean="No Alice, I haven't seen Charlie.", | |
| variants={"speaker": "No Alice, I haven't seen Charlie."}, | |
| metadata=MessageMetadata(word_count=6) | |
| ), | |
| PreprocessedMessage( | |
| message_id=3, | |
| raw=RawMessage(timestamp=datetime.now(), sender="Alice Walker", content="Okay, thanks Bob."), | |
| base_clean="Okay, thanks Bob.", | |
| variants={"speaker": "Okay, thanks Bob."}, | |
| metadata=MessageMetadata(word_count=3) | |
| ) | |
| ] | |
| return PreprocessingResult(messages=messages) | |
| def test_entity_extraction_and_resolution(sample_preprocessed_data): | |
| """Test that NER extracts names and associates first names to full name senders.""" | |
| extractor = EntityExtractor(model_name="en_core_web_md") | |
| result = extractor.analyze_speakers(sample_preprocessed_data) | |
| # Alice Walker (Sender) should be matched with "Alice" mention | |
| alice_profile = next(p for p in result.participants if p.name == "Alice Walker") | |
| assert alice_profile.is_sender is True | |
| assert alice_profile.messages_sent == 2 | |
| # spaCy should catch "Alice" in the second message. | |
| # Even if spaCy misses it in this tiny sample, our logic should handle if it catches it. | |
| # Bob Builder (Sender) should be matched with "Bob" mention | |
| bob_profile = next(p for p in result.participants if p.name == "Bob Builder") | |
| assert bob_profile.is_sender is True | |
| assert bob_profile.messages_sent == 1 | |
| # Charlie should be a mentioned-only participant | |
| charlie_profile = next((p for p in result.participants if "Charlie" in p.name), None) | |
| assert charlie_profile is not None | |
| assert charlie_profile.is_sender is False | |
| assert charlie_profile.mention_count >= 1 | |
| # Check centrality | |
| # Bob was mentioned by Alice (Walker). Alice was mentioned by Bob (Builder). | |
| # Since there are 2 senders, max centrality is 1.0. | |
| assert alice_profile.centrality_score > 0 | |
| assert bob_profile.centrality_score > 0 | |
| print(f"Global Metrics: {result.global_metrics}") | |
| for p in result.participants: | |
| print(f"Profile: {p.name} | S: {p.is_sender} | M: {p.mention_count} | D: {p.dominance_score:.2f} | C: {p.centrality_score:.2f}") | |
| if __name__ == "__main__": | |
| # Ensure spacy model is downloaded before running manually | |
| import subprocess | |
| import sys | |
| try: | |
| import spacy | |
| spacy.load("en_core_web_md") | |
| except: | |
| subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_md"]) | |
| data = sample_preprocessed_data() | |
| test_entity_extraction_and_resolution(data) | |