Spaces:
Running
Running
| """Tests for the dynamic content detector.""" | |
| import pytest | |
| from headroom.cache.dynamic_detector import ( | |
| DetectionResult, | |
| DetectorConfig, | |
| DynamicCategory, | |
| DynamicContentDetector, | |
| RegexDetector, | |
| detect_dynamic_content, | |
| ) | |
| class TestRegexDetector: | |
| """Test the Tier 1 regex detector.""" | |
| def detector(self): | |
| """Create a regex detector.""" | |
| config = DetectorConfig(tiers=["regex"]) | |
| return RegexDetector(config) | |
| def test_iso_date(self, detector): | |
| """Test ISO date detection.""" | |
| spans = detector.detect("The date is 2024-01-15.") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "2024-01-15" | |
| assert spans[0].category == DynamicCategory.DATE | |
| assert spans[0].tier == "regex" | |
| def test_structural_detection(self, detector): | |
| """Test structural detection via 'Label: value' patterns.""" | |
| # New scalable approach: detect via structural "Today: value" pattern | |
| spans = detector.detect("Date: 2024-01-15") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "2024-01-15" | |
| assert spans[0].category == DynamicCategory.DATE | |
| # Test user label detection | |
| spans = detector.detect("User: john.doe@example.com") | |
| user_spans = [s for s in spans if s.category == DynamicCategory.USER_DATA] | |
| assert len(user_spans) == 1 | |
| def test_datetime_iso(self, detector): | |
| """Test ISO datetime detection.""" | |
| spans = detector.detect("Timestamp: 2024-01-15T10:30:00Z") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "2024-01-15T10:30:00Z" | |
| assert spans[0].category == DynamicCategory.DATETIME | |
| def test_uuid(self, detector): | |
| """Test UUID detection.""" | |
| spans = detector.detect("ID: 550e8400-e29b-41d4-a716-446655440000") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "550e8400-e29b-41d4-a716-446655440000" | |
| assert spans[0].category == DynamicCategory.UUID | |
| def test_request_id(self, detector): | |
| """Test request ID detection.""" | |
| spans = detector.detect("Request: req_abc123def456ghi789") | |
| assert len(spans) == 1 | |
| assert "req_" in spans[0].text | |
| assert spans[0].category == DynamicCategory.REQUEST_ID | |
| def test_unix_timestamp(self, detector): | |
| """Test Unix timestamp detection.""" | |
| spans = detector.detect("Time: 1705312200") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "1705312200" | |
| assert spans[0].category == DynamicCategory.TIMESTAMP | |
| def test_time(self, detector): | |
| """Test time detection.""" | |
| spans = detector.detect("Meeting at 10:30 AM") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "10:30 AM" | |
| assert spans[0].category == DynamicCategory.TIME | |
| def test_version(self, detector): | |
| """Test version number detection.""" | |
| spans = detector.detect("Running v2.3.1-beta") | |
| assert len(spans) == 1 | |
| assert spans[0].text == "v2.3.1-beta" | |
| assert spans[0].category == DynamicCategory.VERSION | |
| def test_date_prefix_pattern(self, detector): | |
| """Test full date prefix phrase detection.""" | |
| spans = detector.detect("Today is Monday, January 15, 2024. You are an assistant.") | |
| assert len(spans) >= 1 | |
| # Should detect the full phrase | |
| date_spans = [s for s in spans if s.category == DynamicCategory.DATE] | |
| assert len(date_spans) >= 1 | |
| def test_multiple_dynamic_elements(self, detector): | |
| """Test detecting multiple dynamic elements.""" | |
| content = """ | |
| Date: 2024-01-15 | |
| Time: 10:30:00 | |
| Request ID: req_abc123def456ghi789xyz | |
| UUID: 550e8400-e29b-41d4-a716-446655440000 | |
| """ | |
| spans = detector.detect(content) | |
| assert len(spans) == 4 | |
| categories = {s.category for s in spans} | |
| assert DynamicCategory.DATE in categories | |
| assert DynamicCategory.TIME in categories | |
| assert DynamicCategory.REQUEST_ID in categories | |
| assert DynamicCategory.UUID in categories | |
| def test_no_false_positives_on_static(self, detector): | |
| """Test that static content doesn't trigger false positives.""" | |
| spans = detector.detect("You are a helpful assistant. Answer questions clearly.") | |
| assert len(spans) == 0 | |
| def test_positions_are_correct(self, detector): | |
| """Test that span positions are correct.""" | |
| content = "Date: 2024-01-15" | |
| spans = detector.detect(content) | |
| assert len(spans) == 1 | |
| assert content[spans[0].start : spans[0].end] == spans[0].text | |
| class TestDynamicContentDetector: | |
| """Test the unified dynamic content detector.""" | |
| def test_regex_only(self): | |
| """Test detector with regex tier only.""" | |
| config = DetectorConfig(tiers=["regex"]) | |
| detector = DynamicContentDetector(config) | |
| result = detector.detect("Today is 2024-01-15. You are helpful.") | |
| assert len(result.spans) == 1 | |
| assert result.spans[0].text == "2024-01-15" | |
| assert "regex" in result.tiers_used | |
| assert result.processing_time_ms < 10 # Should be very fast | |
| def test_static_dynamic_split(self): | |
| """Test that content is properly split.""" | |
| config = DetectorConfig(tiers=["regex"]) | |
| detector = DynamicContentDetector(config) | |
| result = detector.detect("Today is 2024-01-15. You are helpful.") | |
| assert "2024-01-15" not in result.static_content | |
| assert "2024-01-15" in result.dynamic_content | |
| assert "You are helpful" in result.static_content | |
| def test_complex_content(self): | |
| """Test with realistic system prompt.""" | |
| config = DetectorConfig(tiers=["regex"]) | |
| detector = DynamicContentDetector(config) | |
| content = """You are a helpful AI assistant. | |
| Today is January 15, 2024. | |
| Current session: sess_abc123def456ghi789xyz | |
| Instructions: | |
| 1. Be concise | |
| 2. Be accurate | |
| 3. Be helpful | |
| Request ID: req_xyz789abc123def456ghi""" | |
| result = detector.detect(content) | |
| # Should find date, session ID, request ID | |
| assert len(result.spans) >= 2 | |
| categories = {s.category for s in result.spans} | |
| assert DynamicCategory.DATE in categories or DynamicCategory.REQUEST_ID in categories | |
| def test_empty_content(self): | |
| """Test with empty content.""" | |
| detector = DynamicContentDetector() | |
| result = detector.detect("") | |
| assert len(result.spans) == 0 | |
| assert result.static_content == "" | |
| assert result.dynamic_content == "" | |
| def test_no_dynamic_content(self): | |
| """Test with fully static content.""" | |
| detector = DynamicContentDetector() | |
| content = "You are a helpful assistant. Answer questions clearly and concisely." | |
| result = detector.detect(content) | |
| assert len(result.spans) == 0 | |
| assert result.static_content == content | |
| assert result.dynamic_content == "" | |
| def test_custom_patterns(self): | |
| """Test adding custom regex patterns.""" | |
| config = DetectorConfig( | |
| tiers=["regex"], | |
| custom_patterns=[ | |
| (r"CUSTOM_\d{4}", DynamicCategory.REQUEST_ID), | |
| ], | |
| ) | |
| detector = DynamicContentDetector(config) | |
| result = detector.detect("Code: CUSTOM_1234") | |
| custom_spans = [s for s in result.spans if s.text == "CUSTOM_1234"] | |
| assert len(custom_spans) == 1 | |
| def test_available_tiers(self): | |
| """Test that available_tiers reflects actual availability.""" | |
| config = DetectorConfig(tiers=["regex", "ner", "semantic"]) | |
| detector = DynamicContentDetector(config) | |
| # Regex should always be available | |
| assert "regex" in detector.available_tiers | |
| # NER and semantic depend on optional dependencies | |
| # They may or may not be available | |
| def test_warnings_for_missing_dependencies(self): | |
| """Test that warnings are generated for missing dependencies.""" | |
| config = DetectorConfig(tiers=["regex", "ner", "semantic"]) | |
| detector = DynamicContentDetector(config) | |
| detector.detect("Test content") | |
| # If NER/semantic not installed, should have warnings | |
| # (This test passes either way - it's informational) | |
| # If deps ARE installed, no warnings. If not, warnings present. | |
| class TestConvenienceFunction: | |
| """Test the detect_dynamic_content convenience function.""" | |
| def test_basic_usage(self): | |
| """Test basic convenience function usage.""" | |
| result = detect_dynamic_content("Date: 2024-01-15") | |
| assert isinstance(result, DetectionResult) | |
| assert len(result.spans) == 1 | |
| assert result.spans[0].text == "2024-01-15" | |
| def test_with_tiers(self): | |
| """Test specifying tiers.""" | |
| result = detect_dynamic_content( | |
| "Date: 2024-01-15", | |
| tiers=["regex"], | |
| ) | |
| assert "regex" in result.tiers_used | |
| class TestEntropyDetection: | |
| """Test entropy-based detection for random IDs/tokens.""" | |
| def test_high_entropy_string(self): | |
| """Test that high-entropy strings are detected.""" | |
| from headroom.cache.dynamic_detector import calculate_entropy | |
| # High entropy strings (random-looking) | |
| assert calculate_entropy("abc123xyz789def") > 0.7 | |
| assert calculate_entropy("550e8400e29b41d4") > 0.7 | |
| # Low entropy strings (repetitive) | |
| assert calculate_entropy("aaaaaaaaaa") < 0.3 | |
| assert calculate_entropy("abababab") < 0.6 | |
| def test_entropy_detection_finds_ids(self): | |
| """Test that entropy detection finds random IDs.""" | |
| detector = DynamicContentDetector() | |
| # Random-looking ID that isn't covered by universal patterns | |
| result = detector.detect("Auth: xK7mN2pQr9sT4vW") | |
| # Should find the ID via entropy or structural detection | |
| assert len(result.spans) >= 1 | |
| def test_entropy_skips_common_words(self): | |
| """Test that common words aren't flagged as high-entropy.""" | |
| detector = DynamicContentDetector() | |
| # These words have mixed case/numbers but aren't IDs | |
| result = detector.detect("Use username and password correctly.") | |
| # "username" and "password" shouldn't be detected | |
| flagged_words = [s.text for s in result.spans] | |
| assert "username" not in flagged_words | |
| assert "password" not in flagged_words | |
| class TestEdgeCases: | |
| """Test edge cases and tricky inputs.""" | |
| def test_overlapping_patterns(self): | |
| """Test that overlapping patterns don't cause duplicates.""" | |
| detector = DynamicContentDetector() | |
| # ISO datetime contains ISO date - shouldn't match both | |
| result = detector.detect("Time: 2024-01-15T10:30:00Z") | |
| # Should match datetime, not date separately | |
| assert len(result.spans) == 1 | |
| assert result.spans[0].category == DynamicCategory.DATETIME | |
| def test_adjacent_dynamic_content(self): | |
| """Test adjacent dynamic elements.""" | |
| detector = DynamicContentDetector() | |
| result = detector.detect("2024-01-15 10:30:00") | |
| # Should find both date and time | |
| assert len(result.spans) == 2 | |
| def test_very_long_content(self): | |
| """Test with long content.""" | |
| detector = DynamicContentDetector() | |
| # Create long content with some dynamic parts | |
| static_parts = ["This is static text. "] * 100 | |
| content = "".join(static_parts) + "Date: 2024-01-15. " + "".join(static_parts) | |
| result = detector.detect(content) | |
| assert len(result.spans) == 1 | |
| assert result.processing_time_ms < 100 # Should still be fast | |
| def test_special_characters(self): | |
| """Test content with special characters.""" | |
| detector = DynamicContentDetector() | |
| content = "Date: 2024-01-15\nUUID: 550e8400-e29b-41d4-a716-446655440000\n\n---\n" | |
| result = detector.detect(content) | |
| assert len(result.spans) == 2 | |
| def test_unicode_content(self): | |
| """Test with Unicode content.""" | |
| detector = DynamicContentDetector() | |
| content = "日期: 2024-01-15. Héllo wörld!" | |
| result = detector.detect(content) | |
| # Should still find the date | |
| assert len(result.spans) == 1 | |
| assert result.spans[0].text == "2024-01-15" | |
| class TestCacheAlignmentScenarios: | |
| """Test scenarios relevant to cache alignment.""" | |
| def test_system_prompt_dates(self): | |
| """Test extracting dates from system prompts.""" | |
| detector = DynamicContentDetector() | |
| content = """You are Claude, an AI assistant by Anthropic. | |
| Today is Monday, January 15, 2024. | |
| Current time: 10:30 AM PST. | |
| Your task is to help users with coding questions.""" | |
| result = detector.detect(content) | |
| # Should extract date and time | |
| assert len(result.spans) >= 1 | |
| # Static content should not have dates | |
| assert "2024" not in result.static_content or "January" in result.static_content | |
| # Dynamic content should have the dates | |
| assert ( | |
| "January" in result.dynamic_content | |
| or "2024-01-15" in result.dynamic_content | |
| or "10:30" in result.dynamic_content | |
| ) | |
| def test_request_metadata(self): | |
| """Test extracting request metadata.""" | |
| detector = DynamicContentDetector() | |
| content = """Request ID: req_abc123xyz789 | |
| Trace ID: 550e8400-e29b-41d4-a716-446655440000 | |
| Timestamp: 1705312200 | |
| Process the following query:""" | |
| result = detector.detect(content) | |
| # Should find request ID, UUID, timestamp | |
| {s.category for s in result.spans} | |
| assert len(result.spans) >= 2 | |
| def test_mixed_static_dynamic(self): | |
| """Test content with interspersed static and dynamic parts.""" | |
| detector = DynamicContentDetector() | |
| content = """You are helpful (static). | |
| Today is 2024-01-15 (dynamic). | |
| Always be accurate (static). | |
| Session: sess_abc123xyz789 (dynamic). | |
| Never lie (static).""" | |
| result = detector.detect(content) | |
| # Should find date and session ID | |
| assert len(result.spans) >= 1 | |
| # Static content should preserve the static parts | |
| assert "helpful" in result.static_content | |
| assert "accurate" in result.static_content | |
| class TestNERDetector: | |
| """Test Tier 2 NER detector (if spaCy available).""" | |
| def ner_detector(self): | |
| """Create detector with NER enabled.""" | |
| from headroom.cache.dynamic_detector import _SPACY_AVAILABLE, NERDetector | |
| if not _SPACY_AVAILABLE: | |
| pytest.skip("spaCy not installed") | |
| config = DetectorConfig(tiers=["ner"]) | |
| detector = NERDetector(config) | |
| if not detector.is_available: | |
| pytest.skip("spaCy model not available") | |
| return detector | |
| def test_person_detection(self, ner_detector): | |
| """Test detecting person names.""" | |
| spans, _ = ner_detector.detect("John Smith sent the message.") | |
| [s for s in spans if s.category == DynamicCategory.PERSON] | |
| # NER might or might not detect "John Smith" depending on model | |
| # This is more of an integration test | |
| def test_money_detection(self, ner_detector): | |
| """Test detecting money amounts.""" | |
| spans, _ = ner_detector.detect("The total is $500.00") | |
| [s for s in spans if s.category == DynamicCategory.MONEY] | |
| # May or may not detect depending on spaCy model | |
| class TestSemanticDetector: | |
| """Test Tier 3 semantic detector (if sentence-transformers available).""" | |
| def semantic_detector(self): | |
| """Create detector with semantic enabled.""" | |
| from headroom.cache.dynamic_detector import ( | |
| _SENTENCE_TRANSFORMERS_AVAILABLE, | |
| SemanticDetector, | |
| ) | |
| if not _SENTENCE_TRANSFORMERS_AVAILABLE: | |
| pytest.skip("sentence-transformers not installed") | |
| config = DetectorConfig(tiers=["semantic"]) | |
| detector = SemanticDetector(config) | |
| if not detector.is_available: | |
| pytest.skip("Embedding model not available") | |
| return detector | |
| def test_realtime_detection(self, semantic_detector): | |
| """Test detecting real-time/volatile content.""" | |
| content = "The current stock price is updated every minute." | |
| spans, _ = semantic_detector.detect(content) | |
| # Should detect this as volatile/realtime | |
| # Depends on similarity threshold | |
| class TestIntegrationWithAllTiers: | |
| """Integration tests using all available tiers.""" | |
| def test_all_tiers_together(self): | |
| """Test running all tiers on complex content.""" | |
| config = DetectorConfig(tiers=["regex", "ner", "semantic"]) | |
| detector = DynamicContentDetector(config) | |
| content = """Today is January 15, 2024. | |
| John paid $500 for the service. | |
| Request ID: req_abc123xyz789. | |
| The stock price updates in real-time. | |
| Be helpful and accurate.""" | |
| result = detector.detect(content) | |
| # Should find at least the regex matches | |
| assert len(result.spans) >= 1 | |
| # Check processing time is reasonable | |
| # NER + semantic might add 50-100ms | |
| assert result.processing_time_ms < 5000 # Very generous timeout | |
| # Should have used at least regex | |
| assert "regex" in result.tiers_used | |
| def test_tier_precedence(self): | |
| """Test that earlier tiers take precedence.""" | |
| config = DetectorConfig(tiers=["regex", "ner"]) | |
| detector = DynamicContentDetector(config) | |
| # Date should be caught by regex, not NER | |
| result = detector.detect("Date: 2024-01-15") | |
| assert len(result.spans) == 1 | |
| assert result.spans[0].tier == "regex" | |