File size: 4,448 Bytes
f884e6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Integration tests for Phase 2A components."""

import shutil
import tempfile

from src.embedding.embedding_service import EmbeddingService
from src.vector_store.vector_db import VectorDatabase


class TestPhase2AIntegration:
    """Test integration between EmbeddingService and VectorDatabase"""

    def setup_method(self):
        """Set up test environment with temporary database"""
        self.test_dir = tempfile.mkdtemp()
        self.embedding_service = EmbeddingService()
        self.vector_db = VectorDatabase(persist_path=self.test_dir, collection_name="test_integration")

    def teardown_method(self):
        """Clean up temporary resources"""
        if hasattr(self, "test_dir"):
            shutil.rmtree(self.test_dir, ignore_errors=True)

    def test_embedding_vector_storage_workflow(self):
        """Test complete workflow: text → embedding → storage → search"""

        # Sample policy texts
        documents = [
            ("Employees must complete security training annually to " "maintain access to company systems."),
            ("Remote work policy allows employees to work from home up to " "3 days per week."),
            ("All expenses over $500 require manager approval before " "reimbursement."),
            ("Code review is mandatory for all pull requests before " "merging to main branch."),
        ]

        # Generate embeddings
        embeddings = self.embedding_service.embed_texts(documents)

        # Verify embeddings were generated
        assert len(embeddings) == len(documents)
        assert all(len(emb) == self.embedding_service.get_embedding_dimension() for emb in embeddings)

        # Store embeddings with metadata (using existing collection)
        doc_ids = [f"doc_{i}" for i in range(len(documents))]
        metadatas = [{"type": "policy", "doc_id": doc_id} for doc_id in doc_ids]

        success = self.vector_db.add_embeddings(
            embeddings=embeddings,
            chunk_ids=doc_ids,
            documents=documents,
            metadatas=metadatas,
        )

        assert success is True

        # Test search functionality
        query = "remote work from home policy"
        query_embedding = self.embedding_service.embed_text(query)

        results = self.vector_db.search(query_embedding=query_embedding, top_k=2)

        # Verify search results (should return list of dictionaries)
        assert isinstance(results, list)
        assert len(results) <= 2  # Should return at most 2 results

        if results:  # If we have results
            assert all(isinstance(result, dict) for result in results)
            # Check that at least one result contains remote work related content
            documents_found = [result.get("document", "") for result in results]
            remote_work_found = any(
                "remote work" in doc.lower() or "work from home" in doc.lower() for doc in documents_found
            )
            assert remote_work_found

    def test_basic_embedding_dimension_consistency(self):
        """Test that embeddings have consistent dimensions"""

        # Test different text lengths
        texts = [
            "Short text.",
            ("This is a medium length text with several words to test " "embedding consistency."),
            (
                "This is a much longer text that contains multiple sentences "
                "and various types of content to ensure that the embedding "
                "service can handle longer inputs without issues and still "
                "produce consistent dimensional output vectors."
            ),
        ]

        # Generate embeddings
        embeddings = self.embedding_service.embed_texts(texts)

        # All embeddings should have the same dimension
        dimensions = [len(emb) for emb in embeddings]
        assert all(dim == dimensions[0] for dim in dimensions)

        # Dimension should match the service's reported dimension
        assert dimensions[0] == self.embedding_service.get_embedding_dimension()

    def test_empty_collection_handling(self):
        """Test behavior with empty collection"""

        # Search in empty collection
        query_embedding = self.embedding_service.embed_text("test query")

        results = self.vector_db.search(query_embedding=query_embedding, top_k=5)

        # Should handle empty collection gracefully
        assert isinstance(results, list)
        assert len(results) == 0