Spaces:
Running
Running
File size: 2,426 Bytes
daafb32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | """Verify embedding model works correctly before full pipeline run."""
import numpy as np
from src.utils.logger import setup_logger, get_logger
from src.embeddings.embedding_model import EmbeddingModel
setup_logger()
logger = get_logger(__name__)
def main():
model = EmbeddingModel()
# Test 1: Document embedding shape
docs = [
"The transformer model uses self-attention mechanisms.",
"UAV delivery systems require multi-agent coordination.",
"Gradient descent optimizes neural network parameters.",
]
doc_embeddings = model.embed_documents(docs, show_progress = False)
assert doc_embeddings.shape == (3, 768), f"Wrong shape: {doc_embeddings.shape}"
logger.info(f"β
Document embedding shape: {doc_embeddings.shape}")
# Test 2: Query embedding shape
query_emb = model.embed_query("what is attention mechanism?")
assert query_emb.shape == (768,), f"Wrong shape: {query_emb.shape}"
logger.info(f"β
Query embedding shape: {query_emb.shape}")
# Test 3: Semantic similarity ordering
# The first two docs are about ML models - should be more similar
# to each other than to the UAV doc
sim_01 = float(np.dot(doc_embeddings[0], doc_embeddings[1]))
sim_02 = float(np.dot(doc_embeddings[0], doc_embeddings[2]))
sim_12 = float(np.dot(doc_embeddings[1], doc_embeddings[2]))
logger.info(f"Similarity (transformer β gradient descent): {sim_02:.3f}")
logger.info(f"Similarity (transformer β UAV): {sim_01:.3f}")
logger.info(f"Similarity (UAV β gradient descent): {sim_12:.3f}")
# Test 4: Query-document similarity direction
# Query about attention should be closest to doc[0]
query_emb_2d = query_emb.reshape(1, -1)
sims = doc_embeddings @ query_emb_2d.T
best_match = int(np.argmax(sims))
logger.info(f"β
Query 'attention mechanism' matched doc[{best_match}]: '{docs[best_match][:50]}'")
assert best_match == 0, f"Expected doc[0] but got doc[{best_match}]"
# Test 5: Verify normalization (all vectors should have magnitude β 1.0)
norms = np.linalg.norm(doc_embeddings, axis = 1)
assert np.allclose(norms, 1.0, atol = 1e-5), f"Not normalized: {norms}"
logger.info(f"β
All embeddings L2-normalized (norms: {norms})")
logger.info(f"\nβ
All embedding tests passed. Ready for full pipeline.")
if __name__ == "__main__":
main() |