""" Embedding Generation (LangChain-based) ========================================================= Supports multiple embedding providers using LangChain: - Azure OpenAI - Sentence Transformers (local, free) - Cohere (via Azure Foundry) """ from typing import List, Optional import os from dataclasses import dataclass # LangChain embeddings from langchain_openai import AzureOpenAIEmbeddings from langchain_huggingface import HuggingFaceEmbeddings # Config from dotenv import load_dotenv load_dotenv() @dataclass class Embedding: """Container for embedding vector with metadata""" vector: List[float] text: str model: str dimension: int metadata: dict = None def __post_init__(self): if self.metadata is None: self.metadata = {} class Embedder: """Universal embedder supporting multiple providers via LangChain""" def __init__( self, provider: str = "sentence-transformers", model_name: str = "all-MiniLM-L6-v2", azure_endpoint: Optional[str] = None, azure_api_key: Optional[str] = None, azure_deployment: Optional[str] = None, ): """ Initialize embedder Args: provider: 'azure-openai' or 'sentence-transformers' model_name: Model name/deployment name azure_endpoint: Azure OpenAI endpoint (if using Azure) azure_api_key: Azure API key (if using Azure) azure_deployment: Azure deployment name (if using Azure) """ self.provider = provider self.model_name = model_name if provider == "azure-openai": # Use LangChain's AzureOpenAIEmbeddings self.embeddings = AzureOpenAIEmbeddings( azure_endpoint=azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT"), api_key=azure_api_key or os.getenv("AZURE_OPENAI_API_KEY"), azure_deployment=azure_deployment or model_name, api_version="2024-02-01", ) # Determine dimension based on model self.dimension = 3072 if "large" in model_name.lower() else 1536 elif provider == "sentence-transformers": # Use LangChain's HuggingFaceEmbeddings (wraps sentence-transformers) self.embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs={'device': 'cpu','trust_remote_code': False}, encode_kwargs={'normalize_embeddings': True} ) # Get dimension from model self.dimension = self._get_embedding_dimension() else: raise ValueError(f"Unsupported provider: {provider}") def _get_embedding_dimension(self) -> int: """Get embedding dimension by testing with a sample text""" try: sample = self.embeddings.embed_query("test") return len(sample) except: # Fallback dimensions for common models dim_map = { "all-MiniLM-L6-v2": 384, "all-MiniLM-L12-v2": 384, "all-mpnet-base-v2": 768, } return dim_map.get(self.model_name, 768) def embed(self, text: str) -> Embedding: """ Generate embedding for a single text Args: text: Text to embed Returns: Embedding object """ vector = self.embeddings.embed_query(text) return Embedding( vector=vector, text=text, model=self.model_name, dimension=len(vector), metadata={'provider': self.provider} ) def embed_batch(self, texts: List[str]) -> List[Embedding]: """ Generate embeddings for multiple texts Args: texts: List of texts to embed Returns: List of Embedding objects """ # LangChain handles batching automatically vectors = self.embeddings.embed_documents(texts) embeddings = [] for text, vector in zip(texts, vectors): embeddings.append(Embedding( vector=vector, text=text, model=self.model_name, dimension=len(vector), metadata={'provider': self.provider} )) return embeddings def similarity(self, text1: str, text2: str) -> float: """ Calculate cosine similarity between two texts Args: text1: First text text2: Second text Returns: Similarity score (0-1) """ import numpy as np emb1 = self.embed(text1) emb2 = self.embed(text2) vec1 = np.array(emb1.vector) vec2 = np.array(emb2.vector) # Cosine similarity dot_product = np.dot(vec1, vec2) norm1 = np.linalg.norm(vec1) norm2 = np.linalg.norm(vec2) return float(dot_product / (norm1 * norm2)) # ============================================================================ # USAGE EXAMPLE # ============================================================================ if __name__ == "__main__": print("šŸ”¢ Embedder Test (LangChain-based)") print("=" * 80) # Test sentence-transformers (free, local) print("\nšŸ“ Testing local embeddings (sentence-transformers)...") local_embedder = Embedder( provider="sentence-transformers", model_name="all-MiniLM-L6-v2" ) test_texts = [ "RAG systems combine retrieval and generation.", "Vector databases store embeddings efficiently.", "Machine learning models require training data.", ] print(f" Provider: {local_embedder.provider}") print(f" Model: {local_embedder.model_name}") print(f" Dimension: {local_embedder.dimension}") # Single embedding print("\n1ļøāƒ£ Single embedding test...") emb1 = local_embedder.embed(test_texts[0]) print(f" āœ… Generated embedding: {emb1.dimension} dimensions") print(f" šŸ“ Text: '{emb1.text}'") print(f" šŸ”¢ Vector preview: [{emb1.vector[0]:.4f}, {emb1.vector[1]:.4f}, {emb1.vector[2]:.4f}, ...]") # Batch embeddings print("\n2ļøāƒ£ Batch embedding test...") embeddings = local_embedder.embed_batch(test_texts) print(f" āœ… Generated {len(embeddings)} embeddings") for i, emb in enumerate(embeddings): print(f" Text {i+1}: {emb.dimension}D - '{emb.text[:40]}...'") # Similarity print("\n3ļøāƒ£ Similarity test...") sim_1_2 = local_embedder.similarity(test_texts[0], test_texts[1]) sim_1_3 = local_embedder.similarity(test_texts[0], test_texts[2]) print(f" Text 1 vs Text 2: {sim_1_2:.4f} (related: RAG & vector DBs)") print(f" Text 1 vs Text 3: {sim_1_3:.4f} (less related: RAG & ML training)") print(f" āœ… Similarity working (higher score = more similar)") # Test Azure OpenAI (if configured) print("\n" + "=" * 80) print("4ļøāƒ£ Azure OpenAI Embeddings Test") print("-" * 80) # Check if you have an embedding deployment azure_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME") if azure_embedding_deployment: try: azure_embedder = Embedder( provider="azure-openai", model_name=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME"), azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"), azure_api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"), azure_deployment=azure_embedding_deployment ) print(f" Provider: {azure_embedder.provider}") print(f" Model: {azure_embedder.model_name}") print(f" Dimension: {azure_embedder.dimension}") # Test single embedding test_text = "Azure AI Foundry provides embedding models" azure_emb = azure_embedder.embed(test_text) print(f"\n āœ… Single embedding: {len(azure_emb.vector)} dimensions") print(f" šŸ“ Text: '{test_text}'") print(f" šŸ”¢ Vector preview: [{azure_emb.vector[0]:.4f}, {azure_emb.vector[1]:.4f}, ...]") # Test batch embeddings batch_texts = [ "RAG systems combine retrieval and generation", "Vector databases store embeddings", "Azure OpenAI provides powerful models" ] azure_batch = azure_embedder.embed_batch(batch_texts) print(f"\n āœ… Batch embeddings: {len(azure_batch)} embeddings generated") # Test similarity sim = azure_embedder.similarity(batch_texts[0], batch_texts[1]) print(f"\n šŸ“Š Similarity test: {sim:.4f}") print(f" āœ… Azure OpenAI embeddings fully working!") except Exception as e: print(f" āŒ Azure embedding failed: {str(e)[:100]}") else: print(f" āš ļø No Azure embedding deployment configured") print(f" šŸ’” Using FREE local embeddings for all pipelines (recommended for dev)") print("\n" + "=" * 80) print("āœ… Embedder test complete!")