Spaces:
Sleeping
Sleeping
| """ | |
| Embedding Generation (LangChain-based) | |
| ========================================================= | |
| Supports multiple embedding providers using LangChain: | |
| - Azure OpenAI | |
| - Sentence Transformers (local, free) | |
| - Cohere (via Azure Foundry) | |
| """ | |
| from typing import List, Optional | |
| import os | |
| from dataclasses import dataclass | |
| # LangChain embeddings | |
| from langchain_openai import AzureOpenAIEmbeddings | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| # Config | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| class Embedding: | |
| """Container for embedding vector with metadata""" | |
| vector: List[float] | |
| text: str | |
| model: str | |
| dimension: int | |
| metadata: dict = None | |
| def __post_init__(self): | |
| if self.metadata is None: | |
| self.metadata = {} | |
| class Embedder: | |
| """Universal embedder supporting multiple providers via LangChain""" | |
| def __init__( | |
| self, | |
| provider: str = "sentence-transformers", | |
| model_name: str = "all-MiniLM-L6-v2", | |
| azure_endpoint: Optional[str] = None, | |
| azure_api_key: Optional[str] = None, | |
| azure_deployment: Optional[str] = None, | |
| ): | |
| """ | |
| Initialize embedder | |
| Args: | |
| provider: 'azure-openai' or 'sentence-transformers' | |
| model_name: Model name/deployment name | |
| azure_endpoint: Azure OpenAI endpoint (if using Azure) | |
| azure_api_key: Azure API key (if using Azure) | |
| azure_deployment: Azure deployment name (if using Azure) | |
| """ | |
| self.provider = provider | |
| self.model_name = model_name | |
| if provider == "azure-openai": | |
| # Use LangChain's AzureOpenAIEmbeddings | |
| self.embeddings = AzureOpenAIEmbeddings( | |
| azure_endpoint=azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT"), | |
| api_key=azure_api_key or os.getenv("AZURE_OPENAI_API_KEY"), | |
| azure_deployment=azure_deployment or model_name, | |
| api_version="2024-02-01", | |
| ) | |
| # Determine dimension based on model | |
| self.dimension = 3072 if "large" in model_name.lower() else 1536 | |
| elif provider == "sentence-transformers": | |
| # Use LangChain's HuggingFaceEmbeddings (wraps sentence-transformers) | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| model_kwargs={'device': 'cpu','trust_remote_code': False}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| # Get dimension from model | |
| self.dimension = self._get_embedding_dimension() | |
| else: | |
| raise ValueError(f"Unsupported provider: {provider}") | |
| def _get_embedding_dimension(self) -> int: | |
| """Get embedding dimension by testing with a sample text""" | |
| try: | |
| sample = self.embeddings.embed_query("test") | |
| return len(sample) | |
| except: | |
| # Fallback dimensions for common models | |
| dim_map = { | |
| "all-MiniLM-L6-v2": 384, | |
| "all-MiniLM-L12-v2": 384, | |
| "all-mpnet-base-v2": 768, | |
| } | |
| return dim_map.get(self.model_name, 768) | |
| def embed(self, text: str) -> Embedding: | |
| """ | |
| Generate embedding for a single text | |
| Args: | |
| text: Text to embed | |
| Returns: | |
| Embedding object | |
| """ | |
| vector = self.embeddings.embed_query(text) | |
| return Embedding( | |
| vector=vector, | |
| text=text, | |
| model=self.model_name, | |
| dimension=len(vector), | |
| metadata={'provider': self.provider} | |
| ) | |
| def embed_batch(self, texts: List[str]) -> List[Embedding]: | |
| """ | |
| Generate embeddings for multiple texts | |
| Args: | |
| texts: List of texts to embed | |
| Returns: | |
| List of Embedding objects | |
| """ | |
| # LangChain handles batching automatically | |
| vectors = self.embeddings.embed_documents(texts) | |
| embeddings = [] | |
| for text, vector in zip(texts, vectors): | |
| embeddings.append(Embedding( | |
| vector=vector, | |
| text=text, | |
| model=self.model_name, | |
| dimension=len(vector), | |
| metadata={'provider': self.provider} | |
| )) | |
| return embeddings | |
| def similarity(self, text1: str, text2: str) -> float: | |
| """ | |
| Calculate cosine similarity between two texts | |
| Args: | |
| text1: First text | |
| text2: Second text | |
| Returns: | |
| Similarity score (0-1) | |
| """ | |
| import numpy as np | |
| emb1 = self.embed(text1) | |
| emb2 = self.embed(text2) | |
| vec1 = np.array(emb1.vector) | |
| vec2 = np.array(emb2.vector) | |
| # Cosine similarity | |
| dot_product = np.dot(vec1, vec2) | |
| norm1 = np.linalg.norm(vec1) | |
| norm2 = np.linalg.norm(vec2) | |
| return float(dot_product / (norm1 * norm2)) | |
| # ============================================================================ | |
| # USAGE EXAMPLE | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| print("π’ Embedder Test (LangChain-based)") | |
| print("=" * 80) | |
| # Test sentence-transformers (free, local) | |
| print("\nπ Testing local embeddings (sentence-transformers)...") | |
| local_embedder = Embedder( | |
| provider="sentence-transformers", | |
| model_name="all-MiniLM-L6-v2" | |
| ) | |
| test_texts = [ | |
| "RAG systems combine retrieval and generation.", | |
| "Vector databases store embeddings efficiently.", | |
| "Machine learning models require training data.", | |
| ] | |
| print(f" Provider: {local_embedder.provider}") | |
| print(f" Model: {local_embedder.model_name}") | |
| print(f" Dimension: {local_embedder.dimension}") | |
| # Single embedding | |
| print("\n1οΈβ£ Single embedding test...") | |
| emb1 = local_embedder.embed(test_texts[0]) | |
| print(f" β Generated embedding: {emb1.dimension} dimensions") | |
| print(f" π Text: '{emb1.text}'") | |
| print(f" π’ Vector preview: [{emb1.vector[0]:.4f}, {emb1.vector[1]:.4f}, {emb1.vector[2]:.4f}, ...]") | |
| # Batch embeddings | |
| print("\n2οΈβ£ Batch embedding test...") | |
| embeddings = local_embedder.embed_batch(test_texts) | |
| print(f" β Generated {len(embeddings)} embeddings") | |
| for i, emb in enumerate(embeddings): | |
| print(f" Text {i+1}: {emb.dimension}D - '{emb.text[:40]}...'") | |
| # Similarity | |
| print("\n3οΈβ£ Similarity test...") | |
| sim_1_2 = local_embedder.similarity(test_texts[0], test_texts[1]) | |
| sim_1_3 = local_embedder.similarity(test_texts[0], test_texts[2]) | |
| print(f" Text 1 vs Text 2: {sim_1_2:.4f} (related: RAG & vector DBs)") | |
| print(f" Text 1 vs Text 3: {sim_1_3:.4f} (less related: RAG & ML training)") | |
| print(f" β Similarity working (higher score = more similar)") | |
| # Test Azure OpenAI (if configured) | |
| print("\n" + "=" * 80) | |
| print("4οΈβ£ Azure OpenAI Embeddings Test") | |
| print("-" * 80) | |
| # Check if you have an embedding deployment | |
| azure_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME") | |
| if azure_embedding_deployment: | |
| try: | |
| azure_embedder = Embedder( | |
| provider="azure-openai", | |
| model_name=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME"), | |
| azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"), | |
| azure_api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"), | |
| azure_deployment=azure_embedding_deployment | |
| ) | |
| print(f" Provider: {azure_embedder.provider}") | |
| print(f" Model: {azure_embedder.model_name}") | |
| print(f" Dimension: {azure_embedder.dimension}") | |
| # Test single embedding | |
| test_text = "Azure AI Foundry provides embedding models" | |
| azure_emb = azure_embedder.embed(test_text) | |
| print(f"\n β Single embedding: {len(azure_emb.vector)} dimensions") | |
| print(f" π Text: '{test_text}'") | |
| print(f" π’ Vector preview: [{azure_emb.vector[0]:.4f}, {azure_emb.vector[1]:.4f}, ...]") | |
| # Test batch embeddings | |
| batch_texts = [ | |
| "RAG systems combine retrieval and generation", | |
| "Vector databases store embeddings", | |
| "Azure OpenAI provides powerful models" | |
| ] | |
| azure_batch = azure_embedder.embed_batch(batch_texts) | |
| print(f"\n β Batch embeddings: {len(azure_batch)} embeddings generated") | |
| # Test similarity | |
| sim = azure_embedder.similarity(batch_texts[0], batch_texts[1]) | |
| print(f"\n π Similarity test: {sim:.4f}") | |
| print(f" β Azure OpenAI embeddings fully working!") | |
| except Exception as e: | |
| print(f" β Azure embedding failed: {str(e)[:100]}") | |
| else: | |
| print(f" β οΈ No Azure embedding deployment configured") | |
| print(f" π‘ Using FREE local embeddings for all pipelines (recommended for dev)") | |
| print("\n" + "=" * 80) | |
| print("β Embedder test complete!") | |