puji4ml's picture
Upload 30 files
2b22a59 verified
"""
Embedding Generation (LangChain-based)
=========================================================
Supports multiple embedding providers using LangChain:
- Azure OpenAI
- Sentence Transformers (local, free)
- Cohere (via Azure Foundry)
"""
from typing import List, Optional
import os
from dataclasses import dataclass
# LangChain embeddings
from langchain_openai import AzureOpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
# Config
from dotenv import load_dotenv
load_dotenv()
@dataclass
class Embedding:
"""Container for embedding vector with metadata"""
vector: List[float]
text: str
model: str
dimension: int
metadata: dict = None
def __post_init__(self):
if self.metadata is None:
self.metadata = {}
class Embedder:
"""Universal embedder supporting multiple providers via LangChain"""
def __init__(
self,
provider: str = "sentence-transformers",
model_name: str = "all-MiniLM-L6-v2",
azure_endpoint: Optional[str] = None,
azure_api_key: Optional[str] = None,
azure_deployment: Optional[str] = None,
):
"""
Initialize embedder
Args:
provider: 'azure-openai' or 'sentence-transformers'
model_name: Model name/deployment name
azure_endpoint: Azure OpenAI endpoint (if using Azure)
azure_api_key: Azure API key (if using Azure)
azure_deployment: Azure deployment name (if using Azure)
"""
self.provider = provider
self.model_name = model_name
if provider == "azure-openai":
# Use LangChain's AzureOpenAIEmbeddings
self.embeddings = AzureOpenAIEmbeddings(
azure_endpoint=azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT"),
api_key=azure_api_key or os.getenv("AZURE_OPENAI_API_KEY"),
azure_deployment=azure_deployment or model_name,
api_version="2024-02-01",
)
# Determine dimension based on model
self.dimension = 3072 if "large" in model_name.lower() else 1536
elif provider == "sentence-transformers":
# Use LangChain's HuggingFaceEmbeddings (wraps sentence-transformers)
self.embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu','trust_remote_code': False},
encode_kwargs={'normalize_embeddings': True}
)
# Get dimension from model
self.dimension = self._get_embedding_dimension()
else:
raise ValueError(f"Unsupported provider: {provider}")
def _get_embedding_dimension(self) -> int:
"""Get embedding dimension by testing with a sample text"""
try:
sample = self.embeddings.embed_query("test")
return len(sample)
except:
# Fallback dimensions for common models
dim_map = {
"all-MiniLM-L6-v2": 384,
"all-MiniLM-L12-v2": 384,
"all-mpnet-base-v2": 768,
}
return dim_map.get(self.model_name, 768)
def embed(self, text: str) -> Embedding:
"""
Generate embedding for a single text
Args:
text: Text to embed
Returns:
Embedding object
"""
vector = self.embeddings.embed_query(text)
return Embedding(
vector=vector,
text=text,
model=self.model_name,
dimension=len(vector),
metadata={'provider': self.provider}
)
def embed_batch(self, texts: List[str]) -> List[Embedding]:
"""
Generate embeddings for multiple texts
Args:
texts: List of texts to embed
Returns:
List of Embedding objects
"""
# LangChain handles batching automatically
vectors = self.embeddings.embed_documents(texts)
embeddings = []
for text, vector in zip(texts, vectors):
embeddings.append(Embedding(
vector=vector,
text=text,
model=self.model_name,
dimension=len(vector),
metadata={'provider': self.provider}
))
return embeddings
def similarity(self, text1: str, text2: str) -> float:
"""
Calculate cosine similarity between two texts
Args:
text1: First text
text2: Second text
Returns:
Similarity score (0-1)
"""
import numpy as np
emb1 = self.embed(text1)
emb2 = self.embed(text2)
vec1 = np.array(emb1.vector)
vec2 = np.array(emb2.vector)
# Cosine similarity
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return float(dot_product / (norm1 * norm2))
# ============================================================================
# USAGE EXAMPLE
# ============================================================================
if __name__ == "__main__":
print("πŸ”’ Embedder Test (LangChain-based)")
print("=" * 80)
# Test sentence-transformers (free, local)
print("\nπŸ“ Testing local embeddings (sentence-transformers)...")
local_embedder = Embedder(
provider="sentence-transformers",
model_name="all-MiniLM-L6-v2"
)
test_texts = [
"RAG systems combine retrieval and generation.",
"Vector databases store embeddings efficiently.",
"Machine learning models require training data.",
]
print(f" Provider: {local_embedder.provider}")
print(f" Model: {local_embedder.model_name}")
print(f" Dimension: {local_embedder.dimension}")
# Single embedding
print("\n1️⃣ Single embedding test...")
emb1 = local_embedder.embed(test_texts[0])
print(f" βœ… Generated embedding: {emb1.dimension} dimensions")
print(f" πŸ“ Text: '{emb1.text}'")
print(f" πŸ”’ Vector preview: [{emb1.vector[0]:.4f}, {emb1.vector[1]:.4f}, {emb1.vector[2]:.4f}, ...]")
# Batch embeddings
print("\n2️⃣ Batch embedding test...")
embeddings = local_embedder.embed_batch(test_texts)
print(f" βœ… Generated {len(embeddings)} embeddings")
for i, emb in enumerate(embeddings):
print(f" Text {i+1}: {emb.dimension}D - '{emb.text[:40]}...'")
# Similarity
print("\n3️⃣ Similarity test...")
sim_1_2 = local_embedder.similarity(test_texts[0], test_texts[1])
sim_1_3 = local_embedder.similarity(test_texts[0], test_texts[2])
print(f" Text 1 vs Text 2: {sim_1_2:.4f} (related: RAG & vector DBs)")
print(f" Text 1 vs Text 3: {sim_1_3:.4f} (less related: RAG & ML training)")
print(f" βœ… Similarity working (higher score = more similar)")
# Test Azure OpenAI (if configured)
print("\n" + "=" * 80)
print("4️⃣ Azure OpenAI Embeddings Test")
print("-" * 80)
# Check if you have an embedding deployment
azure_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
if azure_embedding_deployment:
try:
azure_embedder = Embedder(
provider="azure-openai",
model_name=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME"),
azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
azure_api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"),
azure_deployment=azure_embedding_deployment
)
print(f" Provider: {azure_embedder.provider}")
print(f" Model: {azure_embedder.model_name}")
print(f" Dimension: {azure_embedder.dimension}")
# Test single embedding
test_text = "Azure AI Foundry provides embedding models"
azure_emb = azure_embedder.embed(test_text)
print(f"\n βœ… Single embedding: {len(azure_emb.vector)} dimensions")
print(f" πŸ“ Text: '{test_text}'")
print(f" πŸ”’ Vector preview: [{azure_emb.vector[0]:.4f}, {azure_emb.vector[1]:.4f}, ...]")
# Test batch embeddings
batch_texts = [
"RAG systems combine retrieval and generation",
"Vector databases store embeddings",
"Azure OpenAI provides powerful models"
]
azure_batch = azure_embedder.embed_batch(batch_texts)
print(f"\n βœ… Batch embeddings: {len(azure_batch)} embeddings generated")
# Test similarity
sim = azure_embedder.similarity(batch_texts[0], batch_texts[1])
print(f"\n πŸ“Š Similarity test: {sim:.4f}")
print(f" βœ… Azure OpenAI embeddings fully working!")
except Exception as e:
print(f" ❌ Azure embedding failed: {str(e)[:100]}")
else:
print(f" ⚠️ No Azure embedding deployment configured")
print(f" πŸ’‘ Using FREE local embeddings for all pipelines (recommended for dev)")
print("\n" + "=" * 80)
print("βœ… Embedder test complete!")