""" Vector embedding utilities for the AI Learning Path Generator. Handles text vectorization for semantic search. """ from typing import List, Dict, Any, Optional, Union import numpy as np # Import from langchain (old version compatible with Pydantic v1) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document from src.utils.config import OPENAI_API_KEY, EMBEDDING_MODEL class EmbeddingService: """ Service for generating and managing text embeddings. """ def __init__(self, api_key: Optional[str] = None): """ Initialize the embedding service. Args: api_key: Optional OpenAI API key """ self.api_key = api_key or OPENAI_API_KEY # Try to use free HuggingFace embeddings first, fallback to OpenAI try: from langchain.embeddings import HuggingFaceEmbeddings self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") print("✅ EmbeddingService using free HuggingFace embeddings") except ImportError: if self.api_key: from langchain.embeddings import OpenAIEmbeddings self.embeddings = OpenAIEmbeddings( api_key=self.api_key, model=EMBEDDING_MODEL ) print("✅ EmbeddingService using OpenAI embeddings") else: raise ValueError("HuggingFace embeddings not available and no OpenAI API key provided") # Initialize text splitter for chunking self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, length_function=len, ) def embed_text(self, text: str) -> List[float]: """ Generate embedding vector for a text string. Args: text: The text to embed Returns: Embedding vector as a list of floats """ try: return self.embeddings.embed_query(text) except Exception as e: raise ValueError(f"Failed to generate embedding: {str(e)}") def embed_documents(self, texts: List[str]) -> List[List[float]]: """ Generate embeddings for multiple texts. Args: texts: List of texts to embed Returns: List of embedding vectors """ try: return self.embeddings.embed_documents(texts) except Exception as e: raise ValueError(f"Failed to generate document embeddings: {str(e)}") def chunk_text( self, text: str, metadata: Optional[Dict[str, Any]] = None ) -> List[Document]: """ Split text into chunks for embedding. Args: text: The text to split metadata: Optional metadata to add to each chunk Returns: List of Document objects with text chunks """ # Create a document with metadata doc = Document(page_content=text, metadata=metadata or {}) # Split into chunks chunks = self.text_splitter.split_documents([doc]) return chunks def calculate_similarity( self, embedding1: List[float], embedding2: List[float] ) -> float: """ Calculate cosine similarity between two embeddings. Args: embedding1: First embedding vector embedding2: Second embedding vector Returns: Similarity score (0-1) """ # Convert to numpy arrays vec1 = np.array(embedding1) vec2 = np.array(embedding2) # Calculate cosine similarity dot_product = np.dot(vec1, vec2) norm1 = np.linalg.norm(vec1) norm2 = np.linalg.norm(vec2) if norm1 == 0 or norm2 == 0: return 0 # Handle zero vectors return dot_product / (norm1 * norm2)