Spaces:
Sleeping
Sleeping
| """ | |
| Vector embedding utilities for the AI Learning Path Generator. | |
| Handles text vectorization for semantic search. | |
| """ | |
| from typing import List, Dict, Any, Optional, Union | |
| import numpy as np | |
| # Import from langchain (old version compatible with Pydantic v1) | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| from src.utils.config import OPENAI_API_KEY, EMBEDDING_MODEL | |
| class EmbeddingService: | |
| """ | |
| Service for generating and managing text embeddings. | |
| """ | |
| def __init__(self, api_key: Optional[str] = None): | |
| """ | |
| Initialize the embedding service. | |
| Args: | |
| api_key: Optional OpenAI API key | |
| """ | |
| self.api_key = api_key or OPENAI_API_KEY | |
| # Try to use free HuggingFace embeddings first, fallback to OpenAI | |
| try: | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| print("✅ EmbeddingService using free HuggingFace embeddings") | |
| except ImportError: | |
| if self.api_key: | |
| from langchain.embeddings import OpenAIEmbeddings | |
| self.embeddings = OpenAIEmbeddings( | |
| api_key=self.api_key, | |
| model=EMBEDDING_MODEL | |
| ) | |
| print("✅ EmbeddingService using OpenAI embeddings") | |
| else: | |
| raise ValueError("HuggingFace embeddings not available and no OpenAI API key provided") | |
| # Initialize text splitter for chunking | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=100, | |
| length_function=len, | |
| ) | |
| def embed_text(self, text: str) -> List[float]: | |
| """ | |
| Generate embedding vector for a text string. | |
| Args: | |
| text: The text to embed | |
| Returns: | |
| Embedding vector as a list of floats | |
| """ | |
| try: | |
| return self.embeddings.embed_query(text) | |
| except Exception as e: | |
| raise ValueError(f"Failed to generate embedding: {str(e)}") | |
| def embed_documents(self, texts: List[str]) -> List[List[float]]: | |
| """ | |
| Generate embeddings for multiple texts. | |
| Args: | |
| texts: List of texts to embed | |
| Returns: | |
| List of embedding vectors | |
| """ | |
| try: | |
| return self.embeddings.embed_documents(texts) | |
| except Exception as e: | |
| raise ValueError(f"Failed to generate document embeddings: {str(e)}") | |
| def chunk_text( | |
| self, | |
| text: str, | |
| metadata: Optional[Dict[str, Any]] = None | |
| ) -> List[Document]: | |
| """ | |
| Split text into chunks for embedding. | |
| Args: | |
| text: The text to split | |
| metadata: Optional metadata to add to each chunk | |
| Returns: | |
| List of Document objects with text chunks | |
| """ | |
| # Create a document with metadata | |
| doc = Document(page_content=text, metadata=metadata or {}) | |
| # Split into chunks | |
| chunks = self.text_splitter.split_documents([doc]) | |
| return chunks | |
| def calculate_similarity( | |
| self, | |
| embedding1: List[float], | |
| embedding2: List[float] | |
| ) -> float: | |
| """ | |
| Calculate cosine similarity between two embeddings. | |
| Args: | |
| embedding1: First embedding vector | |
| embedding2: Second embedding vector | |
| Returns: | |
| Similarity score (0-1) | |
| """ | |
| # Convert to numpy arrays | |
| vec1 = np.array(embedding1) | |
| vec2 = np.array(embedding2) | |
| # Calculate cosine similarity | |
| dot_product = np.dot(vec1, vec2) | |
| norm1 = np.linalg.norm(vec1) | |
| norm2 = np.linalg.norm(vec2) | |
| if norm1 == 0 or norm2 == 0: | |
| return 0 # Handle zero vectors | |
| return dot_product / (norm1 * norm2) | |