import numpy as np import pickle import os import torch from typing import List, Dict, Any from sentence_transformers import SentenceTransformer from config.config import Config # Set environment variables to prevent device issues os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU class Embedder: def __init__(self): self.config = Config() # Ensure the device is set to CPU even if CUDA is available device = "cpu" # Always force CPU usage # Check if CUDA is available and print a warning (Streamlit Cloud doesn't support GPU) if torch.cuda.is_available(): print("CUDA is available, but we're forcing the use of CPU.") try: print(f"Loading model: {self.config.EMBEDDING_MODEL} on {device}") # Load the model with the specified device self.model = SentenceTransformer(self.config.EMBEDDING_MODEL, device=device) except Exception as e: raise RuntimeError(f"Failed to load SentenceTransformer model: {str(e)}") self.model_path = "data/processed/sentence_transformer.pkl" # Sentence transformers don't need fitting, but we can save/load if needed def embed_texts(self, texts: List[str]) -> List[List[float]]: """ Generate embeddings for a list of texts using Sentence Transformers. Args: texts: List of text strings to embed Returns: List of embedding vectors """ if not texts: return [] try: # Ensure embeddings are returned as numpy arrays embeddings = self.model.encode(texts, convert_to_numpy=True) return embeddings.tolist() except Exception as e: raise RuntimeError(f"Failed to generate embeddings: {str(e)}") def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Generate embeddings for document chunks and add to chunk metadata. Args: chunks: List of chunk dictionaries Returns: List of chunks with embeddings added """ if not chunks: return [] texts = [chunk["text"] for chunk in chunks] embeddings = self.embed_texts(texts) for chunk, embedding in zip(chunks, embeddings): chunk["embedding"] = embedding return chunks def embed_query(self, query: str) -> List[float]: """ Generate embedding for a single query. Args: query: Query text Returns: Query embedding vector """ embeddings = self.embed_texts([query]) return embeddings[0] if embeddings else [] # Legacy methods for compatibility def fit_on_texts(self, texts: List[str]) -> None: pass # Not needed for sentence transformers def save_vectorizer(self) -> None: pass def load_vectorizer(self) -> bool: return True