from sentence_transformers import SentenceTransformer from src.utils.helpers import load_chunks_from_disk, save_embeddings from src.configs.config import CHUNKS_FILE, EMBEDDINGS_FILE, EMBEDDING_MODEL, BATCH_SIZE, LOG_DIR import numpy as np import logging import os LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log") '''logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", )''' logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') def generate_embeddings(): """Generate embeddings for all chunks.""" model = SentenceTransformer(EMBEDDING_MODEL) chunks_data = load_chunks_from_disk(CHUNKS_FILE) #all_chunks = [item["chunks"] for item in chunks_data] all_chunks = [chunk for item in chunks_data for chunk in item["chunks"]]#used this instead to avoid having a list of list if not all_chunks: logging.warning("No chunks to embed") return np.array([]), [] # Batch embedding embeddings = [] for i in range(0, len(all_chunks), BATCH_SIZE): batch = all_chunks[i:i + BATCH_SIZE] batch_texts = [chunk["text"] for chunk in batch] batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=True) embeddings.extend(batch_embeddings) logging.info(f"Embedded batch {i // BATCH_SIZE + 1}") embeddings = np.array(embeddings) save_embeddings(embeddings, EMBEDDINGS_FILE)#the saved file is in .gitignore but gonna be used in indexing return embeddings, all_chunks