Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| from src.utils.helpers import load_chunks_from_disk, save_embeddings | |
| from src.configs.config import CHUNKS_FILE, EMBEDDINGS_FILE, EMBEDDING_MODEL, BATCH_SIZE, LOG_DIR | |
| import numpy as np | |
| import logging | |
| import os | |
| LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log") | |
| '''logging.basicConfig( | |
| filename=LOG_FILE, | |
| level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| )''' | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') | |
| def generate_embeddings(): | |
| """Generate embeddings for all chunks.""" | |
| model = SentenceTransformer(EMBEDDING_MODEL) | |
| chunks_data = load_chunks_from_disk(CHUNKS_FILE) | |
| #all_chunks = [item["chunks"] for item in chunks_data] | |
| all_chunks = [chunk for item in chunks_data for chunk in item["chunks"]]#used this instead to avoid having a list of list | |
| if not all_chunks: | |
| logging.warning("No chunks to embed") | |
| return np.array([]), [] | |
| # Batch embedding | |
| embeddings = [] | |
| for i in range(0, len(all_chunks), BATCH_SIZE): | |
| batch = all_chunks[i:i + BATCH_SIZE] | |
| batch_texts = [chunk["text"] for chunk in batch] | |
| batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=True) | |
| embeddings.extend(batch_embeddings) | |
| logging.info(f"Embedded batch {i // BATCH_SIZE + 1}") | |
| embeddings = np.array(embeddings) | |
| save_embeddings(embeddings, EMBEDDINGS_FILE)#the saved file is in .gitignore but gonna be used in indexing | |
| return embeddings, all_chunks |