Spaces:
Sleeping
Sleeping
File size: 1,649 Bytes
3107242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from sentence_transformers import SentenceTransformer
from src.utils.helpers import load_chunks_from_disk, save_embeddings
from src.configs.config import CHUNKS_FILE, EMBEDDINGS_FILE, EMBEDDING_MODEL, BATCH_SIZE, LOG_DIR
import numpy as np
import logging
import os
LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
'''logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)'''
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
def generate_embeddings():
"""Generate embeddings for all chunks."""
model = SentenceTransformer(EMBEDDING_MODEL)
chunks_data = load_chunks_from_disk(CHUNKS_FILE)
#all_chunks = [item["chunks"] for item in chunks_data]
all_chunks = [chunk for item in chunks_data for chunk in item["chunks"]]#used this instead to avoid having a list of list
if not all_chunks:
logging.warning("No chunks to embed")
return np.array([]), []
# Batch embedding
embeddings = []
for i in range(0, len(all_chunks), BATCH_SIZE):
batch = all_chunks[i:i + BATCH_SIZE]
batch_texts = [chunk["text"] for chunk in batch]
batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=True)
embeddings.extend(batch_embeddings)
logging.info(f"Embedded batch {i // BATCH_SIZE + 1}")
embeddings = np.array(embeddings)
save_embeddings(embeddings, EMBEDDINGS_FILE)#the saved file is in .gitignore but gonna be used in indexing
return embeddings, all_chunks |