File size: 1,649 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from sentence_transformers import SentenceTransformer
from src.utils.helpers import load_chunks_from_disk, save_embeddings
from src.configs.config import CHUNKS_FILE, EMBEDDINGS_FILE, EMBEDDING_MODEL, BATCH_SIZE, LOG_DIR
import numpy as np
import logging
import os

LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
'''logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)'''
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

def generate_embeddings():
    """Generate embeddings for all chunks."""
    model = SentenceTransformer(EMBEDDING_MODEL)
    chunks_data = load_chunks_from_disk(CHUNKS_FILE)
    
    #all_chunks = [item["chunks"] for item in chunks_data]
    all_chunks = [chunk for item in chunks_data for chunk in item["chunks"]]#used this instead to avoid having a list of list

    
    if not all_chunks:
        logging.warning("No chunks to embed")
        return np.array([]), []
    
    # Batch embedding
    embeddings = []
    for i in range(0, len(all_chunks), BATCH_SIZE):
        batch = all_chunks[i:i + BATCH_SIZE]
        batch_texts = [chunk["text"] for chunk in batch]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=True)
        embeddings.extend(batch_embeddings)
        logging.info(f"Embedded batch {i // BATCH_SIZE + 1}")

    
    embeddings = np.array(embeddings)
    save_embeddings(embeddings, EMBEDDINGS_FILE)#the saved file is in .gitignore but gonna be used in indexing
    return embeddings, all_chunks