| |
| import numpy as np |
| from tqdm import tqdm |
| from sentence_transformers import SentenceTransformer |
| from utils import setup_logger |
| from config import Config |
|
|
| logger = setup_logger('embedding_generator') |
|
|
| def generate_embeddings(documents): |
| model = SentenceTransformer(Config.EMBEDDING_MODEL) |
| embeddings = [] |
| for i in tqdm(range(0, len(documents), Config.BATCH_SIZE), desc="Generating embeddings"): |
| batch = documents['content'][i:i+Config.BATCH_SIZE].tolist() |
| try: |
| batch_embeddings = model.encode(batch, show_progress_bar=False) |
| embeddings.extend(batch_embeddings) |
| except Exception as e: |
| logger.error(f"Error encoding batch: {e}") |
| return np.array(embeddings) |