import pandas as pd from sentence_transformers import SentenceTransformer from src.configs.config import PROCESSED_CHUNKS_CSV, EMBEDDING_MODEL, BATCH_SIZE def run_embedding(): print(f"-> Lecture du dataset: {PROCESSED_CHUNKS_CSV}") df = pd.read_csv(PROCESSED_CHUNKS_CSV) # Filter invalid rows (missing or empty chunks) df = df.dropna(subset=["texte"]) df = df[df["texte"].str.strip() != ""] print(f"-> {len(df)} chunks prêts pour l'encodage.") # Load embedding model print(f"-> Chargement du modèle d'embedding: {EMBEDDING_MODEL}") model = SentenceTransformer(EMBEDDING_MODEL) # Encode chunks print("-> Génération des embeddings...") embeddings = model.encode( df["texte"].tolist(), batch_size=BATCH_SIZE, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True # Optional: good for cosine similarity ) # Save embeddings into DataFrame df["embedding"] = embeddings.tolist() # Ensure it's serializable # Save as pickle output_path = PROCESSED_CHUNKS_CSV.with_name(PROCESSED_CHUNKS_CSV.stem + "_embedded.pkl") df.to_pickle(output_path) print(f">> Embeddings sauvegardés dans: {output_path}")