sxid003's picture
Upload 83 files
3107242 verified
import pandas as pd
from sentence_transformers import SentenceTransformer
from src.configs.config import PROCESSED_CHUNKS_CSV, EMBEDDING_MODEL, BATCH_SIZE
def run_embedding():
print(f"-> Lecture du dataset: {PROCESSED_CHUNKS_CSV}")
df = pd.read_csv(PROCESSED_CHUNKS_CSV)
# Filter invalid rows (missing or empty chunks)
df = df.dropna(subset=["texte"])
df = df[df["texte"].str.strip() != ""]
print(f"-> {len(df)} chunks prêts pour l'encodage.")
# Load embedding model
print(f"-> Chargement du modèle d'embedding: {EMBEDDING_MODEL}")
model = SentenceTransformer(EMBEDDING_MODEL)
# Encode chunks
print("-> Génération des embeddings...")
embeddings = model.encode(
df["texte"].tolist(),
batch_size=BATCH_SIZE,
show_progress_bar=True,
convert_to_numpy=True,
normalize_embeddings=True # Optional: good for cosine similarity
)
# Save embeddings into DataFrame
df["embedding"] = embeddings.tolist() # Ensure it's serializable
# Save as pickle
output_path = PROCESSED_CHUNKS_CSV.with_name(PROCESSED_CHUNKS_CSV.stem + "_embedded.pkl")
df.to_pickle(output_path)
print(f">> Embeddings sauvegardés dans: {output_path}")