Spaces:
Running
Running
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from src.configs.config import PROCESSED_CHUNKS_CSV, EMBEDDING_MODEL, BATCH_SIZE | |
| def run_embedding(): | |
| print(f"-> Lecture du dataset: {PROCESSED_CHUNKS_CSV}") | |
| df = pd.read_csv(PROCESSED_CHUNKS_CSV) | |
| # Filter invalid rows (missing or empty chunks) | |
| df = df.dropna(subset=["texte"]) | |
| df = df[df["texte"].str.strip() != ""] | |
| print(f"-> {len(df)} chunks prêts pour l'encodage.") | |
| # Load embedding model | |
| print(f"-> Chargement du modèle d'embedding: {EMBEDDING_MODEL}") | |
| model = SentenceTransformer(EMBEDDING_MODEL) | |
| # Encode chunks | |
| print("-> Génération des embeddings...") | |
| embeddings = model.encode( | |
| df["texte"].tolist(), | |
| batch_size=BATCH_SIZE, | |
| show_progress_bar=True, | |
| convert_to_numpy=True, | |
| normalize_embeddings=True # Optional: good for cosine similarity | |
| ) | |
| # Save embeddings into DataFrame | |
| df["embedding"] = embeddings.tolist() # Ensure it's serializable | |
| # Save as pickle | |
| output_path = PROCESSED_CHUNKS_CSV.with_name(PROCESSED_CHUNKS_CSV.stem + "_embedded.pkl") | |
| df.to_pickle(output_path) | |
| print(f">> Embeddings sauvegardés dans: {output_path}") | |