Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

App Files Files Community

RAG_APP / src /youtube_embd /index.py

sxid003

Upload 83 files

3107242 verified 9 months ago

raw

history blame contribute delete

1.78 kB

	import os
	import faiss
	import pandas as pd
	import numpy as np
	import pickle
	from src.configs.config import (
	PROCESSED_CHUNKS_CSV,
	FAISS_INDEX_FILE_YT,
	FAISS_METADATA_FILE_YT
	)

	def run_indexing():
	print(f"-> Chargement du fichier de chunks avec embeddings...")
	embedded_path = PROCESSED_CHUNKS_CSV.with_name(PROCESSED_CHUNKS_CSV.stem + "_embedded.pkl")
	df = pd.read_pickle(embedded_path)


	if "embedding" not in df.columns:
	raise ValueError("La colonne 'embedding' est absente du fichier.")

	# Check embedding type
	first_embedding = df["embedding"].iloc[0]
	if not isinstance(first_embedding, (list, np.ndarray)):
	raise TypeError("Les embeddings doivent être des listes ou des vecteurs numpy.")

	print(f"-> Nombre total de chunks à indexer : {len(df)}")
	dim = len(first_embedding)
	print(f"-> Dimension des vecteurs : {dim}")

	# Build FAISS index (cosine similarity via inner product, if vectors normalized)
	index = faiss.IndexFlatIP(dim) # Use IndexFlatL2 if embeddings are NOT normalized

	vectors = np.vstack(df["embedding"].values).astype("float32")
	index.add(vectors)

	# Ensure index directory exists
	index_dir = os.path.dirname(FAISS_INDEX_FILE_YT)
	if index_dir:
	os.makedirs(index_dir, exist_ok=True)

	# Save index
	faiss.write_index(index, str(FAISS_INDEX_FILE_YT))
	print(f">> Index FAISS sauvegardé: {FAISS_INDEX_FILE_YT}")
	print(f"-> Nombre de vecteurs indexés : {index.ntotal}")

	# Save metadata (without embedding column)
	metadata = df.drop(columns=["embedding"]).reset_index(drop=True)
	with open(FAISS_METADATA_FILE_YT, "wb") as f:
	pickle.dump(metadata, f)

	print(f">> Métadonnées sauvegardées: {FAISS_METADATA_FILE_YT}")