import pandas as pd from pathlib import Path from transformers import AutoTokenizer from src.configs.config import RAW_CSV, PROCESSED_CHUNKS_CSV, MAX_TOKENS # Optional: suppress token length warnings from Transformers import transformers transformers.logging.set_verbosity_error() # Load tokenizer from your embedding model tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2") def split_into_chunks(text, max_tokens, overlap_ratio=0.2): tokens = tokenizer.tokenize(text) if not tokens: return [] step = int(max_tokens * (1 - overlap_ratio)) chunks = [] for i in range(0, len(tokens), step): chunk_tokens = tokens[i:i + max_tokens] if not chunk_tokens: continue chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens) chunks.append(chunk_text) if i + max_tokens >= len(tokens): break return chunks def run_preprocessing(): print("-> Prétraitement des sous-titres...") df = pd.read_csv(RAW_CSV) # Drop empty or invalid entries df = df.dropna(subset=["sous-titre", "titre", "date"]) all_chunks = [] print(f"-> Lecture du dataset: {RAW_CSV}") for idx, row in df.iterrows(): texte = str(row["sous-titre"]).strip() if not texte: continue chunks = split_into_chunks(texte, MAX_TOKENS, overlap_ratio=0.2) for i, chunk in enumerate(chunks): all_chunks.append({ "id_video": row["id"], "titre": row["titre"], "date": row["date"], "langue": row.get("langue", "ar"), "chunk_id": f"{row['id']}_{i}", "texte": chunk, "lien": row.get("lien", "") }) chunk_df = pd.DataFrame(all_chunks) # Ensure output directory exists output_path = Path(PROCESSED_CHUNKS_CSV) output_path.parent.mkdir(parents=True, exist_ok=True) # Save to CSV chunk_df.to_csv(output_path, index=False, encoding='utf-8-sig') print(f">> Chunks enregistrés dans {output_path}")