RAG_APP / src /youtube_embd /preprocess.py
sxid003's picture
Upload 83 files
3107242 verified
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer
from src.configs.config import RAW_CSV, PROCESSED_CHUNKS_CSV, MAX_TOKENS
# Optional: suppress token length warnings from Transformers
import transformers
transformers.logging.set_verbosity_error()
# Load tokenizer from your embedding model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
def split_into_chunks(text, max_tokens, overlap_ratio=0.2):
tokens = tokenizer.tokenize(text)
if not tokens:
return []
step = int(max_tokens * (1 - overlap_ratio))
chunks = []
for i in range(0, len(tokens), step):
chunk_tokens = tokens[i:i + max_tokens]
if not chunk_tokens:
continue
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
chunks.append(chunk_text)
if i + max_tokens >= len(tokens):
break
return chunks
def run_preprocessing():
print("-> Prétraitement des sous-titres...")
df = pd.read_csv(RAW_CSV)
# Drop empty or invalid entries
df = df.dropna(subset=["sous-titre", "titre", "date"])
all_chunks = []
print(f"-> Lecture du dataset: {RAW_CSV}")
for idx, row in df.iterrows():
texte = str(row["sous-titre"]).strip()
if not texte:
continue
chunks = split_into_chunks(texte, MAX_TOKENS, overlap_ratio=0.2)
for i, chunk in enumerate(chunks):
all_chunks.append({
"id_video": row["id"],
"titre": row["titre"],
"date": row["date"],
"langue": row.get("langue", "ar"),
"chunk_id": f"{row['id']}_{i}",
"texte": chunk,
"lien": row.get("lien", "")
})
chunk_df = pd.DataFrame(all_chunks)
# Ensure output directory exists
output_path = Path(PROCESSED_CHUNKS_CSV)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save to CSV
chunk_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f">> Chunks enregistrés dans {output_path}")