Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from pathlib import Path | |
| from transformers import AutoTokenizer | |
| from src.configs.config import RAW_CSV, PROCESSED_CHUNKS_CSV, MAX_TOKENS | |
| # Optional: suppress token length warnings from Transformers | |
| import transformers | |
| transformers.logging.set_verbosity_error() | |
| # Load tokenizer from your embedding model | |
| tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2") | |
| def split_into_chunks(text, max_tokens, overlap_ratio=0.2): | |
| tokens = tokenizer.tokenize(text) | |
| if not tokens: | |
| return [] | |
| step = int(max_tokens * (1 - overlap_ratio)) | |
| chunks = [] | |
| for i in range(0, len(tokens), step): | |
| chunk_tokens = tokens[i:i + max_tokens] | |
| if not chunk_tokens: | |
| continue | |
| chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens) | |
| chunks.append(chunk_text) | |
| if i + max_tokens >= len(tokens): | |
| break | |
| return chunks | |
| def run_preprocessing(): | |
| print("-> Prétraitement des sous-titres...") | |
| df = pd.read_csv(RAW_CSV) | |
| # Drop empty or invalid entries | |
| df = df.dropna(subset=["sous-titre", "titre", "date"]) | |
| all_chunks = [] | |
| print(f"-> Lecture du dataset: {RAW_CSV}") | |
| for idx, row in df.iterrows(): | |
| texte = str(row["sous-titre"]).strip() | |
| if not texte: | |
| continue | |
| chunks = split_into_chunks(texte, MAX_TOKENS, overlap_ratio=0.2) | |
| for i, chunk in enumerate(chunks): | |
| all_chunks.append({ | |
| "id_video": row["id"], | |
| "titre": row["titre"], | |
| "date": row["date"], | |
| "langue": row.get("langue", "ar"), | |
| "chunk_id": f"{row['id']}_{i}", | |
| "texte": chunk, | |
| "lien": row.get("lien", "") | |
| }) | |
| chunk_df = pd.DataFrame(all_chunks) | |
| # Ensure output directory exists | |
| output_path = Path(PROCESSED_CHUNKS_CSV) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| # Save to CSV | |
| chunk_df.to_csv(output_path, index=False, encoding='utf-8-sig') | |
| print(f">> Chunks enregistrés dans {output_path}") | |