File size: 2,132 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer
from src.configs.config import RAW_CSV, PROCESSED_CHUNKS_CSV, MAX_TOKENS

# Optional: suppress token length warnings from Transformers
import transformers
transformers.logging.set_verbosity_error()

# Load tokenizer from your embedding model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

def split_into_chunks(text, max_tokens, overlap_ratio=0.2):
    tokens = tokenizer.tokenize(text)

    if not tokens:
        return []

    step = int(max_tokens * (1 - overlap_ratio))
    chunks = []

    for i in range(0, len(tokens), step):
        chunk_tokens = tokens[i:i + max_tokens]
        if not chunk_tokens:
            continue
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        if i + max_tokens >= len(tokens):
            break

    return chunks

def run_preprocessing():
    print("-> Prétraitement des sous-titres...")
    df = pd.read_csv(RAW_CSV)

    # Drop empty or invalid entries
    df = df.dropna(subset=["sous-titre", "titre", "date"])
    all_chunks = []

    print(f"-> Lecture du dataset: {RAW_CSV}")
    for idx, row in df.iterrows():
        texte = str(row["sous-titre"]).strip()
        if not texte:
            continue

        chunks = split_into_chunks(texte, MAX_TOKENS, overlap_ratio=0.2)
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "id_video": row["id"],
                "titre": row["titre"],
                "date": row["date"],
                "langue": row.get("langue", "ar"),
                "chunk_id": f"{row['id']}_{i}",
                "texte": chunk,
                "lien": row.get("lien", "")
            })

    chunk_df = pd.DataFrame(all_chunks)

    # Ensure output directory exists
    output_path = Path(PROCESSED_CHUNKS_CSV)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save to CSV
    chunk_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f">> Chunks enregistrés dans {output_path}")