import pandas as pd import re from pathlib import Path RAW = Path("ml/data/raw") OUT = Path("ml/data/processed") OUT.mkdir(parents=True, exist_ok=True) LEVELS = {"A1","A2","B1","B2","C1","C2"} def clean_text(t: str) -> str: if not isinstance(t, str): return "" t = t.strip() t = re.sub(r"\s+", " ", t) return t def load_and_clean(path: Path) -> pd.DataFrame: df = pd.read_csv(path) df = df[["text", "cefr_level"]].copy() df["text"] = df["text"].apply(clean_text) df = df[df["text"].str.len() > 0] df = df[df["cefr_level"].isin(LEVELS)] # limiter longueur extrême (optionnel) df = df[df["text"].str.len() <= 400] return df if __name__ == "__main__": sp = load_and_clean(RAW / "cefr_sp_en_raw.csv") asag = load_and_clean(RAW / "cefr_asag_en_raw.csv") df = pd.concat([sp, asag], ignore_index=True).drop_duplicates() df.to_csv(OUT / "cefr_en_processed.csv", index=False, encoding="utf-8") print("Saved:", OUT / "cefr_en_processed.csv", "rows=", len(df)) print(df["cefr_level"].value_counts())