import os import re import pandas as pd def clean_text(text): text = str(text).lower() text = re.sub(r"[^a-zA-Z0-9\s]", "", text) text = re.sub(r"\s+", " ", text).strip() return text def preprocess_data( input_path="data/raw/bbc-text.csv", output_path="data/processed/processed_bbc.csv" ): df = pd.read_csv(input_path) df["clean_text"] = df["text"].apply(clean_text) os.makedirs(os.path.dirname(output_path), exist_ok=True) df.to_csv(output_path, index=False) print("Preprocessing completed!") print(df.head()) if __name__ == "__main__": preprocess_data()