File size: 617 Bytes
492754f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import re
import pandas as pd


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def preprocess_data(
    input_path="data/raw/bbc-text.csv",
    output_path="data/processed/processed_bbc.csv"
):
    df = pd.read_csv(input_path)

    df["clean_text"] = df["text"].apply(clean_text)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    df.to_csv(output_path, index=False)

    print("Preprocessing completed!")
    print(df.head())


if __name__ == "__main__":
    preprocess_data()