Spaces:
Sleeping
Sleeping
File size: 617 Bytes
492754f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import os
import re
import pandas as pd
def clean_text(text):
text = str(text).lower()
text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def preprocess_data(
input_path="data/raw/bbc-text.csv",
output_path="data/processed/processed_bbc.csv"
):
df = pd.read_csv(input_path)
df["clean_text"] = df["text"].apply(clean_text)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print("Preprocessing completed!")
print(df.head())
if __name__ == "__main__":
preprocess_data() |