Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import pandas as pd | |
| def clean_text(text): | |
| text = str(text).lower() | |
| text = re.sub(r"[^a-zA-Z0-9\s]", "", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def preprocess_data( | |
| input_path="data/raw/bbc-text.csv", | |
| output_path="data/processed/processed_bbc.csv" | |
| ): | |
| df = pd.read_csv(input_path) | |
| df["clean_text"] = df["text"].apply(clean_text) | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| df.to_csv(output_path, index=False) | |
| print("Preprocessing completed!") | |
| print(df.head()) | |
| if __name__ == "__main__": | |
| preprocess_data() |