Spaces:

Pearll12
/

bbc-document-classifier

Sleeping

Deploy document classifier app

492754f about 1 month ago

617 Bytes

	import os
	import re
	import pandas as pd


	def clean_text(text):
	text = str(text).lower()
	text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def preprocess_data(
	input_path="data/raw/bbc-text.csv",
	output_path="data/processed/processed_bbc.csv"
	):
	df = pd.read_csv(input_path)

	df["clean_text"] = df["text"].apply(clean_text)

	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	df.to_csv(output_path, index=False)

	print("Preprocessing completed!")
	print(df.head())


	if __name__ == "__main__":
	preprocess_data()