Spaces:

opinder2906
/

a1

Sleeping

a1 / src /data_processing.py

Update src/data_processing.py

d0c0a4a verified 7 months ago

917 Bytes

	import re
	import pandas as pd

	# 1️⃣ Data loading + cleaning

	def load_and_clean_data():
	# Load train+val
	df = pd.read_csv(
	"https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
	delimiter=';', header=None, names=['sentence','label']
	)
	# Load test
	ts = pd.read_csv(
	"https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
	delimiter=';', header=None, names=['sentence','label']
	)
	df = pd.concat([df, ts], ignore_index=True)
	df.drop_duplicates(inplace=True)
	df['clean'] = df['sentence'].apply(clean_text)
	return df

	# 2️⃣ Text cleaning utility
	def clean_text(text):
	if pd.isnull(text): return ""
	t = text.lower()
	t = re.sub(r"http\S+\|www\S+\|https\S+", "", t)
	t = re.sub(r"\@\w+\|\#", "", t)
	t = re.sub(r"[^a-z\s]", "", t)
	return re.sub(r"\s+", " ", t).strip()