a1 / src /data_processing.py
opinder2906's picture
Update src/data_processing.py
d0c0a4a verified
import re
import pandas as pd
# 1️⃣ Data loading + cleaning
def load_and_clean_data():
# Load train+val
df = pd.read_csv(
"https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
delimiter=';', header=None, names=['sentence','label']
)
# Load test
ts = pd.read_csv(
"https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
delimiter=';', header=None, names=['sentence','label']
)
df = pd.concat([df, ts], ignore_index=True)
df.drop_duplicates(inplace=True)
df['clean'] = df['sentence'].apply(clean_text)
return df
# 2️⃣ Text cleaning utility
def clean_text(text):
if pd.isnull(text): return ""
t = text.lower()
t = re.sub(r"http\S+|www\S+|https\S+", "", t)
t = re.sub(r"\@\w+|\#", "", t)
t = re.sub(r"[^a-z\s]", "", t)
return re.sub(r"\s+", " ", t).strip()