import re import pandas as pd # 1️⃣ Data loading + cleaning def load_and_clean_data(): # Load train+val df = pd.read_csv( "https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo", delimiter=';', header=None, names=['sentence','label'] ) # Load test ts = pd.read_csv( "https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm", delimiter=';', header=None, names=['sentence','label'] ) df = pd.concat([df, ts], ignore_index=True) df.drop_duplicates(inplace=True) df['clean'] = df['sentence'].apply(clean_text) return df # 2️⃣ Text cleaning utility def clean_text(text): if pd.isnull(text): return "" t = text.lower() t = re.sub(r"http\S+|www\S+|https\S+", "", t) t = re.sub(r"\@\w+|\#", "", t) t = re.sub(r"[^a-z\s]", "", t) return re.sub(r"\s+", " ", t).strip()