import re
import pandas as pd

# 1️⃣ Data loading + cleaning

def load_and_clean_data():
    # Load train+val
    df = pd.read_csv(
        "https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
        delimiter=';', header=None, names=['sentence','label']
    )
    # Load test
    ts = pd.read_csv(
        "https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
        delimiter=';', header=None, names=['sentence','label']
    )
    df = pd.concat([df, ts], ignore_index=True)
    df.drop_duplicates(inplace=True)
    df['clean'] = df['sentence'].apply(clean_text)
    return df

# 2️⃣ Text cleaning utility
def clean_text(text):
    if pd.isnull(text): return ""
    t = text.lower()
    t = re.sub(r"http\S+|www\S+|https\S+", "", t)
    t = re.sub(r"\@\w+|\#", "", t)
    t = re.sub(r"[^a-z\s]", "", t)
    return re.sub(r"\s+", " ", t).strip()