Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| # 1️⃣ Data loading + cleaning | |
| def load_and_clean_data(): | |
| # Load train+val | |
| df = pd.read_csv( | |
| "https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo", | |
| delimiter=';', header=None, names=['sentence','label'] | |
| ) | |
| # Load test | |
| ts = pd.read_csv( | |
| "https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm", | |
| delimiter=';', header=None, names=['sentence','label'] | |
| ) | |
| df = pd.concat([df, ts], ignore_index=True) | |
| df.drop_duplicates(inplace=True) | |
| df['clean'] = df['sentence'].apply(clean_text) | |
| return df | |
| # 2️⃣ Text cleaning utility | |
| def clean_text(text): | |
| if pd.isnull(text): return "" | |
| t = text.lower() | |
| t = re.sub(r"http\S+|www\S+|https\S+", "", t) | |
| t = re.sub(r"\@\w+|\#", "", t) | |
| t = re.sub(r"[^a-z\s]", "", t) | |
| return re.sub(r"\s+", " ", t).strip() |