from load_data import load_dataset_ from bs4 import BeautifulSoup as bs4 import re def remove_html(text): if text is None: return None if "<" not in text and ">" not in text: return text # Otherwise, parse and clean the HTML soup = bs4(text, "html.parser") return soup.get_text() def remove_links(text): if text is None: return None pattern = r'https?://\S+|www\.\S+' clean_text = re.sub(pattern, '', text).lower().strip() return clean_text def preprocess(dataset): texts, labels = zip(*[ (remove_links(remove_html(i['text'])).lower().strip(), i['label']) for i in dataset['train'] if i and i.get('text') and remove_links(remove_html(i['text'])).strip() ]) return texts, labels