phishing_detection / preprocess_data.py
nijatmammadov's picture
Upload folder using huggingface_hub
96a8ab7 verified
raw
history blame contribute delete
790 Bytes
from load_data import load_dataset_
from bs4 import BeautifulSoup as bs4
import re
def remove_html(text):
if text is None:
return None
if "<" not in text and ">" not in text:
return text
# Otherwise, parse and clean the HTML
soup = bs4(text, "html.parser")
return soup.get_text()
def remove_links(text):
if text is None:
return None
pattern = r'https?://\S+|www\.\S+'
clean_text = re.sub(pattern, '', text).lower().strip()
return clean_text
def preprocess(dataset):
texts, labels = zip(*[
(remove_links(remove_html(i['text'])).lower().strip(), i['label'])
for i in dataset['train']
if i and i.get('text') and remove_links(remove_html(i['text'])).strip()
])
return texts, labels