File size: 790 Bytes
96a8ab7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from load_data import load_dataset_
from bs4 import BeautifulSoup as bs4
import re

def remove_html(text):
    if text is None:
      return None
    if "<" not in text and ">" not in text:
        return text

    # Otherwise, parse and clean the HTML
    soup = bs4(text, "html.parser")
    return soup.get_text()

def remove_links(text):

    if text is None:
      return None
    pattern = r'https?://\S+|www\.\S+'
    clean_text = re.sub(pattern, '', text).lower().strip()
    return clean_text

def preprocess(dataset):

   texts, labels = zip(*[
    (remove_links(remove_html(i['text'])).lower().strip(), i['label'])
    for i in dataset['train']
    if i and i.get('text') and remove_links(remove_html(i['text'])).strip()
    ])
   return texts, labels