File size: 549 Bytes
e2b99db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import re

def text_preprocessing(text):
  text = text.lower()
  text = re.sub(r'https?://\S+|www\.\S+', '', text)
  text = re.sub(r'[-+]?[0-9]+', '', text)
  text = re.sub(r'[^\w\s]','', text)
  text = text.strip()
  return text
  
%time data['Text'] = data['Text'].apply(text_preprocessing)

raw_data = data.copy()
raw_data.head()

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(data, test_size=0.2)
df_val, df_test = train_test_split(df_test, test_size=0.6)

df_train.shape, df_test.shape, df_val.shape