sentiment-Analysis / src /data_preprocessing.py
Anki2004's picture
Upload 16 files
a01a0b1 verified
Raw
History Blame Contribute Delete
980 Bytes
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
def load_data(file_path):
data = pd.read_csv(file_path)
return data['review'], data['sentiment']
def data_preprocess(X, y, max_words = 10000, max_len = 100):
y = (y == 'positive').astype(int)
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen = max_len)
return X_pad, y, tokenizer
def prepare_data(file_path, max_words = 10000, max_len = 1000):
X, y = load_data(file_path)
X_pad, y, tokenizer = data_preprocess(X, y, max_words, max_len)
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test, tokenizer