| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| def load_data(file_path): | |
| data = pd.read_csv(file_path) | |
| return data['review'], data['sentiment'] | |
| def data_preprocess(X, y, max_words = 10000, max_len = 100): | |
| y = (y == 'positive').astype(int) | |
| tokenizer = Tokenizer(num_words = max_words) | |
| tokenizer.fit_on_texts(X) | |
| X_seq = tokenizer.texts_to_sequences(X) | |
| X_pad = pad_sequences(X_seq, maxlen = max_len) | |
| return X_pad, y, tokenizer | |
| def prepare_data(file_path, max_words = 10000, max_len = 1000): | |
| X, y = load_data(file_path) | |
| X_pad, y, tokenizer = data_preprocess(X, y, max_words, max_len) | |
| X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42) | |
| return X_train, X_test, y_train, y_test, tokenizer | |