|
|
import os |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import tensorflow as tf |
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional |
|
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
from tensorflow.keras.models import Sequential |
|
|
from tensorflow.keras.optimizers import Adam |
|
|
import nltk |
|
|
import re |
|
|
|
|
|
input_file = 'holmes.txt' |
|
|
|
|
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as infile: |
|
|
data = infile.read() |
|
|
|
|
|
|
|
|
data = data[:500000] |
|
|
|
|
|
def remove_emojis_and_special_characters(text): |
|
|
|
|
|
emoji_pattern = re.compile("[" |
|
|
u"\U0001F600-\U0001F64F" |
|
|
u"\U0001F300-\U0001F5FF" |
|
|
u"\U0001F680-\U0001F6FF" |
|
|
u"\U0001F700-\U0001F77F" |
|
|
u"\U0001F780-\U0001F7FF" |
|
|
u"\U0001F800-\U0001F8FF" |
|
|
u"\U0001F900-\U0001F9FF" |
|
|
u"\U0001FA00-\U0001FA6F" |
|
|
u"\U0001FA70-\U0001FAFF" |
|
|
u"\U00002702-\U000027B0" |
|
|
u"\U000024C2-\U0001F251" |
|
|
"]+", flags=re.UNICODE) |
|
|
|
|
|
|
|
|
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(' +', ' ', text) |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def preprocess_pipeline(data) -> 'list': |
|
|
|
|
|
sentences = data.split('\n') |
|
|
for i in range(len(sentences)): |
|
|
sentences[i] = remove_emojis_and_special_characters(sentences[i]) |
|
|
|
|
|
sentences = [s.strip() for s in sentences] |
|
|
|
|
|
sentences = [s for s in sentences if len(s) > 0] |
|
|
|
|
|
tokenized = [] |
|
|
for sentence in sentences: |
|
|
|
|
|
sentence = sentence.lower() |
|
|
tokenized.append(sentence) |
|
|
return tokenized |
|
|
|
|
|
|
|
|
tokenized_sentences = preprocess_pipeline(data) |
|
|
|
|
|
|
|
|
""" |
|
|
What is an OOV Token? |
|
|
An out-of-vocabulary (OOV) token is a special token used in natural language processing (NLP) tasks to represent words that |
|
|
are not present in the vocabulary of the model or tokenizer. When a word that is not in the vocabulary is encountered during |
|
|
tokenization or text processing, it is replaced with the OOV token. |
|
|
|
|
|
Why Use an OOV Token? |
|
|
Using an OOV token helps handle unseen or unknown words during the training or inference phase of an NLP model. |
|
|
Instead of encountering errors or issues when encountering unknown words, the model can gracefully handle them by |
|
|
representing them with the OOV token. This is particularly useful when working with real-world data where the vocabulary |
|
|
of the model may not cover all possible words. |
|
|
""" |
|
|
|
|
|
tokenizer = Tokenizer(oov_token='<oov>') |
|
|
tokenizer.fit_on_texts(tokenized_sentences) |
|
|
total_words = len(tokenizer.word_index) + 1 |
|
|
|
|
|
|
|
|
""" |
|
|
n_gram example: |
|
|
[3, 15, 8, 7, 20, 12, 6] |
|
|
|
|
|
For the above sentece sentence, the code would generate the following n-gram sequences: |
|
|
|
|
|
[3, 15] |
|
|
[3, 15, 8] |
|
|
[3, 15, 8, 7] |
|
|
[3, 15, 8, 7, 20] |
|
|
[3, 15, 8, 7, 20, 12] |
|
|
[3, 15, 8, 7, 20, 12, 6] |
|
|
""" |
|
|
|
|
|
|
|
|
input_sequences = [] |
|
|
for line in tokenized_sentences: |
|
|
token_list = tokenizer.texts_to_sequences([line])[0] |
|
|
for i in range(1, len(token_list)): |
|
|
n_gram_sequence = token_list[:i + 1] |
|
|
input_sequences.append(n_gram_sequence) |
|
|
|
|
|
|
|
|
max_sequence_len = max([len(x) for x in input_sequences]) |
|
|
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) |
|
|
|
|
|
X,labels = input_sequences[:,:-1],input_sequences[:,-1] |
|
|
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) |
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42) |
|
|
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42) |
|
|
|
|
|
model = Sequential() |
|
|
model.add(Embedding(total_words, 100)) |
|
|
model.add(Bidirectional(LSTM(150))) |
|
|
model.add(Dense(total_words, activation='softmax')) |
|
|
|
|
|
adam = Adam(learning_rate=0.01) |
|
|
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) |
|
|
|
|
|
|
|
|
history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1) |
|
|
|
|
|
|
|
|
|