import os import pandas as pd import numpy as np import tensorflow as tf from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import Adam import nltk import re input_file = 'holmes.txt' # Read the contents of the file with open(input_file, 'r', encoding='utf-8') as infile: data = infile.read() data = data[:500000] def remove_emojis_and_special_characters(text): # Remove emojis emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F700-\U0001F77F" # alchemical symbols u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs u"\U0001FA00-\U0001FA6F" # Chess Symbols u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A u"\U00002702-\U000027B0" # Dingbats u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) # Remove special characters text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove extra spaces text = re.sub(' +', ' ', text) return text def preprocess_pipeline(data) -> 'list': # Split by newline character sentences = data.split('\n') for i in range(len(sentences)): sentences[i] = remove_emojis_and_special_characters(sentences[i]) # Remove leading and trailing spaces sentences = [s.strip() for s in sentences] # Drop empty sentences sentences = [s for s in sentences if len(s) > 0] # Tokenization tokenized = [] for sentence in sentences: # Convert to lowercase sentence = sentence.lower() tokenized.append(sentence) return tokenized # Tokenize sentences tokenized_sentences = preprocess_pipeline(data) """ What is an OOV Token? An out-of-vocabulary (OOV) token is a special token used in natural language processing (NLP) tasks to represent words that are not present in the vocabulary of the model or tokenizer. When a word that is not in the vocabulary is encountered during tokenization or text processing, it is replaced with the OOV token. Why Use an OOV Token? Using an OOV token helps handle unseen or unknown words during the training or inference phase of an NLP model. Instead of encountering errors or issues when encountering unknown words, the model can gracefully handle them by representing them with the OOV token. This is particularly useful when working with real-world data where the vocabulary of the model may not cover all possible words. """ # Tokenize words tokenizer = Tokenizer(oov_token='') tokenizer.fit_on_texts(tokenized_sentences) total_words = len(tokenizer.word_index) + 1 # tokenizer.word_counts # tokenizer.word_index """ n_gram example: [3, 15, 8, 7, 20, 12, 6] For the above sentece sentence, the code would generate the following n-gram sequences: [3, 15] [3, 15, 8] [3, 15, 8, 7] [3, 15, 8, 7, 20] [3, 15, 8, 7, 20, 12] [3, 15, 8, 7, 20, 12, 6] """ # Generate input sequences input_sequences = [] for line in tokenized_sentences: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) # Pad sequences max_sequence_len = max([len(x) for x in input_sequences]) input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) X,labels = input_sequences[:,:-1],input_sequences[:,-1] ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) from sklearn.model_selection import train_test_split X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42) X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42) model = Sequential() model.add(Embedding(total_words, 100)) model.add(Bidirectional(LSTM(150))) model.add(Dense(total_words, activation='softmax')) adam = Adam(learning_rate=0.01) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) # Train the model history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1)