1qwsd commited on
Commit
1a8e220
·
verified ·
1 Parent(s): 6180132

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
6
+ from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
7
+ from tensorflow.keras.preprocessing.text import Tokenizer
8
+ from tensorflow.keras.models import Sequential
9
+ from tensorflow.keras.optimizers import Adam
10
+ import nltk
11
+ import re
12
+
13
+ input_file = 'holmes.txt'
14
+
15
+ # Read the contents of the file
16
+ with open(input_file, 'r', encoding='utf-8') as infile:
17
+ data = infile.read()
18
+
19
+
20
+ data = data[:500000]
21
+
22
+ def remove_emojis_and_special_characters(text):
23
+ # Remove emojis
24
+ emoji_pattern = re.compile("["
25
+ u"\U0001F600-\U0001F64F" # emoticons
26
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
27
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
28
+ u"\U0001F700-\U0001F77F" # alchemical symbols
29
+ u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
30
+ u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
31
+ u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
32
+ u"\U0001FA00-\U0001FA6F" # Chess Symbols
33
+ u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
34
+ u"\U00002702-\U000027B0" # Dingbats
35
+ u"\U000024C2-\U0001F251"
36
+ "]+", flags=re.UNICODE)
37
+
38
+ # Remove special characters
39
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
40
+
41
+ # Remove extra spaces
42
+ text = re.sub(' +', ' ', text)
43
+
44
+ return text
45
+
46
+
47
+ def preprocess_pipeline(data) -> 'list':
48
+ # Split by newline character
49
+ sentences = data.split('\n')
50
+ for i in range(len(sentences)):
51
+ sentences[i] = remove_emojis_and_special_characters(sentences[i])
52
+ # Remove leading and trailing spaces
53
+ sentences = [s.strip() for s in sentences]
54
+ # Drop empty sentences
55
+ sentences = [s for s in sentences if len(s) > 0]
56
+ # Tokenization
57
+ tokenized = []
58
+ for sentence in sentences:
59
+ # Convert to lowercase
60
+ sentence = sentence.lower()
61
+ tokenized.append(sentence)
62
+ return tokenized
63
+
64
+ # Tokenize sentences
65
+ tokenized_sentences = preprocess_pipeline(data)
66
+
67
+
68
+ """
69
+ What is an OOV Token?
70
+ An out-of-vocabulary (OOV) token is a special token used in natural language processing (NLP) tasks to represent words that
71
+ are not present in the vocabulary of the model or tokenizer. When a word that is not in the vocabulary is encountered during
72
+ tokenization or text processing, it is replaced with the OOV token.
73
+
74
+ Why Use an OOV Token?
75
+ Using an OOV token helps handle unseen or unknown words during the training or inference phase of an NLP model.
76
+ Instead of encountering errors or issues when encountering unknown words, the model can gracefully handle them by
77
+ representing them with the OOV token. This is particularly useful when working with real-world data where the vocabulary
78
+ of the model may not cover all possible words.
79
+ """
80
+ # Tokenize words
81
+ tokenizer = Tokenizer(oov_token='<oov>')
82
+ tokenizer.fit_on_texts(tokenized_sentences)
83
+ total_words = len(tokenizer.word_index) + 1
84
+ # tokenizer.word_counts
85
+ # tokenizer.word_index
86
+ """
87
+ n_gram example:
88
+ [3, 15, 8, 7, 20, 12, 6]
89
+
90
+ For the above sentece sentence, the code would generate the following n-gram sequences:
91
+
92
+ [3, 15]
93
+ [3, 15, 8]
94
+ [3, 15, 8, 7]
95
+ [3, 15, 8, 7, 20]
96
+ [3, 15, 8, 7, 20, 12]
97
+ [3, 15, 8, 7, 20, 12, 6]
98
+ """
99
+
100
+ # Generate input sequences
101
+ input_sequences = []
102
+ for line in tokenized_sentences:
103
+ token_list = tokenizer.texts_to_sequences([line])[0]
104
+ for i in range(1, len(token_list)):
105
+ n_gram_sequence = token_list[:i + 1]
106
+ input_sequences.append(n_gram_sequence)
107
+
108
+ # Pad sequences
109
+ max_sequence_len = max([len(x) for x in input_sequences])
110
+ input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
111
+
112
+ X,labels = input_sequences[:,:-1],input_sequences[:,-1]
113
+ ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
114
+
115
+
116
+ from sklearn.model_selection import train_test_split
117
+ X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42)
118
+ X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
119
+
120
+ model = Sequential()
121
+ model.add(Embedding(total_words, 100))
122
+ model.add(Bidirectional(LSTM(150)))
123
+ model.add(Dense(total_words, activation='softmax'))
124
+
125
+ adam = Adam(learning_rate=0.01)
126
+ model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
127
+
128
+ # Train the model
129
+ history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1)
130
+
131
+