Spaces:

1qwsd
/

auoto

Runtime error

App Files Files Community

1qwsd commited on May 20, 2025

Commit

1a8e220

verified ·

1 Parent(s): 6180132

Create app.py

Browse files

Files changed (1) hide show

app.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.optimizers import Adam
+import nltk
+import re
+input_file = 'holmes.txt'
+# Read the contents of the file
+with open(input_file, 'r', encoding='utf-8') as infile:
+    data = infile.read()
+data = data[:500000]
+def remove_emojis_and_special_characters(text):
+    # Remove emojis
+    emoji_pattern = re.compile("["
+                               u"\U0001F600-\U0001F64F"  # emoticons
+                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                               u"\U0001F700-\U0001F77F"  # alchemical symbols
+                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
+                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
+                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
+                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
+                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
+                               u"\U00002702-\U000027B0"  # Dingbats
+                               u"\U000024C2-\U0001F251"
+                               "]+", flags=re.UNICODE)
+    # Remove special characters
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+    # Remove extra spaces
+    text = re.sub(' +', ' ', text)
+    return text
+def preprocess_pipeline(data) -> 'list':
+    # Split by newline character
+    sentences = data.split('\n')
+    for i in range(len(sentences)):
+        sentences[i] = remove_emojis_and_special_characters(sentences[i])
+    # Remove leading and trailing spaces
+    sentences = [s.strip() for s in sentences]
+    # Drop empty sentences
+    sentences = [s for s in sentences if len(s) > 0]
+    # Tokenization
+    tokenized = []
+    for sentence in sentences:
+        # Convert to lowercase
+        sentence = sentence.lower()
+        tokenized.append(sentence)
+    return tokenized
+# Tokenize sentences
+tokenized_sentences = preprocess_pipeline(data)
+"""
+What is an OOV Token?
+An out-of-vocabulary (OOV) token is a special token used in natural language processing (NLP) tasks to represent words that
+are not present in the vocabulary of the model or tokenizer. When a word that is not in the vocabulary is encountered during
+tokenization or text processing, it is replaced with the OOV token.
+Why Use an OOV Token?
+Using an OOV token helps handle unseen or unknown words during the training or inference phase of an NLP model.
+Instead of encountering errors or issues when encountering unknown words, the model can gracefully handle them by
+representing them with the OOV token. This is particularly useful when working with real-world data where the vocabulary
+of the model may not cover all possible words.
+"""
+# Tokenize words
+tokenizer = Tokenizer(oov_token='<oov>')
+tokenizer.fit_on_texts(tokenized_sentences)
+total_words = len(tokenizer.word_index) + 1
+# tokenizer.word_counts
+# tokenizer.word_index
+"""
+n_gram example:
+[3, 15, 8, 7, 20, 12, 6]
+For the above sentece sentence, the code would generate the following n-gram sequences:
+[3, 15]
+[3, 15, 8]
+[3, 15, 8, 7]
+[3, 15, 8, 7, 20]
+[3, 15, 8, 7, 20, 12]
+[3, 15, 8, 7, 20, 12, 6]
+"""
+# Generate input sequences
+input_sequences = []
+for line in tokenized_sentences:
+    token_list = tokenizer.texts_to_sequences([line])[0]
+    for i in range(1, len(token_list)):
+        n_gram_sequence = token_list[:i + 1]
+        input_sequences.append(n_gram_sequence)
+# Pad sequences
+max_sequence_len = max([len(x) for x in input_sequences])
+input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
+X,labels = input_sequences[:,:-1],input_sequences[:,-1]
+ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
+from sklearn.model_selection import train_test_split
+X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42)
+X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
+model = Sequential()
+model.add(Embedding(total_words, 100))
+model.add(Bidirectional(LSTM(150)))
+model.add(Dense(total_words, activation='softmax'))
+adam = Adam(learning_rate=0.01)
+model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
+# Train the model
+history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1)