Spaces:

1qwsd
/

auoto

Runtime error

App Files Files Community

auoto / app.py

1qwsd

Create app.py

1a8e220 verified 8 months ago

raw

history blame contribute delete

4.82 kB

	import os
	import pandas as pd
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.optimizers import Adam
	import nltk
	import re

	input_file = 'holmes.txt'

	# Read the contents of the file
	with open(input_file, 'r', encoding='utf-8') as infile:
	data = infile.read()


	data = data[:500000]

	def remove_emojis_and_special_characters(text):
	# Remove emojis
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F700-\U0001F77F" # alchemical symbols
	u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	u"\U0001FA00-\U0001FA6F" # Chess Symbols
	u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	u"\U00002702-\U000027B0" # Dingbats
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)

	# Remove special characters
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

	# Remove extra spaces
	text = re.sub(' +', ' ', text)

	return text


	def preprocess_pipeline(data) -> 'list':
	# Split by newline character
	sentences = data.split('\n')
	for i in range(len(sentences)):
	sentences[i] = remove_emojis_and_special_characters(sentences[i])
	# Remove leading and trailing spaces
	sentences = [s.strip() for s in sentences]
	# Drop empty sentences
	sentences = [s for s in sentences if len(s) > 0]
	# Tokenization
	tokenized = []
	for sentence in sentences:
	# Convert to lowercase
	sentence = sentence.lower()
	tokenized.append(sentence)
	return tokenized

	# Tokenize sentences
	tokenized_sentences = preprocess_pipeline(data)


	"""
	What is an OOV Token?
	An out-of-vocabulary (OOV) token is a special token used in natural language processing (NLP) tasks to represent words that
	are not present in the vocabulary of the model or tokenizer. When a word that is not in the vocabulary is encountered during
	tokenization or text processing, it is replaced with the OOV token.

	Why Use an OOV Token?
	Using an OOV token helps handle unseen or unknown words during the training or inference phase of an NLP model.
	Instead of encountering errors or issues when encountering unknown words, the model can gracefully handle them by
	representing them with the OOV token. This is particularly useful when working with real-world data where the vocabulary
	of the model may not cover all possible words.
	"""
	# Tokenize words
	tokenizer = Tokenizer(oov_token='<oov>')
	tokenizer.fit_on_texts(tokenized_sentences)
	total_words = len(tokenizer.word_index) + 1
	# tokenizer.word_counts
	# tokenizer.word_index
	"""
	n_gram example:
	[3, 15, 8, 7, 20, 12, 6]

	For the above sentece sentence, the code would generate the following n-gram sequences:

	[3, 15]
	[3, 15, 8]
	[3, 15, 8, 7]
	[3, 15, 8, 7, 20]
	[3, 15, 8, 7, 20, 12]
	[3, 15, 8, 7, 20, 12, 6]
	"""

	# Generate input sequences
	input_sequences = []
	for line in tokenized_sentences:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
	n_gram_sequence = token_list[:i + 1]
	input_sequences.append(n_gram_sequence)

	# Pad sequences
	max_sequence_len = max([len(x) for x in input_sequences])
	input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

	X,labels = input_sequences[:,:-1],input_sequences[:,-1]
	ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)


	from sklearn.model_selection import train_test_split
	X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42)
	X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

	model = Sequential()
	model.add(Embedding(total_words, 100))
	model.add(Bidirectional(LSTM(150)))
	model.add(Dense(total_words, activation='softmax'))

	adam = Adam(learning_rate=0.01)
	model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

	# Train the model
	history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1)