Spaces:

DiegoTheExplorar
/

KlingonToEnglish

Runtime error

App Files Files Community

DiegoTheExplorar commited on Jun 10, 2024

Commit

012aba0

verified ·

1 Parent(s): bab82bf

Upload 7 files

Browse files

Files changed (7) hide show

DataPPwithspecial.py +55 -0
DataPreprocessing.py +53 -0
Decoder.py +66 -0
Encoder.py +51 -0
English_To_Klingon.csv +0 -0
GradioKlingonToEnglish.py +84 -0
Klingon_to_English.pth +3 -0

DataPPwithspecial.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+def preprocess():
+    # Load dataset
+    data = pd.read_csv('./backend/English_To_Klingon.csv')
+    # Append <BOS> and <EOS> tags to the Klingon sentences
+    data['klingon'] = data['klingon'].apply(lambda x: '<BOS> ' + x + ' <EOS>')
+    # Separate the sentences
+    english_sentences = data['english'].values
+    klingon_sentences = data['klingon'].values
+    # Split data into training and testing sets. An 80 - 20 split is used here
+    english_train, english_test, klingon_train, klingon_test = train_test_split(
+        english_sentences, klingon_sentences, test_size=0.2, random_state=42)
+    # Initialize tokenizers with specified vocabulary size
+    english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
+    klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
+    # Fit tokenizers on training data
+    english_tokenizer.fit_on_texts(english_train)
+    klingon_tokenizer.fit_on_texts(klingon_train)
+    # Tokenize the sentences
+    english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
+    klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
+    english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
+    klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
+    # Padding sequences to a fixed length
+    english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=50, padding='post')
+    klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=50, padding='post')
+    english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=50, padding='post')
+    klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=50, padding='post')
+    # Prepare target data for training
+    klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
+    klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
+    klingon_train_target = np.expand_dims(klingon_train_target, -1)
+    # Prepare target data for testing
+    klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
+    klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
+    klingon_test_target = np.expand_dims(klingon_test_target, -1)
+    return (english_tokenizer, klingon_tokenizer, 50, # max_length
+            english_train_padded, klingon_train_input, klingon_train_target,
+            english_test_padded, klingon_test_input, klingon_test_target)

DataPreprocessing.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Not in use as it doesnt account for special tokens
+"""
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+def preprocess():
+    # Load dataset
+    data = pd.read_csv('English_To_Klingon.csv')
+    # separate the sentences
+    english_sentences = data['english'].values
+    klingon_sentences = data['klingon'].values
+    # split data into training and testing tests. An 80 - 20 split is used here
+    english_train, english_test, klingon_train, klingon_test = train_test_split(english_sentences, klingon_sentences, test_size=0.2, random_state=42)
+    # Tokenize the sentences
+    english_tokenizer = tf.keras.preprocessing.text.Tokenizer()
+    klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer()
+    english_tokenizer.fit_on_texts(english_train)
+    klingon_tokenizer.fit_on_texts(klingon_train)
+    english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
+    klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
+    english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
+    klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
+    # Padding sequences
+    max_english_length = max([len(seq) for seq in english_train_sequences])
+    max_klingon_length = max([len(seq) for seq in klingon_train_sequences])
+    english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=max_english_length, padding='post')
+    klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=max_klingon_length, padding='post')
+    english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=max_english_length, padding='post')
+    klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=max_klingon_length, padding='post')
+    # Prepare target data for training
+    klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
+    klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
+    klingon_train_target = np.expand_dims(klingon_train_target, -1)
+    # Prepare target data for testing
+    klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
+    klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
+    klingon_test_target = np.expand_dims(klingon_test_target, -1)
+    return (english_tokenizer, klingon_tokenizer, max_english_length, max_klingon_length,
+            english_train_padded, klingon_train_input, klingon_train_target,
+            english_test_padded, klingon_test_input, klingon_test_target)

Decoder.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch.nn as nn
+class Decoder(nn.Module):
+    """
+    Initailising GRU Decoder. Based on the hidden state(context vector)
+    my encoder has returned I want too make predictions to map
+    English to Klingon
+    Parameters:
+    ----------
+    input_dim : int
+        Size of the input vocabulary
+    emb_dim : int
+        Dimension of the embedding vectors
+    hid_dim : int
+        Number of features in the GRU's hidden state
+    n_layers : int
+        Number of GRU layers (typically 2)
+    dropout : float
+        Dropout probability for the dropout layer
+    """
+    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
+        super().__init__()
+        self.hid_dim = hid_dim
+        self.output_dim = output_dim
+        self.n_layers = n_layers
+        self.embedding = nn.Embedding(output_dim, emb_dim)
+        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
+        self.fc_out = nn.Linear(hid_dim, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    """
+        Forward propagation step of decoding
+        Parameters:
+        ----------
+        hidden : Tensor
+            Hidden tensor containing token indices (seq_len, batch_size)
+            This is what our encoder returns
+        trg : Tensor
+            Target tensor containing token indices (seq_len, batch_size)
+            This is what our tokenized Klingon Data
+        Returns:
+        -------
+        prediction : Tensor
+            Predicted output tensor from the GRU (seq_len, batch_size, output_dim)
+        hidden : Tensor
+            Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
+    """
+    def forward(self, trg, hidden):
+        #unsure trg is 3D
+        trg = trg.unsqueeze(0)
+        #input is converted into embeddings and dropout probability is applied
+        embedded = self.dropout(self.embedding(trg))
+        #print("Embedded shape:", embedded.shape)
+        #GRU layer computes new context based on previous context
+        output, hidden = self.rnn(embedded, hidden)
+        #print("Output shape after RNN:", output.shape)
+        #predicts output from GRU
+        prediction = self.fc_out(output.squeeze(0))
+        #print("Output shape after fc_out:", output.shape)
+        return prediction, hidden

Encoder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch.nn as nn
+class Encoder(nn.Module):
+    """
+    Seq2Seq Encoder for GRU model. I want to store any kind
+    of sequenital information to be passed on to the decoder
+    Parameters:
+    ----------
+    input_dim : int
+        Size of the input vocabulary
+    emb_dim : int
+        Dimension of the embedding vectors
+    hid_dim : int
+        Number of features in the GRU's hidden state
+    n_layers : int
+        Number of GRU layers (typically 2)
+    dropout : float
+        Dropout probability for the dropout layer
+    """
+    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
+        super().__init__()
+        # Embedding layer
+        self.embedding = nn.Embedding(input_dim, emb_dim)
+        self.hid_dim = hid_dim
+        self.n_layers = n_layers
+        # GRU layer
+        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
+        # Dropout layer
+        self.dropout = nn.Dropout(dropout)
+    """
+        Forward propagation step of encoding
+        Parameters:
+        ----------
+        input : Tensor
+            Input tensor containing token indices (seq_len, batch_size)
+        Returns:
+        -------
+        hidden : Tensor
+            Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
+        """
+    def forward(self, input):
+        #input is converted into embeddings
+        embedded = self.dropout(self.embedding(input))
+        #forward pass into GRU and dropout probability is applied
+        _ , hidden = self.rnn(embedded)
+        #only hidden state is required for encoding
+        return hidden

English_To_Klingon.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

GradioKlingonToEnglish.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import tensorflow as tf
+import gradio as gr
+import re
+from Seq2SeqModel import Seq2SeqModel
+from DataPPwithspecial import preprocess
+from Decoder import Decoder
+from Encoder import Encoder
+# Model parameters
+n_layers = 2
+emb_dim = 256
+hid_dim = 512
+dropout = 0.5
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available, otherwise use CPU
+# Load preprocessed data and model parameters
+(klingon_tokenizer, english_tokenizer, max_klingon_length,
+    _, _, _, _, _, _) = preprocess()  # We don't need training data for inference
+input_dim = len(klingon_tokenizer.word_index) + 1  # Add 1 for the padding token
+output_dim = len(english_tokenizer.word_index) + 1  # Add 1 for the padding token
+# Initialize encoder and decoder
+encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
+decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
+# Initialize the Seq2SeqModel
+model = Seq2SeqModel(encoder, decoder, device).to(device)
+# Load the saved model, mapping it to the CPU if necessary
+model.load_state_dict(torch.load('./backend/Klingon_to_English.pth', map_location=torch.device('cpu')))
+model.eval()  # Set the model to evaluation mode
+# Tokenize the Klingon input
+def preprocess_sentence(sentence, tokenizer, max_length):
+    # Tokenize the sentence
+    tokenized_sentence = tokenizer.texts_to_sequences([sentence])
+    # Pad the sequence
+    padded_sentence = tf.keras.preprocessing.sequence.pad_sequences(tokenized_sentence, maxlen=max_length, padding='post')
+    return torch.tensor(padded_sentence, dtype=torch.long).to(device)
+# Translation function for Gradio
+def translate_klingon_to_english(klingon_sentence):
+    # Preprocess the input Klingon sentence
+    input_sentence = preprocess_sentence(klingon_sentence, klingon_tokenizer, max_klingon_length)
+    # Remove the extra dimension added by unsqueeze(1)
+    input_sentence = input_sentence.squeeze(0)
+    # Perform inference
+    with torch.no_grad():
+        # Pass input as both input and target with teacher forcing ratio 0
+        output = model(input_sentence.unsqueeze(1), input_sentence.unsqueeze(1), 0)
+    # Convert output indices to English words
+    output_indices = torch.argmax(output, dim=-1).squeeze().tolist()
+    english_sentence = ' '.join([english_tokenizer.index_word[idx] for idx in output_indices if idx != 0])  # Remove padding token
+    # Regex to remove eos
+    english_sentence = re.sub(r'\beos\b', '', english_sentence).strip()
+    if english_sentence == "":
+        english_sentence = 'sorry model sucks'
+    return english_sentence
+# Create Gradio interface
+examples = [
+    ["nuqneH"],
+    ["tlhIngan Hol Dajatlh'a'?"],
+    ["jIyajbe'"],
+    ["Heghlu'meH QaQ jajvam"],
+    ["Hoch vor Dar"]
+]
+iface = gr.Interface(
+    fn=translate_klingon_to_english,
+    inputs=gr.Textbox(label="Klingon Phrase", lines=2, placeholder="Enter Klingon text here..."),
+    outputs=gr.Textbox(label="English Translation", lines=2),
+    title="Klingon to English Translation",
+    description="Enter text in Klingon and get its translation in English. This translator helps you convert everyday Klingon phrases into English. Try one of the example sentences to see how it works!",
+    examples=examples,
+    theme="default"
+)
+iface.launch()

Klingon_to_English.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad41ca3fc34faf6e4f88902c816ae69cd80aac6b8d238aad9f116dc2ce21e94f
+size 73477686