DiegoTheExplorar commited on
Commit
012aba0
·
verified ·
1 Parent(s): bab82bf

Upload 7 files

Browse files
DataPPwithspecial.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ import tensorflow as tf
5
+
6
+ def preprocess():
7
+ # Load dataset
8
+ data = pd.read_csv('./backend/English_To_Klingon.csv')
9
+
10
+
11
+ # Append <BOS> and <EOS> tags to the Klingon sentences
12
+ data['klingon'] = data['klingon'].apply(lambda x: '<BOS> ' + x + ' <EOS>')
13
+
14
+ # Separate the sentences
15
+ english_sentences = data['english'].values
16
+ klingon_sentences = data['klingon'].values
17
+
18
+ # Split data into training and testing sets. An 80 - 20 split is used here
19
+ english_train, english_test, klingon_train, klingon_test = train_test_split(
20
+ english_sentences, klingon_sentences, test_size=0.2, random_state=42)
21
+
22
+ # Initialize tokenizers with specified vocabulary size
23
+ english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
24
+ klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
25
+
26
+ # Fit tokenizers on training data
27
+ english_tokenizer.fit_on_texts(english_train)
28
+ klingon_tokenizer.fit_on_texts(klingon_train)
29
+
30
+ # Tokenize the sentences
31
+ english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
32
+ klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
33
+ english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
34
+ klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
35
+
36
+ # Padding sequences to a fixed length
37
+ english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=50, padding='post')
38
+ klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=50, padding='post')
39
+ english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=50, padding='post')
40
+ klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=50, padding='post')
41
+
42
+ # Prepare target data for training
43
+ klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
44
+ klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
45
+ klingon_train_target = np.expand_dims(klingon_train_target, -1)
46
+
47
+ # Prepare target data for testing
48
+ klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
49
+ klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
50
+ klingon_test_target = np.expand_dims(klingon_test_target, -1)
51
+
52
+ return (english_tokenizer, klingon_tokenizer, 50, # max_length
53
+ english_train_padded, klingon_train_input, klingon_train_target,
54
+ english_test_padded, klingon_test_input, klingon_test_target)
55
+
DataPreprocessing.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Not in use as it doesnt account for special tokens
3
+ """
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.model_selection import train_test_split
7
+ import tensorflow as tf
8
+
9
+ def preprocess():
10
+ # Load dataset
11
+ data = pd.read_csv('English_To_Klingon.csv')
12
+
13
+ # separate the sentences
14
+ english_sentences = data['english'].values
15
+ klingon_sentences = data['klingon'].values
16
+
17
+ # split data into training and testing tests. An 80 - 20 split is used here
18
+ english_train, english_test, klingon_train, klingon_test = train_test_split(english_sentences, klingon_sentences, test_size=0.2, random_state=42)
19
+
20
+ # Tokenize the sentences
21
+ english_tokenizer = tf.keras.preprocessing.text.Tokenizer()
22
+ klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer()
23
+
24
+ english_tokenizer.fit_on_texts(english_train)
25
+ klingon_tokenizer.fit_on_texts(klingon_train)
26
+
27
+ english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
28
+ klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
29
+ english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
30
+ klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
31
+
32
+ # Padding sequences
33
+ max_english_length = max([len(seq) for seq in english_train_sequences])
34
+ max_klingon_length = max([len(seq) for seq in klingon_train_sequences])
35
+
36
+ english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=max_english_length, padding='post')
37
+ klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=max_klingon_length, padding='post')
38
+ english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=max_english_length, padding='post')
39
+ klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=max_klingon_length, padding='post')
40
+
41
+ # Prepare target data for training
42
+ klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
43
+ klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
44
+ klingon_train_target = np.expand_dims(klingon_train_target, -1)
45
+
46
+ # Prepare target data for testing
47
+ klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
48
+ klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
49
+ klingon_test_target = np.expand_dims(klingon_test_target, -1)
50
+
51
+ return (english_tokenizer, klingon_tokenizer, max_english_length, max_klingon_length,
52
+ english_train_padded, klingon_train_input, klingon_train_target,
53
+ english_test_padded, klingon_test_input, klingon_test_target)
Decoder.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ class Decoder(nn.Module):
3
+ """
4
+ Initailising GRU Decoder. Based on the hidden state(context vector)
5
+ my encoder has returned I want too make predictions to map
6
+ English to Klingon
7
+
8
+ Parameters:
9
+ ----------
10
+ input_dim : int
11
+ Size of the input vocabulary
12
+ emb_dim : int
13
+ Dimension of the embedding vectors
14
+ hid_dim : int
15
+ Number of features in the GRU's hidden state
16
+ n_layers : int
17
+ Number of GRU layers (typically 2)
18
+ dropout : float
19
+ Dropout probability for the dropout layer
20
+
21
+ """
22
+ def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
23
+ super().__init__()
24
+ self.hid_dim = hid_dim
25
+ self.output_dim = output_dim
26
+ self.n_layers = n_layers
27
+ self.embedding = nn.Embedding(output_dim, emb_dim)
28
+ self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
29
+ self.fc_out = nn.Linear(hid_dim, output_dim)
30
+ self.dropout = nn.Dropout(dropout)
31
+
32
+ """
33
+ Forward propagation step of decoding
34
+
35
+ Parameters:
36
+ ----------
37
+ hidden : Tensor
38
+ Hidden tensor containing token indices (seq_len, batch_size)
39
+ This is what our encoder returns
40
+
41
+ trg : Tensor
42
+ Target tensor containing token indices (seq_len, batch_size)
43
+ This is what our tokenized Klingon Data
44
+
45
+ Returns:
46
+ -------
47
+ prediction : Tensor
48
+ Predicted output tensor from the GRU (seq_len, batch_size, output_dim)
49
+
50
+ hidden : Tensor
51
+ Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
52
+ """
53
+
54
+ def forward(self, trg, hidden):
55
+ #unsure trg is 3D
56
+ trg = trg.unsqueeze(0)
57
+ #input is converted into embeddings and dropout probability is applied
58
+ embedded = self.dropout(self.embedding(trg))
59
+ #print("Embedded shape:", embedded.shape)
60
+ #GRU layer computes new context based on previous context
61
+ output, hidden = self.rnn(embedded, hidden)
62
+ #print("Output shape after RNN:", output.shape)
63
+ #predicts output from GRU
64
+ prediction = self.fc_out(output.squeeze(0))
65
+ #print("Output shape after fc_out:", output.shape)
66
+ return prediction, hidden
Encoder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+ class Encoder(nn.Module):
4
+ """
5
+ Seq2Seq Encoder for GRU model. I want to store any kind
6
+ of sequenital information to be passed on to the decoder
7
+
8
+ Parameters:
9
+ ----------
10
+ input_dim : int
11
+ Size of the input vocabulary
12
+ emb_dim : int
13
+ Dimension of the embedding vectors
14
+ hid_dim : int
15
+ Number of features in the GRU's hidden state
16
+ n_layers : int
17
+ Number of GRU layers (typically 2)
18
+ dropout : float
19
+ Dropout probability for the dropout layer
20
+ """
21
+ def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
22
+ super().__init__()
23
+ # Embedding layer
24
+ self.embedding = nn.Embedding(input_dim, emb_dim)
25
+ self.hid_dim = hid_dim
26
+ self.n_layers = n_layers
27
+ # GRU layer
28
+ self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
29
+ # Dropout layer
30
+ self.dropout = nn.Dropout(dropout)
31
+
32
+ """
33
+ Forward propagation step of encoding
34
+
35
+ Parameters:
36
+ ----------
37
+ input : Tensor
38
+ Input tensor containing token indices (seq_len, batch_size)
39
+
40
+ Returns:
41
+ -------
42
+ hidden : Tensor
43
+ Hidden state tensor from the GRU (n_layers, batch_size, hid_dim)
44
+ """
45
+ def forward(self, input):
46
+ #input is converted into embeddings
47
+ embedded = self.dropout(self.embedding(input))
48
+ #forward pass into GRU and dropout probability is applied
49
+ _ , hidden = self.rnn(embedded)
50
+ #only hidden state is required for encoding
51
+ return hidden
English_To_Klingon.csv ADDED
The diff for this file is too large to render. See raw diff
 
GradioKlingonToEnglish.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import tensorflow as tf
3
+ import gradio as gr
4
+ import re
5
+
6
+ from Seq2SeqModel import Seq2SeqModel
7
+ from DataPPwithspecial import preprocess
8
+ from Decoder import Decoder
9
+ from Encoder import Encoder
10
+
11
+ # Model parameters
12
+ n_layers = 2
13
+ emb_dim = 256
14
+ hid_dim = 512
15
+ dropout = 0.5
16
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Use GPU if available, otherwise use CPU
17
+
18
+ # Load preprocessed data and model parameters
19
+ (klingon_tokenizer, english_tokenizer, max_klingon_length,
20
+ _, _, _, _, _, _) = preprocess() # We don't need training data for inference
21
+ input_dim = len(klingon_tokenizer.word_index) + 1 # Add 1 for the padding token
22
+ output_dim = len(english_tokenizer.word_index) + 1 # Add 1 for the padding token
23
+
24
+ # Initialize encoder and decoder
25
+ encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
26
+ decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)
27
+
28
+ # Initialize the Seq2SeqModel
29
+ model = Seq2SeqModel(encoder, decoder, device).to(device)
30
+
31
+ # Load the saved model, mapping it to the CPU if necessary
32
+ model.load_state_dict(torch.load('./backend/Klingon_to_English.pth', map_location=torch.device('cpu')))
33
+ model.eval() # Set the model to evaluation mode
34
+
35
+ # Tokenize the Klingon input
36
+ def preprocess_sentence(sentence, tokenizer, max_length):
37
+ # Tokenize the sentence
38
+ tokenized_sentence = tokenizer.texts_to_sequences([sentence])
39
+ # Pad the sequence
40
+ padded_sentence = tf.keras.preprocessing.sequence.pad_sequences(tokenized_sentence, maxlen=max_length, padding='post')
41
+ return torch.tensor(padded_sentence, dtype=torch.long).to(device)
42
+
43
+ # Translation function for Gradio
44
+ def translate_klingon_to_english(klingon_sentence):
45
+ # Preprocess the input Klingon sentence
46
+ input_sentence = preprocess_sentence(klingon_sentence, klingon_tokenizer, max_klingon_length)
47
+
48
+ # Remove the extra dimension added by unsqueeze(1)
49
+ input_sentence = input_sentence.squeeze(0)
50
+
51
+ # Perform inference
52
+ with torch.no_grad():
53
+ # Pass input as both input and target with teacher forcing ratio 0
54
+ output = model(input_sentence.unsqueeze(1), input_sentence.unsqueeze(1), 0)
55
+
56
+ # Convert output indices to English words
57
+ output_indices = torch.argmax(output, dim=-1).squeeze().tolist()
58
+ english_sentence = ' '.join([english_tokenizer.index_word[idx] for idx in output_indices if idx != 0]) # Remove padding token
59
+ # Regex to remove eos
60
+ english_sentence = re.sub(r'\beos\b', '', english_sentence).strip()
61
+ if english_sentence == "":
62
+ english_sentence = 'sorry model sucks'
63
+ return english_sentence
64
+
65
+ # Create Gradio interface
66
+ examples = [
67
+ ["nuqneH"],
68
+ ["tlhIngan Hol Dajatlh'a'?"],
69
+ ["jIyajbe'"],
70
+ ["Heghlu'meH QaQ jajvam"],
71
+ ["Hoch vor Dar"]
72
+ ]
73
+
74
+ iface = gr.Interface(
75
+ fn=translate_klingon_to_english,
76
+ inputs=gr.Textbox(label="Klingon Phrase", lines=2, placeholder="Enter Klingon text here..."),
77
+ outputs=gr.Textbox(label="English Translation", lines=2),
78
+ title="Klingon to English Translation",
79
+ description="Enter text in Klingon and get its translation in English. This translator helps you convert everyday Klingon phrases into English. Try one of the example sentences to see how it works!",
80
+ examples=examples,
81
+ theme="default"
82
+ )
83
+
84
+ iface.launch()
Klingon_to_English.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad41ca3fc34faf6e4f88902c816ae69cd80aac6b8d238aad9f116dc2ce21e94f
3
+ size 73477686