Upload 26 files
Browse files- Task 2/Eng_French.ipynb +868 -0
- Task 2/Eng_Spanish.ipynb +782 -0
- Task 2/data/english +0 -0
- Task 2/data/french +0 -0
- Task 2/eng_vectorization_config.json +1 -0
- Task 2/eng_vocab.json +0 -0
- Task 2/english_to_french_model/keras_metadata.pb +3 -0
- Task 2/english_to_french_model/saved_model.pb +3 -0
- Task 2/english_to_french_model/variables/variables.data-00000-of-00001 +3 -0
- Task 2/english_to_french_model/variables/variables.index +0 -0
- Task 2/english_tokenizer.json +1 -0
- Task 2/french_tokenizer.json +1 -0
- Task 2/gui.py +218 -0
- Task 2/images/attention.png +0 -0
- Task 2/images/bidirectional.png +0 -0
- Task 2/images/embedding-words.png +0 -0
- Task 2/images/encoder-decoder-context.png +0 -0
- Task 2/images/encoder-decoder-translation.png +0 -0
- Task 2/images/rnn.png +0 -0
- Task 2/sequence_length.json +1 -0
- Task 2/spa_vectorization_config.json +1 -0
- Task 2/spa_vocab.json +0 -0
- Task 2/transformer_model/keras_metadata.pb +3 -0
- Task 2/transformer_model/saved_model.pb +3 -0
- Task 2/transformer_model/variables/variables.data-00000-of-00001 +3 -0
- Task 2/transformer_model/variables/variables.index +0 -0
Task 2/Eng_French.ipynb
ADDED
|
@@ -0,0 +1,868 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"collapsed": true
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"# Machine Translation Project (English to French)"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 2,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"import collections\n",
|
| 19 |
+
"import numpy as np\n",
|
| 20 |
+
"import json\n",
|
| 21 |
+
"\n",
|
| 22 |
+
"from keras.preprocessing.text import Tokenizer\n",
|
| 23 |
+
"from keras.utils import pad_sequences\n",
|
| 24 |
+
"from keras.models import Model, Sequential\n",
|
| 25 |
+
"from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector\n",
|
| 26 |
+
"from keras.optimizers import Adam\n",
|
| 27 |
+
"from keras.losses import sparse_categorical_crossentropy"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"cell_type": "markdown",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"source": [
|
| 34 |
+
"### Verify access to the GPU"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": 3,
|
| 40 |
+
"metadata": {},
|
| 41 |
+
"outputs": [
|
| 42 |
+
{
|
| 43 |
+
"name": "stdout",
|
| 44 |
+
"output_type": "stream",
|
| 45 |
+
"text": [
|
| 46 |
+
"[name: \"/device:CPU:0\"\n",
|
| 47 |
+
"device_type: \"CPU\"\n",
|
| 48 |
+
"memory_limit: 268435456\n",
|
| 49 |
+
"locality {\n",
|
| 50 |
+
"}\n",
|
| 51 |
+
"incarnation: 8951901467623568752\n",
|
| 52 |
+
"xla_global_id: -1\n",
|
| 53 |
+
", name: \"/device:GPU:0\"\n",
|
| 54 |
+
"device_type: \"GPU\"\n",
|
| 55 |
+
"memory_limit: 1733715559\n",
|
| 56 |
+
"locality {\n",
|
| 57 |
+
" bus_id: 1\n",
|
| 58 |
+
" links {\n",
|
| 59 |
+
" }\n",
|
| 60 |
+
"}\n",
|
| 61 |
+
"incarnation: 7542354691675806642\n",
|
| 62 |
+
"physical_device_desc: \"device: 0, name: NVIDIA GeForce RTX 2050, pci bus id: 0000:01:00.0, compute capability: 8.6\"\n",
|
| 63 |
+
"xla_global_id: 416903419\n",
|
| 64 |
+
"]\n"
|
| 65 |
+
]
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"source": [
|
| 69 |
+
"from tensorflow.python.client import device_lib\n",
|
| 70 |
+
"print(device_lib.list_local_devices())"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "markdown",
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"source": [
|
| 77 |
+
"## Dataset\n",
|
| 78 |
+
"For our machine translation project, we opt for a dataset featuring a limited vocabulary, specifically designed to facilitate a more manageable and efficient training process. Unlike the extensive [WMT](http://www.statmt.org/) datasets, our chosen dataset ensures a quicker training time and demands fewer computational resources. This strategic decision aims to balance the learning experience while still achieving meaningful results within practical time constraints.\n",
|
| 79 |
+
"### Load Data"
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"cell_type": "code",
|
| 84 |
+
"execution_count": 4,
|
| 85 |
+
"metadata": {},
|
| 86 |
+
"outputs": [],
|
| 87 |
+
"source": [
|
| 88 |
+
"def load_data(path):\n",
|
| 89 |
+
" input_file = path\n",
|
| 90 |
+
" with open(input_file, \"r\") as f:\n",
|
| 91 |
+
" data = f.read()\n",
|
| 92 |
+
" return data.split('\\n')\n",
|
| 93 |
+
"\n",
|
| 94 |
+
"english_sentences = load_data('data/english')\n",
|
| 95 |
+
"french_sentences = load_data('data/french')"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "markdown",
|
| 100 |
+
"metadata": {},
|
| 101 |
+
"source": [
|
| 102 |
+
"### Sample Data"
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"cell_type": "code",
|
| 107 |
+
"execution_count": 5,
|
| 108 |
+
"metadata": {},
|
| 109 |
+
"outputs": [
|
| 110 |
+
{
|
| 111 |
+
"data": {
|
| 112 |
+
"text/plain": [
|
| 113 |
+
"['new jersey is sometimes quiet during autumn , and it is snowy in april .',\n",
|
| 114 |
+
" 'the united states is usually chilly during july , and it is usually freezing in november .',\n",
|
| 115 |
+
" 'california is usually quiet during march , and it is usually hot in june .',\n",
|
| 116 |
+
" 'the united states is sometimes mild during june , and it is cold in september .',\n",
|
| 117 |
+
" 'your least liked fruit is the grape , but my least liked is the apple .']"
|
| 118 |
+
]
|
| 119 |
+
},
|
| 120 |
+
"execution_count": 5,
|
| 121 |
+
"metadata": {},
|
| 122 |
+
"output_type": "execute_result"
|
| 123 |
+
}
|
| 124 |
+
],
|
| 125 |
+
"source": [
|
| 126 |
+
"english_sentences[:5]"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "markdown",
|
| 131 |
+
"metadata": {},
|
| 132 |
+
"source": [
|
| 133 |
+
"By examining the sentences, it's apparent that they have undergone preprocessing: punctuation has been delimited with spaces, and all the text has been converted to lowercase. This preprocessing serves a crucial purpose in text preparation. Firstly, delimiting punctuation with spaces ensures that each punctuation mark is treated as a separate token, aiding the model in understanding sentence structure. Secondly, converting the entire text to lowercase standardizes the input, preventing the model from distinguishing between words solely based on their casing. This uniformity facilitates more effective training and generalization, enhancing the model's ability to grasp patterns and generate accurate translations."
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"cell_type": "markdown",
|
| 138 |
+
"metadata": {},
|
| 139 |
+
"source": [
|
| 140 |
+
"Structure of the Dataset"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"cell_type": "code",
|
| 145 |
+
"execution_count": 6,
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"outputs": [
|
| 148 |
+
{
|
| 149 |
+
"name": "stdout",
|
| 150 |
+
"output_type": "stream",
|
| 151 |
+
"text": [
|
| 152 |
+
"1823250 English words.\n",
|
| 153 |
+
"227 unique English words.\n",
|
| 154 |
+
"10 Most common words in the English dataset:\n",
|
| 155 |
+
"\"is\" \",\" \".\" \"in\" \"it\" \"during\" \"the\" \"but\" \"and\" \"sometimes\"\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"1961295 French words.\n",
|
| 158 |
+
"355 unique French words.\n",
|
| 159 |
+
"10 Most common words in the French dataset:\n",
|
| 160 |
+
"\"est\" \".\" \",\" \"en\" \"il\" \"les\" \"mais\" \"et\" \"la\" \"parfois\"\n"
|
| 161 |
+
]
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
"source": [
|
| 165 |
+
"english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])\n",
|
| 166 |
+
"french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))\n",
|
| 169 |
+
"print('{} unique English words.'.format(len(english_words_counter)))\n",
|
| 170 |
+
"print('10 Most common words in the English dataset:')\n",
|
| 171 |
+
"print('\"' + '\" \"'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '\"')\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"print()\n",
|
| 174 |
+
"print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))\n",
|
| 175 |
+
"print('{} unique French words.'.format(len(french_words_counter)))\n",
|
| 176 |
+
"print('10 Most common words in the French dataset:')\n",
|
| 177 |
+
"print('\"' + '\" \"'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '\"')"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"cell_type": "markdown",
|
| 182 |
+
"metadata": {},
|
| 183 |
+
"source": [
|
| 184 |
+
"### Preprocess\n",
|
| 185 |
+
"1. Tokenize the words into ids\n",
|
| 186 |
+
"2. Add padding to make all the sequences the same length."
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"cell_type": "code",
|
| 191 |
+
"execution_count": 7,
|
| 192 |
+
"metadata": {},
|
| 193 |
+
"outputs": [
|
| 194 |
+
{
|
| 195 |
+
"name": "stdout",
|
| 196 |
+
"output_type": "stream",
|
| 197 |
+
"text": [
|
| 198 |
+
"{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}\n",
|
| 199 |
+
"\n",
|
| 200 |
+
"Sequence 1 in x\n",
|
| 201 |
+
" Input: The quick brown fox jumps over the lazy dog .\n",
|
| 202 |
+
" Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]\n",
|
| 203 |
+
"Sequence 2 in x\n",
|
| 204 |
+
" Input: By Jove , my quick study of lexicography won a prize .\n",
|
| 205 |
+
" Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]\n",
|
| 206 |
+
"Sequence 3 in x\n",
|
| 207 |
+
" Input: This is a short sentence .\n",
|
| 208 |
+
" Output: [18, 19, 3, 20, 21]\n"
|
| 209 |
+
]
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"source": [
|
| 213 |
+
"def tokenize(x):\n",
|
| 214 |
+
" tokenizer = Tokenizer()\n",
|
| 215 |
+
" tokenizer.fit_on_texts(x)\n",
|
| 216 |
+
" return tokenizer.texts_to_sequences(x), tokenizer\n",
|
| 217 |
+
"\n",
|
| 218 |
+
"text_sentences = [\n",
|
| 219 |
+
" 'The quick brown fox jumps over the lazy dog .',\n",
|
| 220 |
+
" 'By Jove , my quick study of lexicography won a prize .',\n",
|
| 221 |
+
" 'This is a short sentence .']\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"text_tokenized, text_tokenizer = tokenize(text_sentences)\n",
|
| 224 |
+
"print(text_tokenizer.word_index)\n",
|
| 225 |
+
"print()\n",
|
| 226 |
+
"for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):\n",
|
| 227 |
+
" print('Sequence {} in x'.format(sample_i + 1))\n",
|
| 228 |
+
" print(' Input: {}'.format(sent))\n",
|
| 229 |
+
" print(' Output: {}'.format(token_sent))"
|
| 230 |
+
]
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"cell_type": "code",
|
| 234 |
+
"execution_count": 8,
|
| 235 |
+
"metadata": {},
|
| 236 |
+
"outputs": [
|
| 237 |
+
{
|
| 238 |
+
"name": "stdout",
|
| 239 |
+
"output_type": "stream",
|
| 240 |
+
"text": [
|
| 241 |
+
"Sequence 1 in x\n",
|
| 242 |
+
" Input: [1 2 4 5 6 7 1 8 9]\n",
|
| 243 |
+
" Output: [1 2 4 5 6 7 1 8 9 0]\n",
|
| 244 |
+
"Sequence 2 in x\n",
|
| 245 |
+
" Input: [10 11 12 2 13 14 15 16 3 17]\n",
|
| 246 |
+
" Output: [10 11 12 2 13 14 15 16 3 17]\n",
|
| 247 |
+
"Sequence 3 in x\n",
|
| 248 |
+
" Input: [18 19 3 20 21]\n",
|
| 249 |
+
" Output: [18 19 3 20 21 0 0 0 0 0]\n"
|
| 250 |
+
]
|
| 251 |
+
}
|
| 252 |
+
],
|
| 253 |
+
"source": [
|
| 254 |
+
"def pad(x, length=None):\n",
|
| 255 |
+
" if length is None:\n",
|
| 256 |
+
" length = max([len(sentence) for sentence in x])\n",
|
| 257 |
+
" return pad_sequences(x, maxlen=length, padding='post')\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"test_pad = pad(text_tokenized)\n",
|
| 260 |
+
"for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):\n",
|
| 261 |
+
" print('Sequence {} in x'.format(sample_i + 1))\n",
|
| 262 |
+
" print(' Input: {}'.format(np.array(token_sent)))\n",
|
| 263 |
+
" print(' Output: {}'.format(pad_sent))"
|
| 264 |
+
]
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"cell_type": "code",
|
| 268 |
+
"execution_count": 9,
|
| 269 |
+
"metadata": {},
|
| 270 |
+
"outputs": [
|
| 271 |
+
{
|
| 272 |
+
"name": "stdout",
|
| 273 |
+
"output_type": "stream",
|
| 274 |
+
"text": [
|
| 275 |
+
"Data Preprocessed\n",
|
| 276 |
+
"Max English sentence length: 15\n",
|
| 277 |
+
"Max French sentence length: 21\n",
|
| 278 |
+
"English vocabulary size: 199\n",
|
| 279 |
+
"French vocabulary size: 344\n"
|
| 280 |
+
]
|
| 281 |
+
}
|
| 282 |
+
],
|
| 283 |
+
"source": [
|
| 284 |
+
"def preprocess(x,y):\n",
|
| 285 |
+
" preprocess_x, x_tk = tokenize(x)\n",
|
| 286 |
+
" preprocess_y, y_tk = tokenize(y)\n",
|
| 287 |
+
" \n",
|
| 288 |
+
" preprocess_x = pad(preprocess_x)\n",
|
| 289 |
+
" preprocess_y = pad(preprocess_y)\n",
|
| 290 |
+
" \n",
|
| 291 |
+
" preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)\n",
|
| 292 |
+
" \n",
|
| 293 |
+
" return preprocess_x, preprocess_y, x_tk, y_tk\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)\n",
|
| 296 |
+
"\n",
|
| 297 |
+
"max_english_sequence_length = preproc_english_sentences.shape[1]\n",
|
| 298 |
+
"max_french_sequence_length = preproc_french_sentences.shape[1]\n",
|
| 299 |
+
"english_vocab_size = len(english_tokenizer.word_index)\n",
|
| 300 |
+
"french_vocab_size = len(french_tokenizer.word_index)\n",
|
| 301 |
+
"\n",
|
| 302 |
+
"print('Data Preprocessed')\n",
|
| 303 |
+
"print(\"Max English sentence length:\", max_english_sequence_length)\n",
|
| 304 |
+
"print(\"Max French sentence length:\", max_french_sequence_length)\n",
|
| 305 |
+
"print(\"English vocabulary size:\", english_vocab_size)\n",
|
| 306 |
+
"print(\"French vocabulary size:\", french_vocab_size)"
|
| 307 |
+
]
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"cell_type": "markdown",
|
| 311 |
+
"metadata": {},
|
| 312 |
+
"source": [
|
| 313 |
+
"## Models\n",
|
| 314 |
+
"- Model 1 is a simple RNN\n",
|
| 315 |
+
"- Model 2 is a Bidirectional RNN\n",
|
| 316 |
+
"- Model 3 is an Embedding RNN\n",
|
| 317 |
+
"\n",
|
| 318 |
+
"### Ids Back to Text\n",
|
| 319 |
+
"The neural network will be translating the input to words ids, which isn't the final form we want. We want the French translation. The function `logits_to_text` will bridge the gab between the logits from the neural network to the French translation. You'll be using this function to better understand the output of the neural network."
|
| 320 |
+
]
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"cell_type": "code",
|
| 324 |
+
"execution_count": 10,
|
| 325 |
+
"metadata": {},
|
| 326 |
+
"outputs": [],
|
| 327 |
+
"source": [
|
| 328 |
+
"def logits_to_text(logits, tokenizer):\n",
|
| 329 |
+
" index_to_words = {id: word for word, id in tokenizer.word_index.items()}\n",
|
| 330 |
+
" index_to_words[0] = '<PAD>'\n",
|
| 331 |
+
" \n",
|
| 332 |
+
" return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])"
|
| 333 |
+
]
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"cell_type": "markdown",
|
| 337 |
+
"metadata": {},
|
| 338 |
+
"source": [
|
| 339 |
+
"### Model 1: RNN\n",
|
| 340 |
+
"\n",
|
| 341 |
+
"A basic RNN model is a good baseline for sequence data. In this model, you'll build a RNN that translates English to French."
|
| 342 |
+
]
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"cell_type": "code",
|
| 346 |
+
"execution_count": 11,
|
| 347 |
+
"metadata": {},
|
| 348 |
+
"outputs": [
|
| 349 |
+
{
|
| 350 |
+
"name": "stdout",
|
| 351 |
+
"output_type": "stream",
|
| 352 |
+
"text": [
|
| 353 |
+
"Epoch 1/20\n",
|
| 354 |
+
"108/108 [==============================] - 17s 90ms/step - loss: 1.9094 - accuracy: 0.5446 - val_loss: nan - val_accuracy: 0.6307\n",
|
| 355 |
+
"Epoch 2/20\n",
|
| 356 |
+
"108/108 [==============================] - 9s 84ms/step - loss: 1.2243 - accuracy: 0.6429 - val_loss: nan - val_accuracy: 0.6716\n",
|
| 357 |
+
"Epoch 3/20\n",
|
| 358 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 1.0848 - accuracy: 0.6683 - val_loss: nan - val_accuracy: 0.6864\n",
|
| 359 |
+
"Epoch 4/20\n",
|
| 360 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 1.0057 - accuracy: 0.6832 - val_loss: nan - val_accuracy: 0.6927\n",
|
| 361 |
+
"Epoch 5/20\n",
|
| 362 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.9484 - accuracy: 0.6931 - val_loss: nan - val_accuracy: 0.7108\n",
|
| 363 |
+
"Epoch 6/20\n",
|
| 364 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.9019 - accuracy: 0.7036 - val_loss: nan - val_accuracy: 0.7007\n",
|
| 365 |
+
"Epoch 7/20\n",
|
| 366 |
+
"108/108 [==============================] - 9s 84ms/step - loss: 0.8916 - accuracy: 0.6999 - val_loss: nan - val_accuracy: 0.7244\n",
|
| 367 |
+
"Epoch 8/20\n",
|
| 368 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.8407 - accuracy: 0.7178 - val_loss: nan - val_accuracy: 0.7567\n",
|
| 369 |
+
"Epoch 9/20\n",
|
| 370 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.7807 - accuracy: 0.7405 - val_loss: nan - val_accuracy: 0.7405\n",
|
| 371 |
+
"Epoch 10/20\n",
|
| 372 |
+
"108/108 [==============================] - 9s 84ms/step - loss: 0.7474 - accuracy: 0.7496 - val_loss: nan - val_accuracy: 0.7721\n",
|
| 373 |
+
"Epoch 11/20\n",
|
| 374 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.7739 - accuracy: 0.7392 - val_loss: nan - val_accuracy: 0.7392\n",
|
| 375 |
+
"Epoch 12/20\n",
|
| 376 |
+
"108/108 [==============================] - 9s 84ms/step - loss: 0.7552 - accuracy: 0.7420 - val_loss: nan - val_accuracy: 0.7851\n",
|
| 377 |
+
"Epoch 13/20\n",
|
| 378 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.7238 - accuracy: 0.7550 - val_loss: nan - val_accuracy: 0.7937\n",
|
| 379 |
+
"Epoch 14/20\n",
|
| 380 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.7126 - accuracy: 0.7568 - val_loss: nan - val_accuracy: 0.7830\n",
|
| 381 |
+
"Epoch 15/20\n",
|
| 382 |
+
"108/108 [==============================] - 9s 84ms/step - loss: 0.6838 - accuracy: 0.7650 - val_loss: nan - val_accuracy: 0.7976\n",
|
| 383 |
+
"Epoch 16/20\n",
|
| 384 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.6577 - accuracy: 0.7776 - val_loss: nan - val_accuracy: 0.7995\n",
|
| 385 |
+
"Epoch 17/20\n",
|
| 386 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.6447 - accuracy: 0.7821 - val_loss: nan - val_accuracy: 0.8072\n",
|
| 387 |
+
"Epoch 18/20\n",
|
| 388 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.6309 - accuracy: 0.7858 - val_loss: nan - val_accuracy: 0.8100\n",
|
| 389 |
+
"Epoch 19/20\n",
|
| 390 |
+
"108/108 [==============================] - 9s 84ms/step - loss: 0.6073 - accuracy: 0.7930 - val_loss: nan - val_accuracy: 0.8111\n",
|
| 391 |
+
"Epoch 20/20\n",
|
| 392 |
+
"108/108 [==============================] - 9s 83ms/step - loss: 0.6100 - accuracy: 0.7912 - val_loss: nan - val_accuracy: 0.8150\n"
|
| 393 |
+
]
|
| 394 |
+
},
|
| 395 |
+
{
|
| 396 |
+
"data": {
|
| 397 |
+
"text/plain": [
|
| 398 |
+
"<keras.callbacks.History at 0x2573cec49a0>"
|
| 399 |
+
]
|
| 400 |
+
},
|
| 401 |
+
"execution_count": 11,
|
| 402 |
+
"metadata": {},
|
| 403 |
+
"output_type": "execute_result"
|
| 404 |
+
}
|
| 405 |
+
],
|
| 406 |
+
"source": [
|
| 407 |
+
"def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):\n",
|
| 408 |
+
" \n",
|
| 409 |
+
" #Hyperparameters\n",
|
| 410 |
+
" learning_rate = 0.005\n",
|
| 411 |
+
" \n",
|
| 412 |
+
" # Build the layers\n",
|
| 413 |
+
" model = Sequential()\n",
|
| 414 |
+
" model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))\n",
|
| 415 |
+
" model.add(TimeDistributed(Dense(1024, activation='relu')))\n",
|
| 416 |
+
" model.add(Dropout(0.5))\n",
|
| 417 |
+
" model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))\n",
|
| 418 |
+
" \n",
|
| 419 |
+
" # Compile model\n",
|
| 420 |
+
" model.compile(loss = sparse_categorical_crossentropy,\n",
|
| 421 |
+
" optimizer = Adam(learning_rate),\n",
|
| 422 |
+
" metrics = ['accuracy'])\n",
|
| 423 |
+
" \n",
|
| 424 |
+
" return model\n",
|
| 425 |
+
"\n",
|
| 426 |
+
"tmp_x = pad(preproc_english_sentences, max_french_sequence_length)\n",
|
| 427 |
+
"tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"#Train the neural network\n",
|
| 430 |
+
"simple_rnn_model = simple_model(\n",
|
| 431 |
+
" tmp_x.shape,\n",
|
| 432 |
+
" max_french_sequence_length,\n",
|
| 433 |
+
" english_vocab_size,\n",
|
| 434 |
+
" french_vocab_size)\n",
|
| 435 |
+
"\n",
|
| 436 |
+
"simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)"
|
| 437 |
+
]
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"cell_type": "code",
|
| 441 |
+
"execution_count": 12,
|
| 442 |
+
"metadata": {},
|
| 443 |
+
"outputs": [
|
| 444 |
+
{
|
| 445 |
+
"name": "stdout",
|
| 446 |
+
"output_type": "stream",
|
| 447 |
+
"text": [
|
| 448 |
+
"Prediciton:\n",
|
| 449 |
+
"1/1 [==============================] - 0s 259ms/step\n",
|
| 450 |
+
"new jersey est parfois calme en mois et il est il est en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>\n",
|
| 451 |
+
"\n",
|
| 452 |
+
"Correct Translation:\n",
|
| 453 |
+
"[\"new jersey est parfois calme pendant l' automne , et il est neigeux en avril .\"]\n",
|
| 454 |
+
"\n",
|
| 455 |
+
"Original text:\n",
|
| 456 |
+
"['new jersey is sometimes quiet during autumn , and it is snowy in april .']\n"
|
| 457 |
+
]
|
| 458 |
+
}
|
| 459 |
+
],
|
| 460 |
+
"source": [
|
| 461 |
+
"# Print prediction(s)\n",
|
| 462 |
+
"print(\"Prediciton:\")\n",
|
| 463 |
+
"print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))\n",
|
| 464 |
+
"\n",
|
| 465 |
+
"print(\"\\nCorrect Translation:\")\n",
|
| 466 |
+
"print(french_sentences[:1])\n",
|
| 467 |
+
"\n",
|
| 468 |
+
"print('\\nOriginal text:')\n",
|
| 469 |
+
"print(english_sentences[:1])"
|
| 470 |
+
]
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"cell_type": "markdown",
|
| 474 |
+
"metadata": {},
|
| 475 |
+
"source": [
|
| 476 |
+
"### Model 2: Bidirectional RNNs\n",
|
| 477 |
+
"\n",
|
| 478 |
+
"One restriction of a RNN is that it can't see the future input, only the past. This is where bidirectional recurrent neural networks come in. They are able to see the future data."
|
| 479 |
+
]
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"cell_type": "code",
|
| 483 |
+
"execution_count": 13,
|
| 484 |
+
"metadata": {},
|
| 485 |
+
"outputs": [
|
| 486 |
+
{
|
| 487 |
+
"name": "stdout",
|
| 488 |
+
"output_type": "stream",
|
| 489 |
+
"text": [
|
| 490 |
+
"Model: \"sequential_1\"\n",
|
| 491 |
+
"_________________________________________________________________\n",
|
| 492 |
+
" Layer (type) Output Shape Param # \n",
|
| 493 |
+
"=================================================================\n",
|
| 494 |
+
" bidirectional (Bidirectiona (None, 21, 256) 100608 \n",
|
| 495 |
+
" l) \n",
|
| 496 |
+
" \n",
|
| 497 |
+
" time_distributed_2 (TimeDis (None, 21, 1024) 263168 \n",
|
| 498 |
+
" tributed) \n",
|
| 499 |
+
" \n",
|
| 500 |
+
" dropout_1 (Dropout) (None, 21, 1024) 0 \n",
|
| 501 |
+
" \n",
|
| 502 |
+
" time_distributed_3 (TimeDis (None, 21, 344) 352600 \n",
|
| 503 |
+
" tributed) \n",
|
| 504 |
+
" \n",
|
| 505 |
+
"=================================================================\n",
|
| 506 |
+
"Total params: 716,376\n",
|
| 507 |
+
"Trainable params: 716,376\n",
|
| 508 |
+
"Non-trainable params: 0\n",
|
| 509 |
+
"_________________________________________________________________\n",
|
| 510 |
+
"None\n",
|
| 511 |
+
"Epoch 1/20\n",
|
| 512 |
+
"108/108 [==============================] - 12s 90ms/step - loss: 1.7553 - accuracy: 0.5756 - val_loss: nan - val_accuracy: 0.6505\n",
|
| 513 |
+
"Epoch 2/20\n",
|
| 514 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 1.1655 - accuracy: 0.6550 - val_loss: nan - val_accuracy: 0.6802\n",
|
| 515 |
+
"Epoch 3/20\n",
|
| 516 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 1.0423 - accuracy: 0.6759 - val_loss: nan - val_accuracy: 0.6903\n",
|
| 517 |
+
"Epoch 4/20\n",
|
| 518 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.9663 - accuracy: 0.6880 - val_loss: nan - val_accuracy: 0.7003\n",
|
| 519 |
+
"Epoch 5/20\n",
|
| 520 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.9119 - accuracy: 0.6974 - val_loss: nan - val_accuracy: 0.7207\n",
|
| 521 |
+
"Epoch 6/20\n",
|
| 522 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.8700 - accuracy: 0.7059 - val_loss: nan - val_accuracy: 0.7287\n",
|
| 523 |
+
"Epoch 7/20\n",
|
| 524 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.8359 - accuracy: 0.7129 - val_loss: nan - val_accuracy: 0.7301\n",
|
| 525 |
+
"Epoch 8/20\n",
|
| 526 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.8495 - accuracy: 0.7090 - val_loss: nan - val_accuracy: 0.7300\n",
|
| 527 |
+
"Epoch 9/20\n",
|
| 528 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.8025 - accuracy: 0.7197 - val_loss: nan - val_accuracy: 0.7386\n",
|
| 529 |
+
"Epoch 10/20\n",
|
| 530 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.7839 - accuracy: 0.7228 - val_loss: nan - val_accuracy: 0.7429\n",
|
| 531 |
+
"Epoch 11/20\n",
|
| 532 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.7671 - accuracy: 0.7248 - val_loss: nan - val_accuracy: 0.7461\n",
|
| 533 |
+
"Epoch 12/20\n",
|
| 534 |
+
"108/108 [==============================] - 9s 86ms/step - loss: 0.7490 - accuracy: 0.7278 - val_loss: nan - val_accuracy: 0.7487\n",
|
| 535 |
+
"Epoch 13/20\n",
|
| 536 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.7341 - accuracy: 0.7307 - val_loss: nan - val_accuracy: 0.7473\n",
|
| 537 |
+
"Epoch 14/20\n",
|
| 538 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.7183 - accuracy: 0.7363 - val_loss: nan - val_accuracy: 0.7614\n",
|
| 539 |
+
"Epoch 15/20\n",
|
| 540 |
+
"108/108 [==============================] - 9s 86ms/step - loss: 0.6998 - accuracy: 0.7427 - val_loss: nan - val_accuracy: 0.7594\n",
|
| 541 |
+
"Epoch 16/20\n",
|
| 542 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.7086 - accuracy: 0.7361 - val_loss: nan - val_accuracy: 0.7596\n",
|
| 543 |
+
"Epoch 17/20\n",
|
| 544 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.6889 - accuracy: 0.7424 - val_loss: nan - val_accuracy: 0.7679\n",
|
| 545 |
+
"Epoch 18/20\n",
|
| 546 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.6780 - accuracy: 0.7493 - val_loss: nan - val_accuracy: 0.7763\n",
|
| 547 |
+
"Epoch 19/20\n",
|
| 548 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.6625 - accuracy: 0.7535 - val_loss: nan - val_accuracy: 0.7771\n",
|
| 549 |
+
"Epoch 20/20\n",
|
| 550 |
+
"108/108 [==============================] - 9s 85ms/step - loss: 0.6572 - accuracy: 0.7553 - val_loss: nan - val_accuracy: 0.7560\n"
|
| 551 |
+
]
|
| 552 |
+
},
|
| 553 |
+
{
|
| 554 |
+
"data": {
|
| 555 |
+
"text/plain": [
|
| 556 |
+
"<keras.callbacks.History at 0x2573f78f0a0>"
|
| 557 |
+
]
|
| 558 |
+
},
|
| 559 |
+
"execution_count": 13,
|
| 560 |
+
"metadata": {},
|
| 561 |
+
"output_type": "execute_result"
|
| 562 |
+
}
|
| 563 |
+
],
|
| 564 |
+
"source": [
|
| 565 |
+
"def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):\n",
|
| 566 |
+
" \n",
|
| 567 |
+
" #Hyperparameters\n",
|
| 568 |
+
" learning_rate = 0.005\n",
|
| 569 |
+
" \n",
|
| 570 |
+
" # Build the layers\n",
|
| 571 |
+
" model = Sequential()\n",
|
| 572 |
+
" model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=input_shape[1:]))\n",
|
| 573 |
+
" model.add(TimeDistributed(Dense(1024, activation='relu')))\n",
|
| 574 |
+
" model.add(Dropout(0.5))\n",
|
| 575 |
+
" model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))\n",
|
| 576 |
+
" \n",
|
| 577 |
+
" # Compile model\n",
|
| 578 |
+
" model.compile(loss = sparse_categorical_crossentropy,\n",
|
| 579 |
+
" optimizer = Adam(learning_rate),\n",
|
| 580 |
+
" metrics = ['accuracy'])\n",
|
| 581 |
+
" \n",
|
| 582 |
+
" return model\n",
|
| 583 |
+
"\n",
|
| 584 |
+
"tmp_x = pad(preproc_english_sentences, max_french_sequence_length)\n",
|
| 585 |
+
"tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))\n",
|
| 586 |
+
"\n",
|
| 587 |
+
"# Train the neural network\n",
|
| 588 |
+
"bd_rnn_model = bd_model(\n",
|
| 589 |
+
" tmp_x.shape,\n",
|
| 590 |
+
" max_french_sequence_length,\n",
|
| 591 |
+
" english_vocab_size,\n",
|
| 592 |
+
" french_vocab_size)\n",
|
| 593 |
+
"\n",
|
| 594 |
+
"print(bd_rnn_model.summary())\n",
|
| 595 |
+
"\n",
|
| 596 |
+
"bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)"
|
| 597 |
+
]
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"cell_type": "code",
|
| 601 |
+
"execution_count": 14,
|
| 602 |
+
"metadata": {},
|
| 603 |
+
"outputs": [
|
| 604 |
+
{
|
| 605 |
+
"name": "stdout",
|
| 606 |
+
"output_type": "stream",
|
| 607 |
+
"text": [
|
| 608 |
+
"Prediciton:\n",
|
| 609 |
+
"1/1 [==============================] - 1s 544ms/step\n",
|
| 610 |
+
"new jersey est parfois chaud en mois et il et il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>\n",
|
| 611 |
+
"\n",
|
| 612 |
+
"Correct Translation:\n",
|
| 613 |
+
"[\"new jersey est parfois calme pendant l' automne , et il est neigeux en avril .\"]\n",
|
| 614 |
+
"\n",
|
| 615 |
+
"Original text:\n",
|
| 616 |
+
"['new jersey is sometimes quiet during autumn , and it is snowy in april .']\n"
|
| 617 |
+
]
|
| 618 |
+
}
|
| 619 |
+
],
|
| 620 |
+
"source": [
|
| 621 |
+
"# Print prediction(s)\n",
|
| 622 |
+
"print(\"Prediciton:\")\n",
|
| 623 |
+
"print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))\n",
|
| 624 |
+
"\n",
|
| 625 |
+
"print(\"\\nCorrect Translation:\")\n",
|
| 626 |
+
"print(french_sentences[:1])\n",
|
| 627 |
+
"\n",
|
| 628 |
+
"print('\\nOriginal text:')\n",
|
| 629 |
+
"print(english_sentences[:1])"
|
| 630 |
+
]
|
| 631 |
+
},
|
| 632 |
+
{
|
| 633 |
+
"cell_type": "markdown",
|
| 634 |
+
"metadata": {},
|
| 635 |
+
"source": [
|
| 636 |
+
"### Model 3: Embedding\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"You've turned the words into ids, but there's a better representation of a word. This is called word embeddings. An embedding is a vector representation of the word that is close to similar words in n-dimensional space, where the n represents the size of the embedding vectors."
|
| 639 |
+
]
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"cell_type": "code",
|
| 643 |
+
"execution_count": 15,
|
| 644 |
+
"metadata": {},
|
| 645 |
+
"outputs": [
|
| 646 |
+
{
|
| 647 |
+
"name": "stdout",
|
| 648 |
+
"output_type": "stream",
|
| 649 |
+
"text": [
|
| 650 |
+
"Model: \"sequential_2\"\n",
|
| 651 |
+
"_________________________________________________________________\n",
|
| 652 |
+
" Layer (type) Output Shape Param # \n",
|
| 653 |
+
"=================================================================\n",
|
| 654 |
+
" embedding (Embedding) (None, 21, 256) 50944 \n",
|
| 655 |
+
" \n",
|
| 656 |
+
" bidirectional_1 (Bidirectio (None, 21, 512) 789504 \n",
|
| 657 |
+
" nal) \n",
|
| 658 |
+
" \n",
|
| 659 |
+
" time_distributed_4 (TimeDis (None, 21, 1024) 525312 \n",
|
| 660 |
+
" tributed) \n",
|
| 661 |
+
" \n",
|
| 662 |
+
" dropout_2 (Dropout) (None, 21, 1024) 0 \n",
|
| 663 |
+
" \n",
|
| 664 |
+
" time_distributed_5 (TimeDis (None, 21, 344) 352600 \n",
|
| 665 |
+
" tributed) \n",
|
| 666 |
+
" \n",
|
| 667 |
+
"=================================================================\n",
|
| 668 |
+
"Total params: 1,718,360\n",
|
| 669 |
+
"Trainable params: 1,718,360\n",
|
| 670 |
+
"Non-trainable params: 0\n",
|
| 671 |
+
"_________________________________________________________________\n",
|
| 672 |
+
"None\n",
|
| 673 |
+
"Epoch 1/20\n",
|
| 674 |
+
"108/108 [==============================] - 17s 130ms/step - loss: 1.3473 - accuracy: 0.6924 - val_loss: nan - val_accuracy: 0.8697\n",
|
| 675 |
+
"Epoch 2/20\n",
|
| 676 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.3152 - accuracy: 0.9003 - val_loss: nan - val_accuracy: 0.9346\n",
|
| 677 |
+
"Epoch 3/20\n",
|
| 678 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.1808 - accuracy: 0.9434 - val_loss: nan - val_accuracy: 0.9578\n",
|
| 679 |
+
"Epoch 4/20\n",
|
| 680 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.1291 - accuracy: 0.9601 - val_loss: nan - val_accuracy: 0.9702\n",
|
| 681 |
+
"Epoch 5/20\n",
|
| 682 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.1022 - accuracy: 0.9688 - val_loss: nan - val_accuracy: 0.9737\n",
|
| 683 |
+
"Epoch 6/20\n",
|
| 684 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0854 - accuracy: 0.9739 - val_loss: nan - val_accuracy: 0.9772\n",
|
| 685 |
+
"Epoch 7/20\n",
|
| 686 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0763 - accuracy: 0.9767 - val_loss: nan - val_accuracy: 0.9780\n",
|
| 687 |
+
"Epoch 8/20\n",
|
| 688 |
+
"108/108 [==============================] - 14s 127ms/step - loss: 0.0658 - accuracy: 0.9798 - val_loss: nan - val_accuracy: 0.9798\n",
|
| 689 |
+
"Epoch 9/20\n",
|
| 690 |
+
"108/108 [==============================] - 14s 127ms/step - loss: 0.0604 - accuracy: 0.9815 - val_loss: nan - val_accuracy: 0.9816\n",
|
| 691 |
+
"Epoch 10/20\n",
|
| 692 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0572 - accuracy: 0.9825 - val_loss: nan - val_accuracy: 0.9823\n",
|
| 693 |
+
"Epoch 11/20\n",
|
| 694 |
+
"108/108 [==============================] - 14s 127ms/step - loss: 0.0511 - accuracy: 0.9842 - val_loss: nan - val_accuracy: 0.9836\n",
|
| 695 |
+
"Epoch 12/20\n",
|
| 696 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0519 - accuracy: 0.9840 - val_loss: nan - val_accuracy: 0.9839\n",
|
| 697 |
+
"Epoch 13/20\n",
|
| 698 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0494 - accuracy: 0.9850 - val_loss: nan - val_accuracy: 0.9813\n",
|
| 699 |
+
"Epoch 14/20\n",
|
| 700 |
+
"108/108 [==============================] - 14s 127ms/step - loss: 0.0499 - accuracy: 0.9847 - val_loss: nan - val_accuracy: 0.9837\n",
|
| 701 |
+
"Epoch 15/20\n",
|
| 702 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0471 - accuracy: 0.9856 - val_loss: nan - val_accuracy: 0.9838\n",
|
| 703 |
+
"Epoch 16/20\n",
|
| 704 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0411 - accuracy: 0.9874 - val_loss: nan - val_accuracy: 0.9843\n",
|
| 705 |
+
"Epoch 17/20\n",
|
| 706 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0378 - accuracy: 0.9883 - val_loss: nan - val_accuracy: 0.9851\n",
|
| 707 |
+
"Epoch 18/20\n",
|
| 708 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0368 - accuracy: 0.9886 - val_loss: nan - val_accuracy: 0.9853\n",
|
| 709 |
+
"Epoch 19/20\n",
|
| 710 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0386 - accuracy: 0.9883 - val_loss: nan - val_accuracy: 0.9845\n",
|
| 711 |
+
"Epoch 20/20\n",
|
| 712 |
+
"108/108 [==============================] - 14s 126ms/step - loss: 0.0459 - accuracy: 0.9863 - val_loss: nan - val_accuracy: 0.9843\n"
|
| 713 |
+
]
|
| 714 |
+
},
|
| 715 |
+
{
|
| 716 |
+
"data": {
|
| 717 |
+
"text/plain": [
|
| 718 |
+
"<keras.callbacks.History at 0x2573f340d90>"
|
| 719 |
+
]
|
| 720 |
+
},
|
| 721 |
+
"execution_count": 15,
|
| 722 |
+
"metadata": {},
|
| 723 |
+
"output_type": "execute_result"
|
| 724 |
+
}
|
| 725 |
+
],
|
| 726 |
+
"source": [
|
| 727 |
+
"def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):\n",
|
| 728 |
+
" \n",
|
| 729 |
+
" # Hyperparameters\n",
|
| 730 |
+
" learning_rate = 0.005\n",
|
| 731 |
+
" \n",
|
| 732 |
+
" # Build the layers\n",
|
| 733 |
+
" model = Sequential()\n",
|
| 734 |
+
" model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))\n",
|
| 735 |
+
" model.add(Bidirectional(GRU(256, return_sequences=True)))\n",
|
| 736 |
+
" model.add(TimeDistributed(Dense(1024, activation='relu')))\n",
|
| 737 |
+
" model.add(Dropout(0.5))\n",
|
| 738 |
+
" model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))\n",
|
| 739 |
+
" \n",
|
| 740 |
+
" # Compile model\n",
|
| 741 |
+
" model.compile(loss = sparse_categorical_crossentropy,\n",
|
| 742 |
+
" optimizer = Adam(learning_rate),\n",
|
| 743 |
+
" metrics = ['accuracy'])\n",
|
| 744 |
+
" \n",
|
| 745 |
+
" return model\n",
|
| 746 |
+
"\n",
|
| 747 |
+
"tmp_x = pad(preproc_english_sentences, max_french_sequence_length)\n",
|
| 748 |
+
"tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))\n",
|
| 749 |
+
"\n",
|
| 750 |
+
"# Build the model\n",
|
| 751 |
+
"embed_rnn_model = bidirectional_embed_model(\n",
|
| 752 |
+
" tmp_x.shape,\n",
|
| 753 |
+
" max_french_sequence_length,\n",
|
| 754 |
+
" english_vocab_size,\n",
|
| 755 |
+
" french_vocab_size)\n",
|
| 756 |
+
"\n",
|
| 757 |
+
"print(embed_rnn_model.summary())\n",
|
| 758 |
+
"\n",
|
| 759 |
+
"embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)\n",
|
| 760 |
+
" "
|
| 761 |
+
]
|
| 762 |
+
},
|
| 763 |
+
{
|
| 764 |
+
"cell_type": "code",
|
| 765 |
+
"execution_count": 16,
|
| 766 |
+
"metadata": {},
|
| 767 |
+
"outputs": [
|
| 768 |
+
{
|
| 769 |
+
"name": "stdout",
|
| 770 |
+
"output_type": "stream",
|
| 771 |
+
"text": [
|
| 772 |
+
"Prediciton:\n",
|
| 773 |
+
"1/1 [==============================] - 0s 410ms/step\n",
|
| 774 |
+
"new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>\n",
|
| 775 |
+
"\n",
|
| 776 |
+
"Correct Translation:\n",
|
| 777 |
+
"[\"new jersey est parfois calme pendant l' automne , et il est neigeux en avril .\"]\n",
|
| 778 |
+
"\n",
|
| 779 |
+
"Original text:\n",
|
| 780 |
+
"['new jersey is sometimes quiet during autumn , and it is snowy in april .']\n"
|
| 781 |
+
]
|
| 782 |
+
}
|
| 783 |
+
],
|
| 784 |
+
"source": [
|
| 785 |
+
"# Print prediction(s)\n",
|
| 786 |
+
"print(\"Prediciton:\")\n",
|
| 787 |
+
"print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))\n",
|
| 788 |
+
"\n",
|
| 789 |
+
"print(\"\\nCorrect Translation:\")\n",
|
| 790 |
+
"print(french_sentences[:1])\n",
|
| 791 |
+
"\n",
|
| 792 |
+
"print('\\nOriginal text:')\n",
|
| 793 |
+
"print(english_sentences[:1])"
|
| 794 |
+
]
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"cell_type": "code",
|
| 798 |
+
"execution_count": 17,
|
| 799 |
+
"metadata": {},
|
| 800 |
+
"outputs": [
|
| 801 |
+
{
|
| 802 |
+
"name": "stderr",
|
| 803 |
+
"output_type": "stream",
|
| 804 |
+
"text": [
|
| 805 |
+
"WARNING:absl:Found untraced functions such as gru_cell_5_layer_call_fn, gru_cell_5_layer_call_and_return_conditional_losses, gru_cell_6_layer_call_fn, gru_cell_6_layer_call_and_return_conditional_losses while saving (showing 4 of 4). These functions will not be directly callable after loading.\n"
|
| 806 |
+
]
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"name": "stdout",
|
| 810 |
+
"output_type": "stream",
|
| 811 |
+
"text": [
|
| 812 |
+
"INFO:tensorflow:Assets written to: english_to_french_model\\assets\n"
|
| 813 |
+
]
|
| 814 |
+
},
|
| 815 |
+
{
|
| 816 |
+
"name": "stderr",
|
| 817 |
+
"output_type": "stream",
|
| 818 |
+
"text": [
|
| 819 |
+
"INFO:tensorflow:Assets written to: english_to_french_model\\assets\n"
|
| 820 |
+
]
|
| 821 |
+
}
|
| 822 |
+
],
|
| 823 |
+
"source": [
|
| 824 |
+
"embed_rnn_model.save('english_to_french_model')\n",
|
| 825 |
+
"# Serialize English Tokenizer to JSON\n",
|
| 826 |
+
"with open('english_tokenizer.json', 'w', encoding='utf8') as f:\n",
|
| 827 |
+
" f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))\n",
|
| 828 |
+
" \n",
|
| 829 |
+
"# Serialize French Tokenizer to JSON\n",
|
| 830 |
+
"with open('french_tokenizer.json', 'w', encoding='utf8') as f:\n",
|
| 831 |
+
" f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))\n",
|
| 832 |
+
" \n",
|
| 833 |
+
"# Save max lengths\n",
|
| 834 |
+
"max_french_sequence_length_json = max_french_sequence_length\n",
|
| 835 |
+
"with open('sequence_length.json', 'w', encoding='utf8') as f:\n",
|
| 836 |
+
" f.write(json.dumps(max_french_sequence_length_json, ensure_ascii=False))"
|
| 837 |
+
]
|
| 838 |
+
},
|
| 839 |
+
{
|
| 840 |
+
"cell_type": "code",
|
| 841 |
+
"execution_count": null,
|
| 842 |
+
"metadata": {},
|
| 843 |
+
"outputs": [],
|
| 844 |
+
"source": []
|
| 845 |
+
}
|
| 846 |
+
],
|
| 847 |
+
"metadata": {
|
| 848 |
+
"kernelspec": {
|
| 849 |
+
"display_name": "Python 3",
|
| 850 |
+
"language": "python",
|
| 851 |
+
"name": "python3"
|
| 852 |
+
},
|
| 853 |
+
"language_info": {
|
| 854 |
+
"codemirror_mode": {
|
| 855 |
+
"name": "ipython",
|
| 856 |
+
"version": 3
|
| 857 |
+
},
|
| 858 |
+
"file_extension": ".py",
|
| 859 |
+
"mimetype": "text/x-python",
|
| 860 |
+
"name": "python",
|
| 861 |
+
"nbconvert_exporter": "python",
|
| 862 |
+
"pygments_lexer": "ipython3",
|
| 863 |
+
"version": "3.9.19"
|
| 864 |
+
}
|
| 865 |
+
},
|
| 866 |
+
"nbformat": 4,
|
| 867 |
+
"nbformat_minor": 1
|
| 868 |
+
}
|
Task 2/Eng_Spanish.ipynb
ADDED
|
@@ -0,0 +1,782 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Machine Translation Project (English to Spanish)"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "code",
|
| 12 |
+
"execution_count": 1,
|
| 13 |
+
"metadata": {},
|
| 14 |
+
"outputs": [],
|
| 15 |
+
"source": [
|
| 16 |
+
"import pathlib\n",
|
| 17 |
+
"import random\n",
|
| 18 |
+
"import string\n",
|
| 19 |
+
"import tensorflow.strings as tf_strings\n",
|
| 20 |
+
"import tensorflow.data as tf_data\n",
|
| 21 |
+
"import re\n",
|
| 22 |
+
"from keras.layers import TextVectorization\n",
|
| 23 |
+
"import keras\n",
|
| 24 |
+
"import tensorflow as tf\n",
|
| 25 |
+
"from keras import layers\n",
|
| 26 |
+
"import json"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "markdown",
|
| 31 |
+
"metadata": {},
|
| 32 |
+
"source": [
|
| 33 |
+
"### Verify access to the GPU"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": 2,
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [
|
| 41 |
+
{
|
| 42 |
+
"name": "stdout",
|
| 43 |
+
"output_type": "stream",
|
| 44 |
+
"text": [
|
| 45 |
+
"[name: \"/device:CPU:0\"\n",
|
| 46 |
+
"device_type: \"CPU\"\n",
|
| 47 |
+
"memory_limit: 268435456\n",
|
| 48 |
+
"locality {\n",
|
| 49 |
+
"}\n",
|
| 50 |
+
"incarnation: 16791471205212918184\n",
|
| 51 |
+
"xla_global_id: -1\n",
|
| 52 |
+
", name: \"/device:GPU:0\"\n",
|
| 53 |
+
"device_type: \"GPU\"\n",
|
| 54 |
+
"memory_limit: 1733715559\n",
|
| 55 |
+
"locality {\n",
|
| 56 |
+
" bus_id: 1\n",
|
| 57 |
+
" links {\n",
|
| 58 |
+
" }\n",
|
| 59 |
+
"}\n",
|
| 60 |
+
"incarnation: 6643307082616730570\n",
|
| 61 |
+
"physical_device_desc: \"device: 0, name: NVIDIA GeForce RTX 2050, pci bus id: 0000:01:00.0, compute capability: 8.6\"\n",
|
| 62 |
+
"xla_global_id: 416903419\n",
|
| 63 |
+
"]\n"
|
| 64 |
+
]
|
| 65 |
+
}
|
| 66 |
+
],
|
| 67 |
+
"source": [
|
| 68 |
+
"from tensorflow.python.client import device_lib\n",
|
| 69 |
+
"print(device_lib.list_local_devices())"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "markdown",
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"source": [
|
| 76 |
+
"### Download and prepare the data\n",
|
| 77 |
+
"source :\"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\""
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "code",
|
| 82 |
+
"execution_count": 3,
|
| 83 |
+
"metadata": {},
|
| 84 |
+
"outputs": [
|
| 85 |
+
{
|
| 86 |
+
"name": "stdout",
|
| 87 |
+
"output_type": "stream",
|
| 88 |
+
"text": [
|
| 89 |
+
"Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\n",
|
| 90 |
+
"2638744/2638744 [==============================] - 11s 4us/step\n"
|
| 91 |
+
]
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"source": [
|
| 95 |
+
"text_file = keras.utils.get_file(\n",
|
| 96 |
+
" fname = \"spa-eng.zip\",\n",
|
| 97 |
+
" origin = \"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\",\n",
|
| 98 |
+
" extract = True,\n",
|
| 99 |
+
")\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"text_file = pathlib.Path(text_file).parent / \"spa-eng\" / \"spa.txt\"\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"with open(text_file, \"r\") as f:\n",
|
| 104 |
+
" lines = f.read().split(\"\\n\")[:-1]\n",
|
| 105 |
+
" \n",
|
| 106 |
+
"text_pairs = []\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"for line in lines:\n",
|
| 109 |
+
" eng, spa = line.split(\"\\t\")\n",
|
| 110 |
+
" spa = \"[start] \" + spa + \" [end]\"\n",
|
| 111 |
+
" text_pairs.append((eng, spa))"
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"cell_type": "code",
|
| 116 |
+
"execution_count": 4,
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"outputs": [],
|
| 119 |
+
"source": [
|
| 120 |
+
"random.shuffle(text_pairs)"
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": 5,
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [
|
| 128 |
+
{
|
| 129 |
+
"name": "stdout",
|
| 130 |
+
"output_type": "stream",
|
| 131 |
+
"text": [
|
| 132 |
+
"('Please remind me to phone him tomorrow.', '[start] Por favor, recordadme que le llame mañana. [end]')\n",
|
| 133 |
+
"('These apples taste good.', '[start] Estas manzanas están buenas. [end]')\n",
|
| 134 |
+
"('Tom is on his own now.', '[start] Tom es independiente ahora. [end]')\n",
|
| 135 |
+
"('Hey, you want to have a lot of fun? Come with us.', '[start] Oye, ¿quieres entretenerte?, acompáñanos. [end]')\n",
|
| 136 |
+
"('You have to remain detached.', '[start] Tú tienes que permanecer independiente. [end]')\n"
|
| 137 |
+
]
|
| 138 |
+
}
|
| 139 |
+
],
|
| 140 |
+
"source": [
|
| 141 |
+
"for i in range(5):\n",
|
| 142 |
+
" print(text_pairs[i])"
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"cell_type": "markdown",
|
| 147 |
+
"metadata": {},
|
| 148 |
+
"source": [
|
| 149 |
+
"Structure of the Dataset"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": 6,
|
| 155 |
+
"metadata": {},
|
| 156 |
+
"outputs": [
|
| 157 |
+
{
|
| 158 |
+
"name": "stdout",
|
| 159 |
+
"output_type": "stream",
|
| 160 |
+
"text": [
|
| 161 |
+
"118964 total pairs\n",
|
| 162 |
+
"83276 training pairs\n",
|
| 163 |
+
"17844 validation pairs\n",
|
| 164 |
+
"17844 test pairs\n"
|
| 165 |
+
]
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
"source": [
|
| 169 |
+
"num_val_samples = int(0.15 * len(text_pairs))\n",
|
| 170 |
+
"num_train_samples = len(text_pairs) - 2 * num_val_samples\n",
|
| 171 |
+
"train_pairs = text_pairs[:num_train_samples]\n",
|
| 172 |
+
"val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]\n",
|
| 173 |
+
"test_pairs = text_pairs[num_train_samples + num_val_samples:]\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"print(f\"{len(text_pairs)} total pairs\")\n",
|
| 176 |
+
"print(f\"{len(train_pairs)} training pairs\")\n",
|
| 177 |
+
"print(f\"{len(val_pairs)} validation pairs\")\n",
|
| 178 |
+
"print(f\"{len(test_pairs)} test pairs\")"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"cell_type": "code",
|
| 183 |
+
"execution_count": 7,
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"outputs": [],
|
| 186 |
+
"source": [
|
| 187 |
+
"# parameters\n",
|
| 188 |
+
"strip_chars = string.punctuation + \"¿\"\n",
|
| 189 |
+
"strip_chars = strip_chars.replace(\"[\", \"\")\n",
|
| 190 |
+
"strip_chars = strip_chars.replace(\"]\", \"\")\n",
|
| 191 |
+
"\n",
|
| 192 |
+
"vocab_size = 15000\n",
|
| 193 |
+
"sequence_length = 20\n",
|
| 194 |
+
"batch_size = 64"
|
| 195 |
+
]
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"cell_type": "markdown",
|
| 199 |
+
"metadata": {},
|
| 200 |
+
"source": [
|
| 201 |
+
"## Vectorize the data"
|
| 202 |
+
]
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"cell_type": "code",
|
| 206 |
+
"execution_count": 8,
|
| 207 |
+
"metadata": {},
|
| 208 |
+
"outputs": [],
|
| 209 |
+
"source": [
|
| 210 |
+
"def custom_standardization(input_string):\n",
|
| 211 |
+
" lowercase = tf_strings.lower(input_string)\n",
|
| 212 |
+
" return tf_strings.regex_replace(lowercase, f\"[{re.escape(strip_chars)}]\", \"\")\n",
|
| 213 |
+
"\n",
|
| 214 |
+
"# vectorization\n",
|
| 215 |
+
"eng_vectorization = TextVectorization(\n",
|
| 216 |
+
" max_tokens = vocab_size,\n",
|
| 217 |
+
" output_mode = \"int\",\n",
|
| 218 |
+
" output_sequence_length = sequence_length,\n",
|
| 219 |
+
")\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"spa_vectorization = TextVectorization(\n",
|
| 222 |
+
" max_tokens = vocab_size,\n",
|
| 223 |
+
" output_mode = \"int\",\n",
|
| 224 |
+
" output_sequence_length = sequence_length + 1,\n",
|
| 225 |
+
" standardize = custom_standardization,\n",
|
| 226 |
+
")\n",
|
| 227 |
+
"\n",
|
| 228 |
+
"train_eng_texts = [pair[0] for pair in train_pairs]\n",
|
| 229 |
+
"train_spa_texts = [pair[1] for pair in train_pairs]\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"eng_vectorization.adapt(train_eng_texts)\n",
|
| 232 |
+
"spa_vectorization.adapt(train_spa_texts)\n",
|
| 233 |
+
"\n",
|
| 234 |
+
"#save the vectorization layers\n",
|
| 235 |
+
"eng_vectorization_config = eng_vectorization.get_config()\n",
|
| 236 |
+
"eng_vectorization_config.pop('standardize', None)\n",
|
| 237 |
+
"eng_vocab = eng_vectorization.get_vocabulary()\n",
|
| 238 |
+
"with open('eng_vectorization_config.json', 'w', encoding='utf-8') as f:\n",
|
| 239 |
+
" json.dump(eng_vectorization_config, f)\n",
|
| 240 |
+
" \n",
|
| 241 |
+
"with open('eng_vocab.json', 'w', encoding='utf-8') as f:\n",
|
| 242 |
+
" json.dump(eng_vocab, f)\n",
|
| 243 |
+
" \n",
|
| 244 |
+
"spa_vectorization_config = spa_vectorization.get_config()\n",
|
| 245 |
+
"spa_vectorization_config.pop('standardize', None)\n",
|
| 246 |
+
"spa_vocab = spa_vectorization.get_vocabulary()\n",
|
| 247 |
+
"with open('spa_vectorization_config.json', 'w', encoding='utf-8') as f:\n",
|
| 248 |
+
" json.dump(spa_vectorization_config, f)\n",
|
| 249 |
+
" \n",
|
| 250 |
+
"with open('spa_vocab.json', 'w', encoding='utf-8') as f:\n",
|
| 251 |
+
" json.dump(spa_vocab, f)\n",
|
| 252 |
+
" \n",
|
| 253 |
+
"\n",
|
| 254 |
+
"def format_dataset(eng, spa):\n",
|
| 255 |
+
" eng = eng_vectorization(eng)\n",
|
| 256 |
+
" spa = spa_vectorization(spa)\n",
|
| 257 |
+
" return (\n",
|
| 258 |
+
" {\n",
|
| 259 |
+
" \"encoder_inputs\": eng,\n",
|
| 260 |
+
" \"decoder_inputs\": spa[:, :-1],\n",
|
| 261 |
+
" },\n",
|
| 262 |
+
" spa[:, 1:],\n",
|
| 263 |
+
" )\n",
|
| 264 |
+
" \n",
|
| 265 |
+
"def make_dataset(pairs):\n",
|
| 266 |
+
" eng_texts, spa_texts = zip(*pairs)\n",
|
| 267 |
+
" eng_texts = list(eng_texts)\n",
|
| 268 |
+
" spa_texts = list(spa_texts)\n",
|
| 269 |
+
" dataset = tf_data.Dataset.from_tensor_slices((eng_texts, spa_texts))\n",
|
| 270 |
+
" dataset = dataset.batch(batch_size)\n",
|
| 271 |
+
" dataset = dataset.map(format_dataset)\n",
|
| 272 |
+
" return dataset.cache().shuffle(2048).prefetch(16)\n",
|
| 273 |
+
"\n",
|
| 274 |
+
"train_ds = make_dataset(train_pairs)\n",
|
| 275 |
+
"val_ds = make_dataset(val_pairs)\n",
|
| 276 |
+
" "
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"cell_type": "code",
|
| 281 |
+
"execution_count": 9,
|
| 282 |
+
"metadata": {},
|
| 283 |
+
"outputs": [
|
| 284 |
+
{
|
| 285 |
+
"name": "stdout",
|
| 286 |
+
"output_type": "stream",
|
| 287 |
+
"text": [
|
| 288 |
+
"(64, 20)\n",
|
| 289 |
+
"(64, 20)\n"
|
| 290 |
+
]
|
| 291 |
+
}
|
| 292 |
+
],
|
| 293 |
+
"source": [
|
| 294 |
+
"for inputs,targets in train_ds.take(1):\n",
|
| 295 |
+
" print(inputs[\"encoder_inputs\"].shape)\n",
|
| 296 |
+
" print(targets.shape)"
|
| 297 |
+
]
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"cell_type": "markdown",
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"source": [
|
| 303 |
+
"### Model Architecture\n",
|
| 304 |
+
"\n",
|
| 305 |
+
"\n",
|
| 306 |
+
""
|
| 307 |
+
]
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"cell_type": "code",
|
| 311 |
+
"execution_count": 10,
|
| 312 |
+
"metadata": {},
|
| 313 |
+
"outputs": [],
|
| 314 |
+
"source": [
|
| 315 |
+
"# Creating an Encoder\n",
|
| 316 |
+
"class TransformerEncoder(layers.Layer):\n",
|
| 317 |
+
" def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):\n",
|
| 318 |
+
" super().__init__(**kwargs)\n",
|
| 319 |
+
" self.embed_dim = embed_dim\n",
|
| 320 |
+
" self.dense_dim = dense_dim\n",
|
| 321 |
+
" self.num_heads = num_heads\n",
|
| 322 |
+
" self.attention = layers.MultiHeadAttention(\n",
|
| 323 |
+
" num_heads = num_heads, key_dim = embed_dim\n",
|
| 324 |
+
" )\n",
|
| 325 |
+
" self.dense_proj = keras.Sequential(\n",
|
| 326 |
+
" [\n",
|
| 327 |
+
" layers.Dense(dense_dim, activation = \"relu\"),\n",
|
| 328 |
+
" layers.Dense(embed_dim),\n",
|
| 329 |
+
" ]\n",
|
| 330 |
+
" )\n",
|
| 331 |
+
" self.layernorm_1 = layers.LayerNormalization()\n",
|
| 332 |
+
" self.layernorm_2 = layers.LayerNormalization()\n",
|
| 333 |
+
" self.supports_masking = True\n",
|
| 334 |
+
" \n",
|
| 335 |
+
" def call(self, inputs, mask=None):\n",
|
| 336 |
+
" if mask is not None:\n",
|
| 337 |
+
" padding_mask = tf.cast(mask[:, None, :], dtype = tf.int32)\n",
|
| 338 |
+
" else:\n",
|
| 339 |
+
" padding_mask = None\n",
|
| 340 |
+
" \n",
|
| 341 |
+
" attention_output = self.attention(\n",
|
| 342 |
+
" query = inputs,\n",
|
| 343 |
+
" value = inputs,\n",
|
| 344 |
+
" key = inputs,\n",
|
| 345 |
+
" attention_mask = padding_mask,\n",
|
| 346 |
+
" )\n",
|
| 347 |
+
" proj_input = self.layernorm_1(inputs + attention_output)\n",
|
| 348 |
+
" proj_output = self.dense_proj(proj_input)\n",
|
| 349 |
+
" return self.layernorm_2(proj_input + proj_output)\n",
|
| 350 |
+
" \n",
|
| 351 |
+
" def get_config(self):\n",
|
| 352 |
+
" config = super().get_config()\n",
|
| 353 |
+
" config.update({\n",
|
| 354 |
+
" \"embed_dim\": self.embed_dim,\n",
|
| 355 |
+
" \"dense_dim\": self.dense_dim,\n",
|
| 356 |
+
" \"num_heads\": self.num_heads,\n",
|
| 357 |
+
" })\n",
|
| 358 |
+
" return config\n",
|
| 359 |
+
" \n",
|
| 360 |
+
"# Creating a Positional Embedding\n",
|
| 361 |
+
"class PositionalEmbedding(layers.Layer):\n",
|
| 362 |
+
" def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):\n",
|
| 363 |
+
" super().__init__(**kwargs)\n",
|
| 364 |
+
" self.token_embeddings = layers.Embedding(\n",
|
| 365 |
+
" input_dim = vocab_size, output_dim = embed_dim\n",
|
| 366 |
+
" )\n",
|
| 367 |
+
" self.position_embeddings = layers.Embedding(\n",
|
| 368 |
+
" input_dim = sequence_length, output_dim = embed_dim\n",
|
| 369 |
+
" )\n",
|
| 370 |
+
" self.sequence_length = sequence_length\n",
|
| 371 |
+
" self.vocab_size = vocab_size\n",
|
| 372 |
+
" self.embed_dim = embed_dim\n",
|
| 373 |
+
" \n",
|
| 374 |
+
" def call(self, inputs):\n",
|
| 375 |
+
" length = tf.shape(inputs)[-1]\n",
|
| 376 |
+
" positions = tf.range(start = 0, limit = length, delta = 1)\n",
|
| 377 |
+
" embedded_tokens = self.token_embeddings(inputs)\n",
|
| 378 |
+
" embedded_positions = self.position_embeddings(positions)\n",
|
| 379 |
+
" return embedded_tokens + embedded_positions\n",
|
| 380 |
+
" \n",
|
| 381 |
+
" def compute_mask(self, inputs, mask=None):\n",
|
| 382 |
+
" if mask is not None:\n",
|
| 383 |
+
" return tf.not_equal(inputs, 0)\n",
|
| 384 |
+
" else:\n",
|
| 385 |
+
" return None\n",
|
| 386 |
+
" \n",
|
| 387 |
+
" def get_config(self):\n",
|
| 388 |
+
" config = super().get_config()\n",
|
| 389 |
+
" config.update({\n",
|
| 390 |
+
" \"vocab_size\": self.vocab_size,\n",
|
| 391 |
+
" \"sequence_length\": self.sequence_length,\n",
|
| 392 |
+
" \"embed_dim\": self.embed_dim,\n",
|
| 393 |
+
" })\n",
|
| 394 |
+
" return config\n",
|
| 395 |
+
" \n",
|
| 396 |
+
"# Creating a Decoder\n",
|
| 397 |
+
"class TransformerDecoder(layers.Layer):\n",
|
| 398 |
+
" def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):\n",
|
| 399 |
+
" super().__init__(**kwargs)\n",
|
| 400 |
+
" self.embed_dim = embed_dim\n",
|
| 401 |
+
" self.latent_dim = latent_dim\n",
|
| 402 |
+
" self.num_heads = num_heads\n",
|
| 403 |
+
" self.attention_1 = layers.MultiHeadAttention(\n",
|
| 404 |
+
" num_heads = num_heads, key_dim = embed_dim\n",
|
| 405 |
+
" )\n",
|
| 406 |
+
" self.attention_2 = layers.MultiHeadAttention(\n",
|
| 407 |
+
" num_heads = num_heads, key_dim = embed_dim\n",
|
| 408 |
+
" )\n",
|
| 409 |
+
" self.dense_proj = keras.Sequential(\n",
|
| 410 |
+
" [\n",
|
| 411 |
+
" layers.Dense(latent_dim, activation = \"relu\"),\n",
|
| 412 |
+
" layers.Dense(embed_dim),\n",
|
| 413 |
+
" ]\n",
|
| 414 |
+
" )\n",
|
| 415 |
+
" self.layernorm_1 = layers.LayerNormalization()\n",
|
| 416 |
+
" self.layernorm_2 = layers.LayerNormalization()\n",
|
| 417 |
+
" self.layernorm_3 = layers.LayerNormalization()\n",
|
| 418 |
+
" self.supports_masking = True\n",
|
| 419 |
+
" \n",
|
| 420 |
+
" def call(self, inputs, encoder_outputs, mask=None):\n",
|
| 421 |
+
" casual_mask = self.get_causal_attention_mask(inputs)\n",
|
| 422 |
+
" if mask is not None:\n",
|
| 423 |
+
" padding_mask = tf.cast(mask[:, None, :], dtype = tf.int32)\n",
|
| 424 |
+
" padding_mask = tf.minimum(padding_mask, casual_mask)\n",
|
| 425 |
+
" else:\n",
|
| 426 |
+
" padding_mask = None\n",
|
| 427 |
+
" \n",
|
| 428 |
+
" attention_output_1 = self.attention_1(\n",
|
| 429 |
+
" query = inputs,\n",
|
| 430 |
+
" value = inputs,\n",
|
| 431 |
+
" key = inputs,\n",
|
| 432 |
+
" attention_mask = casual_mask,\n",
|
| 433 |
+
" )\n",
|
| 434 |
+
" out_1 = self.layernorm_1(inputs + attention_output_1)\n",
|
| 435 |
+
" \n",
|
| 436 |
+
" attention_output_2 = self.attention_2(\n",
|
| 437 |
+
" query = out_1,\n",
|
| 438 |
+
" value = encoder_outputs,\n",
|
| 439 |
+
" key = encoder_outputs,\n",
|
| 440 |
+
" attention_mask = padding_mask,\n",
|
| 441 |
+
" )\n",
|
| 442 |
+
" \n",
|
| 443 |
+
" out_2 = self.layernorm_2(out_1 + attention_output_2)\n",
|
| 444 |
+
" proj_output = self.dense_proj(out_2)\n",
|
| 445 |
+
" \n",
|
| 446 |
+
" return self.layernorm_3(out_2 + proj_output)\n",
|
| 447 |
+
" \n",
|
| 448 |
+
" def get_causal_attention_mask(self, inputs):\n",
|
| 449 |
+
" input_shape = tf.shape(inputs)\n",
|
| 450 |
+
" batch_size, sequence_length = input_shape[0], input_shape[1]\n",
|
| 451 |
+
" i = tf.range(sequence_length)[:, None]\n",
|
| 452 |
+
" j = tf.range(sequence_length)\n",
|
| 453 |
+
" mask = tf.cast(i >= j, tf.int32)\n",
|
| 454 |
+
" mask = tf.reshape(mask,(1, input_shape[1], input_shape[1]))\n",
|
| 455 |
+
" mult = tf.concat(\n",
|
| 456 |
+
" [\n",
|
| 457 |
+
" tf.expand_dims(batch_size, -1),\n",
|
| 458 |
+
" tf.convert_to_tensor([1, 1]),\n",
|
| 459 |
+
" ],\n",
|
| 460 |
+
" axis = 0,\n",
|
| 461 |
+
" )\n",
|
| 462 |
+
" return tf.tile(mask, mult)\n",
|
| 463 |
+
" \n",
|
| 464 |
+
" def get_config(self):\n",
|
| 465 |
+
" config = super().get_config()\n",
|
| 466 |
+
" config.update({\n",
|
| 467 |
+
" \"embed_dim\": self.embed_dim,\n",
|
| 468 |
+
" \"latent_dim\": self.latent_dim,\n",
|
| 469 |
+
" \"num_heads\": self.num_heads,\n",
|
| 470 |
+
" })\n",
|
| 471 |
+
" return config\n"
|
| 472 |
+
]
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"cell_type": "code",
|
| 476 |
+
"execution_count": 11,
|
| 477 |
+
"metadata": {},
|
| 478 |
+
"outputs": [],
|
| 479 |
+
"source": [
|
| 480 |
+
"# define emmbedding dimensions, latent dimensions, and number of heads\n",
|
| 481 |
+
"embed_dim = 256\n",
|
| 482 |
+
"latent_dim = 2048\n",
|
| 483 |
+
"num_heads = 8\n",
|
| 484 |
+
"\n",
|
| 485 |
+
"#Encoder\n",
|
| 486 |
+
"encoder_inputs = keras.Input(shape = (None,), dtype = \"int64\", name = \"encoder_inputs\")\n",
|
| 487 |
+
"\n",
|
| 488 |
+
"x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)\n",
|
| 489 |
+
"\n",
|
| 490 |
+
"encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)\n",
|
| 491 |
+
"\n",
|
| 492 |
+
"encoder = keras.Model(encoder_inputs, encoder_outputs, name = \"encoder\")\n",
|
| 493 |
+
"\n",
|
| 494 |
+
"#Decoder\n",
|
| 495 |
+
"decoder_inputs = keras.Input(shape = (None,), dtype = \"int64\", name = \"decoder_inputs\")\n",
|
| 496 |
+
"encoder_seq_inputs = keras.Input(shape = (None, embed_dim), name = \"encoder_seq_inputs\")\n",
|
| 497 |
+
"\n",
|
| 498 |
+
"x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)\n",
|
| 499 |
+
"\n",
|
| 500 |
+
"x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoder_seq_inputs)\n",
|
| 501 |
+
"\n",
|
| 502 |
+
"x = layers.Dropout(0.5)(x)\n",
|
| 503 |
+
"\n",
|
| 504 |
+
"decoder_outputs = layers.Dense(vocab_size, activation = \"softmax\")(x)\n",
|
| 505 |
+
"\n",
|
| 506 |
+
"decoder = keras.Model([decoder_inputs, encoder_seq_inputs], decoder_outputs, name = \"decoder\")\n",
|
| 507 |
+
"\n",
|
| 508 |
+
"# Define the final model\n",
|
| 509 |
+
"decoder_outputs = decoder([decoder_inputs, encoder_outputs])\n",
|
| 510 |
+
"\n",
|
| 511 |
+
"transformer = keras.Model(\n",
|
| 512 |
+
" [encoder_inputs, decoder_inputs], decoder_outputs, name = \"transformer\"\n",
|
| 513 |
+
")\n"
|
| 514 |
+
]
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"cell_type": "code",
|
| 518 |
+
"execution_count": 12,
|
| 519 |
+
"metadata": {},
|
| 520 |
+
"outputs": [
|
| 521 |
+
{
|
| 522 |
+
"name": "stdout",
|
| 523 |
+
"output_type": "stream",
|
| 524 |
+
"text": [
|
| 525 |
+
"Model: \"transformer\"\n",
|
| 526 |
+
"__________________________________________________________________________________________________\n",
|
| 527 |
+
" Layer (type) Output Shape Param # Connected to \n",
|
| 528 |
+
"==================================================================================================\n",
|
| 529 |
+
" encoder_inputs (InputLayer) [(None, None)] 0 [] \n",
|
| 530 |
+
" \n",
|
| 531 |
+
" positional_embedding (Position (None, None, 256) 3845120 ['encoder_inputs[0][0]'] \n",
|
| 532 |
+
" alEmbedding) \n",
|
| 533 |
+
" \n",
|
| 534 |
+
" decoder_inputs (InputLayer) [(None, None)] 0 [] \n",
|
| 535 |
+
" \n",
|
| 536 |
+
" transformer_encoder (Transform (None, None, 256) 3155456 ['positional_embedding[0][0]'] \n",
|
| 537 |
+
" erEncoder) \n",
|
| 538 |
+
" \n",
|
| 539 |
+
" decoder (Functional) (None, None, 15000) 12959640 ['decoder_inputs[0][0]', \n",
|
| 540 |
+
" 'transformer_encoder[0][0]'] \n",
|
| 541 |
+
" \n",
|
| 542 |
+
"==================================================================================================\n",
|
| 543 |
+
"Total params: 19,960,216\n",
|
| 544 |
+
"Trainable params: 19,960,216\n",
|
| 545 |
+
"Non-trainable params: 0\n",
|
| 546 |
+
"__________________________________________________________________________________________________\n",
|
| 547 |
+
"Epoch 1/20\n",
|
| 548 |
+
"1302/1302 [==============================] - 132s 95ms/step - loss: 1.9854 - accuracy: 0.7200 - val_loss: 1.7019 - val_accuracy: 0.7395\n",
|
| 549 |
+
"Epoch 2/20\n",
|
| 550 |
+
"1302/1302 [==============================] - 123s 94ms/step - loss: 1.7172 - accuracy: 0.7499 - val_loss: 1.5479 - val_accuracy: 0.7610\n",
|
| 551 |
+
"Epoch 3/20\n",
|
| 552 |
+
"1302/1302 [==============================] - 123s 95ms/step - loss: 1.5636 - accuracy: 0.7712 - val_loss: 1.4142 - val_accuracy: 0.7838\n",
|
| 553 |
+
"Epoch 4/20\n",
|
| 554 |
+
"1302/1302 [==============================] - 124s 96ms/step - loss: 1.4229 - accuracy: 0.7906 - val_loss: 1.2981 - val_accuracy: 0.8027\n",
|
| 555 |
+
"Epoch 5/20\n",
|
| 556 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 1.3049 - accuracy: 0.8079 - val_loss: 1.2207 - val_accuracy: 0.8173\n",
|
| 557 |
+
"Epoch 6/20\n",
|
| 558 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 1.2307 - accuracy: 0.8232 - val_loss: 1.1703 - val_accuracy: 0.8294\n",
|
| 559 |
+
"Epoch 7/20\n",
|
| 560 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 1.1852 - accuracy: 0.8348 - val_loss: 1.1304 - val_accuracy: 0.8376\n",
|
| 561 |
+
"Epoch 8/20\n",
|
| 562 |
+
"1302/1302 [==============================] - 124s 95ms/step - loss: 1.1455 - accuracy: 0.8431 - val_loss: 1.1064 - val_accuracy: 0.8436\n",
|
| 563 |
+
"Epoch 9/20\n",
|
| 564 |
+
"1302/1302 [==============================] - 124s 95ms/step - loss: 1.1154 - accuracy: 0.8496 - val_loss: 1.0878 - val_accuracy: 0.8461\n",
|
| 565 |
+
"Epoch 10/20\n",
|
| 566 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 1.0901 - accuracy: 0.8545 - val_loss: 1.0737 - val_accuracy: 0.8498\n",
|
| 567 |
+
"Epoch 11/20\n",
|
| 568 |
+
"1302/1302 [==============================] - 124s 95ms/step - loss: 1.0690 - accuracy: 0.8583 - val_loss: 1.0697 - val_accuracy: 0.8472\n",
|
| 569 |
+
"Epoch 12/20\n",
|
| 570 |
+
"1302/1302 [==============================] - 124s 95ms/step - loss: 1.0495 - accuracy: 0.8616 - val_loss: 1.0458 - val_accuracy: 0.8543\n",
|
| 571 |
+
"Epoch 13/20\n",
|
| 572 |
+
"1302/1302 [==============================] - 124s 95ms/step - loss: 1.0332 - accuracy: 0.8648 - val_loss: 1.0387 - val_accuracy: 0.8548\n",
|
| 573 |
+
"Epoch 14/20\n",
|
| 574 |
+
"1302/1302 [==============================] - 123s 95ms/step - loss: 1.0180 - accuracy: 0.8673 - val_loss: 1.0458 - val_accuracy: 0.8550\n",
|
| 575 |
+
"Epoch 15/20\n",
|
| 576 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 1.0036 - accuracy: 0.8695 - val_loss: 1.0303 - val_accuracy: 0.8569\n",
|
| 577 |
+
"Epoch 16/20\n",
|
| 578 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 0.9891 - accuracy: 0.8720 - val_loss: 1.0184 - val_accuracy: 0.8586\n",
|
| 579 |
+
"Epoch 17/20\n",
|
| 580 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 0.9779 - accuracy: 0.8738 - val_loss: 1.0313 - val_accuracy: 0.8567\n",
|
| 581 |
+
"Epoch 18/20\n",
|
| 582 |
+
"1302/1302 [==============================] - 126s 96ms/step - loss: 0.9668 - accuracy: 0.8754 - val_loss: 1.0106 - val_accuracy: 0.8614\n",
|
| 583 |
+
"Epoch 19/20\n",
|
| 584 |
+
"1302/1302 [==============================] - 125s 96ms/step - loss: 0.9543 - accuracy: 0.8772 - val_loss: 1.0144 - val_accuracy: 0.8598\n",
|
| 585 |
+
"Epoch 20/20\n",
|
| 586 |
+
"1302/1302 [==============================] - 124s 95ms/step - loss: 0.9415 - accuracy: 0.8791 - val_loss: 1.0139 - val_accuracy: 0.8617\n"
|
| 587 |
+
]
|
| 588 |
+
},
|
| 589 |
+
{
|
| 590 |
+
"data": {
|
| 591 |
+
"text/plain": [
|
| 592 |
+
"<keras.callbacks.History at 0x24df73cf5e0>"
|
| 593 |
+
]
|
| 594 |
+
},
|
| 595 |
+
"execution_count": 12,
|
| 596 |
+
"metadata": {},
|
| 597 |
+
"output_type": "execute_result"
|
| 598 |
+
}
|
| 599 |
+
],
|
| 600 |
+
"source": [
|
| 601 |
+
"epochs = 20\n",
|
| 602 |
+
"\n",
|
| 603 |
+
"transformer.summary()\n",
|
| 604 |
+
"\n",
|
| 605 |
+
"transformer.compile(\n",
|
| 606 |
+
" \"rmsprop\", loss = \"sparse_categorical_crossentropy\", metrics = [\"accuracy\"]\n",
|
| 607 |
+
")\n",
|
| 608 |
+
"\n",
|
| 609 |
+
"transformer.fit(train_ds, epochs = epochs, validation_data = val_ds)"
|
| 610 |
+
]
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"cell_type": "code",
|
| 614 |
+
"execution_count": 13,
|
| 615 |
+
"metadata": {},
|
| 616 |
+
"outputs": [
|
| 617 |
+
{
|
| 618 |
+
"name": "stderr",
|
| 619 |
+
"output_type": "stream",
|
| 620 |
+
"text": [
|
| 621 |
+
"WARNING:absl:Found untraced functions such as embedding_layer_call_fn, embedding_layer_call_and_return_conditional_losses, embedding_1_layer_call_fn, embedding_1_layer_call_and_return_conditional_losses, multi_head_attention_layer_call_fn while saving (showing 5 of 60). These functions will not be directly callable after loading.\n"
|
| 622 |
+
]
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"name": "stdout",
|
| 626 |
+
"output_type": "stream",
|
| 627 |
+
"text": [
|
| 628 |
+
"INFO:tensorflow:Assets written to: transformer_model\\assets\n"
|
| 629 |
+
]
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"name": "stderr",
|
| 633 |
+
"output_type": "stream",
|
| 634 |
+
"text": [
|
| 635 |
+
"INFO:tensorflow:Assets written to: transformer_model\\assets\n"
|
| 636 |
+
]
|
| 637 |
+
}
|
| 638 |
+
],
|
| 639 |
+
"source": [
|
| 640 |
+
"transformer.save(\"transformer_model\")"
|
| 641 |
+
]
|
| 642 |
+
},
|
| 643 |
+
{
|
| 644 |
+
"cell_type": "code",
|
| 645 |
+
"execution_count": 14,
|
| 646 |
+
"metadata": {},
|
| 647 |
+
"outputs": [
|
| 648 |
+
{
|
| 649 |
+
"name": "stdout",
|
| 650 |
+
"output_type": "stream",
|
| 651 |
+
"text": [
|
| 652 |
+
"input: a few passengers went on board the plane\n",
|
| 653 |
+
"translated: [start] unos dos te [UNK] en los años [end]\n",
|
| 654 |
+
"\n",
|
| 655 |
+
"input: i think shes an honest woman\n",
|
| 656 |
+
"translated: [start] creo que es una mujer [UNK] [end]\n",
|
| 657 |
+
"\n",
|
| 658 |
+
"input: im old enough to do that on my own\n",
|
| 659 |
+
"translated: [start] soy lo suficientemente viejo para hacer por mi [end]\n",
|
| 660 |
+
"\n",
|
| 661 |
+
"input: youre too drunk to drive\n",
|
| 662 |
+
"translated: [start] eres demasiado de conducir [end]\n",
|
| 663 |
+
"\n",
|
| 664 |
+
"input: id like to go to hawaii\n",
|
| 665 |
+
"translated: [start] quisiera ir a china [end]\n",
|
| 666 |
+
"\n"
|
| 667 |
+
]
|
| 668 |
+
}
|
| 669 |
+
],
|
| 670 |
+
"source": [
|
| 671 |
+
"spa_vocab = spa_vectorization.get_vocabulary()\n",
|
| 672 |
+
"spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))\n",
|
| 673 |
+
"max_decoded_sentence_length = sequence_length\n",
|
| 674 |
+
"\n",
|
| 675 |
+
"def decode_sentence(input_sentence):\n",
|
| 676 |
+
" tokenized_input_sentence = eng_vectorization([input_sentence])\n",
|
| 677 |
+
" decoded_sentence = \"[start]\"\n",
|
| 678 |
+
" for i in range(max_decoded_sentence_length):\n",
|
| 679 |
+
" tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]\n",
|
| 680 |
+
" predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])\n",
|
| 681 |
+
" sampled_token_index = tf.argmax(predictions[0, i, :]).numpy().item(0)\n",
|
| 682 |
+
" sampled_token = spa_index_lookup[sampled_token_index]\n",
|
| 683 |
+
" decoded_sentence += \" \" + sampled_token\n",
|
| 684 |
+
" if sampled_token == \"[end]\":\n",
|
| 685 |
+
" break\n",
|
| 686 |
+
" return decoded_sentence\n",
|
| 687 |
+
"\n",
|
| 688 |
+
"test_eng_texts = [pair[0] for pair in test_pairs]\n",
|
| 689 |
+
"for _ in range(5):\n",
|
| 690 |
+
" input_sentence = random.choice(test_eng_texts)\n",
|
| 691 |
+
" input_sentence = input_sentence.lower()\n",
|
| 692 |
+
" input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))\n",
|
| 693 |
+
" translated = decode_sentence(input_sentence)\n",
|
| 694 |
+
" print(f\"input: {input_sentence}\")\n",
|
| 695 |
+
" print(f\"translated: {translated}\")\n",
|
| 696 |
+
" print()"
|
| 697 |
+
]
|
| 698 |
+
},
|
| 699 |
+
{
|
| 700 |
+
"cell_type": "code",
|
| 701 |
+
"execution_count": 2,
|
| 702 |
+
"metadata": {},
|
| 703 |
+
"outputs": [
|
| 704 |
+
{
|
| 705 |
+
"name": "stdout",
|
| 706 |
+
"output_type": "stream",
|
| 707 |
+
"text": [
|
| 708 |
+
"Requirement already satisfied: tensorflow in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (2.10.1)Note: you may need to restart the kernel to use updated packages.\n",
|
| 709 |
+
"\n",
|
| 710 |
+
"Requirement already satisfied: absl-py>=1.0.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (2.1.0)\n",
|
| 711 |
+
"Requirement already satisfied: astunparse>=1.6.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (1.6.3)\n",
|
| 712 |
+
"Requirement already satisfied: flatbuffers>=2.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (24.3.25)\n",
|
| 713 |
+
"Requirement already satisfied: gast<=0.4.0,>=0.2.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (0.4.0)\n",
|
| 714 |
+
"Requirement already satisfied: google-pasta>=0.1.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (0.2.0)\n",
|
| 715 |
+
"Requirement already satisfied: h5py>=2.9.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (3.11.0)\n",
|
| 716 |
+
"Requirement already satisfied: keras-preprocessing>=1.1.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (1.1.2)\n",
|
| 717 |
+
"Requirement already satisfied: libclang>=13.0.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (18.1.1)\n",
|
| 718 |
+
"Requirement already satisfied: numpy>=1.20 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (1.26.4)\n",
|
| 719 |
+
"Requirement already satisfied: opt-einsum>=2.3.2 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (3.3.0)\n",
|
| 720 |
+
"Requirement already satisfied: packaging in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (24.0)\n",
|
| 721 |
+
"Requirement already satisfied: protobuf<3.20,>=3.9.2 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (3.19.6)\n",
|
| 722 |
+
"Requirement already satisfied: setuptools in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (69.5.1)\n",
|
| 723 |
+
"Requirement already satisfied: six>=1.12.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (1.16.0)\n",
|
| 724 |
+
"Requirement already satisfied: termcolor>=1.1.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (2.4.0)\n",
|
| 725 |
+
"Requirement already satisfied: typing-extensions>=3.6.6 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (4.12.2)\n",
|
| 726 |
+
"Requirement already satisfied: wrapt>=1.11.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (1.16.0)\n",
|
| 727 |
+
"Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (0.31.0)\n",
|
| 728 |
+
"Requirement already satisfied: grpcio<2.0,>=1.24.3 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (1.64.1)\n",
|
| 729 |
+
"Requirement already satisfied: tensorboard<2.11,>=2.10 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (2.10.1)\n",
|
| 730 |
+
"Requirement already satisfied: tensorflow-estimator<2.11,>=2.10.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (2.10.0)\n",
|
| 731 |
+
"Requirement already satisfied: keras<2.11,>=2.10.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorflow) (2.10.0)\n",
|
| 732 |
+
"Requirement already satisfied: wheel<1.0,>=0.23.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from astunparse>=1.6.0->tensorflow) (0.43.0)\n",
|
| 733 |
+
"Requirement already satisfied: google-auth<3,>=1.6.3 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (2.30.0)\n",
|
| 734 |
+
"Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (0.4.6)\n",
|
| 735 |
+
"Requirement already satisfied: markdown>=2.6.8 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (3.6)\n",
|
| 736 |
+
"Requirement already satisfied: requests<3,>=2.21.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (2.32.3)\n",
|
| 737 |
+
"Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (0.6.1)\n",
|
| 738 |
+
"Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (1.8.1)\n",
|
| 739 |
+
"Requirement already satisfied: werkzeug>=1.0.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from tensorboard<2.11,>=2.10->tensorflow) (3.0.3)\n",
|
| 740 |
+
"Requirement already satisfied: cachetools<6.0,>=2.0.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow) (5.3.3)\n",
|
| 741 |
+
"Requirement already satisfied: pyasn1-modules>=0.2.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow) (0.4.0)\n",
|
| 742 |
+
"Requirement already satisfied: rsa<5,>=3.1.4 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow) (4.9)\n",
|
| 743 |
+
"Requirement already satisfied: requests-oauthlib>=0.7.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow) (2.0.0)\n",
|
| 744 |
+
"Requirement already satisfied: importlib-metadata>=4.4 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from markdown>=2.6.8->tensorboard<2.11,>=2.10->tensorflow) (7.1.0)\n",
|
| 745 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow) (3.3.2)\n",
|
| 746 |
+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow) (3.7)\n",
|
| 747 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow) (2.2.1)\n",
|
| 748 |
+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow) (2024.6.2)\n",
|
| 749 |
+
"Requirement already satisfied: MarkupSafe>=2.1.1 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from werkzeug>=1.0.1->tensorboard<2.11,>=2.10->tensorflow) (2.1.5)\n",
|
| 750 |
+
"Requirement already satisfied: zipp>=0.5 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.11,>=2.10->tensorflow) (3.19.2)\n",
|
| 751 |
+
"Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow) (0.6.0)\n",
|
| 752 |
+
"Requirement already satisfied: oauthlib>=3.0.0 in c:\\users\\prajw\\anaconda3\\envs\\testnullclass\\lib\\site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow) (3.2.2)\n"
|
| 753 |
+
]
|
| 754 |
+
}
|
| 755 |
+
],
|
| 756 |
+
"source": [
|
| 757 |
+
"pip install tensorflow\n"
|
| 758 |
+
]
|
| 759 |
+
}
|
| 760 |
+
],
|
| 761 |
+
"metadata": {
|
| 762 |
+
"kernelspec": {
|
| 763 |
+
"display_name": "base",
|
| 764 |
+
"language": "python",
|
| 765 |
+
"name": "python3"
|
| 766 |
+
},
|
| 767 |
+
"language_info": {
|
| 768 |
+
"codemirror_mode": {
|
| 769 |
+
"name": "ipython",
|
| 770 |
+
"version": 3
|
| 771 |
+
},
|
| 772 |
+
"file_extension": ".py",
|
| 773 |
+
"mimetype": "text/x-python",
|
| 774 |
+
"name": "python",
|
| 775 |
+
"nbconvert_exporter": "python",
|
| 776 |
+
"pygments_lexer": "ipython3",
|
| 777 |
+
"version": "3.9.19"
|
| 778 |
+
}
|
| 779 |
+
},
|
| 780 |
+
"nbformat": 4,
|
| 781 |
+
"nbformat_minor": 2
|
| 782 |
+
}
|
Task 2/data/english
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Task 2/data/french
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Task 2/eng_vectorization_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"name": "text_vectorization", "trainable": true, "batch_input_shape": [null], "dtype": "string", "max_tokens": 15000, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 20, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null}
|
Task 2/eng_vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Task 2/english_to_french_model/keras_metadata.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18727f19a8f2e298e734019630e718acf71e07ef341ce0398a88208498c47dfd
|
| 3 |
+
size 21276
|
Task 2/english_to_french_model/saved_model.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0781d40fb91f9326102fd3c6574611bfc035bff26a81a4d69cedf1ef0b0aa1e5
|
| 3 |
+
size 2021585
|
Task 2/english_to_french_model/variables/variables.data-00000-of-00001
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3c3bfe842a940826fd00c874b3f94b44abdf96953f8d695a1caefbfc1ad9608
|
| 3 |
+
size 20633894
|
Task 2/english_to_french_model/variables/variables.index
ADDED
|
Binary file (2.91 kB). View file
|
|
|
Task 2/english_tokenizer.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"{\"class_name\": \"Tokenizer\", \"config\": {\"num_words\": null, \"filters\": \"!\\\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\", \"lower\": true, \"split\": \" \", \"char_level\": false, \"oov_token\": null, \"document_count\": 137861, \"word_counts\": \"{\\\"new\\\": 12197, \\\"jersey\\\": 11225, \\\"is\\\": 205858, \\\"sometimes\\\": 37746, \\\"quiet\\\": 8693, \\\"during\\\": 74933, \\\"autumn\\\": 9004, \\\"and\\\": 59850, \\\"it\\\": 75137, \\\"snowy\\\": 8898, \\\"in\\\": 75525, \\\"april\\\": 8954, \\\"the\\\": 67628, \\\"united\\\": 11270, \\\"states\\\": 11270, \\\"usually\\\": 37507, \\\"chilly\\\": 8770, \\\"july\\\": 8956, \\\"freezing\\\": 8928, \\\"november\\\": 8951, \\\"california\\\": 11250, \\\"march\\\": 9023, \\\"hot\\\": 8639, \\\"june\\\": 9133, \\\"mild\\\": 8743, \\\"cold\\\": 8878, \\\"september\\\": 8958, \\\"your\\\": 9734, \\\"least\\\": 27564, \\\"liked\\\": 14046, \\\"fruit\\\": 27192, \\\"grape\\\": 4848, \\\"but\\\": 63987, \\\"my\\\": 9700, \\\"apple\\\": 4848, \\\"his\\\": 9700, \\\"favorite\\\": 28332, \\\"orange\\\": 4848, \\\"paris\\\": 11334, \\\"relaxing\\\": 8696, \\\"december\\\": 8945, \\\"busy\\\": 8791, \\\"spring\\\": 9102, \\\"never\\\": 37500, \\\"our\\\": 8932, \\\"lemon\\\": 4848, \\\"january\\\": 9090, \\\"warm\\\": 8890, \\\"lime\\\": 4848, \\\"her\\\": 9700, \\\"banana\\\": 4848, \\\"he\\\": 10786, \\\"saw\\\": 648, \\\"a\\\": 1944, \\\"old\\\": 972, \\\"yellow\\\": 972, \\\"truck\\\": 1944, \\\"india\\\": 11277, \\\"rainy\\\": 8761, \\\"that\\\": 2712, \\\"cat\\\": 192, \\\"was\\\": 1867, \\\"most\\\": 14934, \\\"loved\\\": 14166, \\\"animal\\\": 2304, \\\"dislikes\\\": 7314, \\\"grapefruit\\\": 10692, \\\"limes\\\": 5844, \\\"lemons\\\": 5844, \\\"february\\\": 8942, \\\"china\\\": 10953, \\\"pleasant\\\": 8916, \\\"october\\\": 8910, \\\"wonderful\\\": 8808, \\\"nice\\\": 8984, \\\"summer\\\": 8948, \\\"france\\\": 11170, \\\"may\\\": 8995, \\\"grapes\\\": 5844, \\\"mangoes\\\": 5844, \\\"their\\\": 8932, \\\"mango\\\": 4848, \\\"pear\\\": 4848, \\\"august\\\": 8789, \\\"beautiful\\\": 8915, \\\"apples\\\": 5844, \\\"peaches\\\": 5844, \\\"feared\\\": 768, \\\"shark\\\": 192, \\\"wet\\\": 8726, \\\"dry\\\": 8794, \\\"we\\\": 2532, \\\"like\\\": 4588, \\\"oranges\\\": 5844, \\\"they\\\": 3222, \\\"pears\\\": 5844, \\\"she\\\": 10786, \\\"little\\\": 1016, \\\"red\\\": 972, \\\"winter\\\": 9038, \\\"disliked\\\": 648, \\\"rusty\\\": 972, \\\"car\\\": 1944, \\\"strawberries\\\": 5844, \\\"i\\\": 2664, \\\"strawberry\\\": 4848, \\\"bananas\\\": 5844, \\\"going\\\": 666, \\\"to\\\": 5166, \\\"next\\\": 1666, \\\"plan\\\": 714, \\\"visit\\\": 1224, \\\"elephants\\\": 64, \\\"were\\\": 384, \\\"animals\\\": 768, \\\"are\\\": 870, \\\"likes\\\": 7314, \\\"dislike\\\": 4444, \\\"fall\\\": 9134, \\\"driving\\\": 1296, \\\"peach\\\": 4848, \\\"drives\\\": 648, \\\"blue\\\": 972, \\\"you\\\": 2414, \\\"bird\\\": 192, \\\"horses\\\": 64, \\\"mouse\\\": 192, \\\"went\\\": 378, \\\"last\\\": 781, \\\"horse\\\": 192, \\\"automobile\\\": 1944, \\\"dogs\\\": 64, \\\"white\\\": 972, \\\"elephant\\\": 192, \\\"black\\\": 972, \\\"think\\\": 240, \\\"difficult\\\": 260, \\\"translate\\\": 480, \\\"between\\\": 540, \\\"spanish\\\": 312, \\\"portuguese\\\": 312, \\\"big\\\": 1016, \\\"green\\\": 972, \\\"translating\\\": 300, \\\"fun\\\": 260, \\\"where\\\": 12, \\\"dog\\\": 192, \\\"why\\\": 240, \\\"might\\\": 378, \\\"go\\\": 1386, \\\"this\\\": 768, \\\"drove\\\": 648, \\\"shiny\\\": 972, \\\"sharks\\\": 64, \\\"monkey\\\": 192, \\\"how\\\": 67, \\\"weather\\\": 33, \\\"lion\\\": 192, \\\"plans\\\": 476, \\\"bear\\\": 192, \\\"rabbit\\\": 192, \\\"it's\\\": 240, \\\"chinese\\\": 312, \\\"when\\\": 144, \\\"eiffel\\\": 57, \\\"tower\\\": 57, \\\"did\\\": 204, \\\"grocery\\\": 57, \\\"store\\\": 57, \\\"wanted\\\": 378, \\\"does\\\": 24, \\\"football\\\": 57, \\\"field\\\": 57, \\\"wants\\\": 252, \\\"didn't\\\": 60, \\\"snake\\\": 192, \\\"snakes\\\": 64, \\\"do\\\": 84, \\\"easy\\\": 260, \\\"thinks\\\": 360, \\\"english\\\": 312, \\\"french\\\": 312, \\\"would\\\": 48, \\\"aren't\\\": 36, \\\"cats\\\": 64, \\\"rabbits\\\": 64, \\\"has\\\": 24, \\\"been\\\": 36, \\\"monkeys\\\": 64, \\\"lake\\\": 57, \\\"bears\\\": 64, \\\"school\\\": 57, \\\"birds\\\": 64, \\\"want\\\": 126, \\\"isn't\\\": 24, \\\"lions\\\": 64, \\\"am\\\": 24, \\\"mice\\\": 64, \\\"have\\\": 12}\", \"word_docs\": \"{\\\"april\\\": 8954, \\\"sometimes\\\": 32946, \\\"new\\\": 12197, \\\"autumn\\\": 9004, \\\"it\\\": 75137, \\\"snowy\\\": 8898, \\\"and\\\": 59850, \\\"in\\\": 75525, \\\"quiet\\\": 8693, \\\"is\\\": 104561, \\\"jersey\\\": 11225, \\\"during\\\": 74933, \\\"united\\\": 11270, \\\"states\\\": 11270, \\\"chilly\\\": 8770, \\\"freezing\\\": 8928, \\\"usually\\\": 32846, \\\"november\\\": 8951, \\\"the\\\": 41221, \\\"july\\\": 8956, \\\"march\\\": 9023, \\\"june\\\": 9133, \\\"california\\\": 11250, \\\"hot\\\": 8639, \\\"september\\\": 8958, \\\"mild\\\": 8743, \\\"cold\\\": 8878, \\\"liked\\\": 7446, \\\"grape\\\": 4848, \\\"but\\\": 63987, \\\"least\\\": 14364, \\\"apple\\\": 4848, \\\"my\\\": 9700, \\\"your\\\": 9734, \\\"fruit\\\": 27192, \\\"his\\\": 9700, \\\"favorite\\\": 15132, \\\"orange\\\": 4848, \\\"december\\\": 8945, \\\"relaxing\\\": 8696, \\\"paris\\\": 11334, \\\"spring\\\": 9102, \\\"never\\\": 32834, \\\"busy\\\": 8791, \\\"our\\\": 8932, \\\"lemon\\\": 4848, \\\"warm\\\": 8890, \\\"january\\\": 9090, \\\"lime\\\": 4848, \\\"her\\\": 9700, \\\"banana\\\": 4848, \\\"old\\\": 972, \\\"truck\\\": 1944, \\\"a\\\": 1944, \\\"saw\\\": 648, \\\"he\\\": 10786, \\\"yellow\\\": 972, \\\"india\\\": 11277, \\\"rainy\\\": 8761, \\\"most\\\": 8334, \\\"animal\\\": 2304, \\\"was\\\": 1867, \\\"cat\\\": 192, \\\"that\\\": 2712, \\\"loved\\\": 7566, \\\"grapefruit\\\": 10692, \\\"limes\\\": 5844, \\\"dislikes\\\": 7314, \\\"lemons\\\": 5844, \\\"february\\\": 8942, \\\"pleasant\\\": 8916, \\\"october\\\": 8910, \\\"china\\\": 10953, \\\"wonderful\\\": 8808, \\\"nice\\\": 8984, \\\"summer\\\": 8948, \\\"france\\\": 11170, \\\"may\\\": 8995, \\\"mangoes\\\": 5844, \\\"grapes\\\": 5844, \\\"mango\\\": 4848, \\\"pear\\\": 4848, \\\"their\\\": 8932, \\\"august\\\": 8789, \\\"beautiful\\\": 8915, \\\"apples\\\": 5844, \\\"peaches\\\": 5844, \\\"feared\\\": 768, \\\"shark\\\": 192, \\\"dry\\\": 8794, \\\"wet\\\": 8726, \\\"we\\\": 2532, \\\"like\\\": 4588, \\\"oranges\\\": 5844, \\\"they\\\": 3222, \\\"pears\\\": 5844, \\\"red\\\": 972, \\\"little\\\": 1016, \\\"she\\\": 10786, \\\"winter\\\": 9038, \\\"rusty\\\": 972, \\\"disliked\\\": 648, \\\"car\\\": 1944, \\\"strawberries\\\": 5844, \\\"i\\\": 2664, \\\"strawberry\\\": 4848, \\\"bananas\\\": 5844, \\\"to\\\": 4170, \\\"going\\\": 666, \\\"next\\\": 1666, \\\"visit\\\": 1224, \\\"plan\\\": 714, \\\"animals\\\": 768, \\\"were\\\": 384, \\\"elephants\\\": 64, \\\"are\\\": 870, \\\"likes\\\": 7314, \\\"dislike\\\": 4444, \\\"fall\\\": 9134, \\\"driving\\\": 1296, \\\"peach\\\": 4848, \\\"drives\\\": 648, \\\"blue\\\": 972, \\\"you\\\": 2414, \\\"bird\\\": 192, \\\"horses\\\": 64, \\\"mouse\\\": 192, \\\"went\\\": 378, \\\"last\\\": 781, \\\"horse\\\": 192, \\\"automobile\\\": 1944, \\\"dogs\\\": 64, \\\"white\\\": 972, \\\"elephant\\\": 192, \\\"black\\\": 972, \\\"translate\\\": 480, \\\"think\\\": 240, \\\"portuguese\\\": 312, \\\"difficult\\\": 260, \\\"spanish\\\": 312, \\\"between\\\": 540, \\\"big\\\": 1016, \\\"green\\\": 972, \\\"fun\\\": 260, \\\"translating\\\": 300, \\\"where\\\": 12, \\\"dog\\\": 192, \\\"why\\\": 240, \\\"go\\\": 1386, \\\"might\\\": 378, \\\"this\\\": 768, \\\"drove\\\": 648, \\\"shiny\\\": 972, \\\"sharks\\\": 64, \\\"monkey\\\": 192, \\\"weather\\\": 33, \\\"how\\\": 67, \\\"lion\\\": 192, \\\"plans\\\": 476, \\\"bear\\\": 192, \\\"rabbit\\\": 192, \\\"chinese\\\": 312, \\\"it's\\\": 240, \\\"tower\\\": 57, \\\"when\\\": 144, \\\"eiffel\\\": 57, \\\"grocery\\\": 57, \\\"did\\\": 204, \\\"store\\\": 57, \\\"wanted\\\": 378, \\\"football\\\": 57, \\\"field\\\": 57, \\\"does\\\": 24, \\\"wants\\\": 252, \\\"didn't\\\": 60, \\\"snake\\\": 192, \\\"snakes\\\": 64, \\\"easy\\\": 260, \\\"do\\\": 84, \\\"french\\\": 312, \\\"thinks\\\": 360, \\\"english\\\": 312, \\\"would\\\": 48, \\\"aren't\\\": 36, \\\"cats\\\": 64, \\\"rabbits\\\": 64, \\\"has\\\": 24, \\\"been\\\": 36, \\\"monkeys\\\": 64, \\\"lake\\\": 57, \\\"bears\\\": 64, \\\"school\\\": 57, \\\"birds\\\": 64, \\\"want\\\": 126, \\\"isn't\\\": 24, \\\"lions\\\": 64, \\\"am\\\": 24, \\\"mice\\\": 64, \\\"have\\\": 12}\", \"index_docs\": \"{\\\"44\\\": 8954, \\\"8\\\": 32946, \\\"17\\\": 12197, \\\"39\\\": 9004, \\\"3\\\": 75137, \\\"55\\\": 8898, \\\"7\\\": 59850, \\\"2\\\": 75525, \\\"67\\\": 8693, \\\"1\\\": 104561, \\\"23\\\": 11225, \\\"4\\\": 74933, \\\"20\\\": 11270, \\\"21\\\": 11270, \\\"62\\\": 8770, \\\"51\\\": 8928, \\\"9\\\": 32846, \\\"45\\\": 8951, \\\"5\\\": 41221, \\\"43\\\": 8956, \\\"38\\\": 9023, \\\"34\\\": 9133, \\\"22\\\": 11250, \\\"68\\\": 8639, \\\"42\\\": 8958, \\\"64\\\": 8743, \\\"57\\\": 8878, \\\"16\\\": 7446, \\\"82\\\": 4848, \\\"6\\\": 63987, \\\"12\\\": 14364, \\\"83\\\": 4848, \\\"30\\\": 9700, \\\"29\\\": 9734, \\\"13\\\": 27192, \\\"31\\\": 9700, \\\"11\\\": 15132, \\\"84\\\": 4848, \\\"47\\\": 8945, \\\"66\\\": 8696, \\\"18\\\": 11334, \\\"35\\\": 9102, \\\"10\\\": 32834, \\\"60\\\": 8791, \\\"49\\\": 8932, \\\"85\\\": 4848, \\\"56\\\": 8890, \\\"36\\\": 9090, \\\"86\\\": 4848, \\\"32\\\": 9700, \\\"87\\\": 4848, \\\"111\\\": 972, \\\"101\\\": 1944, \\\"100\\\": 1944, \\\"127\\\": 648, \\\"26\\\": 10786, \\\"112\\\": 972, \\\"19\\\": 11277, \\\"63\\\": 8761, \\\"14\\\": 8334, \\\"99\\\": 2304, \\\"104\\\": 1867, \\\"153\\\": 192, \\\"95\\\": 2712, \\\"15\\\": 7566, \\\"28\\\": 10692, \\\"71\\\": 5844, \\\"69\\\": 7314, \\\"72\\\": 5844, \\\"48\\\": 8942, \\\"52\\\": 8916, \\\"54\\\": 8910, \\\"25\\\": 10953, \\\"58\\\": 8808, \\\"41\\\": 8984, \\\"46\\\": 8948, \\\"24\\\": 11170, \\\"40\\\": 8995, \\\"74\\\": 5844, \\\"73\\\": 5844, \\\"88\\\": 4848, \\\"89\\\": 4848, \\\"50\\\": 8932, \\\"61\\\": 8789, \\\"53\\\": 8915, \\\"75\\\": 5844, \\\"76\\\": 5844, \\\"122\\\": 768, \\\"154\\\": 192, \\\"59\\\": 8794, \\\"65\\\": 8726, \\\"97\\\": 2532, \\\"92\\\": 4588, \\\"77\\\": 5844, \\\"94\\\": 3222, \\\"78\\\": 5844, \\\"113\\\": 972, \\\"109\\\": 1016, \\\"27\\\": 10786, \\\"37\\\": 9038, \\\"114\\\": 972, \\\"128\\\": 648, \\\"102\\\": 1944, \\\"79\\\": 5844, \\\"96\\\": 2664, \\\"90\\\": 4848, \\\"80\\\": 5844, \\\"81\\\": 4170, \\\"126\\\": 666, \\\"105\\\": 1666, \\\"108\\\": 1224, \\\"125\\\": 714, \\\"123\\\": 768, \\\"134\\\": 384, \\\"169\\\": 64, \\\"120\\\": 870, \\\"70\\\": 7314, \\\"93\\\": 4444, \\\"33\\\": 9134, \\\"107\\\": 1296, \\\"91\\\": 4848, \\\"129\\\": 648, \\\"115\\\": 972, \\\"98\\\": 2414, \\\"155\\\": 192, \\\"170\\\": 64, \\\"156\\\": 192, \\\"135\\\": 378, \\\"121\\\": 781, \\\"157\\\": 192, \\\"103\\\": 1944, \\\"171\\\": 64, \\\"116\\\": 972, \\\"158\\\": 192, \\\"117\\\": 972, \\\"132\\\": 480, \\\"149\\\": 240, \\\"140\\\": 312, \\\"145\\\": 260, \\\"139\\\": 312, \\\"131\\\": 540, \\\"110\\\": 1016, \\\"118\\\": 972, \\\"146\\\": 260, \\\"144\\\": 300, \\\"198\\\": 12, \\\"159\\\": 192, \\\"150\\\": 240, \\\"106\\\": 1386, \\\"136\\\": 378, \\\"124\\\": 768, \\\"130\\\": 648, \\\"119\\\": 972, \\\"172\\\": 64, \\\"160\\\": 192, \\\"193\\\": 33, \\\"168\\\": 67, \\\"161\\\": 192, \\\"133\\\": 476, \\\"162\\\": 192, \\\"163\\\": 192, \\\"141\\\": 312, \\\"151\\\": 240, \\\"183\\\": 57, \\\"165\\\": 144, \\\"182\\\": 57, \\\"184\\\": 57, \\\"152\\\": 204, \\\"185\\\": 57, \\\"137\\\": 378, \\\"186\\\": 57, \\\"187\\\": 57, \\\"194\\\": 24, \\\"148\\\": 252, \\\"181\\\": 60, \\\"164\\\": 192, \\\"173\\\": 64, \\\"147\\\": 260, \\\"167\\\": 84, \\\"143\\\": 312, \\\"138\\\": 360, \\\"142\\\": 312, \\\"190\\\": 48, \\\"191\\\": 36, \\\"174\\\": 64, \\\"175\\\": 64, \\\"195\\\": 24, \\\"192\\\": 36, \\\"176\\\": 64, \\\"188\\\": 57, \\\"177\\\": 64, \\\"189\\\": 57, \\\"178\\\": 64, \\\"166\\\": 126, \\\"196\\\": 24, \\\"179\\\": 64, \\\"197\\\": 24, \\\"180\\\": 64, \\\"199\\\": 12}\", \"index_word\": \"{\\\"1\\\": \\\"is\\\", \\\"2\\\": \\\"in\\\", \\\"3\\\": \\\"it\\\", \\\"4\\\": \\\"during\\\", \\\"5\\\": \\\"the\\\", \\\"6\\\": \\\"but\\\", \\\"7\\\": \\\"and\\\", \\\"8\\\": \\\"sometimes\\\", \\\"9\\\": \\\"usually\\\", \\\"10\\\": \\\"never\\\", \\\"11\\\": \\\"favorite\\\", \\\"12\\\": \\\"least\\\", \\\"13\\\": \\\"fruit\\\", \\\"14\\\": \\\"most\\\", \\\"15\\\": \\\"loved\\\", \\\"16\\\": \\\"liked\\\", \\\"17\\\": \\\"new\\\", \\\"18\\\": \\\"paris\\\", \\\"19\\\": \\\"india\\\", \\\"20\\\": \\\"united\\\", \\\"21\\\": \\\"states\\\", \\\"22\\\": \\\"california\\\", \\\"23\\\": \\\"jersey\\\", \\\"24\\\": \\\"france\\\", \\\"25\\\": \\\"china\\\", \\\"26\\\": \\\"he\\\", \\\"27\\\": \\\"she\\\", \\\"28\\\": \\\"grapefruit\\\", \\\"29\\\": \\\"your\\\", \\\"30\\\": \\\"my\\\", \\\"31\\\": \\\"his\\\", \\\"32\\\": \\\"her\\\", \\\"33\\\": \\\"fall\\\", \\\"34\\\": \\\"june\\\", \\\"35\\\": \\\"spring\\\", \\\"36\\\": \\\"january\\\", \\\"37\\\": \\\"winter\\\", \\\"38\\\": \\\"march\\\", \\\"39\\\": \\\"autumn\\\", \\\"40\\\": \\\"may\\\", \\\"41\\\": \\\"nice\\\", \\\"42\\\": \\\"september\\\", \\\"43\\\": \\\"july\\\", \\\"44\\\": \\\"april\\\", \\\"45\\\": \\\"november\\\", \\\"46\\\": \\\"summer\\\", \\\"47\\\": \\\"december\\\", \\\"48\\\": \\\"february\\\", \\\"49\\\": \\\"our\\\", \\\"50\\\": \\\"their\\\", \\\"51\\\": \\\"freezing\\\", \\\"52\\\": \\\"pleasant\\\", \\\"53\\\": \\\"beautiful\\\", \\\"54\\\": \\\"october\\\", \\\"55\\\": \\\"snowy\\\", \\\"56\\\": \\\"warm\\\", \\\"57\\\": \\\"cold\\\", \\\"58\\\": \\\"wonderful\\\", \\\"59\\\": \\\"dry\\\", \\\"60\\\": \\\"busy\\\", \\\"61\\\": \\\"august\\\", \\\"62\\\": \\\"chilly\\\", \\\"63\\\": \\\"rainy\\\", \\\"64\\\": \\\"mild\\\", \\\"65\\\": \\\"wet\\\", \\\"66\\\": \\\"relaxing\\\", \\\"67\\\": \\\"quiet\\\", \\\"68\\\": \\\"hot\\\", \\\"69\\\": \\\"dislikes\\\", \\\"70\\\": \\\"likes\\\", \\\"71\\\": \\\"limes\\\", \\\"72\\\": \\\"lemons\\\", \\\"73\\\": \\\"grapes\\\", \\\"74\\\": \\\"mangoes\\\", \\\"75\\\": \\\"apples\\\", \\\"76\\\": \\\"peaches\\\", \\\"77\\\": \\\"oranges\\\", \\\"78\\\": \\\"pears\\\", \\\"79\\\": \\\"strawberries\\\", \\\"80\\\": \\\"bananas\\\", \\\"81\\\": \\\"to\\\", \\\"82\\\": \\\"grape\\\", \\\"83\\\": \\\"apple\\\", \\\"84\\\": \\\"orange\\\", \\\"85\\\": \\\"lemon\\\", \\\"86\\\": \\\"lime\\\", \\\"87\\\": \\\"banana\\\", \\\"88\\\": \\\"mango\\\", \\\"89\\\": \\\"pear\\\", \\\"90\\\": \\\"strawberry\\\", \\\"91\\\": \\\"peach\\\", \\\"92\\\": \\\"like\\\", \\\"93\\\": \\\"dislike\\\", \\\"94\\\": \\\"they\\\", \\\"95\\\": \\\"that\\\", \\\"96\\\": \\\"i\\\", \\\"97\\\": \\\"we\\\", \\\"98\\\": \\\"you\\\", \\\"99\\\": \\\"animal\\\", \\\"100\\\": \\\"a\\\", \\\"101\\\": \\\"truck\\\", \\\"102\\\": \\\"car\\\", \\\"103\\\": \\\"automobile\\\", \\\"104\\\": \\\"was\\\", \\\"105\\\": \\\"next\\\", \\\"106\\\": \\\"go\\\", \\\"107\\\": \\\"driving\\\", \\\"108\\\": \\\"visit\\\", \\\"109\\\": \\\"little\\\", \\\"110\\\": \\\"big\\\", \\\"111\\\": \\\"old\\\", \\\"112\\\": \\\"yellow\\\", \\\"113\\\": \\\"red\\\", \\\"114\\\": \\\"rusty\\\", \\\"115\\\": \\\"blue\\\", \\\"116\\\": \\\"white\\\", \\\"117\\\": \\\"black\\\", \\\"118\\\": \\\"green\\\", \\\"119\\\": \\\"shiny\\\", \\\"120\\\": \\\"are\\\", \\\"121\\\": \\\"last\\\", \\\"122\\\": \\\"feared\\\", \\\"123\\\": \\\"animals\\\", \\\"124\\\": \\\"this\\\", \\\"125\\\": \\\"plan\\\", \\\"126\\\": \\\"going\\\", \\\"127\\\": \\\"saw\\\", \\\"128\\\": \\\"disliked\\\", \\\"129\\\": \\\"drives\\\", \\\"130\\\": \\\"drove\\\", \\\"131\\\": \\\"between\\\", \\\"132\\\": \\\"translate\\\", \\\"133\\\": \\\"plans\\\", \\\"134\\\": \\\"were\\\", \\\"135\\\": \\\"went\\\", \\\"136\\\": \\\"might\\\", \\\"137\\\": \\\"wanted\\\", \\\"138\\\": \\\"thinks\\\", \\\"139\\\": \\\"spanish\\\", \\\"140\\\": \\\"portuguese\\\", \\\"141\\\": \\\"chinese\\\", \\\"142\\\": \\\"english\\\", \\\"143\\\": \\\"french\\\", \\\"144\\\": \\\"translating\\\", \\\"145\\\": \\\"difficult\\\", \\\"146\\\": \\\"fun\\\", \\\"147\\\": \\\"easy\\\", \\\"148\\\": \\\"wants\\\", \\\"149\\\": \\\"think\\\", \\\"150\\\": \\\"why\\\", \\\"151\\\": \\\"it's\\\", \\\"152\\\": \\\"did\\\", \\\"153\\\": \\\"cat\\\", \\\"154\\\": \\\"shark\\\", \\\"155\\\": \\\"bird\\\", \\\"156\\\": \\\"mouse\\\", \\\"157\\\": \\\"horse\\\", \\\"158\\\": \\\"elephant\\\", \\\"159\\\": \\\"dog\\\", \\\"160\\\": \\\"monkey\\\", \\\"161\\\": \\\"lion\\\", \\\"162\\\": \\\"bear\\\", \\\"163\\\": \\\"rabbit\\\", \\\"164\\\": \\\"snake\\\", \\\"165\\\": \\\"when\\\", \\\"166\\\": \\\"want\\\", \\\"167\\\": \\\"do\\\", \\\"168\\\": \\\"how\\\", \\\"169\\\": \\\"elephants\\\", \\\"170\\\": \\\"horses\\\", \\\"171\\\": \\\"dogs\\\", \\\"172\\\": \\\"sharks\\\", \\\"173\\\": \\\"snakes\\\", \\\"174\\\": \\\"cats\\\", \\\"175\\\": \\\"rabbits\\\", \\\"176\\\": \\\"monkeys\\\", \\\"177\\\": \\\"bears\\\", \\\"178\\\": \\\"birds\\\", \\\"179\\\": \\\"lions\\\", \\\"180\\\": \\\"mice\\\", \\\"181\\\": \\\"didn't\\\", \\\"182\\\": \\\"eiffel\\\", \\\"183\\\": \\\"tower\\\", \\\"184\\\": \\\"grocery\\\", \\\"185\\\": \\\"store\\\", \\\"186\\\": \\\"football\\\", \\\"187\\\": \\\"field\\\", \\\"188\\\": \\\"lake\\\", \\\"189\\\": \\\"school\\\", \\\"190\\\": \\\"would\\\", \\\"191\\\": \\\"aren't\\\", \\\"192\\\": \\\"been\\\", \\\"193\\\": \\\"weather\\\", \\\"194\\\": \\\"does\\\", \\\"195\\\": \\\"has\\\", \\\"196\\\": \\\"isn't\\\", \\\"197\\\": \\\"am\\\", \\\"198\\\": \\\"where\\\", \\\"199\\\": \\\"have\\\"}\", \"word_index\": \"{\\\"is\\\": 1, \\\"in\\\": 2, \\\"it\\\": 3, \\\"during\\\": 4, \\\"the\\\": 5, \\\"but\\\": 6, \\\"and\\\": 7, \\\"sometimes\\\": 8, \\\"usually\\\": 9, \\\"never\\\": 10, \\\"favorite\\\": 11, \\\"least\\\": 12, \\\"fruit\\\": 13, \\\"most\\\": 14, \\\"loved\\\": 15, \\\"liked\\\": 16, \\\"new\\\": 17, \\\"paris\\\": 18, \\\"india\\\": 19, \\\"united\\\": 20, \\\"states\\\": 21, \\\"california\\\": 22, \\\"jersey\\\": 23, \\\"france\\\": 24, \\\"china\\\": 25, \\\"he\\\": 26, \\\"she\\\": 27, \\\"grapefruit\\\": 28, \\\"your\\\": 29, \\\"my\\\": 30, \\\"his\\\": 31, \\\"her\\\": 32, \\\"fall\\\": 33, \\\"june\\\": 34, \\\"spring\\\": 35, \\\"january\\\": 36, \\\"winter\\\": 37, \\\"march\\\": 38, \\\"autumn\\\": 39, \\\"may\\\": 40, \\\"nice\\\": 41, \\\"september\\\": 42, \\\"july\\\": 43, \\\"april\\\": 44, \\\"november\\\": 45, \\\"summer\\\": 46, \\\"december\\\": 47, \\\"february\\\": 48, \\\"our\\\": 49, \\\"their\\\": 50, \\\"freezing\\\": 51, \\\"pleasant\\\": 52, \\\"beautiful\\\": 53, \\\"october\\\": 54, \\\"snowy\\\": 55, \\\"warm\\\": 56, \\\"cold\\\": 57, \\\"wonderful\\\": 58, \\\"dry\\\": 59, \\\"busy\\\": 60, \\\"august\\\": 61, \\\"chilly\\\": 62, \\\"rainy\\\": 63, \\\"mild\\\": 64, \\\"wet\\\": 65, \\\"relaxing\\\": 66, \\\"quiet\\\": 67, \\\"hot\\\": 68, \\\"dislikes\\\": 69, \\\"likes\\\": 70, \\\"limes\\\": 71, \\\"lemons\\\": 72, \\\"grapes\\\": 73, \\\"mangoes\\\": 74, \\\"apples\\\": 75, \\\"peaches\\\": 76, \\\"oranges\\\": 77, \\\"pears\\\": 78, \\\"strawberries\\\": 79, \\\"bananas\\\": 80, \\\"to\\\": 81, \\\"grape\\\": 82, \\\"apple\\\": 83, \\\"orange\\\": 84, \\\"lemon\\\": 85, \\\"lime\\\": 86, \\\"banana\\\": 87, \\\"mango\\\": 88, \\\"pear\\\": 89, \\\"strawberry\\\": 90, \\\"peach\\\": 91, \\\"like\\\": 92, \\\"dislike\\\": 93, \\\"they\\\": 94, \\\"that\\\": 95, \\\"i\\\": 96, \\\"we\\\": 97, \\\"you\\\": 98, \\\"animal\\\": 99, \\\"a\\\": 100, \\\"truck\\\": 101, \\\"car\\\": 102, \\\"automobile\\\": 103, \\\"was\\\": 104, \\\"next\\\": 105, \\\"go\\\": 106, \\\"driving\\\": 107, \\\"visit\\\": 108, \\\"little\\\": 109, \\\"big\\\": 110, \\\"old\\\": 111, \\\"yellow\\\": 112, \\\"red\\\": 113, \\\"rusty\\\": 114, \\\"blue\\\": 115, \\\"white\\\": 116, \\\"black\\\": 117, \\\"green\\\": 118, \\\"shiny\\\": 119, \\\"are\\\": 120, \\\"last\\\": 121, \\\"feared\\\": 122, \\\"animals\\\": 123, \\\"this\\\": 124, \\\"plan\\\": 125, \\\"going\\\": 126, \\\"saw\\\": 127, \\\"disliked\\\": 128, \\\"drives\\\": 129, \\\"drove\\\": 130, \\\"between\\\": 131, \\\"translate\\\": 132, \\\"plans\\\": 133, \\\"were\\\": 134, \\\"went\\\": 135, \\\"might\\\": 136, \\\"wanted\\\": 137, \\\"thinks\\\": 138, \\\"spanish\\\": 139, \\\"portuguese\\\": 140, \\\"chinese\\\": 141, \\\"english\\\": 142, \\\"french\\\": 143, \\\"translating\\\": 144, \\\"difficult\\\": 145, \\\"fun\\\": 146, \\\"easy\\\": 147, \\\"wants\\\": 148, \\\"think\\\": 149, \\\"why\\\": 150, \\\"it's\\\": 151, \\\"did\\\": 152, \\\"cat\\\": 153, \\\"shark\\\": 154, \\\"bird\\\": 155, \\\"mouse\\\": 156, \\\"horse\\\": 157, \\\"elephant\\\": 158, \\\"dog\\\": 159, \\\"monkey\\\": 160, \\\"lion\\\": 161, \\\"bear\\\": 162, \\\"rabbit\\\": 163, \\\"snake\\\": 164, \\\"when\\\": 165, \\\"want\\\": 166, \\\"do\\\": 167, \\\"how\\\": 168, \\\"elephants\\\": 169, \\\"horses\\\": 170, \\\"dogs\\\": 171, \\\"sharks\\\": 172, \\\"snakes\\\": 173, \\\"cats\\\": 174, \\\"rabbits\\\": 175, \\\"monkeys\\\": 176, \\\"bears\\\": 177, \\\"birds\\\": 178, \\\"lions\\\": 179, \\\"mice\\\": 180, \\\"didn't\\\": 181, \\\"eiffel\\\": 182, \\\"tower\\\": 183, \\\"grocery\\\": 184, \\\"store\\\": 185, \\\"football\\\": 186, \\\"field\\\": 187, \\\"lake\\\": 188, \\\"school\\\": 189, \\\"would\\\": 190, \\\"aren't\\\": 191, \\\"been\\\": 192, \\\"weather\\\": 193, \\\"does\\\": 194, \\\"has\\\": 195, \\\"isn't\\\": 196, \\\"am\\\": 197, \\\"where\\\": 198, \\\"have\\\": 199}\"}}"
|
Task 2/french_tokenizer.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"{\"class_name\": \"Tokenizer\", \"config\": {\"num_words\": null, \"filters\": \"!\\\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\", \"lower\": true, \"split\": \" \", \"char_level\": false, \"oov_token\": null, \"document_count\": 137861, \"word_counts\": \"{\\\"new\\\": 11047, \\\"jersey\\\": 11052, \\\"est\\\": 196821, \\\"parfois\\\": 37746, \\\"calme\\\": 7256, \\\"pendant\\\": 10741, \\\"l'\\\": 32917, \\\"automne\\\": 14727, \\\"et\\\": 59851, \\\"il\\\": 84115, \\\"neigeux\\\": 1867, \\\"en\\\": 105768, \\\"avril\\\": 8954, \\\"les\\\": 65255, \\\"\\\\u00e9tats\\\": 11267, \\\"unis\\\": 11270, \\\"g\\\\u00e9n\\\\u00e9ralement\\\": 31292, \\\"froid\\\": 16794, \\\"juillet\\\": 8956, \\\"g\\\\u00e8le\\\": 3622, \\\"habituellement\\\": 6215, \\\"novembre\\\": 8951, \\\"california\\\": 3061, \\\"mars\\\": 9023, \\\"chaud\\\": 16405, \\\"juin\\\": 9133, \\\"l\\\\u00e9g\\\\u00e8re\\\": 63, \\\"fait\\\": 2916, \\\"septembre\\\": 8958, \\\"votre\\\": 9368, \\\"moins\\\": 27557, \\\"aim\\\\u00e9\\\": 25852, \\\"fruit\\\": 23626, \\\"le\\\": 35306, \\\"raisin\\\": 4852, \\\"mais\\\": 63987, \\\"mon\\\": 9403, \\\"la\\\": 49861, \\\"pomme\\\": 4848, \\\"son\\\": 16496, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9\\\": 23305, \\\"l'orange\\\": 4848, \\\"paris\\\": 11334, \\\"relaxant\\\": 8458, \\\"d\\\\u00e9cembre\\\": 8945, \\\"occup\\\\u00e9\\\": 7782, \\\"au\\\": 25738, \\\"printemps\\\": 9100, \\\"jamais\\\": 37215, \\\"chaude\\\": 1124, \\\"notre\\\": 8319, \\\"citron\\\": 4848, \\\"janvier\\\": 9090, \\\"chaux\\\": 4848, \\\"des\\\": 2435, \\\"fruits\\\": 3566, \\\"banane\\\": 4848, \\\"a\\\": 1356, \\\"vu\\\": 645, \\\"un\\\": 698, \\\"vieux\\\": 325, \\\"camion\\\": 1944, \\\"jaune\\\": 972, \\\"inde\\\": 11277, \\\"pluvieux\\\": 7658, \\\"ce\\\": 1572, \\\"chat\\\": 192, \\\"\\\\u00e9tait\\\": 1198, \\\"animal\\\": 2248, \\\"plus\\\": 14934, \\\"n'aime\\\": 3131, \\\"pamplemousse\\\": 10140, \\\"citrons\\\": 11679, \\\"verts\\\": 5835, \\\"californie\\\": 8189, \\\"ne\\\": 2715, \\\"f\\\\u00e9vrier\\\": 8942, \\\"gel\\\": 4886, \\\"chine\\\": 10936, \\\"agr\\\\u00e9able\\\": 17751, \\\"octobre\\\": 8911, \\\"merveilleux\\\": 8704, \\\"doux\\\": 8458, \\\"tranquille\\\": 1437, \\\"\\\\u00e0\\\": 13870, \\\"l'automne\\\": 3411, \\\"\\\\u00e9t\\\\u00e9\\\": 8999, \\\"france\\\": 11170, \\\"mois\\\": 14350, \\\"de\\\": 15070, \\\"mai\\\": 8995, \\\"frisquet\\\": 834, \\\"d\\\\u00e9teste\\\": 3743, \\\"raisins\\\": 5780, \\\"mangues\\\": 5774, \\\"leur\\\": 7855, \\\"mangue\\\": 4899, \\\"poire\\\": 4848, \\\"ao\\\\u00fbt\\\": 8789, \\\"beau\\\": 6387, \\\"pommes\\\": 5844, \\\"p\\\\u00eaches\\\": 5844, \\\"redout\\\\u00e9\\\": 576, \\\"que\\\": 667, \\\"requin\\\": 192, \\\"humide\\\": 8446, \\\"d'\\\": 5100, \\\"sec\\\": 7957, \\\"enneig\\\\u00e9e\\\": 4008, \\\"nous\\\": 2520, \\\"aimons\\\": 1111, \\\"oranges\\\": 5844, \\\"ils\\\": 3221, \\\"aiment\\\": 1126, \\\"poires\\\": 5844, \\\"elle\\\": 12080, \\\"petit\\\": 324, \\\"rouge\\\": 972, \\\"cher\\\": 1308, \\\"aim\\\\u00e9e\\\": 105, \\\"neige\\\": 3016, \\\"trop\\\": 173, \\\"monde\\\": 173, \\\"hiver\\\": 9038, \\\"sont\\\": 1018, \\\"n'aimait\\\": 561, \\\"pas\\\": 4495, \\\"une\\\": 1278, \\\"voiture\\\": 3510, \\\"rouill\\\\u00e9e\\\": 486, \\\"fraises\\\": 5844, \\\"cours\\\": 1927, \\\"j'aime\\\": 966, \\\"fraise\\\": 4848, \\\"bananes\\\": 5844, \\\"va\\\": 355, \\\"aux\\\": 392, \\\"prochain\\\": 1666, \\\"je\\\": 1548, \\\"pr\\\\u00e9vois\\\": 233, \\\"visiter\\\": 908, \\\"belle\\\": 2726, \\\"\\\\u00e9l\\\\u00e9phants\\\": 64, \\\"\\\\u00e9taient\\\": 357, \\\"ses\\\": 402, \\\"animaux\\\": 768, \\\"redout\\\\u00e9s\\\": 190, \\\"vont\\\": 168, \\\"aime\\\": 8870, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9e\\\": 770, \\\"n'aiment\\\": 1111, \\\"i\\\": 150, \\\"comme\\\": 259, \\\"conduit\\\": 1706, \\\"p\\\\u00eache\\\": 4848, \\\"nouvelle\\\": 648, \\\"bleue\\\": 504, \\\"vous\\\": 2541, \\\"aimez\\\": 1053, \\\"cet\\\": 286, \\\"oiseau\\\": 128, \\\"pamplemousses\\\": 552, \\\"pleut\\\": 562, \\\"magnifique\\\": 104, \\\"favori\\\": 3857, \\\"vos\\\": 225, \\\"aim\\\\u00e9s\\\": 237, \\\"chevaux\\\": 64, \\\"n'aimez\\\": 1094, \\\"n'aimons\\\": 97, \\\"souris\\\": 256, \\\"d\\\\u00e9testons\\\": 1001, \\\"all\\\\u00e9\\\": 187, \\\"dernier\\\": 757, \\\"conduisait\\\": 673, \\\"petite\\\": 615, \\\"glaciales\\\": 307, \\\"cheval\\\": 192, \\\"vieille\\\": 647, \\\"chiens\\\": 64, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9s\\\": 383, \\\"blanche\\\": 579, \\\"occup\\\\u00e9e\\\": 836, \\\"nos\\\": 613, \\\"l'\\\\u00e9l\\\\u00e9phant\\\": 64, \\\"nouveau\\\": 502, \\\"noire\\\": 602, \\\"pluies\\\": 367, \\\"pense\\\": 540, \\\"qu'il\\\": 393, \\\"difficile\\\": 260, \\\"traduire\\\": 501, \\\"entre\\\": 540, \\\"espagnol\\\": 312, \\\"portugais\\\": 312, \\\"bleu\\\": 468, \\\"rouill\\\\u00e9\\\": 454, \\\"aimait\\\": 707, \\\"grande\\\": 459, \\\"verte\\\": 628, \\\"traduction\\\": 277, \\\"amusant\\\": 260, \\\"cette\\\": 1239, \\\"vert\\\": 344, \\\"grand\\\": 81, \\\"blanc\\\": 393, \\\"volant\\\": 165, \\\"gros\\\": 258, \\\"o\\\\u00f9\\\": 12, \\\"chien\\\": 192, \\\"leurs\\\": 1072, \\\"pourquoi\\\": 240, \\\"l'automobile\\\": 100, \\\"pourrait\\\": 252, \\\"se\\\": 461, \\\"rendre\\\": 350, \\\"pr\\\\u00e9voyons\\\": 232, \\\"maillot\\\": 173, \\\"grosse\\\": 185, \\\"brillant\\\": 587, \\\"pr\\\\u00e9voient\\\": 75, \\\"mouill\\\\u00e9e\\\": 7, \\\"lui\\\": 70, \\\"d\\\\u00e9tendre\\\": 111, \\\"automobile\\\": 278, \\\"pourraient\\\": 126, \\\"aller\\\": 1180, \\\"mes\\\": 297, \\\"s\\\\u00e8che\\\": 837, \\\"l'oiseau\\\": 64, \\\"pluie\\\": 174, \\\"requins\\\": 64, \\\"noir\\\": 370, \\\"singe\\\": 192, \\\"d\\\\u00e9testait\\\": 87, \\\"comment\\\": 67, \\\"temps\\\": 33, \\\"dans\\\": 12, \\\"lion\\\": 192, \\\"pr\\\\u00e9voit\\\": 75, \\\"ours\\\": 192, \\\"porcelaine\\\": 17, \\\"cl\\\\u00e9mentes\\\": 200, \\\"pla\\\\u00eet\\\": 13, \\\"proches\\\": 20, \\\"brillante\\\": 385, \\\"lapin\\\": 192, \\\"l'ours\\\": 64, \\\"chinois\\\": 312, \\\"quand\\\": 144, \\\"tour\\\": 57, \\\"eiffel\\\": 57, \\\"allons\\\": 45, \\\"l'\\\\u00e9picerie\\\": 57, \\\"voulait\\\": 252, \\\"c\\\\u00e9page\\\": 60, \\\"t\\\": 18, \\\"terrain\\\": 57, \\\"football\\\": 57, \\\"du\\\": 39, \\\"veut\\\": 252, \\\"\\\\u00e9l\\\\u00e9phant\\\": 128, \\\"gel\\\\u00e9\\\": 94, \\\"bien\\\": 77, \\\"enneig\\\\u00e9\\\": 7, \\\"gel\\\\u00e9s\\\": 5, \\\"serpent\\\": 192, \\\"all\\\\u00e9s\\\": 150, \\\"all\\\\u00e9e\\\": 150, \\\"envisage\\\": 360, \\\"peu\\\": 41, \\\"mouill\\\\u00e9\\\": 273, \\\"serpents\\\": 64, \\\"pensez\\\": 60, \\\"facile\\\": 260, \\\"anglais\\\": 312, \\\"fran\\\\u00e7ais\\\": 312, \\\"voulez\\\": 12, \\\"grandes\\\": 16, \\\"avez\\\": 162, \\\"aimeraient\\\": 12, \\\"allez\\\": 45, \\\"chats\\\": 64, \\\"lapins\\\": 64, \\\"visite\\\": 68, \\\"ont\\\": 194, \\\"intention\\\": 206, \\\"n'est\\\": 47, \\\"derni\\\\u00e8re\\\": 24, \\\"voulaient\\\": 126, \\\"singes\\\": 64, \\\"\\\\u00eates\\\": 24, \\\"qu'elle\\\": 26, \\\"vers\\\": 76, \\\"lac\\\": 57, \\\"pousse\\\": 41, \\\"d\\\\u00e9testez\\\": 17, \\\"manguiers\\\": 19, \\\"grands\\\": 9, \\\"l'\\\\u00e9cole\\\": 57, \\\"l'animal\\\": 56, \\\"at\\\": 32, \\\"oiseaux\\\": 64, \\\"ressort\\\": 2, \\\"petits\\\": 10, \\\"n'a\\\": 12, \\\"veulent\\\": 126, \\\"rouille\\\": 32, \\\"frais\\\": 20, \\\"limes\\\": 9, \\\"lions\\\": 64, \\\"douce\\\": 14, \\\"envisagent\\\": 9, \\\"petites\\\": 26, \\\"vais\\\": 24, \\\"durant\\\": 14, \\\"c'est\\\": 17, \\\"cong\\\\u00e9lation\\\": 14, \\\"allions\\\": 1, \\\"voudrait\\\": 24, \\\"d\\\\u00e9tend\\\": 2, \\\"trouv\\\\u00e9\\\": 1, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9es\\\": 16, \\\"conduite\\\": 6, \\\"grosses\\\": 8, \\\"b\\\\u00e9nigne\\\": 8, \\\"avons\\\": 19, \\\"sur\\\": 28, \\\"redout\\\\u00e9e\\\": 2, \\\"etats\\\": 3, \\\"moindres\\\": 7, \\\"n'\\\\u00eates\\\": 3, \\\"vit\\\": 3, \\\"as\\\": 1, \\\"tu\\\": 2, \\\"qui\\\": 2, \\\"faire\\\": 1, \\\"traduis\\\": 2, \\\"favoris\\\": 1, \\\"souvent\\\": 1, \\\"es\\\": 1, \\\"appr\\\\u00e9ci\\\\u00e9\\\": 2, \\\"moteur\\\": 1, \\\"tout\\\": 4}\", \"word_docs\": \"{\\\"l'\\\": 28828, \\\"en\\\": 72048, \\\"automne\\\": 14398, \\\"calme\\\": 7256, \\\"et\\\": 59851, \\\"neigeux\\\": 1867, \\\"est\\\": 104279, \\\"il\\\": 84114, \\\"pendant\\\": 10741, \\\"new\\\": 11047, \\\"parfois\\\": 32946, \\\"jersey\\\": 11052, \\\"avril\\\": 8954, \\\"unis\\\": 11270, \\\"froid\\\": 16323, \\\"les\\\": 33404, \\\"juillet\\\": 8956, \\\"\\\\u00e9tats\\\": 11267, \\\"habituellement\\\": 6112, \\\"g\\\\u00e8le\\\": 3622, \\\"novembre\\\": 8951, \\\"g\\\\u00e9n\\\\u00e9ralement\\\": 28101, \\\"mars\\\": 9023, \\\"chaud\\\": 15908, \\\"california\\\": 3061, \\\"juin\\\": 9133, \\\"l\\\\u00e9g\\\\u00e8re\\\": 63, \\\"fait\\\": 2916, \\\"septembre\\\": 8958, \\\"mon\\\": 9403, \\\"moins\\\": 14357, \\\"votre\\\": 9368, \\\"mais\\\": 63987, \\\"la\\\": 39038, \\\"raisin\\\": 4852, \\\"fruit\\\": 23626, \\\"aim\\\\u00e9\\\": 14038, \\\"pomme\\\": 4848, \\\"le\\\": 28877, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9\\\": 14722, \\\"l'orange\\\": 4848, \\\"son\\\": 15184, \\\"relaxant\\\": 8458, \\\"paris\\\": 11334, \\\"d\\\\u00e9cembre\\\": 8945, \\\"printemps\\\": 9100, \\\"occup\\\\u00e9\\\": 7782, \\\"au\\\": 24675, \\\"jamais\\\": 32618, \\\"chaude\\\": 1124, \\\"citron\\\": 4848, \\\"notre\\\": 8319, \\\"janvier\\\": 9090, \\\"banane\\\": 4848, \\\"fruits\\\": 3566, \\\"chaux\\\": 4848, \\\"des\\\": 2435, \\\"a\\\": 1356, \\\"jaune\\\": 972, \\\"vu\\\": 645, \\\"camion\\\": 1944, \\\"vieux\\\": 325, \\\"un\\\": 698, \\\"inde\\\": 11277, \\\"pluvieux\\\": 7658, \\\"animal\\\": 2248, \\\"\\\\u00e9tait\\\": 1198, \\\"plus\\\": 8334, \\\"chat\\\": 192, \\\"ce\\\": 1572, \\\"pamplemousse\\\": 10140, \\\"n'aime\\\": 3131, \\\"verts\\\": 5835, \\\"citrons\\\": 10568, \\\"gel\\\": 4886, \\\"f\\\\u00e9vrier\\\": 8942, \\\"californie\\\": 8189, \\\"ne\\\": 2707, \\\"chine\\\": 10936, \\\"agr\\\\u00e9able\\\": 17151, \\\"octobre\\\": 8910, \\\"merveilleux\\\": 8704, \\\"doux\\\": 8458, \\\"tranquille\\\": 1437, \\\"\\\\u00e0\\\": 13381, \\\"l'automne\\\": 3411, \\\"\\\\u00e9t\\\\u00e9\\\": 8999, \\\"france\\\": 11170, \\\"mois\\\": 14311, \\\"mai\\\": 8995, \\\"frisquet\\\": 834, \\\"de\\\": 14740, \\\"mangues\\\": 5774, \\\"d\\\\u00e9teste\\\": 3743, \\\"raisins\\\": 5780, \\\"mangue\\\": 4899, \\\"poire\\\": 4848, \\\"leur\\\": 7855, \\\"ao\\\\u00fbt\\\": 8789, \\\"beau\\\": 6387, \\\"pommes\\\": 5844, \\\"p\\\\u00eaches\\\": 5844, \\\"requin\\\": 192, \\\"redout\\\\u00e9\\\": 576, \\\"que\\\": 667, \\\"d'\\\": 5100, \\\"sec\\\": 7957, \\\"humide\\\": 8446, \\\"enneig\\\\u00e9e\\\": 4008, \\\"aimons\\\": 1111, \\\"nous\\\": 2520, \\\"oranges\\\": 5844, \\\"poires\\\": 5844, \\\"ils\\\": 3221, \\\"aiment\\\": 1126, \\\"petit\\\": 324, \\\"rouge\\\": 972, \\\"elle\\\": 12080, \\\"aim\\\\u00e9e\\\": 105, \\\"cher\\\": 1308, \\\"monde\\\": 173, \\\"neige\\\": 3016, \\\"trop\\\": 173, \\\"hiver\\\": 9038, \\\"sont\\\": 1018, \\\"n'aimait\\\": 561, \\\"voiture\\\": 3510, \\\"une\\\": 1278, \\\"pas\\\": 4495, \\\"rouill\\\\u00e9e\\\": 486, \\\"fraises\\\": 5844, \\\"cours\\\": 1927, \\\"j'aime\\\": 966, \\\"fraise\\\": 4848, \\\"bananes\\\": 5844, \\\"va\\\": 355, \\\"aux\\\": 392, \\\"prochain\\\": 1666, \\\"visiter\\\": 908, \\\"pr\\\\u00e9vois\\\": 233, \\\"je\\\": 1548, \\\"belle\\\": 2725, \\\"ses\\\": 402, \\\"animaux\\\": 768, \\\"redout\\\\u00e9s\\\": 190, \\\"\\\\u00e9l\\\\u00e9phants\\\": 64, \\\"\\\\u00e9taient\\\": 357, \\\"vont\\\": 168, \\\"aime\\\": 8870, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9e\\\": 770, \\\"n'aiment\\\": 1111, \\\"i\\\": 150, \\\"comme\\\": 259, \\\"conduit\\\": 1706, \\\"p\\\\u00eache\\\": 4848, \\\"bleue\\\": 504, \\\"nouvelle\\\": 648, \\\"vous\\\": 2541, \\\"aimez\\\": 1053, \\\"cet\\\": 286, \\\"oiseau\\\": 128, \\\"pamplemousses\\\": 552, \\\"pleut\\\": 562, \\\"magnifique\\\": 104, \\\"favori\\\": 3857, \\\"aim\\\\u00e9s\\\": 237, \\\"vos\\\": 225, \\\"chevaux\\\": 64, \\\"n'aimez\\\": 1094, \\\"n'aimons\\\": 97, \\\"souris\\\": 256, \\\"d\\\\u00e9testons\\\": 1001, \\\"dernier\\\": 757, \\\"all\\\\u00e9\\\": 187, \\\"petite\\\": 615, \\\"conduisait\\\": 673, \\\"glaciales\\\": 307, \\\"cheval\\\": 192, \\\"vieille\\\": 647, \\\"chiens\\\": 64, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9s\\\": 383, \\\"blanche\\\": 579, \\\"occup\\\\u00e9e\\\": 836, \\\"nos\\\": 613, \\\"l'\\\\u00e9l\\\\u00e9phant\\\": 64, \\\"nouveau\\\": 502, \\\"noire\\\": 602, \\\"pluies\\\": 367, \\\"traduire\\\": 501, \\\"difficile\\\": 260, \\\"pense\\\": 540, \\\"entre\\\": 540, \\\"qu'il\\\": 393, \\\"portugais\\\": 312, \\\"espagnol\\\": 312, \\\"rouill\\\\u00e9\\\": 454, \\\"bleu\\\": 468, \\\"verte\\\": 628, \\\"grande\\\": 459, \\\"aimait\\\": 707, \\\"amusant\\\": 260, \\\"traduction\\\": 277, \\\"cette\\\": 1239, \\\"vert\\\": 344, \\\"grand\\\": 81, \\\"blanc\\\": 393, \\\"volant\\\": 165, \\\"gros\\\": 258, \\\"o\\\\u00f9\\\": 12, \\\"chien\\\": 192, \\\"leurs\\\": 1072, \\\"pourquoi\\\": 240, \\\"l'automobile\\\": 100, \\\"pourrait\\\": 252, \\\"rendre\\\": 350, \\\"se\\\": 461, \\\"maillot\\\": 173, \\\"pr\\\\u00e9voyons\\\": 232, \\\"grosse\\\": 185, \\\"brillant\\\": 587, \\\"pr\\\\u00e9voient\\\": 75, \\\"mouill\\\\u00e9e\\\": 7, \\\"lui\\\": 70, \\\"d\\\\u00e9tendre\\\": 111, \\\"automobile\\\": 278, \\\"aller\\\": 1180, \\\"pourraient\\\": 126, \\\"mes\\\": 297, \\\"s\\\\u00e8che\\\": 837, \\\"l'oiseau\\\": 64, \\\"pluie\\\": 174, \\\"requins\\\": 64, \\\"noir\\\": 370, \\\"singe\\\": 192, \\\"d\\\\u00e9testait\\\": 87, \\\"dans\\\": 12, \\\"temps\\\": 33, \\\"comment\\\": 67, \\\"lion\\\": 192, \\\"pr\\\\u00e9voit\\\": 75, \\\"ours\\\": 192, \\\"porcelaine\\\": 17, \\\"cl\\\\u00e9mentes\\\": 200, \\\"pla\\\\u00eet\\\": 13, \\\"proches\\\": 20, \\\"brillante\\\": 385, \\\"lapin\\\": 192, \\\"l'ours\\\": 64, \\\"chinois\\\": 312, \\\"quand\\\": 144, \\\"tour\\\": 57, \\\"eiffel\\\": 57, \\\"allons\\\": 45, \\\"l'\\\\u00e9picerie\\\": 57, \\\"voulait\\\": 252, \\\"c\\\\u00e9page\\\": 60, \\\"football\\\": 57, \\\"t\\\": 18, \\\"terrain\\\": 57, \\\"du\\\": 39, \\\"veut\\\": 252, \\\"\\\\u00e9l\\\\u00e9phant\\\": 128, \\\"gel\\\\u00e9\\\": 94, \\\"bien\\\": 77, \\\"enneig\\\\u00e9\\\": 7, \\\"gel\\\\u00e9s\\\": 5, \\\"serpent\\\": 192, \\\"all\\\\u00e9s\\\": 150, \\\"all\\\\u00e9e\\\": 150, \\\"envisage\\\": 360, \\\"peu\\\": 41, \\\"mouill\\\\u00e9\\\": 273, \\\"serpents\\\": 64, \\\"facile\\\": 260, \\\"pensez\\\": 60, \\\"anglais\\\": 312, \\\"fran\\\\u00e7ais\\\": 312, \\\"voulez\\\": 12, \\\"grandes\\\": 16, \\\"avez\\\": 162, \\\"aimeraient\\\": 12, \\\"allez\\\": 45, \\\"chats\\\": 64, \\\"lapins\\\": 64, \\\"visite\\\": 68, \\\"intention\\\": 206, \\\"ont\\\": 194, \\\"n'est\\\": 47, \\\"derni\\\\u00e8re\\\": 24, \\\"voulaient\\\": 126, \\\"singes\\\": 64, \\\"\\\\u00eates\\\": 24, \\\"qu'elle\\\": 26, \\\"vers\\\": 76, \\\"lac\\\": 57, \\\"pousse\\\": 41, \\\"d\\\\u00e9testez\\\": 17, \\\"manguiers\\\": 19, \\\"grands\\\": 9, \\\"l'\\\\u00e9cole\\\": 57, \\\"l'animal\\\": 56, \\\"at\\\": 32, \\\"oiseaux\\\": 64, \\\"ressort\\\": 2, \\\"petits\\\": 10, \\\"n'a\\\": 12, \\\"veulent\\\": 126, \\\"rouille\\\": 32, \\\"frais\\\": 20, \\\"limes\\\": 9, \\\"lions\\\": 64, \\\"douce\\\": 14, \\\"envisagent\\\": 9, \\\"petites\\\": 26, \\\"vais\\\": 24, \\\"durant\\\": 14, \\\"c'est\\\": 17, \\\"cong\\\\u00e9lation\\\": 14, \\\"allions\\\": 1, \\\"voudrait\\\": 24, \\\"d\\\\u00e9tend\\\": 2, \\\"trouv\\\\u00e9\\\": 1, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9es\\\": 16, \\\"conduite\\\": 6, \\\"grosses\\\": 8, \\\"b\\\\u00e9nigne\\\": 8, \\\"avons\\\": 19, \\\"sur\\\": 28, \\\"redout\\\\u00e9e\\\": 2, \\\"etats\\\": 3, \\\"moindres\\\": 7, \\\"n'\\\\u00eates\\\": 3, \\\"vit\\\": 3, \\\"as\\\": 1, \\\"tu\\\": 2, \\\"qui\\\": 2, \\\"faire\\\": 1, \\\"traduis\\\": 2, \\\"favoris\\\": 1, \\\"souvent\\\": 1, \\\"es\\\": 1, \\\"appr\\\\u00e9ci\\\\u00e9\\\": 1, \\\"moteur\\\": 1, \\\"tout\\\": 4}\", \"index_docs\": \"{\\\"11\\\": 28828, \\\"2\\\": 72048, \\\"24\\\": 14398, \\\"67\\\": 7256, \\\"6\\\": 59851, \\\"112\\\": 1867, \\\"1\\\": 104279, \\\"3\\\": 84114, \\\"37\\\": 10741, \\\"35\\\": 11047, \\\"8\\\": 32946, \\\"34\\\": 11052, \\\"50\\\": 8954, \\\"31\\\": 11270, \\\"19\\\": 16323, \\\"4\\\": 33404, \\\"49\\\": 8956, \\\"32\\\": 11267, \\\"69\\\": 6112, \\\"95\\\": 3622, \\\"51\\\": 8951, \\\"12\\\": 28101, \\\"45\\\": 9023, \\\"21\\\": 15908, \\\"101\\\": 3061, \\\"41\\\": 9133, \\\"269\\\": 63, \\\"103\\\": 2916, \\\"48\\\": 8958, \\\"39\\\": 9403, \\\"13\\\": 14357, \\\"40\\\": 9368, \\\"5\\\": 63987, \\\"7\\\": 39038, \\\"82\\\": 4852, \\\"16\\\": 23626, \\\"14\\\": 14038, \\\"83\\\": 4848, \\\"10\\\": 28877, \\\"17\\\": 14722, \\\"84\\\": 4848, \\\"20\\\": 15184, \\\"58\\\": 8458, \\\"29\\\": 11334, \\\"52\\\": 8945, \\\"42\\\": 9100, \\\"65\\\": 7782, \\\"15\\\": 24675, \\\"9\\\": 32618, \\\"125\\\": 1124, \\\"85\\\": 4848, \\\"61\\\": 8319, \\\"43\\\": 9090, \\\"87\\\": 4848, \\\"96\\\": 3566, \\\"86\\\": 4848, \\\"108\\\": 2435, \\\"118\\\": 1356, \\\"133\\\": 972, \\\"149\\\": 645, \\\"110\\\": 1944, \\\"183\\\": 325, \\\"144\\\": 698, \\\"30\\\": 11277, \\\"66\\\": 7658, \\\"109\\\": 2248, \\\"122\\\": 1198, \\\"23\\\": 8334, \\\"213\\\": 192, \\\"115\\\": 1572, \\\"38\\\": 10140, \\\"100\\\": 3131, \\\"76\\\": 5835, \\\"28\\\": 10568, \\\"81\\\": 4886, \\\"53\\\": 8942, \\\"62\\\": 8189, \\\"105\\\": 2707, \\\"36\\\": 10936, \\\"18\\\": 17151, \\\"54\\\": 8910, \\\"57\\\": 8704, \\\"59\\\": 8458, \\\"117\\\": 1437, \\\"26\\\": 13381, \\\"98\\\": 3411, \\\"46\\\": 8999, \\\"33\\\": 11170, \\\"25\\\": 14311, \\\"47\\\": 8995, \\\"139\\\": 834, \\\"22\\\": 14740, \\\"78\\\": 5774, \\\"94\\\": 3743, \\\"77\\\": 5780, \\\"80\\\": 4899, \\\"88\\\": 4848, \\\"64\\\": 7855, \\\"56\\\": 8789, \\\"68\\\": 6387, \\\"70\\\": 5844, \\\"71\\\": 5844, \\\"214\\\": 192, \\\"156\\\": 576, \\\"146\\\": 667, \\\"79\\\": 5100, \\\"63\\\": 7957, \\\"60\\\": 8446, \\\"92\\\": 4008, \\\"126\\\": 1111, \\\"107\\\": 2520, \\\"72\\\": 5844, \\\"73\\\": 5844, \\\"99\\\": 3221, \\\"124\\\": 1126, \\\"184\\\": 324, \\\"134\\\": 972, \\\"27\\\": 12080, \\\"242\\\": 105, \\\"119\\\": 1308, \\\"227\\\": 173, \\\"102\\\": 3016, \\\"226\\\": 173, \\\"44\\\": 9038, \\\"131\\\": 1018, \\\"158\\\": 561, \\\"97\\\": 3510, \\\"120\\\": 1278, \\\"91\\\": 4495, \\\"165\\\": 486, \\\"74\\\": 5844, \\\"111\\\": 1927, \\\"135\\\": 966, \\\"89\\\": 4848, \\\"75\\\": 5844, \\\"180\\\": 355, \\\"173\\\": 392, \\\"114\\\": 1666, \\\"136\\\": 908, \\\"207\\\": 233, \\\"116\\\": 1548, \\\"104\\\": 2725, \\\"170\\\": 402, \\\"141\\\": 768, \\\"222\\\": 190, \\\"256\\\": 64, \\\"179\\\": 357, \\\"229\\\": 168, \\\"55\\\": 8870, \\\"140\\\": 770, \\\"127\\\": 1111, \\\"232\\\": 150, \\\"199\\\": 259, \\\"113\\\": 1706, \\\"90\\\": 4848, \\\"162\\\": 504, \\\"147\\\": 648, \\\"106\\\": 2541, \\\"130\\\": 1053, \\\"192\\\": 286, \\\"236\\\": 128, \\\"159\\\": 552, \\\"157\\\": 562, \\\"243\\\": 104, \\\"93\\\": 3857, \\\"206\\\": 237, \\\"209\\\": 225, \\\"257\\\": 64, \\\"128\\\": 1094, \\\"245\\\": 97, \\\"201\\\": 256, \\\"132\\\": 1001, \\\"142\\\": 757, \\\"223\\\": 187, \\\"151\\\": 615, \\\"145\\\": 673, \\\"190\\\": 307, \\\"215\\\": 192, \\\"148\\\": 647, \\\"258\\\": 64, \\\"175\\\": 383, \\\"155\\\": 579, \\\"138\\\": 836, \\\"152\\\": 613, \\\"259\\\": 64, \\\"163\\\": 502, \\\"153\\\": 602, \\\"177\\\": 367, \\\"164\\\": 501, \\\"196\\\": 260, \\\"160\\\": 540, \\\"161\\\": 540, \\\"171\\\": 393, \\\"186\\\": 312, \\\"185\\\": 312, \\\"169\\\": 454, \\\"166\\\": 468, \\\"150\\\": 628, \\\"168\\\": 459, \\\"143\\\": 707, \\\"197\\\": 260, \\\"194\\\": 277, \\\"121\\\": 1239, \\\"182\\\": 344, \\\"248\\\": 81, \\\"172\\\": 393, \\\"230\\\": 165, \\\"200\\\": 258, \\\"310\\\": 12, \\\"216\\\": 192, \\\"129\\\": 1072, \\\"205\\\": 240, \\\"244\\\": 100, \\\"202\\\": 252, \\\"181\\\": 350, \\\"167\\\": 461, \\\"228\\\": 173, \\\"208\\\": 232, \\\"224\\\": 185, \\\"154\\\": 587, \\\"251\\\": 75, \\\"321\\\": 7, \\\"253\\\": 70, \\\"241\\\": 111, \\\"193\\\": 278, \\\"123\\\": 1180, \\\"238\\\": 126, \\\"191\\\": 297, \\\"137\\\": 837, \\\"260\\\": 64, \\\"225\\\": 174, \\\"261\\\": 64, \\\"176\\\": 370, \\\"217\\\": 192, \\\"247\\\": 87, \\\"311\\\": 12, \\\"286\\\": 33, \\\"255\\\": 67, \\\"218\\\": 192, \\\"252\\\": 75, \\\"219\\\": 192, \\\"301\\\": 17, \\\"211\\\": 200, \\\"309\\\": 13, \\\"296\\\": 20, \\\"174\\\": 385, \\\"220\\\": 192, \\\"262\\\": 64, \\\"187\\\": 312, \\\"235\\\": 144, \\\"272\\\": 57, \\\"273\\\": 57, \\\"281\\\": 45, \\\"274\\\": 57, \\\"203\\\": 252, \\\"270\\\": 60, \\\"276\\\": 57, \\\"300\\\": 18, \\\"275\\\": 57, \\\"285\\\": 39, \\\"204\\\": 252, \\\"237\\\": 128, \\\"246\\\": 94, \\\"249\\\": 77, \\\"322\\\": 7, \\\"325\\\": 5, \\\"221\\\": 192, \\\"233\\\": 150, \\\"234\\\": 150, \\\"178\\\": 360, \\\"283\\\": 41, \\\"195\\\": 273, \\\"263\\\": 64, \\\"198\\\": 260, \\\"271\\\": 60, \\\"188\\\": 312, \\\"189\\\": 312, \\\"312\\\": 12, \\\"304\\\": 16, \\\"231\\\": 162, \\\"313\\\": 12, \\\"282\\\": 45, \\\"264\\\": 64, \\\"265\\\": 64, \\\"254\\\": 68, \\\"210\\\": 206, \\\"212\\\": 194, \\\"280\\\": 47, \\\"292\\\": 24, \\\"239\\\": 126, \\\"266\\\": 64, \\\"293\\\": 24, \\\"290\\\": 26, \\\"250\\\": 76, \\\"277\\\": 57, \\\"284\\\": 41, \\\"302\\\": 17, \\\"298\\\": 19, \\\"316\\\": 9, \\\"278\\\": 57, \\\"279\\\": 56, \\\"287\\\": 32, \\\"267\\\": 64, \\\"330\\\": 2, \\\"315\\\": 10, \\\"314\\\": 12, \\\"240\\\": 126, \\\"288\\\": 32, \\\"297\\\": 20, \\\"317\\\": 9, \\\"268\\\": 64, \\\"306\\\": 14, \\\"318\\\": 9, \\\"291\\\": 26, \\\"294\\\": 24, \\\"307\\\": 14, \\\"303\\\": 17, \\\"308\\\": 14, \\\"337\\\": 1, \\\"295\\\": 24, \\\"331\\\": 2, \\\"338\\\": 1, \\\"305\\\": 16, \\\"324\\\": 6, \\\"319\\\": 8, \\\"320\\\": 8, \\\"299\\\": 19, \\\"289\\\": 28, \\\"332\\\": 2, \\\"327\\\": 3, \\\"323\\\": 7, \\\"328\\\": 3, \\\"329\\\": 3, \\\"339\\\": 1, \\\"333\\\": 2, \\\"334\\\": 2, \\\"340\\\": 1, \\\"335\\\": 2, \\\"341\\\": 1, \\\"342\\\": 1, \\\"343\\\": 1, \\\"336\\\": 1, \\\"344\\\": 1, \\\"326\\\": 4}\", \"index_word\": \"{\\\"1\\\": \\\"est\\\", \\\"2\\\": \\\"en\\\", \\\"3\\\": \\\"il\\\", \\\"4\\\": \\\"les\\\", \\\"5\\\": \\\"mais\\\", \\\"6\\\": \\\"et\\\", \\\"7\\\": \\\"la\\\", \\\"8\\\": \\\"parfois\\\", \\\"9\\\": \\\"jamais\\\", \\\"10\\\": \\\"le\\\", \\\"11\\\": \\\"l'\\\", \\\"12\\\": \\\"g\\\\u00e9n\\\\u00e9ralement\\\", \\\"13\\\": \\\"moins\\\", \\\"14\\\": \\\"aim\\\\u00e9\\\", \\\"15\\\": \\\"au\\\", \\\"16\\\": \\\"fruit\\\", \\\"17\\\": \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9\\\", \\\"18\\\": \\\"agr\\\\u00e9able\\\", \\\"19\\\": \\\"froid\\\", \\\"20\\\": \\\"son\\\", \\\"21\\\": \\\"chaud\\\", \\\"22\\\": \\\"de\\\", \\\"23\\\": \\\"plus\\\", \\\"24\\\": \\\"automne\\\", \\\"25\\\": \\\"mois\\\", \\\"26\\\": \\\"\\\\u00e0\\\", \\\"27\\\": \\\"elle\\\", \\\"28\\\": \\\"citrons\\\", \\\"29\\\": \\\"paris\\\", \\\"30\\\": \\\"inde\\\", \\\"31\\\": \\\"unis\\\", \\\"32\\\": \\\"\\\\u00e9tats\\\", \\\"33\\\": \\\"france\\\", \\\"34\\\": \\\"jersey\\\", \\\"35\\\": \\\"new\\\", \\\"36\\\": \\\"chine\\\", \\\"37\\\": \\\"pendant\\\", \\\"38\\\": \\\"pamplemousse\\\", \\\"39\\\": \\\"mon\\\", \\\"40\\\": \\\"votre\\\", \\\"41\\\": \\\"juin\\\", \\\"42\\\": \\\"printemps\\\", \\\"43\\\": \\\"janvier\\\", \\\"44\\\": \\\"hiver\\\", \\\"45\\\": \\\"mars\\\", \\\"46\\\": \\\"\\\\u00e9t\\\\u00e9\\\", \\\"47\\\": \\\"mai\\\", \\\"48\\\": \\\"septembre\\\", \\\"49\\\": \\\"juillet\\\", \\\"50\\\": \\\"avril\\\", \\\"51\\\": \\\"novembre\\\", \\\"52\\\": \\\"d\\\\u00e9cembre\\\", \\\"53\\\": \\\"f\\\\u00e9vrier\\\", \\\"54\\\": \\\"octobre\\\", \\\"55\\\": \\\"aime\\\", \\\"56\\\": \\\"ao\\\\u00fbt\\\", \\\"57\\\": \\\"merveilleux\\\", \\\"58\\\": \\\"relaxant\\\", \\\"59\\\": \\\"doux\\\", \\\"60\\\": \\\"humide\\\", \\\"61\\\": \\\"notre\\\", \\\"62\\\": \\\"californie\\\", \\\"63\\\": \\\"sec\\\", \\\"64\\\": \\\"leur\\\", \\\"65\\\": \\\"occup\\\\u00e9\\\", \\\"66\\\": \\\"pluvieux\\\", \\\"67\\\": \\\"calme\\\", \\\"68\\\": \\\"beau\\\", \\\"69\\\": \\\"habituellement\\\", \\\"70\\\": \\\"pommes\\\", \\\"71\\\": \\\"p\\\\u00eaches\\\", \\\"72\\\": \\\"oranges\\\", \\\"73\\\": \\\"poires\\\", \\\"74\\\": \\\"fraises\\\", \\\"75\\\": \\\"bananes\\\", \\\"76\\\": \\\"verts\\\", \\\"77\\\": \\\"raisins\\\", \\\"78\\\": \\\"mangues\\\", \\\"79\\\": \\\"d'\\\", \\\"80\\\": \\\"mangue\\\", \\\"81\\\": \\\"gel\\\", \\\"82\\\": \\\"raisin\\\", \\\"83\\\": \\\"pomme\\\", \\\"84\\\": \\\"l'orange\\\", \\\"85\\\": \\\"citron\\\", \\\"86\\\": \\\"chaux\\\", \\\"87\\\": \\\"banane\\\", \\\"88\\\": \\\"poire\\\", \\\"89\\\": \\\"fraise\\\", \\\"90\\\": \\\"p\\\\u00eache\\\", \\\"91\\\": \\\"pas\\\", \\\"92\\\": \\\"enneig\\\\u00e9e\\\", \\\"93\\\": \\\"favori\\\", \\\"94\\\": \\\"d\\\\u00e9teste\\\", \\\"95\\\": \\\"g\\\\u00e8le\\\", \\\"96\\\": \\\"fruits\\\", \\\"97\\\": \\\"voiture\\\", \\\"98\\\": \\\"l'automne\\\", \\\"99\\\": \\\"ils\\\", \\\"100\\\": \\\"n'aime\\\", \\\"101\\\": \\\"california\\\", \\\"102\\\": \\\"neige\\\", \\\"103\\\": \\\"fait\\\", \\\"104\\\": \\\"belle\\\", \\\"105\\\": \\\"ne\\\", \\\"106\\\": \\\"vous\\\", \\\"107\\\": \\\"nous\\\", \\\"108\\\": \\\"des\\\", \\\"109\\\": \\\"animal\\\", \\\"110\\\": \\\"camion\\\", \\\"111\\\": \\\"cours\\\", \\\"112\\\": \\\"neigeux\\\", \\\"113\\\": \\\"conduit\\\", \\\"114\\\": \\\"prochain\\\", \\\"115\\\": \\\"ce\\\", \\\"116\\\": \\\"je\\\", \\\"117\\\": \\\"tranquille\\\", \\\"118\\\": \\\"a\\\", \\\"119\\\": \\\"cher\\\", \\\"120\\\": \\\"une\\\", \\\"121\\\": \\\"cette\\\", \\\"122\\\": \\\"\\\\u00e9tait\\\", \\\"123\\\": \\\"aller\\\", \\\"124\\\": \\\"aiment\\\", \\\"125\\\": \\\"chaude\\\", \\\"126\\\": \\\"aimons\\\", \\\"127\\\": \\\"n'aiment\\\", \\\"128\\\": \\\"n'aimez\\\", \\\"129\\\": \\\"leurs\\\", \\\"130\\\": \\\"aimez\\\", \\\"131\\\": \\\"sont\\\", \\\"132\\\": \\\"d\\\\u00e9testons\\\", \\\"133\\\": \\\"jaune\\\", \\\"134\\\": \\\"rouge\\\", \\\"135\\\": \\\"j'aime\\\", \\\"136\\\": \\\"visiter\\\", \\\"137\\\": \\\"s\\\\u00e8che\\\", \\\"138\\\": \\\"occup\\\\u00e9e\\\", \\\"139\\\": \\\"frisquet\\\", \\\"140\\\": \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9e\\\", \\\"141\\\": \\\"animaux\\\", \\\"142\\\": \\\"dernier\\\", \\\"143\\\": \\\"aimait\\\", \\\"144\\\": \\\"un\\\", \\\"145\\\": \\\"conduisait\\\", \\\"146\\\": \\\"que\\\", \\\"147\\\": \\\"nouvelle\\\", \\\"148\\\": \\\"vieille\\\", \\\"149\\\": \\\"vu\\\", \\\"150\\\": \\\"verte\\\", \\\"151\\\": \\\"petite\\\", \\\"152\\\": \\\"nos\\\", \\\"153\\\": \\\"noire\\\", \\\"154\\\": \\\"brillant\\\", \\\"155\\\": \\\"blanche\\\", \\\"156\\\": \\\"redout\\\\u00e9\\\", \\\"157\\\": \\\"pleut\\\", \\\"158\\\": \\\"n'aimait\\\", \\\"159\\\": \\\"pamplemousses\\\", \\\"160\\\": \\\"pense\\\", \\\"161\\\": \\\"entre\\\", \\\"162\\\": \\\"bleue\\\", \\\"163\\\": \\\"nouveau\\\", \\\"164\\\": \\\"traduire\\\", \\\"165\\\": \\\"rouill\\\\u00e9e\\\", \\\"166\\\": \\\"bleu\\\", \\\"167\\\": \\\"se\\\", \\\"168\\\": \\\"grande\\\", \\\"169\\\": \\\"rouill\\\\u00e9\\\", \\\"170\\\": \\\"ses\\\", \\\"171\\\": \\\"qu'il\\\", \\\"172\\\": \\\"blanc\\\", \\\"173\\\": \\\"aux\\\", \\\"174\\\": \\\"brillante\\\", \\\"175\\\": \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9s\\\", \\\"176\\\": \\\"noir\\\", \\\"177\\\": \\\"pluies\\\", \\\"178\\\": \\\"envisage\\\", \\\"179\\\": \\\"\\\\u00e9taient\\\", \\\"180\\\": \\\"va\\\", \\\"181\\\": \\\"rendre\\\", \\\"182\\\": \\\"vert\\\", \\\"183\\\": \\\"vieux\\\", \\\"184\\\": \\\"petit\\\", \\\"185\\\": \\\"espagnol\\\", \\\"186\\\": \\\"portugais\\\", \\\"187\\\": \\\"chinois\\\", \\\"188\\\": \\\"anglais\\\", \\\"189\\\": \\\"fran\\\\u00e7ais\\\", \\\"190\\\": \\\"glaciales\\\", \\\"191\\\": \\\"mes\\\", \\\"192\\\": \\\"cet\\\", \\\"193\\\": \\\"automobile\\\", \\\"194\\\": \\\"traduction\\\", \\\"195\\\": \\\"mouill\\\\u00e9\\\", \\\"196\\\": \\\"difficile\\\", \\\"197\\\": \\\"amusant\\\", \\\"198\\\": \\\"facile\\\", \\\"199\\\": \\\"comme\\\", \\\"200\\\": \\\"gros\\\", \\\"201\\\": \\\"souris\\\", \\\"202\\\": \\\"pourrait\\\", \\\"203\\\": \\\"voulait\\\", \\\"204\\\": \\\"veut\\\", \\\"205\\\": \\\"pourquoi\\\", \\\"206\\\": \\\"aim\\\\u00e9s\\\", \\\"207\\\": \\\"pr\\\\u00e9vois\\\", \\\"208\\\": \\\"pr\\\\u00e9voyons\\\", \\\"209\\\": \\\"vos\\\", \\\"210\\\": \\\"intention\\\", \\\"211\\\": \\\"cl\\\\u00e9mentes\\\", \\\"212\\\": \\\"ont\\\", \\\"213\\\": \\\"chat\\\", \\\"214\\\": \\\"requin\\\", \\\"215\\\": \\\"cheval\\\", \\\"216\\\": \\\"chien\\\", \\\"217\\\": \\\"singe\\\", \\\"218\\\": \\\"lion\\\", \\\"219\\\": \\\"ours\\\", \\\"220\\\": \\\"lapin\\\", \\\"221\\\": \\\"serpent\\\", \\\"222\\\": \\\"redout\\\\u00e9s\\\", \\\"223\\\": \\\"all\\\\u00e9\\\", \\\"224\\\": \\\"grosse\\\", \\\"225\\\": \\\"pluie\\\", \\\"226\\\": \\\"trop\\\", \\\"227\\\": \\\"monde\\\", \\\"228\\\": \\\"maillot\\\", \\\"229\\\": \\\"vont\\\", \\\"230\\\": \\\"volant\\\", \\\"231\\\": \\\"avez\\\", \\\"232\\\": \\\"i\\\", \\\"233\\\": \\\"all\\\\u00e9s\\\", \\\"234\\\": \\\"all\\\\u00e9e\\\", \\\"235\\\": \\\"quand\\\", \\\"236\\\": \\\"oiseau\\\", \\\"237\\\": \\\"\\\\u00e9l\\\\u00e9phant\\\", \\\"238\\\": \\\"pourraient\\\", \\\"239\\\": \\\"voulaient\\\", \\\"240\\\": \\\"veulent\\\", \\\"241\\\": \\\"d\\\\u00e9tendre\\\", \\\"242\\\": \\\"aim\\\\u00e9e\\\", \\\"243\\\": \\\"magnifique\\\", \\\"244\\\": \\\"l'automobile\\\", \\\"245\\\": \\\"n'aimons\\\", \\\"246\\\": \\\"gel\\\\u00e9\\\", \\\"247\\\": \\\"d\\\\u00e9testait\\\", \\\"248\\\": \\\"grand\\\", \\\"249\\\": \\\"bien\\\", \\\"250\\\": \\\"vers\\\", \\\"251\\\": \\\"pr\\\\u00e9voient\\\", \\\"252\\\": \\\"pr\\\\u00e9voit\\\", \\\"253\\\": \\\"lui\\\", \\\"254\\\": \\\"visite\\\", \\\"255\\\": \\\"comment\\\", \\\"256\\\": \\\"\\\\u00e9l\\\\u00e9phants\\\", \\\"257\\\": \\\"chevaux\\\", \\\"258\\\": \\\"chiens\\\", \\\"259\\\": \\\"l'\\\\u00e9l\\\\u00e9phant\\\", \\\"260\\\": \\\"l'oiseau\\\", \\\"261\\\": \\\"requins\\\", \\\"262\\\": \\\"l'ours\\\", \\\"263\\\": \\\"serpents\\\", \\\"264\\\": \\\"chats\\\", \\\"265\\\": \\\"lapins\\\", \\\"266\\\": \\\"singes\\\", \\\"267\\\": \\\"oiseaux\\\", \\\"268\\\": \\\"lions\\\", \\\"269\\\": \\\"l\\\\u00e9g\\\\u00e8re\\\", \\\"270\\\": \\\"c\\\\u00e9page\\\", \\\"271\\\": \\\"pensez\\\", \\\"272\\\": \\\"tour\\\", \\\"273\\\": \\\"eiffel\\\", \\\"274\\\": \\\"l'\\\\u00e9picerie\\\", \\\"275\\\": \\\"terrain\\\", \\\"276\\\": \\\"football\\\", \\\"277\\\": \\\"lac\\\", \\\"278\\\": \\\"l'\\\\u00e9cole\\\", \\\"279\\\": \\\"l'animal\\\", \\\"280\\\": \\\"n'est\\\", \\\"281\\\": \\\"allons\\\", \\\"282\\\": \\\"allez\\\", \\\"283\\\": \\\"peu\\\", \\\"284\\\": \\\"pousse\\\", \\\"285\\\": \\\"du\\\", \\\"286\\\": \\\"temps\\\", \\\"287\\\": \\\"at\\\", \\\"288\\\": \\\"rouille\\\", \\\"289\\\": \\\"sur\\\", \\\"290\\\": \\\"qu'elle\\\", \\\"291\\\": \\\"petites\\\", \\\"292\\\": \\\"derni\\\\u00e8re\\\", \\\"293\\\": \\\"\\\\u00eates\\\", \\\"294\\\": \\\"vais\\\", \\\"295\\\": \\\"voudrait\\\", \\\"296\\\": \\\"proches\\\", \\\"297\\\": \\\"frais\\\", \\\"298\\\": \\\"manguiers\\\", \\\"299\\\": \\\"avons\\\", \\\"300\\\": \\\"t\\\", \\\"301\\\": \\\"porcelaine\\\", \\\"302\\\": \\\"d\\\\u00e9testez\\\", \\\"303\\\": \\\"c'est\\\", \\\"304\\\": \\\"grandes\\\", \\\"305\\\": \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9es\\\", \\\"306\\\": \\\"douce\\\", \\\"307\\\": \\\"durant\\\", \\\"308\\\": \\\"cong\\\\u00e9lation\\\", \\\"309\\\": \\\"pla\\\\u00eet\\\", \\\"310\\\": \\\"o\\\\u00f9\\\", \\\"311\\\": \\\"dans\\\", \\\"312\\\": \\\"voulez\\\", \\\"313\\\": \\\"aimeraient\\\", \\\"314\\\": \\\"n'a\\\", \\\"315\\\": \\\"petits\\\", \\\"316\\\": \\\"grands\\\", \\\"317\\\": \\\"limes\\\", \\\"318\\\": \\\"envisagent\\\", \\\"319\\\": \\\"grosses\\\", \\\"320\\\": \\\"b\\\\u00e9nigne\\\", \\\"321\\\": \\\"mouill\\\\u00e9e\\\", \\\"322\\\": \\\"enneig\\\\u00e9\\\", \\\"323\\\": \\\"moindres\\\", \\\"324\\\": \\\"conduite\\\", \\\"325\\\": \\\"gel\\\\u00e9s\\\", \\\"326\\\": \\\"tout\\\", \\\"327\\\": \\\"etats\\\", \\\"328\\\": \\\"n'\\\\u00eates\\\", \\\"329\\\": \\\"vit\\\", \\\"330\\\": \\\"ressort\\\", \\\"331\\\": \\\"d\\\\u00e9tend\\\", \\\"332\\\": \\\"redout\\\\u00e9e\\\", \\\"333\\\": \\\"tu\\\", \\\"334\\\": \\\"qui\\\", \\\"335\\\": \\\"traduis\\\", \\\"336\\\": \\\"appr\\\\u00e9ci\\\\u00e9\\\", \\\"337\\\": \\\"allions\\\", \\\"338\\\": \\\"trouv\\\\u00e9\\\", \\\"339\\\": \\\"as\\\", \\\"340\\\": \\\"faire\\\", \\\"341\\\": \\\"favoris\\\", \\\"342\\\": \\\"souvent\\\", \\\"343\\\": \\\"es\\\", \\\"344\\\": \\\"moteur\\\"}\", \"word_index\": \"{\\\"est\\\": 1, \\\"en\\\": 2, \\\"il\\\": 3, \\\"les\\\": 4, \\\"mais\\\": 5, \\\"et\\\": 6, \\\"la\\\": 7, \\\"parfois\\\": 8, \\\"jamais\\\": 9, \\\"le\\\": 10, \\\"l'\\\": 11, \\\"g\\\\u00e9n\\\\u00e9ralement\\\": 12, \\\"moins\\\": 13, \\\"aim\\\\u00e9\\\": 14, \\\"au\\\": 15, \\\"fruit\\\": 16, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9\\\": 17, \\\"agr\\\\u00e9able\\\": 18, \\\"froid\\\": 19, \\\"son\\\": 20, \\\"chaud\\\": 21, \\\"de\\\": 22, \\\"plus\\\": 23, \\\"automne\\\": 24, \\\"mois\\\": 25, \\\"\\\\u00e0\\\": 26, \\\"elle\\\": 27, \\\"citrons\\\": 28, \\\"paris\\\": 29, \\\"inde\\\": 30, \\\"unis\\\": 31, \\\"\\\\u00e9tats\\\": 32, \\\"france\\\": 33, \\\"jersey\\\": 34, \\\"new\\\": 35, \\\"chine\\\": 36, \\\"pendant\\\": 37, \\\"pamplemousse\\\": 38, \\\"mon\\\": 39, \\\"votre\\\": 40, \\\"juin\\\": 41, \\\"printemps\\\": 42, \\\"janvier\\\": 43, \\\"hiver\\\": 44, \\\"mars\\\": 45, \\\"\\\\u00e9t\\\\u00e9\\\": 46, \\\"mai\\\": 47, \\\"septembre\\\": 48, \\\"juillet\\\": 49, \\\"avril\\\": 50, \\\"novembre\\\": 51, \\\"d\\\\u00e9cembre\\\": 52, \\\"f\\\\u00e9vrier\\\": 53, \\\"octobre\\\": 54, \\\"aime\\\": 55, \\\"ao\\\\u00fbt\\\": 56, \\\"merveilleux\\\": 57, \\\"relaxant\\\": 58, \\\"doux\\\": 59, \\\"humide\\\": 60, \\\"notre\\\": 61, \\\"californie\\\": 62, \\\"sec\\\": 63, \\\"leur\\\": 64, \\\"occup\\\\u00e9\\\": 65, \\\"pluvieux\\\": 66, \\\"calme\\\": 67, \\\"beau\\\": 68, \\\"habituellement\\\": 69, \\\"pommes\\\": 70, \\\"p\\\\u00eaches\\\": 71, \\\"oranges\\\": 72, \\\"poires\\\": 73, \\\"fraises\\\": 74, \\\"bananes\\\": 75, \\\"verts\\\": 76, \\\"raisins\\\": 77, \\\"mangues\\\": 78, \\\"d'\\\": 79, \\\"mangue\\\": 80, \\\"gel\\\": 81, \\\"raisin\\\": 82, \\\"pomme\\\": 83, \\\"l'orange\\\": 84, \\\"citron\\\": 85, \\\"chaux\\\": 86, \\\"banane\\\": 87, \\\"poire\\\": 88, \\\"fraise\\\": 89, \\\"p\\\\u00eache\\\": 90, \\\"pas\\\": 91, \\\"enneig\\\\u00e9e\\\": 92, \\\"favori\\\": 93, \\\"d\\\\u00e9teste\\\": 94, \\\"g\\\\u00e8le\\\": 95, \\\"fruits\\\": 96, \\\"voiture\\\": 97, \\\"l'automne\\\": 98, \\\"ils\\\": 99, \\\"n'aime\\\": 100, \\\"california\\\": 101, \\\"neige\\\": 102, \\\"fait\\\": 103, \\\"belle\\\": 104, \\\"ne\\\": 105, \\\"vous\\\": 106, \\\"nous\\\": 107, \\\"des\\\": 108, \\\"animal\\\": 109, \\\"camion\\\": 110, \\\"cours\\\": 111, \\\"neigeux\\\": 112, \\\"conduit\\\": 113, \\\"prochain\\\": 114, \\\"ce\\\": 115, \\\"je\\\": 116, \\\"tranquille\\\": 117, \\\"a\\\": 118, \\\"cher\\\": 119, \\\"une\\\": 120, \\\"cette\\\": 121, \\\"\\\\u00e9tait\\\": 122, \\\"aller\\\": 123, \\\"aiment\\\": 124, \\\"chaude\\\": 125, \\\"aimons\\\": 126, \\\"n'aiment\\\": 127, \\\"n'aimez\\\": 128, \\\"leurs\\\": 129, \\\"aimez\\\": 130, \\\"sont\\\": 131, \\\"d\\\\u00e9testons\\\": 132, \\\"jaune\\\": 133, \\\"rouge\\\": 134, \\\"j'aime\\\": 135, \\\"visiter\\\": 136, \\\"s\\\\u00e8che\\\": 137, \\\"occup\\\\u00e9e\\\": 138, \\\"frisquet\\\": 139, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9e\\\": 140, \\\"animaux\\\": 141, \\\"dernier\\\": 142, \\\"aimait\\\": 143, \\\"un\\\": 144, \\\"conduisait\\\": 145, \\\"que\\\": 146, \\\"nouvelle\\\": 147, \\\"vieille\\\": 148, \\\"vu\\\": 149, \\\"verte\\\": 150, \\\"petite\\\": 151, \\\"nos\\\": 152, \\\"noire\\\": 153, \\\"brillant\\\": 154, \\\"blanche\\\": 155, \\\"redout\\\\u00e9\\\": 156, \\\"pleut\\\": 157, \\\"n'aimait\\\": 158, \\\"pamplemousses\\\": 159, \\\"pense\\\": 160, \\\"entre\\\": 161, \\\"bleue\\\": 162, \\\"nouveau\\\": 163, \\\"traduire\\\": 164, \\\"rouill\\\\u00e9e\\\": 165, \\\"bleu\\\": 166, \\\"se\\\": 167, \\\"grande\\\": 168, \\\"rouill\\\\u00e9\\\": 169, \\\"ses\\\": 170, \\\"qu'il\\\": 171, \\\"blanc\\\": 172, \\\"aux\\\": 173, \\\"brillante\\\": 174, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9s\\\": 175, \\\"noir\\\": 176, \\\"pluies\\\": 177, \\\"envisage\\\": 178, \\\"\\\\u00e9taient\\\": 179, \\\"va\\\": 180, \\\"rendre\\\": 181, \\\"vert\\\": 182, \\\"vieux\\\": 183, \\\"petit\\\": 184, \\\"espagnol\\\": 185, \\\"portugais\\\": 186, \\\"chinois\\\": 187, \\\"anglais\\\": 188, \\\"fran\\\\u00e7ais\\\": 189, \\\"glaciales\\\": 190, \\\"mes\\\": 191, \\\"cet\\\": 192, \\\"automobile\\\": 193, \\\"traduction\\\": 194, \\\"mouill\\\\u00e9\\\": 195, \\\"difficile\\\": 196, \\\"amusant\\\": 197, \\\"facile\\\": 198, \\\"comme\\\": 199, \\\"gros\\\": 200, \\\"souris\\\": 201, \\\"pourrait\\\": 202, \\\"voulait\\\": 203, \\\"veut\\\": 204, \\\"pourquoi\\\": 205, \\\"aim\\\\u00e9s\\\": 206, \\\"pr\\\\u00e9vois\\\": 207, \\\"pr\\\\u00e9voyons\\\": 208, \\\"vos\\\": 209, \\\"intention\\\": 210, \\\"cl\\\\u00e9mentes\\\": 211, \\\"ont\\\": 212, \\\"chat\\\": 213, \\\"requin\\\": 214, \\\"cheval\\\": 215, \\\"chien\\\": 216, \\\"singe\\\": 217, \\\"lion\\\": 218, \\\"ours\\\": 219, \\\"lapin\\\": 220, \\\"serpent\\\": 221, \\\"redout\\\\u00e9s\\\": 222, \\\"all\\\\u00e9\\\": 223, \\\"grosse\\\": 224, \\\"pluie\\\": 225, \\\"trop\\\": 226, \\\"monde\\\": 227, \\\"maillot\\\": 228, \\\"vont\\\": 229, \\\"volant\\\": 230, \\\"avez\\\": 231, \\\"i\\\": 232, \\\"all\\\\u00e9s\\\": 233, \\\"all\\\\u00e9e\\\": 234, \\\"quand\\\": 235, \\\"oiseau\\\": 236, \\\"\\\\u00e9l\\\\u00e9phant\\\": 237, \\\"pourraient\\\": 238, \\\"voulaient\\\": 239, \\\"veulent\\\": 240, \\\"d\\\\u00e9tendre\\\": 241, \\\"aim\\\\u00e9e\\\": 242, \\\"magnifique\\\": 243, \\\"l'automobile\\\": 244, \\\"n'aimons\\\": 245, \\\"gel\\\\u00e9\\\": 246, \\\"d\\\\u00e9testait\\\": 247, \\\"grand\\\": 248, \\\"bien\\\": 249, \\\"vers\\\": 250, \\\"pr\\\\u00e9voient\\\": 251, \\\"pr\\\\u00e9voit\\\": 252, \\\"lui\\\": 253, \\\"visite\\\": 254, \\\"comment\\\": 255, \\\"\\\\u00e9l\\\\u00e9phants\\\": 256, \\\"chevaux\\\": 257, \\\"chiens\\\": 258, \\\"l'\\\\u00e9l\\\\u00e9phant\\\": 259, \\\"l'oiseau\\\": 260, \\\"requins\\\": 261, \\\"l'ours\\\": 262, \\\"serpents\\\": 263, \\\"chats\\\": 264, \\\"lapins\\\": 265, \\\"singes\\\": 266, \\\"oiseaux\\\": 267, \\\"lions\\\": 268, \\\"l\\\\u00e9g\\\\u00e8re\\\": 269, \\\"c\\\\u00e9page\\\": 270, \\\"pensez\\\": 271, \\\"tour\\\": 272, \\\"eiffel\\\": 273, \\\"l'\\\\u00e9picerie\\\": 274, \\\"terrain\\\": 275, \\\"football\\\": 276, \\\"lac\\\": 277, \\\"l'\\\\u00e9cole\\\": 278, \\\"l'animal\\\": 279, \\\"n'est\\\": 280, \\\"allons\\\": 281, \\\"allez\\\": 282, \\\"peu\\\": 283, \\\"pousse\\\": 284, \\\"du\\\": 285, \\\"temps\\\": 286, \\\"at\\\": 287, \\\"rouille\\\": 288, \\\"sur\\\": 289, \\\"qu'elle\\\": 290, \\\"petites\\\": 291, \\\"derni\\\\u00e8re\\\": 292, \\\"\\\\u00eates\\\": 293, \\\"vais\\\": 294, \\\"voudrait\\\": 295, \\\"proches\\\": 296, \\\"frais\\\": 297, \\\"manguiers\\\": 298, \\\"avons\\\": 299, \\\"t\\\": 300, \\\"porcelaine\\\": 301, \\\"d\\\\u00e9testez\\\": 302, \\\"c'est\\\": 303, \\\"grandes\\\": 304, \\\"pr\\\\u00e9f\\\\u00e9r\\\\u00e9es\\\": 305, \\\"douce\\\": 306, \\\"durant\\\": 307, \\\"cong\\\\u00e9lation\\\": 308, \\\"pla\\\\u00eet\\\": 309, \\\"o\\\\u00f9\\\": 310, \\\"dans\\\": 311, \\\"voulez\\\": 312, \\\"aimeraient\\\": 313, \\\"n'a\\\": 314, \\\"petits\\\": 315, \\\"grands\\\": 316, \\\"limes\\\": 317, \\\"envisagent\\\": 318, \\\"grosses\\\": 319, \\\"b\\\\u00e9nigne\\\": 320, \\\"mouill\\\\u00e9e\\\": 321, \\\"enneig\\\\u00e9\\\": 322, \\\"moindres\\\": 323, \\\"conduite\\\": 324, \\\"gel\\\\u00e9s\\\": 325, \\\"tout\\\": 326, \\\"etats\\\": 327, \\\"n'\\\\u00eates\\\": 328, \\\"vit\\\": 329, \\\"ressort\\\": 330, \\\"d\\\\u00e9tend\\\": 331, \\\"redout\\\\u00e9e\\\": 332, \\\"tu\\\": 333, \\\"qui\\\": 334, \\\"traduis\\\": 335, \\\"appr\\\\u00e9ci\\\\u00e9\\\": 336, \\\"allions\\\": 337, \\\"trouv\\\\u00e9\\\": 338, \\\"as\\\": 339, \\\"faire\\\": 340, \\\"favoris\\\": 341, \\\"souvent\\\": 342, \\\"es\\\": 343, \\\"moteur\\\": 344}\"}}"
|
Task 2/gui.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import libraries
|
| 2 |
+
import tkinter as tk
|
| 3 |
+
from tkinter import ttk, messagebox
|
| 4 |
+
|
| 5 |
+
from keras.layers import TextVectorization
|
| 6 |
+
import re
|
| 7 |
+
import tensorflow.strings as tf_strings
|
| 8 |
+
import json
|
| 9 |
+
import string
|
| 10 |
+
from keras.models import load_model
|
| 11 |
+
import tensorflow as tf
|
| 12 |
+
from keras.preprocessing.text import tokenizer_from_json
|
| 13 |
+
from keras.utils import pad_sequences
|
| 14 |
+
import numpy as np
|
| 15 |
+
import difflib
|
| 16 |
+
|
| 17 |
+
# English to Spanish translation
|
| 18 |
+
strip_chars = string.punctuation + "¿"
|
| 19 |
+
strip_chars = strip_chars.replace("[", "")
|
| 20 |
+
strip_chars = strip_chars.replace("]", "")
|
| 21 |
+
|
| 22 |
+
def custom_standardization(input_string):
|
| 23 |
+
lowercase = tf_strings.lower(input_string)
|
| 24 |
+
return tf_strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
|
| 25 |
+
|
| 26 |
+
# Load the English vectorization layer configuration
|
| 27 |
+
with open('eng_vectorization_config.json') as json_file:
|
| 28 |
+
eng_vectorization_config = json.load(json_file)
|
| 29 |
+
|
| 30 |
+
# Recreate the English vectorization layer with basic configuration
|
| 31 |
+
eng_vectorization = TextVectorization(
|
| 32 |
+
max_tokens=eng_vectorization_config['max_tokens'],
|
| 33 |
+
output_mode=eng_vectorization_config['output_mode'],
|
| 34 |
+
output_sequence_length=eng_vectorization_config['output_sequence_length']
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Apply the custom standardization function
|
| 38 |
+
eng_vectorization.standardize = custom_standardization
|
| 39 |
+
|
| 40 |
+
# Load the Spanish vectorization layer configuration
|
| 41 |
+
with open('spa_vectorization_config.json') as json_file:
|
| 42 |
+
spa_vectorization_config = json.load(json_file)
|
| 43 |
+
|
| 44 |
+
# Recreate the Spanish vectorization layer with basic configuration
|
| 45 |
+
spa_vectorization = TextVectorization(
|
| 46 |
+
max_tokens=spa_vectorization_config['max_tokens'],
|
| 47 |
+
output_mode=spa_vectorization_config['output_mode'],
|
| 48 |
+
output_sequence_length=spa_vectorization_config['output_sequence_length'],
|
| 49 |
+
standardize=custom_standardization
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Load and set the English vocabulary
|
| 53 |
+
with open('eng_vocab.json') as json_file:
|
| 54 |
+
eng_vocab = json.load(json_file)
|
| 55 |
+
eng_vectorization.set_vocabulary(eng_vocab)
|
| 56 |
+
|
| 57 |
+
# Load and set the Spanish vocabulary
|
| 58 |
+
with open('spa_vocab.json') as json_file:
|
| 59 |
+
spa_vocab = json.load(json_file)
|
| 60 |
+
spa_vectorization.set_vocabulary(spa_vocab)
|
| 61 |
+
|
| 62 |
+
# Load the Spanish model
|
| 63 |
+
transformer = load_model('transformer_model')
|
| 64 |
+
|
| 65 |
+
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
|
| 66 |
+
max_decoded_sentence_length = 20
|
| 67 |
+
|
| 68 |
+
# Initialize list to track incorrect words
|
| 69 |
+
incorrect_words = []
|
| 70 |
+
|
| 71 |
+
def beam_search_decode(input_sentence, beam_width=3):
|
| 72 |
+
tokenized_input_sentence = eng_vectorization([input_sentence])
|
| 73 |
+
decoded_sentences = [("[start]", 0.0)]
|
| 74 |
+
|
| 75 |
+
for i in range(max_decoded_sentence_length):
|
| 76 |
+
all_candidates = []
|
| 77 |
+
for decoded_sentence, score in decoded_sentences:
|
| 78 |
+
tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
|
| 79 |
+
predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
|
| 80 |
+
top_k = tf.math.top_k(predictions[0, i, :], k=beam_width)
|
| 81 |
+
|
| 82 |
+
for j in range(beam_width):
|
| 83 |
+
predicted_token_index = top_k.indices[j].numpy()
|
| 84 |
+
predicted_token = spa_index_lookup[predicted_token_index]
|
| 85 |
+
candidate = (decoded_sentence + " " + predicted_token, score + top_k.values[j].numpy())
|
| 86 |
+
all_candidates.append(candidate)
|
| 87 |
+
|
| 88 |
+
ordered = sorted(all_candidates, key=lambda x: x[1], reverse=True)
|
| 89 |
+
decoded_sentences = ordered[:beam_width]
|
| 90 |
+
|
| 91 |
+
if all(sentence[0].endswith("[end]") for sentence in decoded_sentences):
|
| 92 |
+
break
|
| 93 |
+
|
| 94 |
+
return decoded_sentences[0][0]
|
| 95 |
+
|
| 96 |
+
# English to French translation
|
| 97 |
+
# Load French model
|
| 98 |
+
model = load_model('english_to_french_model')
|
| 99 |
+
|
| 100 |
+
# Load Tokenizer
|
| 101 |
+
with open('english_tokenizer.json') as f:
|
| 102 |
+
data = json.load(f)
|
| 103 |
+
english_tokenizer = tokenizer_from_json(data)
|
| 104 |
+
|
| 105 |
+
with open('french_tokenizer.json') as f:
|
| 106 |
+
data = json.load(f)
|
| 107 |
+
french_tokenizer = tokenizer_from_json(data)
|
| 108 |
+
|
| 109 |
+
# Load max length
|
| 110 |
+
with open('sequence_length.json') as f:
|
| 111 |
+
max_length = json.load(f)
|
| 112 |
+
|
| 113 |
+
def pad(x, length=None):
|
| 114 |
+
return pad_sequences(x, maxlen=length, padding='post')
|
| 115 |
+
|
| 116 |
+
def translate_to_french(english_sentence):
|
| 117 |
+
english_sentence = english_sentence.lower()
|
| 118 |
+
english_sentence = re.sub(r'[.?!,]', '', english_sentence)
|
| 119 |
+
english_sentence = english_tokenizer.texts_to_sequences([english_sentence])
|
| 120 |
+
english_sentence = pad(english_sentence, max_length)
|
| 121 |
+
english_sentence = english_sentence.reshape((-1, max_length))
|
| 122 |
+
|
| 123 |
+
french_sentence = model.predict(english_sentence)[0]
|
| 124 |
+
french_sentence = [np.argmax(word) for word in french_sentence]
|
| 125 |
+
french_sentence = french_tokenizer.sequences_to_texts([french_sentence])[0]
|
| 126 |
+
|
| 127 |
+
return french_sentence
|
| 128 |
+
|
| 129 |
+
def get_word_suggestions(word, vocab):
|
| 130 |
+
return difflib.get_close_matches(word, vocab, n=3, cutoff=0.6)
|
| 131 |
+
|
| 132 |
+
def check_and_correct_sentence(sentence, vocab):
|
| 133 |
+
words = sentence.split()
|
| 134 |
+
incorrect_words.clear()
|
| 135 |
+
corrected_sentence = []
|
| 136 |
+
for word in words:
|
| 137 |
+
if word not in vocab:
|
| 138 |
+
suggestions = get_word_suggestions(word, vocab)
|
| 139 |
+
incorrect_words.append((word, suggestions))
|
| 140 |
+
else:
|
| 141 |
+
corrected_sentence.append(word)
|
| 142 |
+
|
| 143 |
+
if incorrect_words:
|
| 144 |
+
message = f"Incorrect word(s) detected: {', '.join([w[0] for w in incorrect_words])}\n"
|
| 145 |
+
for word, suggestions in incorrect_words:
|
| 146 |
+
message += f"Suggestions for '{word}': {', '.join(suggestions) if suggestions else 'No suggestions available'}\n"
|
| 147 |
+
if len(incorrect_words) >= 1:
|
| 148 |
+
messagebox.showerror("Error", message)
|
| 149 |
+
return False
|
| 150 |
+
return True
|
| 151 |
+
|
| 152 |
+
def translate_to_spanish(english_sentence):
|
| 153 |
+
if not check_and_correct_sentence(english_sentence, eng_vocab):
|
| 154 |
+
return ""
|
| 155 |
+
spanish_sentence = beam_search_decode(english_sentence)
|
| 156 |
+
return spanish_sentence.replace("[start]", "").replace("[end]", "").strip()
|
| 157 |
+
|
| 158 |
+
# Function to handle translation request based on selected language
|
| 159 |
+
def handle_translate():
|
| 160 |
+
selected_language = language_var.get()
|
| 161 |
+
english_sentence = text_input.get("1.0", "end-1c").strip()
|
| 162 |
+
|
| 163 |
+
if not english_sentence:
|
| 164 |
+
messagebox.showwarning("Warning", "Please enter a sentence to translate.")
|
| 165 |
+
return
|
| 166 |
+
|
| 167 |
+
if selected_language == "French":
|
| 168 |
+
translation = translate_to_french(english_sentence)
|
| 169 |
+
elif selected_language == "Spanish":
|
| 170 |
+
translation = translate_to_spanish(english_sentence)
|
| 171 |
+
|
| 172 |
+
translation_output.delete("1.0", "end")
|
| 173 |
+
translation_output.insert("end", f"{selected_language} translation: {translation}")
|
| 174 |
+
|
| 175 |
+
# Setting up the main window
|
| 176 |
+
root = tk.Tk()
|
| 177 |
+
root.title("Language Translator")
|
| 178 |
+
root.geometry("550x600")
|
| 179 |
+
|
| 180 |
+
# Font configuration
|
| 181 |
+
font_style = "Times New Roman"
|
| 182 |
+
font_size = 14
|
| 183 |
+
|
| 184 |
+
# Frame for input
|
| 185 |
+
input_frame = tk.Frame(root)
|
| 186 |
+
input_frame.pack(pady=10)
|
| 187 |
+
|
| 188 |
+
# Heading for input
|
| 189 |
+
input_heading = tk.Label(input_frame, text="Enter the text to be translated", font=(font_style, font_size, 'bold'))
|
| 190 |
+
input_heading.pack()
|
| 191 |
+
# Text input for English sentence
|
| 192 |
+
text_input = tk.Text(input_frame, height=5, width=50, font=(font_style, font_size))
|
| 193 |
+
text_input.pack()
|
| 194 |
+
|
| 195 |
+
# Language selection
|
| 196 |
+
language_var = tk.StringVar()
|
| 197 |
+
language_label = tk.Label(root, text="Select the language to translate to", font=(font_style, font_size, 'bold'))
|
| 198 |
+
language_label.pack()
|
| 199 |
+
language_select = ttk.Combobox(root, textvariable=language_var, values=["French", "Spanish"], font=(font_style, font_size), state="readonly")
|
| 200 |
+
language_select.pack()
|
| 201 |
+
|
| 202 |
+
# Submit button
|
| 203 |
+
submit_button = ttk.Button(root, text="Translate", command=handle_translate)
|
| 204 |
+
submit_button.pack(pady=10)
|
| 205 |
+
|
| 206 |
+
# Frame for output
|
| 207 |
+
output_frame = tk.Frame(root)
|
| 208 |
+
output_frame.pack(pady=10)
|
| 209 |
+
# Heading for output
|
| 210 |
+
output_heading = tk.Label(output_frame, text="Translation: ", font=(font_style, font_size, 'bold'))
|
| 211 |
+
output_heading.pack()
|
| 212 |
+
|
| 213 |
+
# Text output for translations
|
| 214 |
+
translation_output = tk.Text(output_frame, height=10, width=50, font=(font_style, font_size))
|
| 215 |
+
translation_output.pack()
|
| 216 |
+
|
| 217 |
+
# Running the application
|
| 218 |
+
root.mainloop()
|
Task 2/images/attention.png
ADDED
|
Task 2/images/bidirectional.png
ADDED
|
Task 2/images/embedding-words.png
ADDED
|
Task 2/images/encoder-decoder-context.png
ADDED
|
Task 2/images/encoder-decoder-translation.png
ADDED
|
Task 2/images/rnn.png
ADDED
|
Task 2/sequence_length.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
21
|
Task 2/spa_vectorization_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"name": "text_vectorization_1", "trainable": true, "batch_input_shape": [null], "dtype": "string", "max_tokens": 15000, "split": "whitespace", "ngrams": null, "output_mode": "int", "output_sequence_length": 21, "pad_to_max_tokens": false, "sparse": false, "ragged": false, "vocabulary": null, "idf_weights": null}
|
Task 2/spa_vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Task 2/transformer_model/keras_metadata.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c57e5b1c49a163881bb486d7ab440406af67981fb4d43c1b293d4f3e8be60b6b
|
| 3 |
+
size 58739
|
Task 2/transformer_model/saved_model.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08e90ad65260d2a5f6b86eb2222070bfff801829a65b959df096fdf5609f4979
|
| 3 |
+
size 1305169
|
Task 2/transformer_model/variables/variables.data-00000-of-00001
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1dee90376f3c6db9eac3b062c85fb1490b06871eaf8b078b2b312b0071e20033
|
| 3 |
+
size 159724466
|
Task 2/transformer_model/variables/variables.index
ADDED
|
Binary file (7.6 kB). View file
|
|
|