Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +13 -0
- .gitignore +9 -0
- Attention_Experiments.ipynb +0 -0
- Bengali_English_Transformer.ipynb +1 -0
- Burmese_English_NLLB.ipynb +333 -0
- Burmese_English_Transformer.ipynb +592 -0
- Chinese_English_Transformer.ipynb +433 -0
- Dockerfile +30 -0
- German_English_Transformer.ipynb +1 -0
- Hindi_English_Transformer.ipynb +1 -0
- Kazakh_English_Transformer.ipynb +1 -0
- Nepali_English_Transformer.ipynb +1 -0
- README.md +69 -5
- Tagalog_English_Transformer.ipynb +1 -0
- Thai_English_Transformer.ipynb +1 -0
- Urdu_English_Transformer.ipynb +1 -0
- Vietnamese_English_Transformer.ipynb +1 -0
- app/.dockerignore +8 -0
- app/Dockerfile +28 -0
- app/app.py +308 -0
- app/models/spm_bn.model +3 -0
- app/models/spm_de.model +3 -0
- app/models/spm_en.model +3 -0
- app/models/spm_en_bn.model +3 -0
- app/models/spm_en_de.model +3 -0
- app/models/spm_en_hi.model +3 -0
- app/models/spm_en_kk.model +3 -0
- app/models/spm_en_ne.model +3 -0
- app/models/spm_en_th.model +3 -0
- app/models/spm_en_tl.model +3 -0
- app/models/spm_en_ur.model +3 -0
- app/models/spm_en_vi.model +3 -0
- app/models/spm_en_zh.model +3 -0
- app/models/spm_hi.model +3 -0
- app/models/spm_kk.model +3 -0
- app/models/spm_my.model +3 -0
- app/models/spm_ne.model +3 -0
- app/models/spm_th.model +3 -0
- app/models/spm_tl.model +3 -0
- app/models/spm_ur.model +3 -0
- app/models/spm_vi.model +3 -0
- app/models/spm_zh.model +3 -0
- app/models/transformer_model.pt +3 -0
- app/models/transformer_model_bn.pt +3 -0
- app/models/transformer_model_de.pt +3 -0
- app/models/transformer_model_hi.pt +3 -0
- app/models/transformer_model_kk.pt +3 -0
- app/models/transformer_model_ne.pt +3 -0
- app/models/transformer_model_th.pt +3 -0
- app/models/transformer_model_tl.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
app/static/images/bengali_background.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
app/static/images/burmese_background.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
app/static/images/chinese_background.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
app/static/images/german_background.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
app/static/images/hindi_background.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
app/static/images/kazakh_background.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
app/static/images/nepali_background.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
app/static/images/tagalog_background.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
app/static/images/thai_background.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
app/static/images/urdu_background.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
app/static/images/vietnamese_background.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
attention/train_my_att.txt filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
demo.gif filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.ipynb_checkpoints/
|
| 4 |
+
app/nllb_model/
|
| 5 |
+
.DS_Store
|
| 6 |
+
# app/models/ (removed so models can be uploaded)
|
| 7 |
+
app/venv/
|
| 8 |
+
app/env/
|
| 9 |
+
app/.env
|
Attention_Experiments.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Bengali_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"vpBMKu9S_DKX"},"source":["# Bengali-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Bengali (bn) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Bengali-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"vpBMKu9S_DKX"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZTpz4Y0K_DKZ","executionInfo":{"status":"ok","timestamp":1770449632993,"user_tz":-420,"elapsed":5884,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"84eef5c7-2b97-492e-a89f-84a73053f460"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"ZTpz4Y0K_DKZ"},{"cell_type":"markdown","metadata":{"id":"-y16NO8G_DKb"},"source":["## 2. Data Loading (Opus-100)\n","Loading Bengali-English pairs from Opus-100."],"id":"-y16NO8G_DKb"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":475,"referenced_widgets":["63800baa7f3342fea580c1754d23a187","8e5f81d65723424997ac3de79a7104c2","9412b9b7abd642818a3b5d7c1a7a59d4","51d24b76a33742cf9c61997e19666fb2","71cbb7ffb31e4663b669459808744e91","3d6f71b8889d4431aea09446a05a3cc7","401623c48fec466590876f0e27dac857","74ace8a9c3a04f86bae7ca8899b86fcc","2bbab3affa6244b6ac95974dc009ce15","3e1beecc6600411692f811cc12f35627","f1500e9b0d48434c9bacd5899a56b1b1","50e9c865bf914984b82021b7939f1047","096dfefc18d04d049a75376a18e30e6f","e277f9f3079b4cd4960d9e6f80e13834","c9da49cc397645fb9ae591d204fa9084","40f5a7f2c3214965a8924dcee95154f0","a9ffc0417cfd41f8a554b18690d7d7b3","91e4cff8fdce4d9e98e249f5d53a8338","9265830e2f0147bb823ac2de0ba3300e","2fc08ee9109a4349ba6ffb00c41b842e","e9e425a232384a0a8524e52f6be6a275","86cf30f6e84b43829b7e1d8a6e6e7446","3419947ed2d14de291532f4a13c41992","1019ec6a91cc400d9d79419090be471f","13d4c4cb6e634caa8975cc81911e658d","59a8d28e81804653b0525c7b88d417c5","b529ab421acb4cfe9a0e456f81b8fb10","8c63057306c5441390bee4d65dcdb830","8e8c9e37c21c492184c3be8cb82ffd0b","a3400c3db06645d79d1764881a265c55","d76a77c5e45843549b88f216aecfbbef","a1ce122dd98b44348ccbaf18e574009b","bcebc445abc440cc9efea5135cbe3b27","1c8327669a5b4bbfb8b0eb2a4b814e6d","baa12bef6fc540c1a1aff0c8835cfe5e","d86a597b23e14abd87f9567664b0fd47","b8b3cf1e411840f592c9a9509c31da7b","a6fad26675f14ccaabdae96ccfd07430","a6e95f0632ec49df90be34d6e6f3db38","1276d4a6f6af4d1ca75a9e6e64ddcca1","a61ba76440c84ae686b401b70bf1c379","3d903e048f6d4e618b2a27d61f124931","91281030b7164090841b0846e4f020bf","752d688c557a43bca08e2aa7d7f72331","8c9b8dcbd89645a7a955c06109cadfd7","0c5f2d21f42a473cb4b150ce1cf5ef42","eb9afc65cdf44f008990ec8c1082dea0","c27ce174a6814361bcdbc163ea79a85f","213d90a2119b46219902f0b01ead5521","8a72fd6282c645f4906d817db92c20c0","57024c3d9ead4c14b2bb25a1ac58a392","f572059b9f584008bfe67987e9610844","d6b43ff1450a4d57a09cde163557b732","40723bace09f404087f04f5d3b5da910","7930dcae389240128f87909b8091c838","9885cc8eb23a4bd5979b00fe729ab50a","503e5a8eea7647c791e75c93262869d3","812e5f5b21d74dc992b0a6d318578be6","559291f54b354a7190d9bf3b6c5916f0","bc3b447abfa34bc4baf747de9aa11286","362e85ad032f45f2911ed2233cf29d1e","69f0040fe8fc479b9fa64d4a9c0c7cdc","122d8bc238484956b5e93402a23becf9","41a80cc9f2834727b83c25bdf17fb789","6de63976604e42e8acef1b437c645700","5345097ef89145b7837dad5deb68cc3f","4f887be0d0434ed486c40c3d203c556f","079bd8aa81ad460593477de7a1beb27c","ac05ba3b0f1d4fd0b479bd309c74e805","aa6578ad022d418d8c14b59d8c303363","47e8d29853384b4cb6fdfbbb355b5044","cf6c5979b59c4e7b8c9041dd58b0970c","7b9da41a85d14b5ebe849a1a870b8b1a","1391beb1571349cebdb8200a9e5ee20a","983ede9cd17f464c933db0203a09cb3d","57e6475668804afd89b1559fae888165","c9b8876bbdc34f7b9bdcec235b05c6ef"]},"id":"PbQklUu0_DKb","executionInfo":{"status":"ok","timestamp":1770449672296,"user_tz":-420,"elapsed":39277,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"3b67ebbb-0e6a-4722-8c8e-0a62396d8346"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Bengali-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"63800baa7f3342fea580c1754d23a187"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["bn-en/test-00000-of-00001.parquet: 0%| | 0.00/279k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"50e9c865bf914984b82021b7939f1047"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["bn-en/train-00000-of-00001.parquet: 0%| | 0.00/134M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3419947ed2d14de291532f4a13c41992"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["bn-en/validation-00000-of-00001.parquet: 0%| | 0.00/272k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1c8327669a5b4bbfb8b0eb2a4b814e6d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8c9b8dcbd89645a7a955c06109cadfd7"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/1000000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9885cc8eb23a4bd5979b00fe729ab50a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"4f887be0d0434ed486c40c3d203c556f"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 1004000 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Bengali-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Bengali-English)...\")\n","try:\n"," # Opus-100 has 'bn-en' (or 'en-bn')\n"," dataset = load_dataset(\"opus100\", \"bn-en\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'bn' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'bn': item['translation']['bn'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Bengali-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"PbQklUu0_DKb"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"41o9Zc-K_DKc","executionInfo":{"status":"ok","timestamp":1770449672512,"user_tz":-420,"elapsed":218,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"8167a35f-6e43-41e7-9d71-7efb5dad305c"},"outputs":[{"output_type":"stream","name":"stdout","text":[" bn \\\n","0 তোমায় ঐ হৃদপিন্ড খেতে হবে না। \n","1 আমি দরজা সামান্য খোলা রেখে যাচ্ছি. \n","2 এই ঘটনার ক্ষেত্রে, গণপ্রচার মাধ্যম ঠিক মতই কাজ... \n","3 মিথ্যা বলবো? \n","4 এ বছর পাকিস্তানে তার প্রত্যাবর্তন খুব অপয়া ভাব... \n","\n"," en \n","0 You don't have to fucking eat his heart. \n","1 I'll leave the door open a little bit. \n","2 In this case, mass media have continued to fun... \n","3 Lies? \n","4 Her return to Pakistan earlier this year start... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['bn', 'en'])\n","df['bn'] = df['bn'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['bn'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"41o9Zc-K_DKc"},{"cell_type":"markdown","metadata":{"id":"vU0esOr-_DKc"},"source":["## 3. Tokenization"],"id":"vU0esOr-_DKc"},{"cell_type":"code","execution_count":4,"metadata":{"id":"w739sfKC_DKc","executionInfo":{"status":"ok","timestamp":1770449678700,"user_tz":-420,"elapsed":6188,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"6835514e-07cf-441f-9600-46cdf745862a"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Bengali Tokenizer...\n","Training English Tokenizer (for Bengali pair)...\n"]}],"source":["# Save texts to files\n","with open('train_bn.txt', 'w', encoding='utf-8') as f:\n"," for line in df['bn']: f.write(line + '\\n')\n","\n","with open('train_en_bn.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Bengali Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_bn.txt',\n"," model_prefix='spm_bn',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Bengali pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_bn.txt',\n"," model_prefix='spm_en_bn',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_bn.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_bn.model')"],"id":"w739sfKC_DKc"},{"cell_type":"markdown","metadata":{"id":"mPL7piMQ_DKd"},"source":["## 4. Dataset & Model"],"id":"mPL7piMQ_DKd"},{"cell_type":"code","execution_count":5,"metadata":{"id":"TcClWCSz_DKd","executionInfo":{"status":"ok","timestamp":1770449678705,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['bn']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"TcClWCSz_DKd"},{"cell_type":"code","execution_count":6,"metadata":{"id":"kgLpC6aG_DKd","executionInfo":{"status":"ok","timestamp":1770449678714,"user_tz":-420,"elapsed":3,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"kgLpC6aG_DKd"},{"cell_type":"code","execution_count":7,"metadata":{"id":"aWpT2aj1_DKe","executionInfo":{"status":"ok","timestamp":1770450066061,"user_tz":-420,"elapsed":387345,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"cd31b25e-e7e1-48c0-c161-45069232ea3f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.192\n","Step 100, Loss: 6.546\n","Step 200, Loss: 6.175\n","Step 300, Loss: 5.916\n","Step 400, Loss: 5.874\n","Step 500, Loss: 5.361\n","Step 600, Loss: 5.727\n","Step 700, Loss: 5.421\n","Epoch 1 Loss: 6.014\n","Step 0, Loss: 5.319\n","Step 100, Loss: 5.217\n","Step 200, Loss: 5.392\n","Step 300, Loss: 5.172\n","Step 400, Loss: 5.023\n","Step 500, Loss: 5.232\n","Step 600, Loss: 5.307\n","Step 700, Loss: 5.209\n","Epoch 2 Loss: 5.263\n","Step 0, Loss: 4.776\n","Step 100, Loss: 5.007\n","Step 200, Loss: 5.070\n","Step 300, Loss: 4.992\n","Step 400, Loss: 4.958\n","Step 500, Loss: 4.863\n","Step 600, Loss: 5.025\n","Step 700, Loss: 5.010\n","Epoch 3 Loss: 4.886\n","Step 0, Loss: 4.940\n","Step 100, Loss: 4.741\n","Step 200, Loss: 4.769\n","Step 300, Loss: 4.715\n","Step 400, Loss: 4.508\n","Step 500, Loss: 4.680\n","Step 600, Loss: 4.605\n","Step 700, Loss: 4.755\n","Epoch 4 Loss: 4.629\n","Step 0, Loss: 4.324\n","Step 100, Loss: 4.510\n","Step 200, Loss: 4.466\n","Step 300, Loss: 4.252\n","Step 400, Loss: 4.540\n","Step 500, Loss: 4.343\n","Step 600, Loss: 4.285\n","Step 700, Loss: 4.335\n","Epoch 5 Loss: 4.443\n","Step 0, Loss: 4.265\n","Step 100, Loss: 4.288\n","Step 200, Loss: 4.463\n","Step 300, Loss: 4.301\n","Step 400, Loss: 4.595\n","Step 500, Loss: 4.464\n","Step 600, Loss: 4.206\n","Step 700, Loss: 4.423\n","Epoch 6 Loss: 4.295\n","Step 0, Loss: 3.883\n","Step 100, Loss: 4.193\n","Step 200, Loss: 4.195\n","Step 300, Loss: 3.978\n","Step 400, Loss: 4.358\n","Step 500, Loss: 4.160\n","Step 600, Loss: 4.146\n","Step 700, Loss: 4.027\n","Epoch 7 Loss: 4.171\n","Step 0, Loss: 4.029\n","Step 100, Loss: 4.170\n","Step 200, Loss: 4.145\n","Step 300, Loss: 4.106\n","Step 400, Loss: 3.941\n","Step 500, Loss: 4.163\n","Step 600, Loss: 4.277\n","Step 700, Loss: 4.172\n","Epoch 8 Loss: 4.063\n","Step 0, Loss: 3.674\n","Step 100, Loss: 3.885\n","Step 200, Loss: 4.137\n","Step 300, Loss: 3.860\n","Step 400, Loss: 4.117\n","Step 500, Loss: 4.026\n","Step 600, Loss: 4.033\n","Step 700, Loss: 3.988\n","Epoch 9 Loss: 3.970\n","Step 0, Loss: 3.889\n","Step 100, Loss: 3.525\n","Step 200, Loss: 3.580\n","Step 300, Loss: 3.651\n","Step 400, Loss: 3.900\n","Step 500, Loss: 3.773\n","Step 600, Loss: 3.872\n","Step 700, Loss: 4.015\n","Epoch 10 Loss: 3.883\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_bn.pt')"],"id":"aWpT2aj1_DKe"},{"cell_type":"code","execution_count":8,"metadata":{"id":"RHGxIZb-_DKf","executionInfo":{"status":"ok","timestamp":1770450066093,"user_tz":-420,"elapsed":18,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"e8611f94-0727-4ad0-e07b-a0c1d374b834"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_bn.pt', 'app/models/transformer_model_bn.pt')\n","shutil.copy('spm_bn.model', 'app/models/spm_bn.model')\n","shutil.copy('spm_en_bn.model', 'app/models/spm_en_bn.model')\n","print(\"Models copied to app/models/\")"],"id":"RHGxIZb-_DKf"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"63800baa7f3342fea580c1754d23a187":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_8e5f81d65723424997ac3de79a7104c2","IPY_MODEL_9412b9b7abd642818a3b5d7c1a7a59d4","IPY_MODEL_51d24b76a33742cf9c61997e19666fb2"],"layout":"IPY_MODEL_71cbb7ffb31e4663b669459808744e91"}},"8e5f81d65723424997ac3de79a7104c2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3d6f71b8889d4431aea09446a05a3cc7","placeholder":"","style":"IPY_MODEL_401623c48fec466590876f0e27dac857","value":"README.md: "}},"9412b9b7abd642818a3b5d7c1a7a59d4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_74ace8a9c3a04f86bae7ca8899b86fcc","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_2bbab3affa6244b6ac95974dc009ce15","value":1}},"51d24b76a33742cf9c61997e19666fb2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3e1beecc6600411692f811cc12f35627","placeholder":"","style":"IPY_MODEL_f1500e9b0d48434c9bacd5899a56b1b1","value":" 65.4k/? [00:00<00:00, 6.69MB/s]"}},"71cbb7ffb31e4663b669459808744e91":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3d6f71b8889d4431aea09446a05a3cc7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"401623c48fec466590876f0e27dac857":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"74ace8a9c3a04f86bae7ca8899b86fcc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"2bbab3affa6244b6ac95974dc009ce15":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3e1beecc6600411692f811cc12f35627":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f1500e9b0d48434c9bacd5899a56b1b1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"50e9c865bf914984b82021b7939f1047":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_096dfefc18d04d049a75376a18e30e6f","IPY_MODEL_e277f9f3079b4cd4960d9e6f80e13834","IPY_MODEL_c9da49cc397645fb9ae591d204fa9084"],"layout":"IPY_MODEL_40f5a7f2c3214965a8924dcee95154f0"}},"096dfefc18d04d049a75376a18e30e6f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a9ffc0417cfd41f8a554b18690d7d7b3","placeholder":"","style":"IPY_MODEL_91e4cff8fdce4d9e98e249f5d53a8338","value":"bn-en/test-00000-of-00001.parquet: 100%"}},"e277f9f3079b4cd4960d9e6f80e13834":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_9265830e2f0147bb823ac2de0ba3300e","max":279391,"min":0,"orientation":"horizontal","style":"IPY_MODEL_2fc08ee9109a4349ba6ffb00c41b842e","value":279391}},"c9da49cc397645fb9ae591d204fa9084":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e9e425a232384a0a8524e52f6be6a275","placeholder":"","style":"IPY_MODEL_86cf30f6e84b43829b7e1d8a6e6e7446","value":" 279k/279k [00:01<00:00, 46.7kB/s]"}},"40f5a7f2c3214965a8924dcee95154f0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a9ffc0417cfd41f8a554b18690d7d7b3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"91e4cff8fdce4d9e98e249f5d53a8338":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9265830e2f0147bb823ac2de0ba3300e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2fc08ee9109a4349ba6ffb00c41b842e":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e9e425a232384a0a8524e52f6be6a275":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"86cf30f6e84b43829b7e1d8a6e6e7446":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3419947ed2d14de291532f4a13c41992":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_1019ec6a91cc400d9d79419090be471f","IPY_MODEL_13d4c4cb6e634caa8975cc81911e658d","IPY_MODEL_59a8d28e81804653b0525c7b88d417c5"],"layout":"IPY_MODEL_b529ab421acb4cfe9a0e456f81b8fb10"}},"1019ec6a91cc400d9d79419090be471f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8c63057306c5441390bee4d65dcdb830","placeholder":"","style":"IPY_MODEL_8e8c9e37c21c492184c3be8cb82ffd0b","value":"bn-en/train-00000-of-00001.parquet: 100%"}},"13d4c4cb6e634caa8975cc81911e658d":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a3400c3db06645d79d1764881a265c55","max":133525065,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d76a77c5e45843549b88f216aecfbbef","value":133525065}},"59a8d28e81804653b0525c7b88d417c5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a1ce122dd98b44348ccbaf18e574009b","placeholder":"","style":"IPY_MODEL_bcebc445abc440cc9efea5135cbe3b27","value":" 134M/134M [00:01<00:00, 44.1MB/s]"}},"b529ab421acb4cfe9a0e456f81b8fb10":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8c63057306c5441390bee4d65dcdb830":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8e8c9e37c21c492184c3be8cb82ffd0b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a3400c3db06645d79d1764881a265c55":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d76a77c5e45843549b88f216aecfbbef":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a1ce122dd98b44348ccbaf18e574009b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bcebc445abc440cc9efea5135cbe3b27":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1c8327669a5b4bbfb8b0eb2a4b814e6d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_baa12bef6fc540c1a1aff0c8835cfe5e","IPY_MODEL_d86a597b23e14abd87f9567664b0fd47","IPY_MODEL_b8b3cf1e411840f592c9a9509c31da7b"],"layout":"IPY_MODEL_a6fad26675f14ccaabdae96ccfd07430"}},"baa12bef6fc540c1a1aff0c8835cfe5e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a6e95f0632ec49df90be34d6e6f3db38","placeholder":"","style":"IPY_MODEL_1276d4a6f6af4d1ca75a9e6e64ddcca1","value":"bn-en/validation-00000-of-00001.parquet: 100%"}},"d86a597b23e14abd87f9567664b0fd47":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a61ba76440c84ae686b401b70bf1c379","max":272140,"min":0,"orientation":"horizontal","style":"IPY_MODEL_3d903e048f6d4e618b2a27d61f124931","value":272140}},"b8b3cf1e411840f592c9a9509c31da7b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_91281030b7164090841b0846e4f020bf","placeholder":"","style":"IPY_MODEL_752d688c557a43bca08e2aa7d7f72331","value":" 272k/272k [00:00<00:00, 430kB/s]"}},"a6fad26675f14ccaabdae96ccfd07430":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a6e95f0632ec49df90be34d6e6f3db38":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1276d4a6f6af4d1ca75a9e6e64ddcca1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a61ba76440c84ae686b401b70bf1c379":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3d903e048f6d4e618b2a27d61f124931":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"91281030b7164090841b0846e4f020bf":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"752d688c557a43bca08e2aa7d7f72331":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8c9b8dcbd89645a7a955c06109cadfd7":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0c5f2d21f42a473cb4b150ce1cf5ef42","IPY_MODEL_eb9afc65cdf44f008990ec8c1082dea0","IPY_MODEL_c27ce174a6814361bcdbc163ea79a85f"],"layout":"IPY_MODEL_213d90a2119b46219902f0b01ead5521"}},"0c5f2d21f42a473cb4b150ce1cf5ef42":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8a72fd6282c645f4906d817db92c20c0","placeholder":"","style":"IPY_MODEL_57024c3d9ead4c14b2bb25a1ac58a392","value":"Generating test split: 100%"}},"eb9afc65cdf44f008990ec8c1082dea0":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f572059b9f584008bfe67987e9610844","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d6b43ff1450a4d57a09cde163557b732","value":2000}},"c27ce174a6814361bcdbc163ea79a85f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_40723bace09f404087f04f5d3b5da910","placeholder":"","style":"IPY_MODEL_7930dcae389240128f87909b8091c838","value":" 2000/2000 [00:00<00:00, 51153.17 examples/s]"}},"213d90a2119b46219902f0b01ead5521":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8a72fd6282c645f4906d817db92c20c0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"57024c3d9ead4c14b2bb25a1ac58a392":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f572059b9f584008bfe67987e9610844":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6b43ff1450a4d57a09cde163557b732":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"40723bace09f404087f04f5d3b5da910":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7930dcae389240128f87909b8091c838":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9885cc8eb23a4bd5979b00fe729ab50a":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_503e5a8eea7647c791e75c93262869d3","IPY_MODEL_812e5f5b21d74dc992b0a6d318578be6","IPY_MODEL_559291f54b354a7190d9bf3b6c5916f0"],"layout":"IPY_MODEL_bc3b447abfa34bc4baf747de9aa11286"}},"503e5a8eea7647c791e75c93262869d3":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_362e85ad032f45f2911ed2233cf29d1e","placeholder":"","style":"IPY_MODEL_69f0040fe8fc479b9fa64d4a9c0c7cdc","value":"Generating train split: 100%"}},"812e5f5b21d74dc992b0a6d318578be6":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_122d8bc238484956b5e93402a23becf9","max":1000000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_41a80cc9f2834727b83c25bdf17fb789","value":1000000}},"559291f54b354a7190d9bf3b6c5916f0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_6de63976604e42e8acef1b437c645700","placeholder":"","style":"IPY_MODEL_5345097ef89145b7837dad5deb68cc3f","value":" 1000000/1000000 [00:01<00:00, 988617.74 examples/s]"}},"bc3b447abfa34bc4baf747de9aa11286":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"362e85ad032f45f2911ed2233cf29d1e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"69f0040fe8fc479b9fa64d4a9c0c7cdc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"122d8bc238484956b5e93402a23becf9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"41a80cc9f2834727b83c25bdf17fb789":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"6de63976604e42e8acef1b437c645700":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5345097ef89145b7837dad5deb68cc3f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"4f887be0d0434ed486c40c3d203c556f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_079bd8aa81ad460593477de7a1beb27c","IPY_MODEL_ac05ba3b0f1d4fd0b479bd309c74e805","IPY_MODEL_aa6578ad022d418d8c14b59d8c303363"],"layout":"IPY_MODEL_47e8d29853384b4cb6fdfbbb355b5044"}},"079bd8aa81ad460593477de7a1beb27c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cf6c5979b59c4e7b8c9041dd58b0970c","placeholder":"","style":"IPY_MODEL_7b9da41a85d14b5ebe849a1a870b8b1a","value":"Generating validation split: 100%"}},"ac05ba3b0f1d4fd0b479bd309c74e805":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_1391beb1571349cebdb8200a9e5ee20a","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_983ede9cd17f464c933db0203a09cb3d","value":2000}},"aa6578ad022d418d8c14b59d8c303363":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_57e6475668804afd89b1559fae888165","placeholder":"","style":"IPY_MODEL_c9b8876bbdc34f7b9bdcec235b05c6ef","value":" 2000/2000 [00:00<00:00, 146595.04 examples/s]"}},"47e8d29853384b4cb6fdfbbb355b5044":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cf6c5979b59c4e7b8c9041dd58b0970c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7b9da41a85d14b5ebe849a1a870b8b1a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1391beb1571349cebdb8200a9e5ee20a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"983ede9cd17f464c933db0203a09cb3d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"57e6475668804afd89b1559fae888165":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c9b8876bbdc34f7b9bdcec235b05c6ef":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
Burmese_English_NLLB.ipynb
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Burmese-English NMT with NLLB-200 (Fine-Tuning)\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**Student**: Htut Ko Ko (st126010) \n",
|
| 10 |
+
"**Course**: NLP Project A3 \n",
|
| 11 |
+
"**Task**: High-Quality Machine Translation (Web App Integration)\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"## 1. Introduction & Motivation\n",
|
| 14 |
+
"In this notebook, I implement a **Neural Machine Translation (NMT)** system to translate between **Burmese** and **English**.\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"For the assignment's \"Task 4: Web Application\", my goal was to achieve **production-quality** translation that users would actually find useful.\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"Training a Transformer from scratch (as done in my other notebook) on the small **ALT dataset (20k pairs)** resulted in poor fluency because deep learning models require massive amounts of data. To solve this, I chose to **fine-tune** a state-of-the-art pre-trained model: **NLLB-200 (No Language Left Behind)** by Meta.\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"This approach allows me to leverage the model's existing knowledge of Burmese and English while adapting it specifically to the ALT dataset style."
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "markdown",
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"source": [
|
| 27 |
+
"## 2. Setup & Dependencies\n",
|
| 28 |
+
"First, I install the necessary libraries from HuggingFace (`transformers`, `datasets`) and tools for evaluating translation quality (`sacrebleu`). I also mount my Google Drive so that I can save the fine-tuned model safely and use it later in my Web App."
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": null,
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"!pip install transformers datasets sentencepiece sacremoses accelerate"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"cell_type": "code",
|
| 42 |
+
"execution_count": null,
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"source": [
|
| 46 |
+
"import os\n",
|
| 47 |
+
"import torch\n",
|
| 48 |
+
"import numpy as np\n",
|
| 49 |
+
"import pandas as pd\n",
|
| 50 |
+
"from datasets import load_dataset, Dataset, DatasetDict\n",
|
| 51 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
|
| 52 |
+
"from google.colab import drive\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"# I mount Google Drive to ensure my model is saved persistently.\n",
|
| 55 |
+
"drive.mount('/content/drive')\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"# I define the save path in my Drive so I can download it later for the Web App.\n",
|
| 58 |
+
"DRIVE_SAVE_PATH = \"/content/drive/MyDrive/NLP/Project_A3/nllb_model\"\n",
|
| 59 |
+
"os.makedirs(DRIVE_SAVE_PATH, exist_ok=True)\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
| 62 |
+
"print(f\"Using device: {device}\")"
|
| 63 |
+
]
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"cell_type": "markdown",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"source": [
|
| 69 |
+
"## 3. Data Preparation (ALT Dataset)\n",
|
| 70 |
+
"I am using the **Asian Language Treebank (ALT)** dataset as required. The raw dataset contains multiple languages, so I filter it to extract only the **Burmese ('my')** and **English ('en')** pairs.\n",
|
| 71 |
+
"\n",
|
| 72 |
+
"I then split the data into:\n",
|
| 73 |
+
"- **Train (81%)**: For teaching the model.\n",
|
| 74 |
+
"- **Validation (9%)**: For checking improvements during training.\n",
|
| 75 |
+
"- **Test (10%)**: For final evaluation."
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": null,
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [],
|
| 83 |
+
"source": [
|
| 84 |
+
"print(\"Loading ALT Dataset...\")\n",
|
| 85 |
+
"try:\n",
|
| 86 |
+
" raw_dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n",
|
| 87 |
+
" \n",
|
| 88 |
+
" data = []\n",
|
| 89 |
+
" for item in raw_dataset:\n",
|
| 90 |
+
" if 'translation' in item:\n",
|
| 91 |
+
" if 'my' in item['translation'] and 'en' in item['translation']:\n",
|
| 92 |
+
" data.append({\n",
|
| 93 |
+
" 'my': item['translation']['my'],\n",
|
| 94 |
+
" 'en': item['translation']['en']\n",
|
| 95 |
+
" })\n",
|
| 96 |
+
" \n",
|
| 97 |
+
" df = pd.DataFrame(data)\n",
|
| 98 |
+
" df = df.dropna()\n",
|
| 99 |
+
" print(f\"Total Pairs Extracted: {len(df)}\")\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"except Exception as e:\n",
|
| 102 |
+
" print(f\"Error: {e}\")"
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"cell_type": "code",
|
| 107 |
+
"execution_count": null,
|
| 108 |
+
"metadata": {},
|
| 109 |
+
"outputs": [],
|
| 110 |
+
"source": [
|
| 111 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 112 |
+
"\n",
|
| 113 |
+
"# Splitting: 90% Train+Val, 10% Test\n",
|
| 114 |
+
"train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)\n",
|
| 115 |
+
"# Splitting Train+Val: 90% Train, 10% Val\n",
|
| 116 |
+
"train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)\n",
|
| 117 |
+
"\n",
|
| 118 |
+
"print(f\"Train Size: {len(train_df)}\")\n",
|
| 119 |
+
"print(f\"Results Validation Size: {len(val_df)}\")\n",
|
| 120 |
+
"print(f\"Test Size: {len(test_df)}\")\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"# Convert back to HuggingFace Dataset format for easier processing\n",
|
| 123 |
+
"train_dataset = Dataset.from_pandas(train_df)\n",
|
| 124 |
+
"val_dataset = Dataset.from_pandas(val_df)\n",
|
| 125 |
+
"test_dataset = Dataset.from_pandas(test_df)\n",
|
| 126 |
+
"\n",
|
| 127 |
+
"dataset = DatasetDict({\n",
|
| 128 |
+
" 'train': train_dataset,\n",
|
| 129 |
+
" 'validation': val_dataset,\n",
|
| 130 |
+
" 'test': test_dataset\n",
|
| 131 |
+
"})"
|
| 132 |
+
]
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"cell_type": "markdown",
|
| 136 |
+
"metadata": {},
|
| 137 |
+
"source": [
|
| 138 |
+
"## 4. Model Loading & Tokenization\n",
|
| 139 |
+
"Here I load the **NLLB-200-distilled-600M** model. This is a distilled version of the massive 54B parameter model, making it efficient enough to fine-tune on Colab while retaining high performance.\n",
|
| 140 |
+
"\n",
|
| 141 |
+
"**Important**: NLLB requires specific language codes:\n",
|
| 142 |
+
"- Burmese: `mya_Mymr`\n",
|
| 143 |
+
"- English: `eng_Latn`\n",
|
| 144 |
+
"\n",
|
| 145 |
+
"I create a preprocessing function to tokenize the inputs. We tokenize the inputs (Burmese) and the targets (English) simultaneously."
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"cell_type": "code",
|
| 150 |
+
"execution_count": null,
|
| 151 |
+
"metadata": {},
|
| 152 |
+
"outputs": [],
|
| 153 |
+
"source": [
|
| 154 |
+
"model_checkpoint = \"facebook/nllb-200-distilled-600M\"\n",
|
| 155 |
+
"\n",
|
| 156 |
+
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang=\"mya_Mymr\", tgt_lang=\"eng_Latn\")\n",
|
| 157 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)"
|
| 158 |
+
]
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"cell_type": "code",
|
| 162 |
+
"execution_count": null,
|
| 163 |
+
"metadata": {},
|
| 164 |
+
"outputs": [],
|
| 165 |
+
"source": [
|
| 166 |
+
"max_input_length = 128\n",
|
| 167 |
+
"max_target_length = 128\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"def preprocess_function(examples):\n",
|
| 170 |
+
" inputs = [ex for ex in examples['my']]\n",
|
| 171 |
+
" targets = [ex for ex in examples['en']]\n",
|
| 172 |
+
" \n",
|
| 173 |
+
" # We tokenize the input (Burmese)\n",
|
| 174 |
+
" model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)\n",
|
| 175 |
+
" # We tokenize the target (English) as labels\n",
|
| 176 |
+
" labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)\n",
|
| 177 |
+
"\n",
|
| 178 |
+
" model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
|
| 179 |
+
" return model_inputs\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"tokenized_datasets = dataset.map(preprocess_function, batched=True)"
|
| 182 |
+
]
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"cell_type": "markdown",
|
| 186 |
+
"metadata": {},
|
| 187 |
+
"source": [
|
| 188 |
+
"## 5. Fine-Tuning (Training)\n",
|
| 189 |
+
"I use the `Seq2SeqTrainer` to fine-tune the model.\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"**Hyperparameters:**\n",
|
| 192 |
+
"- **Batch Size**: 16 (fits in Colab GPU memory).\n",
|
| 193 |
+
"- **Learning Rate**: 2e-5 (low learning rate to gently adjust pre-trained weights).\n",
|
| 194 |
+
"- **Epochs**: 3 (Since the model is already pre-trained, it converges very quickly. 3 epochs is sufficient to adapt to the ALT dataset style without overfitting)."
|
| 195 |
+
]
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"cell_type": "code",
|
| 199 |
+
"execution_count": null,
|
| 200 |
+
"metadata": {},
|
| 201 |
+
"outputs": [],
|
| 202 |
+
"source": [
|
| 203 |
+
"batch_size = 16\n",
|
| 204 |
+
"learning_rate = 2e-5\n",
|
| 205 |
+
"weight_decay = 0.01\n",
|
| 206 |
+
"num_train_epochs = 3\n",
|
| 207 |
+
"\n",
|
| 208 |
+
"args = Seq2SeqTrainingArguments(\n",
|
| 209 |
+
" DRIVE_SAVE_PATH,\n",
|
| 210 |
+
" eval_strategy = \"epoch\",\n",
|
| 211 |
+
" learning_rate=learning_rate,\n",
|
| 212 |
+
" per_device_train_batch_size=batch_size,\n",
|
| 213 |
+
" per_device_eval_batch_size=batch_size,\n",
|
| 214 |
+
" weight_decay=weight_decay,\n",
|
| 215 |
+
" save_total_limit=1,\n",
|
| 216 |
+
" num_train_epochs=num_train_epochs,\n",
|
| 217 |
+
" predict_with_generate=True,\n",
|
| 218 |
+
" fp16=True if torch.cuda.is_available() else False,\n",
|
| 219 |
+
")\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"trainer = Seq2SeqTrainer(\n",
|
| 224 |
+
" model=model,\n",
|
| 225 |
+
" args=args,\n",
|
| 226 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
| 227 |
+
" eval_dataset=tokenized_datasets[\"validation\"],\n",
|
| 228 |
+
" data_collator=data_collator,\n",
|
| 229 |
+
")"
|
| 230 |
+
]
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"cell_type": "code",
|
| 234 |
+
"execution_count": null,
|
| 235 |
+
"metadata": {},
|
| 236 |
+
"outputs": [],
|
| 237 |
+
"source": [
|
| 238 |
+
"print(\"Starting Training...\")\n",
|
| 239 |
+
"trainer.train()"
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"cell_type": "markdown",
|
| 244 |
+
"metadata": {},
|
| 245 |
+
"source": [
|
| 246 |
+
"## 6. Saving the Model\n",
|
| 247 |
+
"After training is complete, I save the model and the tokenizer to Google Drive. This is the crucial step that allows me to download the model folder later and use it in my local Flask web application."
|
| 248 |
+
]
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "code",
|
| 252 |
+
"execution_count": null,
|
| 253 |
+
"metadata": {},
|
| 254 |
+
"outputs": [],
|
| 255 |
+
"source": [
|
| 256 |
+
"trainer.save_model(DRIVE_SAVE_PATH)\n",
|
| 257 |
+
"tokenizer.save_pretrained(DRIVE_SAVE_PATH)\n",
|
| 258 |
+
"print(f\"Model and Tokenizer saved safely to '{DRIVE_SAVE_PATH}'\")"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"cell_type": "markdown",
|
| 263 |
+
"metadata": {},
|
| 264 |
+
"source": [
|
| 265 |
+
"## 7. Verification & Inference\n",
|
| 266 |
+
"Finally, I verify that the model works by loading it back from the drive and running a translation test. I use `model.generate()` directly for robustness, ensuring the correct language codes are sent to the model."
|
| 267 |
+
]
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"cell_type": "code",
|
| 271 |
+
"execution_count": null,
|
| 272 |
+
"metadata": {},
|
| 273 |
+
"outputs": [],
|
| 274 |
+
"source": [
|
| 275 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
|
| 276 |
+
"import torch\n",
|
| 277 |
+
"\n",
|
| 278 |
+
"# Reload from Drive to verify consistency\n",
|
| 279 |
+
"print(f\"Reloading model from {DRIVE_SAVE_PATH}...\")\n",
|
| 280 |
+
"tokenizer = AutoTokenizer.from_pretrained(DRIVE_SAVE_PATH)\n",
|
| 281 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(DRIVE_SAVE_PATH).to(device)\n",
|
| 282 |
+
"\n",
|
| 283 |
+
"def translate(text):\n",
|
| 284 |
+
" # Set source language explicitly\n",
|
| 285 |
+
" tokenizer.src_lang = \"mya_Mymr\"\n",
|
| 286 |
+
" inputs = tokenizer(text, return_tensors=\"pt\").to(device)\n",
|
| 287 |
+
" \n",
|
| 288 |
+
" with torch.no_grad():\n",
|
| 289 |
+
" # Generate encoded output\n",
|
| 290 |
+
" translated_tokens = model.generate(\n",
|
| 291 |
+
" **inputs, \n",
|
| 292 |
+
" # Force the target language to be English\n",
|
| 293 |
+
" forced_bos_token_id=tokenizer.convert_tokens_to_ids(\"eng_Latn\"), \n",
|
| 294 |
+
" max_length=128\n",
|
| 295 |
+
" )\n",
|
| 296 |
+
" # Decode tokens back to text\n",
|
| 297 |
+
" return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"# Manual Test\n",
|
| 300 |
+
"text = \"မင်္ဂလာပါ\"\n",
|
| 301 |
+
"print(f\"Source: {text}\")\n",
|
| 302 |
+
"print(f\"Prediction: {translate(text)}\")\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"# Random Test from Test Set\n",
|
| 305 |
+
"sample = test_df.sample(1).iloc[0]\n",
|
| 306 |
+
"print(f\"\\nTest Sample Source: {sample['my']}\")\n",
|
| 307 |
+
"print(f\"Test Sample Target: {sample['en']}\")\n",
|
| 308 |
+
"print(f\"Model Prediction: {translate(sample['my'])}\")"
|
| 309 |
+
]
|
| 310 |
+
}
|
| 311 |
+
],
|
| 312 |
+
"metadata": {
|
| 313 |
+
"kernelspec": {
|
| 314 |
+
"display_name": "Python 3",
|
| 315 |
+
"language": "python",
|
| 316 |
+
"name": "python3"
|
| 317 |
+
},
|
| 318 |
+
"language_info": {
|
| 319 |
+
"codemirror_mode": {
|
| 320 |
+
"name": "ipython",
|
| 321 |
+
"version": 3
|
| 322 |
+
},
|
| 323 |
+
"file_extension": ".py",
|
| 324 |
+
"mimetype": "text/x-python",
|
| 325 |
+
"name": "python",
|
| 326 |
+
"nbconvert_exporter": "python",
|
| 327 |
+
"pygments_lexer": "ipython3",
|
| 328 |
+
"version": "3.8.10"
|
| 329 |
+
}
|
| 330 |
+
},
|
| 331 |
+
"nbformat": 4,
|
| 332 |
+
"nbformat_minor": 5
|
| 333 |
+
}
|
Burmese_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1,592 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Burmese-English Machine Translation (A3 Project)\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**Student**: Htut Ko Ko \n",
|
| 10 |
+
"**Course**: Natural Language Understanding \n",
|
| 11 |
+
"**Task**: Burmese (my) <-> English (en) Translation using Transformer\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"## Project Overview\n",
|
| 14 |
+
"This notebook implements a Neural Machine Translation system using a **Transformer** architecture. \n",
|
| 15 |
+
"We use the **ALT (Asian Language Treebank)** dataset for Burmese-English parallel data.\n",
|
| 16 |
+
"We use **SentencePiece** for subword tokenization to handle the Burmese script effectively.\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"## Pipeline\n",
|
| 19 |
+
"1. **Setup**: Install/Import dependencies.\n",
|
| 20 |
+
"2. **Data Loading**: Load the ALT dataset.\n",
|
| 21 |
+
"3. **Tokenization**: Train SentencePiece model on the corpus.\n",
|
| 22 |
+
"4. **Data Processing**: Create PyTorch Datasets and DataLoaders.\n",
|
| 23 |
+
"5. **Model**: Implement Transformer (using `nn.Transformer`).\n",
|
| 24 |
+
"6. **Training**: Train the model and log performance.\n",
|
| 25 |
+
"7. **Evaluation**: Calculate BLEU score on Test set.\n",
|
| 26 |
+
"8. **Inference**: Demo function and save model for Web App."
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "markdown",
|
| 31 |
+
"metadata": {},
|
| 32 |
+
"source": [
|
| 33 |
+
"## 1. Setup and Imports"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": null,
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [],
|
| 41 |
+
"source": [
|
| 42 |
+
"import os\n",
|
| 43 |
+
"import math\n",
|
| 44 |
+
"import time\n",
|
| 45 |
+
"import random\n",
|
| 46 |
+
"import numpy as np\n",
|
| 47 |
+
"import pandas as pd\n",
|
| 48 |
+
"import matplotlib.pyplot as plt\n",
|
| 49 |
+
"import seaborn as sns\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"import torch\n",
|
| 52 |
+
"import torch.nn as nn\n",
|
| 53 |
+
"import torch.optim as optim\n",
|
| 54 |
+
"from torch.utils.data import Dataset, DataLoader\n",
|
| 55 |
+
"from torch.nn.utils.rnn import pad_sequence\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"# Check for GPU\n",
|
| 58 |
+
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
| 59 |
+
"print(f\"Using device: {device}\")\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"# Set seeds\n",
|
| 62 |
+
"SEED = 1234\n",
|
| 63 |
+
"random.seed(SEED)\n",
|
| 64 |
+
"np.random.seed(SEED)\n",
|
| 65 |
+
"torch.manual_seed(SEED)\n",
|
| 66 |
+
"torch.cuda.manual_seed(SEED)\n",
|
| 67 |
+
"torch.backends.cudnn.deterministic = True"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"cell_type": "code",
|
| 72 |
+
"execution_count": null,
|
| 73 |
+
"metadata": {},
|
| 74 |
+
"outputs": [],
|
| 75 |
+
"source": [
|
| 76 |
+
"# Install dependencies if missing (uncomment if needed)\n",
|
| 77 |
+
"# !pip install sentencepiece datasets portalocker"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "markdown",
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"source": [
|
| 84 |
+
"## 2. Data Loading (ALT Dataset)\n",
|
| 85 |
+
"We will use the **ALT (Asian Language Treebank)** dataset via the HuggingFace `datasets` library."
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"cell_type": "code",
|
| 90 |
+
"execution_count": null,
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"outputs": [],
|
| 93 |
+
"source": [
|
| 94 |
+
"from datasets import load_dataset\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"print(\"Loading ALT Dataset (Burmese-English)...\")\n",
|
| 97 |
+
"try:\n",
|
| 98 |
+
" # Load ALT dataset from HuggingFace (my-en pair)\n",
|
| 99 |
+
" # Note: 'alt' dataset on HF might need specific config configuration or we can use 'bs-modeling-metadata/alt-burmese-english-parallel'\n",
|
| 100 |
+
" # For reliability, we'll try to load a known good source or fallback to manual download if needed.\n",
|
| 101 |
+
" # Here we use 'larryvrh/alt-my-en' or similar if available, else we process raw files if local.\n",
|
| 102 |
+
" # Let's try loading 'alt' configuration directly if supported, otherwise 'Helsinki-NLP/alt' does not exist.\n",
|
| 103 |
+
" # Using a generic approach: Loading from a known reliable HF path or url if standard 'alt' fails.\n",
|
| 104 |
+
" \n",
|
| 105 |
+
" # Let's use 'my_alt' from 'Asian-Language-Treebank' if available, but for now we'll assume the user has internet access.\n",
|
| 106 |
+
" # We will use 'alt' script if available or a direct parquet/csv link if we were doing custom.\n",
|
| 107 |
+
" # Actually, let's use the 'alt' dataset provided by 'my_en' config if possible.\n",
|
| 108 |
+
" \n",
|
| 109 |
+
" dataset = load_dataset(\"alt\", split=\"train+validation+test\") # Load all for custom splitting\n",
|
| 110 |
+
" print(f\"Loaded {len(dataset)} sentences from ALT dataset.\")\n",
|
| 111 |
+
" \n",
|
| 112 |
+
" # Filter/Extract only Burmese and English\n",
|
| 113 |
+
" data = []\n",
|
| 114 |
+
" for item in dataset:\n",
|
| 115 |
+
" # ALT structure usually: {'translation': {'bg': '...', 'en': '...', 'my': '...'}}\n",
|
| 116 |
+
" # The HF 'alt' dataset structure check:\n",
|
| 117 |
+
" if 'translation' in item:\n",
|
| 118 |
+
" if 'my' in item['translation'] and 'en' in item['translation']:\n",
|
| 119 |
+
" data.append({\n",
|
| 120 |
+
" 'my': item['translation']['my'],\n",
|
| 121 |
+
" 'en': item['translation']['en']\n",
|
| 122 |
+
" })\n",
|
| 123 |
+
" \n",
|
| 124 |
+
" print(f\"Extracted {len(data)} Burmese-English pairs.\")\n",
|
| 125 |
+
" \n",
|
| 126 |
+
"except Exception as e:\n",
|
| 127 |
+
" print(f\"Error loading from HF: {e}\")\n",
|
| 128 |
+
" print(\"Attempting fallback or assuming local file 'alt_my_en.csv' exists...\")\n",
|
| 129 |
+
" # fallback code would go here\n"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"cell_type": "code",
|
| 134 |
+
"execution_count": null,
|
| 135 |
+
"metadata": {},
|
| 136 |
+
"outputs": [],
|
| 137 |
+
"source": [
|
| 138 |
+
"# Convert to DataFrame for easier handling\n",
|
| 139 |
+
"df = pd.DataFrame(data)\n",
|
| 140 |
+
"print(df.head())\n",
|
| 141 |
+
"\n",
|
| 142 |
+
"# Basic Cleaning\n",
|
| 143 |
+
"# 1. Drop NaN/None\n",
|
| 144 |
+
"df = df.dropna(subset=['my', 'en'])\n",
|
| 145 |
+
"# 2. Ensure they are strings\n",
|
| 146 |
+
"df['my'] = df['my'].astype(str)\n",
|
| 147 |
+
"df['en'] = df['en'].astype(str)\n",
|
| 148 |
+
"\n",
|
| 149 |
+
"# 3. Remove empty strings\n",
|
| 150 |
+
"df = df[df['my'].str.strip() != '']\n",
|
| 151 |
+
"df = df[df['en'].str.strip() != '']\n",
|
| 152 |
+
"print(f\"After cleaning: {len(df)} pairs\")\n",
|
| 153 |
+
"\n",
|
| 154 |
+
"print(\"\\n--- Data Alignment Check ---\")\n",
|
| 155 |
+
"for i in range(5):\n",
|
| 156 |
+
" sample = df.sample(1).iloc[0]\n",
|
| 157 |
+
" print(f\"Source (my): {sample['my']}\")\n",
|
| 158 |
+
" print(f\"Target (en): {sample['en']}\")\n",
|
| 159 |
+
" print(\"-\" * 20)"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"cell_type": "markdown",
|
| 164 |
+
"metadata": {},
|
| 165 |
+
"source": [
|
| 166 |
+
"## 3. Tokenization (SentencePiece)\n",
|
| 167 |
+
"Burmese does not use spaces between words cleanly. **SentencePiece** is excellent for this as it builds a vocabulary based on subword frequency, handling rare words and no-space languages effectively without external segmenters."
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": null,
|
| 173 |
+
"metadata": {},
|
| 174 |
+
"outputs": [],
|
| 175 |
+
"source": [
|
| 176 |
+
"import sentencepiece as spm\n",
|
| 177 |
+
"\n",
|
| 178 |
+
"# 1. Save texts to files to train tokenizer\n",
|
| 179 |
+
"with open('train_my.txt', 'w', encoding='utf-8') as f:\n",
|
| 180 |
+
" for line in df['my']:\n",
|
| 181 |
+
" f.write(line + '\\n')\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"with open('train_en.txt', 'w', encoding='utf-8') as f:\n",
|
| 184 |
+
" for line in df['en']:\n",
|
| 185 |
+
" f.write(line + '\\n')\n",
|
| 186 |
+
"\n",
|
| 187 |
+
"# 2. Train SentencePiece models\n",
|
| 188 |
+
"vocab_size = 4000 # Reduced for small dataset (~20k sentences) to learn better representations\n",
|
| 189 |
+
"model_type = 'bpe' # Byte-Pair Encoding\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"print(\"Training Burmese Tokenizer...\")\n",
|
| 192 |
+
"spm.SentencePieceTrainer.train(\n",
|
| 193 |
+
" input='train_my.txt', \n",
|
| 194 |
+
" model_prefix='spm_my', \n",
|
| 195 |
+
" vocab_size=vocab_size, \n",
|
| 196 |
+
" model_type=model_type,\n",
|
| 197 |
+
" pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
|
| 198 |
+
")\n",
|
| 199 |
+
"\n",
|
| 200 |
+
"print(\"Training English Tokenizer...\")\n",
|
| 201 |
+
"spm.SentencePieceTrainer.train(\n",
|
| 202 |
+
" input='train_en.txt', \n",
|
| 203 |
+
" model_prefix='spm_en', \n",
|
| 204 |
+
" vocab_size=vocab_size, \n",
|
| 205 |
+
" model_type=model_type,\n",
|
| 206 |
+
" pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
|
| 207 |
+
")\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"print(\"Tokenizer training complete!\")"
|
| 210 |
+
]
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"cell_type": "code",
|
| 214 |
+
"execution_count": null,
|
| 215 |
+
"metadata": {},
|
| 216 |
+
"outputs": [],
|
| 217 |
+
"source": [
|
| 218 |
+
"# Load the processors\n",
|
| 219 |
+
"sp_my = spm.SentencePieceProcessor(model_file='spm_my.model')\n",
|
| 220 |
+
"sp_en = spm.SentencePieceProcessor(model_file='spm_en.model')\n",
|
| 221 |
+
"\n",
|
| 222 |
+
"# Test Tokenization\n",
|
| 223 |
+
"idx = 0\n",
|
| 224 |
+
"print(f\"Original my: {df.iloc[idx]['my']}\")\n",
|
| 225 |
+
"print(f\"Tokens: {sp_my.encode(df.iloc[idx]['my'], out_type=str)}\")\n",
|
| 226 |
+
"print(f\"IDs: {sp_my.encode(df.iloc[idx]['my'], out_type=int)}\")\n",
|
| 227 |
+
"\n",
|
| 228 |
+
"print(f\"\\nOriginal en: {df.iloc[idx]['en']}\")\n",
|
| 229 |
+
"print(f\"Tokens: {sp_en.encode(df.iloc[idx]['en'], out_type=str)}\")\n",
|
| 230 |
+
"print(f\"IDs: {sp_en.encode(df.iloc[idx]['en'], out_type=int)}\")"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"cell_type": "markdown",
|
| 235 |
+
"metadata": {},
|
| 236 |
+
"source": [
|
| 237 |
+
"## 4. PyTorch Dataset and DataLoader"
|
| 238 |
+
]
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"cell_type": "code",
|
| 242 |
+
"execution_count": null,
|
| 243 |
+
"id": "9377dc67",
|
| 244 |
+
"metadata": {},
|
| 245 |
+
"outputs": [],
|
| 246 |
+
"source": [
|
| 247 |
+
"class TranslationDataset(Dataset):\n",
|
| 248 |
+
" def __init__(self, df, sp_src, sp_trg):\n",
|
| 249 |
+
" self.data = df\n",
|
| 250 |
+
" self.sp_src = sp_src\n",
|
| 251 |
+
" self.sp_trg = sp_trg\n",
|
| 252 |
+
" \n",
|
| 253 |
+
" def __len__(self):\n",
|
| 254 |
+
" return len(self.data)\n",
|
| 255 |
+
" \n",
|
| 256 |
+
" def __getitem__(self, idx):\n",
|
| 257 |
+
" src_text = self.data.iloc[idx]['my']\n",
|
| 258 |
+
" trg_text = self.data.iloc[idx]['en']\n",
|
| 259 |
+
" \n",
|
| 260 |
+
" # Encode with EOS\n",
|
| 261 |
+
" # spm doesn't add sos/eos by default unless configured, we'll adds manually for safety or usage in model\n",
|
| 262 |
+
" # Use bos_id() for beginning of sentence\n",
|
| 263 |
+
" src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n",
|
| 264 |
+
" trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n",
|
| 265 |
+
" \n",
|
| 266 |
+
" return torch.tensor(src_ids), torch.tensor(trg_ids)\n",
|
| 267 |
+
"\n",
|
| 268 |
+
"def collate_fn(batch):\n",
|
| 269 |
+
" src_batch, trg_batch = [], []\n",
|
| 270 |
+
" for src, trg in batch:\n",
|
| 271 |
+
" src_batch.append(src)\n",
|
| 272 |
+
" trg_batch.append(trg)\n",
|
| 273 |
+
" \n",
|
| 274 |
+
" # Pad sequences\n",
|
| 275 |
+
" # PAD ID is 0 for our spm models\n",
|
| 276 |
+
" src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n",
|
| 277 |
+
" trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n",
|
| 278 |
+
" \n",
|
| 279 |
+
" return src_pad, trg_pad\n",
|
| 280 |
+
"\n",
|
| 281 |
+
"# Split Data\n",
|
| 282 |
+
"train_df = df.sample(frac=0.8, random_state=SEED)\n",
|
| 283 |
+
"val_test_df = df.drop(train_df.index)\n",
|
| 284 |
+
"val_df = val_test_df.sample(frac=0.5, random_state=SEED)\n",
|
| 285 |
+
"test_df = val_test_df.drop(val_df.index)\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"print(f\"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}\")\n",
|
| 288 |
+
"\n",
|
| 289 |
+
"train_dataset = TranslationDataset(train_df, sp_my, sp_en)\n",
|
| 290 |
+
"val_dataset = TranslationDataset(val_df, sp_my, sp_en)\n",
|
| 291 |
+
"test_dataset = TranslationDataset(test_df, sp_my, sp_en)\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"BATCH_SIZE = 64 # Increased to stabilize gradients\n",
|
| 294 |
+
"train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n",
|
| 295 |
+
"val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n",
|
| 296 |
+
"test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)"
|
| 297 |
+
]
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"cell_type": "markdown",
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"source": [
|
| 303 |
+
"## 5. Transformer Model\n",
|
| 304 |
+
"Using PyTorch's `nn.Transformer`."
|
| 305 |
+
]
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"cell_type": "code",
|
| 309 |
+
"execution_count": null,
|
| 310 |
+
"metadata": {},
|
| 311 |
+
"outputs": [],
|
| 312 |
+
"source": [
|
| 313 |
+
"class TransformerModel(nn.Module):\n",
|
| 314 |
+
" def __init__(self, src_vocab_size, trg_vocab_size, \n",
|
| 315 |
+
" d_model=512, nhead=8, num_encoder_layers=3, \n",
|
| 316 |
+
" num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):\n",
|
| 317 |
+
" super(TransformerModel, self).__init__()\n",
|
| 318 |
+
" \n",
|
| 319 |
+
" self.d_model = d_model\n",
|
| 320 |
+
" self.pad_idx = pad_idx\n",
|
| 321 |
+
" \n",
|
| 322 |
+
" # Embeddings\n",
|
| 323 |
+
" self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n",
|
| 324 |
+
" self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n",
|
| 325 |
+
" \n",
|
| 326 |
+
" # Positional Encoding\n",
|
| 327 |
+
" self.pos_encoder = PositionalEncoding(d_model, dropout)\n",
|
| 328 |
+
" \n",
|
| 329 |
+
" # Transformer\n",
|
| 330 |
+
" self.transformer = nn.Transformer(\n",
|
| 331 |
+
" d_model=d_model, \n",
|
| 332 |
+
" nhead=nhead, \n",
|
| 333 |
+
" num_encoder_layers=num_encoder_layers, \n",
|
| 334 |
+
" num_decoder_layers=num_decoder_layers, \n",
|
| 335 |
+
" dim_feedforward=dim_feedforward, \n",
|
| 336 |
+
" dropout=dropout,\n",
|
| 337 |
+
" batch_first=True\n",
|
| 338 |
+
" )\n",
|
| 339 |
+
" \n",
|
| 340 |
+
" # Output Layer\n",
|
| 341 |
+
" self.fc_out = nn.Linear(d_model, trg_vocab_size)\n",
|
| 342 |
+
" \n",
|
| 343 |
+
" self.init_weights()\n",
|
| 344 |
+
" \n",
|
| 345 |
+
" def init_weights(self):\n",
|
| 346 |
+
" for p in self.parameters():\n",
|
| 347 |
+
" if p.dim() > 1:\n",
|
| 348 |
+
" nn.init.xavier_uniform_(p)\n",
|
| 349 |
+
" \n",
|
| 350 |
+
" def forward(self, src, trg):\n",
|
| 351 |
+
" # src: [batch_size, src_len]\n",
|
| 352 |
+
" # trg: [batch_size, trg_len]\n",
|
| 353 |
+
" \n",
|
| 354 |
+
" # Create masks\n",
|
| 355 |
+
" src_key_padding_mask = (src == self.pad_idx)\n",
|
| 356 |
+
" trg_key_padding_mask = (trg == self.pad_idx)\n",
|
| 357 |
+
" \n",
|
| 358 |
+
" # Target mask for autoregressive decoding (prevent peeking future)\n",
|
| 359 |
+
" trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n",
|
| 360 |
+
" \n",
|
| 361 |
+
" # Embed + Positional Encoding\n",
|
| 362 |
+
" src_emb = self.src_embedding(src) * math.sqrt(self.d_model)\n",
|
| 363 |
+
" trg_emb = self.trg_embedding(trg) * math.sqrt(self.d_model)\n",
|
| 364 |
+
" \n",
|
| 365 |
+
" src_emb = self.pos_encoder(src_emb)\n",
|
| 366 |
+
" trg_emb = self.pos_encoder(trg_emb)\n",
|
| 367 |
+
" \n",
|
| 368 |
+
" # Transformer Forward\n",
|
| 369 |
+
" output = self.transformer(\n",
|
| 370 |
+
" src=src_emb, \n",
|
| 371 |
+
" tgt=trg_emb, \n",
|
| 372 |
+
" tgt_mask=trg_mask,\n",
|
| 373 |
+
" src_key_padding_mask=src_key_padding_mask,\n",
|
| 374 |
+
" tgt_key_padding_mask=trg_key_padding_mask\n",
|
| 375 |
+
" )\n",
|
| 376 |
+
" \n",
|
| 377 |
+
" prediction = self.fc_out(output)\n",
|
| 378 |
+
" return prediction\n",
|
| 379 |
+
"\n",
|
| 380 |
+
"class PositionalEncoding(nn.Module):\n",
|
| 381 |
+
" def __init__(self, d_model, dropout=0.1, max_len=5000):\n",
|
| 382 |
+
" super(PositionalEncoding, self).__init__()\n",
|
| 383 |
+
" self.dropout = nn.Dropout(p=dropout)\n",
|
| 384 |
+
"\n",
|
| 385 |
+
" pe = torch.zeros(max_len, d_model)\n",
|
| 386 |
+
" position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
|
| 387 |
+
" div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n",
|
| 388 |
+
" pe[:, 0::2] = torch.sin(position * div_term)\n",
|
| 389 |
+
" pe[:, 1::2] = torch.cos(position * div_term)\n",
|
| 390 |
+
" self.register_buffer('pe', pe)\n",
|
| 391 |
+
"\n",
|
| 392 |
+
" def forward(self, x):\n",
|
| 393 |
+
" # x: [batch_size, seq_len, d_model]\n",
|
| 394 |
+
" x = x + self.pe[:x.size(1), :]\n",
|
| 395 |
+
" return self.dropout(x)"
|
| 396 |
+
]
|
| 397 |
+
},
|
| 398 |
+
{
|
| 399 |
+
"cell_type": "markdown",
|
| 400 |
+
"metadata": {},
|
| 401 |
+
"source": [
|
| 402 |
+
"## 6. Training Loop"
|
| 403 |
+
]
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"cell_type": "code",
|
| 407 |
+
"execution_count": null,
|
| 408 |
+
"metadata": {},
|
| 409 |
+
"outputs": [],
|
| 410 |
+
"source": [
|
| 411 |
+
"# Config\n",
|
| 412 |
+
"SRC_VOCAB_SIZE = vocab_size\n",
|
| 413 |
+
"TRG_VOCAB_SIZE = vocab_size\n",
|
| 414 |
+
"D_MODEL = 256\n",
|
| 415 |
+
"N_HEAD = 4 # Reduced for small dataset\n",
|
| 416 |
+
"NUM_LAYERS = 2 # Reduced layers\n",
|
| 417 |
+
"FF_DIM = 512\n",
|
| 418 |
+
"DROPOUT = 0.4 # Increased for regularization\n",
|
| 419 |
+
"LR = 0.0005\n",
|
| 420 |
+
"EPOCHS = 100 # Increased to allow convergence\n",
|
| 421 |
+
"\n",
|
| 422 |
+
"model = TransformerModel(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, N_HEAD, NUM_LAYERS, NUM_LAYERS, FF_DIM, DROPOUT, pad_idx=0).to(device)\n",
|
| 423 |
+
"optimizer = optim.Adam(model.parameters(), lr=LR)\n",
|
| 424 |
+
"scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)\n",
|
| 425 |
+
"criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1) # Label smoothing helps with generation\n",
|
| 426 |
+
"\n",
|
| 427 |
+
"def train(model, iterator, optimizer, criterion, clip):\n",
|
| 428 |
+
" model.train()\n",
|
| 429 |
+
" epoch_loss = 0\n",
|
| 430 |
+
" \n",
|
| 431 |
+
" for i, (src, trg) in enumerate(iterator):\n",
|
| 432 |
+
" src, trg = src.to(device), trg.to(device)\n",
|
| 433 |
+
" \n",
|
| 434 |
+
" optimizer.zero_grad()\n",
|
| 435 |
+
" \n",
|
| 436 |
+
" # trg input = trg[:, :-1] (all except last)\n",
|
| 437 |
+
" # trg output = trg[:, 1:] (all except first - predicted next token)\n",
|
| 438 |
+
" output = model(src, trg[:, :-1])\n",
|
| 439 |
+
" \n",
|
| 440 |
+
" output_dim = output.shape[-1]\n",
|
| 441 |
+
" \n",
|
| 442 |
+
" # Flatten for loss calculation\n",
|
| 443 |
+
" output = output.contiguous().view(-1, output_dim)\n",
|
| 444 |
+
" trg = trg[:, 1:].contiguous().view(-1)\n",
|
| 445 |
+
" \n",
|
| 446 |
+
" loss = criterion(output, trg)\n",
|
| 447 |
+
" loss.backward()\n",
|
| 448 |
+
" torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
|
| 449 |
+
" optimizer.step()\n",
|
| 450 |
+
" \n",
|
| 451 |
+
" epoch_loss += loss.item()\n",
|
| 452 |
+
" \n",
|
| 453 |
+
" return epoch_loss / len(iterator)\n",
|
| 454 |
+
"\n",
|
| 455 |
+
"def evaluate(model, iterator, criterion):\n",
|
| 456 |
+
" model.eval()\n",
|
| 457 |
+
" epoch_loss = 0\n",
|
| 458 |
+
" \n",
|
| 459 |
+
" with torch.no_grad():\n",
|
| 460 |
+
" for i, (src, trg) in enumerate(iterator):\n",
|
| 461 |
+
" src, trg = src.to(device), trg.to(device)\n",
|
| 462 |
+
" output = model(src, trg[:, :-1])\n",
|
| 463 |
+
" \n",
|
| 464 |
+
" output_dim = output.shape[-1]\n",
|
| 465 |
+
" output = output.contiguous().view(-1, output_dim)\n",
|
| 466 |
+
" trg = trg[:, 1:].contiguous().view(-1)\n",
|
| 467 |
+
" \n",
|
| 468 |
+
" loss = criterion(output, trg)\n",
|
| 469 |
+
" epoch_loss += loss.item()\n",
|
| 470 |
+
" \n",
|
| 471 |
+
" return epoch_loss / len(iterator)\n",
|
| 472 |
+
"\n",
|
| 473 |
+
"print(\"Starting training...\")\n",
|
| 474 |
+
"best_valid_loss = float('inf')\n",
|
| 475 |
+
"\n",
|
| 476 |
+
"for epoch in range(EPOCHS):\n",
|
| 477 |
+
" start_time = time.time()\n",
|
| 478 |
+
" \n",
|
| 479 |
+
" train_loss = train(model, train_loader, optimizer, criterion, 1.0)\n",
|
| 480 |
+
" valid_loss = evaluate(model, val_loader, criterion)\n",
|
| 481 |
+
" \n",
|
| 482 |
+
" end_time = time.time()\n",
|
| 483 |
+
" \n",
|
| 484 |
+
" # Step the scheduler\n",
|
| 485 |
+
" scheduler.step(valid_loss)\n",
|
| 486 |
+
" \n",
|
| 487 |
+
" if valid_loss < best_valid_loss:\n",
|
| 488 |
+
" best_valid_loss = valid_loss\n",
|
| 489 |
+
" torch.save(model.state_dict(), 'transformer_model.pt')\n",
|
| 490 |
+
" \n",
|
| 491 |
+
" print(f'Epoch: {epoch+1:02} | Time: {end_time-start_time:.0f}s')\n",
|
| 492 |
+
" print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
|
| 493 |
+
" print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')\n",
|
| 494 |
+
" print(f'\\t LR: {optimizer.param_groups[0][\"lr\"]:.6f}')"
|
| 495 |
+
]
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"cell_type": "markdown",
|
| 499 |
+
"metadata": {},
|
| 500 |
+
"source": [
|
| 501 |
+
"## 7. Inference and Verification"
|
| 502 |
+
]
|
| 503 |
+
},
|
| 504 |
+
{
|
| 505 |
+
"cell_type": "code",
|
| 506 |
+
"execution_count": null,
|
| 507 |
+
"metadata": {},
|
| 508 |
+
"outputs": [],
|
| 509 |
+
"source": [
|
| 510 |
+
"# Load Best Model\n",
|
| 511 |
+
"model.load_state_dict(torch.load('transformer_model.pt', map_location=device))\n",
|
| 512 |
+
"\n",
|
| 513 |
+
"def translate_sentence(sentence, model, sp_src, sp_trg, max_len=50, device=device):\n",
|
| 514 |
+
" model.eval()\n",
|
| 515 |
+
" \n",
|
| 516 |
+
" # Tokenize src\n",
|
| 517 |
+
" tokens = [sp_src.bos_id()] + sp_src.encode(sentence, out_type=int) + [sp_src.eos_id()]\n",
|
| 518 |
+
" print(f\"Debug - Source tokens: {sp_src.encode(sentence, out_type=str)}\")\n",
|
| 519 |
+
" src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)\n",
|
| 520 |
+
" \n",
|
| 521 |
+
" # Start with SOS\n",
|
| 522 |
+
" trg_indices = [sp_trg.bos_id()]\n",
|
| 523 |
+
" \n",
|
| 524 |
+
" for i in range(max_len):\n",
|
| 525 |
+
" trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)\n",
|
| 526 |
+
" \n",
|
| 527 |
+
" with torch.no_grad():\n",
|
| 528 |
+
" output = model(src_tensor, trg_tensor)\n",
|
| 529 |
+
" \n",
|
| 530 |
+
" # Get last predicted token\n",
|
| 531 |
+
" pred_token = output.argmax(2)[:, -1].item()\n",
|
| 532 |
+
" \n",
|
| 533 |
+
" trg_indices.append(pred_token)\n",
|
| 534 |
+
" \n",
|
| 535 |
+
" if pred_token == sp_trg.eos_id():\n",
|
| 536 |
+
" break\n",
|
| 537 |
+
" \n",
|
| 538 |
+
" # Decode\n",
|
| 539 |
+
" translated_text = sp_trg.decode(trg_indices)\n",
|
| 540 |
+
" return translated_text\n",
|
| 541 |
+
"\n",
|
| 542 |
+
"# Test Translation\n",
|
| 543 |
+
"idx = random.randint(0, len(test_df)-1)\n",
|
| 544 |
+
"src_sent = test_df.iloc[idx]['my']\n",
|
| 545 |
+
"trg_sent = test_df.iloc[idx]['en']\n",
|
| 546 |
+
"\n",
|
| 547 |
+
"print(f\"Source: {src_sent}\")\n",
|
| 548 |
+
"print(f\"Target: {trg_sent}\")\n",
|
| 549 |
+
"print(f\"Pred: {translate_sentence(src_sent, model, sp_my, sp_en)}\")"
|
| 550 |
+
]
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"cell_type": "code",
|
| 554 |
+
"execution_count": null,
|
| 555 |
+
"metadata": {},
|
| 556 |
+
"outputs": [],
|
| 557 |
+
"source": [
|
| 558 |
+
"# Save artifacts for Web App\n",
|
| 559 |
+
"# Already saved: 'transformer_model.pt', 'spm_my.model', 'spm_en.model'\n",
|
| 560 |
+
"# The web app will need these files.\n",
|
| 561 |
+
"import shutil\n",
|
| 562 |
+
"\n",
|
| 563 |
+
"os.makedirs('app/models', exist_ok=True)\n",
|
| 564 |
+
"shutil.copy('transformer_model.pt', 'app/models/transformer_model.pt')\n",
|
| 565 |
+
"shutil.copy('spm_my.model', 'app/models/spm_my.model')\n",
|
| 566 |
+
"shutil.copy('spm_en.model', 'app/models/spm_en.model')\n",
|
| 567 |
+
"print(\"Models copied to app/models/\")"
|
| 568 |
+
]
|
| 569 |
+
}
|
| 570 |
+
],
|
| 571 |
+
"metadata": {
|
| 572 |
+
"kernelspec": {
|
| 573 |
+
"display_name": "Python 3",
|
| 574 |
+
"language": "python",
|
| 575 |
+
"name": "python3"
|
| 576 |
+
},
|
| 577 |
+
"language_info": {
|
| 578 |
+
"codemirror_mode": {
|
| 579 |
+
"name": "ipython",
|
| 580 |
+
"version": 3
|
| 581 |
+
},
|
| 582 |
+
"file_extension": ".py",
|
| 583 |
+
"mimetype": "text/x-python",
|
| 584 |
+
"name": "python",
|
| 585 |
+
"nbconvert_exporter": "python",
|
| 586 |
+
"pygments_lexer": "ipython3",
|
| 587 |
+
"version": "3.8.10"
|
| 588 |
+
}
|
| 589 |
+
},
|
| 590 |
+
"nbformat": 4,
|
| 591 |
+
"nbformat_minor": 5
|
| 592 |
+
}
|
Chinese_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Chinese-English Machine Translation (A3 Project)\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**Student**: Htut Ko Ko \n",
|
| 10 |
+
"**Course**: Natural Language Understanding \n",
|
| 11 |
+
"**Task**: Chinese (zh) <-> English (en) Translation using Transformer\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"## Project Overview\n",
|
| 14 |
+
"This notebook implements a Neural Machine Translation system using a **Transformer** architecture. \n",
|
| 15 |
+
"We use the **ALT (Asian Language Treebank)** dataset for Chinese-English parallel data.\n",
|
| 16 |
+
"We use **SentencePiece** for subword tokenization.\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"## Pipeline\n",
|
| 19 |
+
"1. **Setup**: Install/Import dependencies.\n",
|
| 20 |
+
"2. **Data Loading**: Load the ALT dataset (Chinese-English).\n",
|
| 21 |
+
"3. **Tokenization**: Train SentencePiece model (`spm_zh`, `spm_en_zh`).\n",
|
| 22 |
+
"4. **Data Processing**: Create PyTorch Datasets and DataLoaders.\n",
|
| 23 |
+
"5. **Model**: Implement Transformer.\n",
|
| 24 |
+
"6. **Training**: Train the model.\n",
|
| 25 |
+
"7. **Evaluation**: Calculate BLEU score.\n",
|
| 26 |
+
"8. **Inference**: Demo function and save model for Web App."
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "markdown",
|
| 31 |
+
"metadata": {},
|
| 32 |
+
"source": [
|
| 33 |
+
"## 1. Setup and Imports"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": null,
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [],
|
| 41 |
+
"source": [
|
| 42 |
+
"import os\n",
|
| 43 |
+
"import math\n",
|
| 44 |
+
"import time\n",
|
| 45 |
+
"import random\n",
|
| 46 |
+
"import numpy as np\n",
|
| 47 |
+
"import pandas as pd\n",
|
| 48 |
+
"import matplotlib.pyplot as plt\n",
|
| 49 |
+
"import seaborn as sns\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"import torch\n",
|
| 52 |
+
"import torch.nn as nn\n",
|
| 53 |
+
"import torch.optim as optim\n",
|
| 54 |
+
"from torch.utils.data import Dataset, DataLoader\n",
|
| 55 |
+
"from torch.nn.utils.rnn import pad_sequence\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"# Check for GPU\n",
|
| 58 |
+
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
| 59 |
+
"print(f\"Using device: {device}\")\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"# Set seeds\n",
|
| 62 |
+
"SEED = 1234\n",
|
| 63 |
+
"random.seed(SEED)\n",
|
| 64 |
+
"np.random.seed(SEED)\n",
|
| 65 |
+
"torch.manual_seed(SEED)\n",
|
| 66 |
+
"torch.cuda.manual_seed(SEED)\n",
|
| 67 |
+
"torch.backends.cudnn.deterministic = True"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"cell_type": "code",
|
| 72 |
+
"execution_count": null,
|
| 73 |
+
"metadata": {},
|
| 74 |
+
"outputs": [],
|
| 75 |
+
"source": [
|
| 76 |
+
"# Install dependencies if missing (uncomment if needed)\n",
|
| 77 |
+
"# !pip install sentencepiece datasets portalocker"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "markdown",
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"source": [
|
| 84 |
+
"## 2. Data Loading (ALT Dataset)\n",
|
| 85 |
+
"Loading Chinese-English pairs from ALT."
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"cell_type": "code",
|
| 90 |
+
"execution_count": null,
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"outputs": [],
|
| 93 |
+
"source": [
|
| 94 |
+
"from datasets import load_dataset\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"print(\"Loading ALT Dataset (Chinese-English)...\")\n",
|
| 97 |
+
"try:\n",
|
| 98 |
+
" dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n",
|
| 99 |
+
" print(f\"Loaded {len(dataset)} sentences from ALT dataset.\")\n",
|
| 100 |
+
" \n",
|
| 101 |
+
" # Filter/Extract only Chinese and English\n",
|
| 102 |
+
" data = []\n",
|
| 103 |
+
" for item in dataset:\n",
|
| 104 |
+
" if 'translation' in item:\n",
|
| 105 |
+
" if 'zh' in item['translation'] and 'en' in item['translation']:\n",
|
| 106 |
+
" data.append({\n",
|
| 107 |
+
" 'zh': item['translation']['zh'],\n",
|
| 108 |
+
" 'en': item['translation']['en']\n",
|
| 109 |
+
" })\n",
|
| 110 |
+
" \n",
|
| 111 |
+
" print(f\"Extracted {len(data)} Chinese-English pairs.\")\n",
|
| 112 |
+
" \n",
|
| 113 |
+
"except Exception as e:\n",
|
| 114 |
+
" print(f\"Error loading from HF: {e}\")\n"
|
| 115 |
+
]
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"cell_type": "code",
|
| 119 |
+
"execution_count": null,
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"outputs": [],
|
| 122 |
+
"source": [
|
| 123 |
+
"# Convert to DataFrame\n",
|
| 124 |
+
"df = pd.DataFrame(data)\n",
|
| 125 |
+
"print(df.head())\n",
|
| 126 |
+
"\n",
|
| 127 |
+
"# Basic Cleaning\n",
|
| 128 |
+
"df = df.dropna(subset=['zh', 'en'])\n",
|
| 129 |
+
"df['zh'] = df['zh'].astype(str)\n",
|
| 130 |
+
"df['en'] = df['en'].astype(str)\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"df = df[df['zh'].str.strip() != '']\n",
|
| 133 |
+
"df = df[df['en'].str.strip() != '']\n",
|
| 134 |
+
"print(f\"After cleaning: {len(df)} pairs\")\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"print(\"\\n--- Data Alignment Check ---\")\n",
|
| 137 |
+
"for i in range(5):\n",
|
| 138 |
+
" sample = df.sample(1).iloc[0]\n",
|
| 139 |
+
" print(f\"Source (zh): {sample['zh']}\")\n",
|
| 140 |
+
" print(f\"Target (en): {sample['en']}\")\n",
|
| 141 |
+
" print(\"-\" * 20)"
|
| 142 |
+
]
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"cell_type": "markdown",
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"source": [
|
| 148 |
+
"## 3. Tokenization (SentencePiece)\n",
|
| 149 |
+
"Training separate tokenizers for Chinese (`spm_zh`) and English (`spm_en_zh`)."
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": null,
|
| 155 |
+
"metadata": {},
|
| 156 |
+
"outputs": [],
|
| 157 |
+
"source": [
|
| 158 |
+
"import sentencepiece as spm\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"# 1. Save texts to files\n",
|
| 161 |
+
"with open('train_zh.txt', 'w', encoding='utf-8') as f:\n",
|
| 162 |
+
" for line in df['zh']:\n",
|
| 163 |
+
" f.write(line + '\\n')\n",
|
| 164 |
+
"\n",
|
| 165 |
+
"with open('train_en_zh.txt', 'w', encoding='utf-8') as f:\n",
|
| 166 |
+
" for line in df['en']:\n",
|
| 167 |
+
" f.write(line + '\\n')\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"# 2. Train SentencePiece models\n",
|
| 170 |
+
"vocab_size = 4000\n",
|
| 171 |
+
"model_type = 'bpe'\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"print(\"Training Chinese Tokenizer...\")\n",
|
| 174 |
+
"spm.SentencePieceTrainer.train(\n",
|
| 175 |
+
" input='train_zh.txt', \n",
|
| 176 |
+
" model_prefix='spm_zh', \n",
|
| 177 |
+
" vocab_size=vocab_size, \n",
|
| 178 |
+
" model_type=model_type,\n",
|
| 179 |
+
" pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
|
| 180 |
+
")\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"print(\"Training English Tokenizer (for Chinese pair)...\")\n",
|
| 183 |
+
"spm.SentencePieceTrainer.train(\n",
|
| 184 |
+
" input='train_en_zh.txt', \n",
|
| 185 |
+
" model_prefix='spm_en_zh', \n",
|
| 186 |
+
" vocab_size=vocab_size, \n",
|
| 187 |
+
" model_type=model_type,\n",
|
| 188 |
+
" pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
|
| 189 |
+
")\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"print(\"Tokenizer training complete!\")"
|
| 192 |
+
]
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"cell_type": "code",
|
| 196 |
+
"execution_count": null,
|
| 197 |
+
"metadata": {},
|
| 198 |
+
"outputs": [],
|
| 199 |
+
"source": [
|
| 200 |
+
"# Load the processors\n",
|
| 201 |
+
"sp_zh = spm.SentencePieceProcessor(model_file='spm_zh.model')\n",
|
| 202 |
+
"sp_en = spm.SentencePieceProcessor(model_file='spm_en_zh.model')\n",
|
| 203 |
+
"\n",
|
| 204 |
+
"# Test Tokenization\n",
|
| 205 |
+
"idx = 0\n",
|
| 206 |
+
"print(f\"Original zh: {df.iloc[idx]['zh']}\")\n",
|
| 207 |
+
"print(f\"Tokens: {sp_zh.encode(df.iloc[idx]['zh'], out_type=str)}\")\n",
|
| 208 |
+
"print(f\"IDs: {sp_zh.encode(df.iloc[idx]['zh'], out_type=int)}\")"
|
| 209 |
+
]
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"cell_type": "markdown",
|
| 213 |
+
"metadata": {},
|
| 214 |
+
"source": [
|
| 215 |
+
"## 4. PyTorch Dataset and DataLoader"
|
| 216 |
+
]
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"cell_type": "code",
|
| 220 |
+
"execution_count": null,
|
| 221 |
+
"metadata": {},
|
| 222 |
+
"outputs": [],
|
| 223 |
+
"source": [
|
| 224 |
+
"class TranslationDataset(Dataset):\n",
|
| 225 |
+
" def __init__(self, df, sp_src, sp_trg):\n",
|
| 226 |
+
" self.data = df\n",
|
| 227 |
+
" self.sp_src = sp_src\n",
|
| 228 |
+
" self.sp_trg = sp_trg\n",
|
| 229 |
+
" \n",
|
| 230 |
+
" def __len__(self):\n",
|
| 231 |
+
" return len(self.data)\n",
|
| 232 |
+
" \n",
|
| 233 |
+
" def __getitem__(self, idx):\n",
|
| 234 |
+
" src_text = self.data.iloc[idx]['zh']\n",
|
| 235 |
+
" trg_text = self.data.iloc[idx]['en']\n",
|
| 236 |
+
" \n",
|
| 237 |
+
" src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n",
|
| 238 |
+
" trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n",
|
| 239 |
+
" \n",
|
| 240 |
+
" return torch.tensor(src_ids), torch.tensor(trg_ids)\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"def collate_fn(batch):\n",
|
| 243 |
+
" src_batch, trg_batch = [], []\n",
|
| 244 |
+
" for src, trg in batch:\n",
|
| 245 |
+
" src_batch.append(src)\n",
|
| 246 |
+
" trg_batch.append(trg)\n",
|
| 247 |
+
" \n",
|
| 248 |
+
" src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n",
|
| 249 |
+
" trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n",
|
| 250 |
+
" \n",
|
| 251 |
+
" return src_pad, trg_pad\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"# Split Data\n",
|
| 254 |
+
"train_df = df.sample(frac=0.8, random_state=SEED)\n",
|
| 255 |
+
"val_test_df = df.drop(train_df.index)\n",
|
| 256 |
+
"val_df = val_test_df.sample(frac=0.5, random_state=SEED)\n",
|
| 257 |
+
"test_df = val_test_df.drop(val_df.index)\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"train_dataset = TranslationDataset(train_df, sp_zh, sp_en)\n",
|
| 260 |
+
"val_dataset = TranslationDataset(val_df, sp_zh, sp_en)\n",
|
| 261 |
+
"test_dataset = TranslationDataset(test_df, sp_zh, sp_en)\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"BATCH_SIZE = 64\n",
|
| 264 |
+
"train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n",
|
| 265 |
+
"val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n",
|
| 266 |
+
"test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)"
|
| 267 |
+
]
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"cell_type": "markdown",
|
| 271 |
+
"metadata": {},
|
| 272 |
+
"source": [
|
| 273 |
+
"## 5. Transformer Model"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"cell_type": "code",
|
| 278 |
+
"execution_count": null,
|
| 279 |
+
"metadata": {},
|
| 280 |
+
"outputs": [],
|
| 281 |
+
"source": [
|
| 282 |
+
"class TransformerModel(nn.Module):\n",
|
| 283 |
+
" def __init__(self, src_vocab_size, trg_vocab_size, \n",
|
| 284 |
+
" d_model=512, nhead=8, num_encoder_layers=3, \n",
|
| 285 |
+
" num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):\n",
|
| 286 |
+
" super(TransformerModel, self).__init__()\n",
|
| 287 |
+
" \n",
|
| 288 |
+
" self.d_model = d_model\n",
|
| 289 |
+
" self.pad_idx = pad_idx\n",
|
| 290 |
+
" \n",
|
| 291 |
+
" self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n",
|
| 292 |
+
" self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n",
|
| 293 |
+
" self.pos_encoder = PositionalEncoding(d_model, dropout)\n",
|
| 294 |
+
" \n",
|
| 295 |
+
" self.transformer = nn.Transformer(\n",
|
| 296 |
+
" d_model=d_model, \n",
|
| 297 |
+
" nhead=nhead, \n",
|
| 298 |
+
" num_encoder_layers=num_encoder_layers, \n",
|
| 299 |
+
" num_decoder_layers=num_decoder_layers, \n",
|
| 300 |
+
" dim_feedforward=dim_feedforward, \n",
|
| 301 |
+
" dropout=dropout,\n",
|
| 302 |
+
" batch_first=True\n",
|
| 303 |
+
" )\n",
|
| 304 |
+
" \n",
|
| 305 |
+
" self.fc_out = nn.Linear(d_model, trg_vocab_size)\n",
|
| 306 |
+
" self.init_weights()\n",
|
| 307 |
+
" \n",
|
| 308 |
+
" def init_weights(self):\n",
|
| 309 |
+
" for p in self.parameters():\n",
|
| 310 |
+
" if p.dim() > 1:\n",
|
| 311 |
+
" nn.init.xavier_uniform_(p)\n",
|
| 312 |
+
" \n",
|
| 313 |
+
" def forward(self, src, trg):\n",
|
| 314 |
+
" src_key_padding_mask = (src == self.pad_idx)\n",
|
| 315 |
+
" trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n",
|
| 316 |
+
" \n",
|
| 317 |
+
" src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n",
|
| 318 |
+
" trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n",
|
| 319 |
+
" \n",
|
| 320 |
+
" output = self.transformer(\n",
|
| 321 |
+
" src=src_emb, \n",
|
| 322 |
+
" tgt=trg_emb, \n",
|
| 323 |
+
" tgt_mask=trg_mask,\n",
|
| 324 |
+
" src_key_padding_mask=src_key_padding_mask\n",
|
| 325 |
+
" )\n",
|
| 326 |
+
" return self.fc_out(output)\n",
|
| 327 |
+
"\n",
|
| 328 |
+
"class PositionalEncoding(nn.Module):\n",
|
| 329 |
+
" def __init__(self, d_model, dropout=0.1, max_len=5000):\n",
|
| 330 |
+
" super(PositionalEncoding, self).__init__()\n",
|
| 331 |
+
" self.dropout = nn.Dropout(p=dropout)\n",
|
| 332 |
+
" pe = torch.zeros(max_len, d_model)\n",
|
| 333 |
+
" position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
|
| 334 |
+
" div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n",
|
| 335 |
+
" pe[:, 0::2] = torch.sin(position * div_term)\n",
|
| 336 |
+
" pe[:, 1::2] = torch.cos(position * div_term)\n",
|
| 337 |
+
" self.register_buffer('pe', pe)\n",
|
| 338 |
+
"\n",
|
| 339 |
+
" def forward(self, x):\n",
|
| 340 |
+
" x = x + self.pe[:x.size(1), :]\n",
|
| 341 |
+
" return self.dropout(x)"
|
| 342 |
+
]
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"cell_type": "markdown",
|
| 346 |
+
"metadata": {},
|
| 347 |
+
"source": [
|
| 348 |
+
"## 6. Training"
|
| 349 |
+
]
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"cell_type": "code",
|
| 353 |
+
"execution_count": null,
|
| 354 |
+
"metadata": {},
|
| 355 |
+
"outputs": [],
|
| 356 |
+
"source": [
|
| 357 |
+
"SRC_VOCAB_SIZE = vocab_size\n",
|
| 358 |
+
"TRG_VOCAB_SIZE = vocab_size\n",
|
| 359 |
+
"D_MODEL = 256\n",
|
| 360 |
+
"N_HEAD = 4\n",
|
| 361 |
+
"NUM_LAYERS = 2\n",
|
| 362 |
+
"FF_DIM = 512\n",
|
| 363 |
+
"DROPOUT = 0.4\n",
|
| 364 |
+
"LR = 0.0005\n",
|
| 365 |
+
"EPOCHS = 100\n",
|
| 366 |
+
"\n",
|
| 367 |
+
"model = TransformerModel(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, N_HEAD, NUM_LAYERS, NUM_LAYERS, FF_DIM, DROPOUT).to(device)\n",
|
| 368 |
+
"optimizer = optim.Adam(model.parameters(), lr=LR)\n",
|
| 369 |
+
"criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)\n",
|
| 370 |
+
"\n",
|
| 371 |
+
"def train(model, iterator, optimizer, criterion, clip):\n",
|
| 372 |
+
" model.train()\n",
|
| 373 |
+
" epoch_loss = 0\n",
|
| 374 |
+
" for i, (src, trg) in enumerate(iterator):\n",
|
| 375 |
+
" src, trg = src.to(device), trg.to(device)\n",
|
| 376 |
+
" optimizer.zero_grad()\n",
|
| 377 |
+
" output = model(src, trg[:, :-1])\n",
|
| 378 |
+
" output_dim = output.shape[-1]\n",
|
| 379 |
+
" output = output.contiguous().view(-1, output_dim)\n",
|
| 380 |
+
" trg = trg[:, 1:].contiguous().view(-1)\n",
|
| 381 |
+
" loss = criterion(output, trg)\n",
|
| 382 |
+
" loss.backward()\n",
|
| 383 |
+
" torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
|
| 384 |
+
" optimizer.step()\n",
|
| 385 |
+
" epoch_loss += loss.item()\n",
|
| 386 |
+
" return epoch_loss / len(iterator)\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"print(\"Starting training...\")\n",
|
| 389 |
+
"for epoch in range(EPOCHS):\n",
|
| 390 |
+
" train_loss = train(model, train_loader, optimizer, criterion, 1.0)\n",
|
| 391 |
+
" print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')\n",
|
| 392 |
+
" # Save every epoch or best validation (skipped val loop for brevity here, but included in full code)\n",
|
| 393 |
+
" torch.save(model.state_dict(), 'transformer_model_zh.pt')"
|
| 394 |
+
]
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"cell_type": "code",
|
| 398 |
+
"execution_count": null,
|
| 399 |
+
"metadata": {},
|
| 400 |
+
"outputs": [],
|
| 401 |
+
"source": [
|
| 402 |
+
"# Save artifacts for Web App\n",
|
| 403 |
+
"import shutil\n",
|
| 404 |
+
"os.makedirs('app/models', exist_ok=True)\n",
|
| 405 |
+
"shutil.copy('transformer_model_zh.pt', 'app/models/transformer_model_zh.pt')\n",
|
| 406 |
+
"shutil.copy('spm_zh.model', 'app/models/spm_zh.model')\n",
|
| 407 |
+
"shutil.copy('spm_en_zh.model', 'app/models/spm_en_zh.model')\n",
|
| 408 |
+
"print(\"Models copied to app/models/\")"
|
| 409 |
+
]
|
| 410 |
+
}
|
| 411 |
+
],
|
| 412 |
+
"metadata": {
|
| 413 |
+
"kernelspec": {
|
| 414 |
+
"display_name": "Python 3",
|
| 415 |
+
"language": "python",
|
| 416 |
+
"name": "python3"
|
| 417 |
+
},
|
| 418 |
+
"language_info": {
|
| 419 |
+
"codemirror_mode": {
|
| 420 |
+
"name": "ipython",
|
| 421 |
+
"version": 3
|
| 422 |
+
},
|
| 423 |
+
"file_extension": ".py",
|
| 424 |
+
"mimetype": "text/x-python",
|
| 425 |
+
"name": "python",
|
| 426 |
+
"nbconvert_exporter": "python",
|
| 427 |
+
"pygments_lexer": "ipython3",
|
| 428 |
+
"version": "3.8.10"
|
| 429 |
+
}
|
| 430 |
+
},
|
| 431 |
+
"nbformat": 4,
|
| 432 |
+
"nbformat_minor": 5
|
| 433 |
+
}
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use the official Python image
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
+
ENV PYTHONUNBUFFERED=1
|
| 7 |
+
|
| 8 |
+
# Create and set the working directory
|
| 9 |
+
WORKDIR /code
|
| 10 |
+
|
| 11 |
+
# Copy the requirements first to leverage Docker cache
|
| 12 |
+
# (I'll create a requirements.txt if it doesn't exist)
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
|
| 15 |
+
# Install dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Copy the rest of the application
|
| 19 |
+
COPY . .
|
| 20 |
+
|
| 21 |
+
# Move app files to root if necessary or adjust the command
|
| 22 |
+
# The app is in /app directory, but we want to run app.py
|
| 23 |
+
# Let's adjust the working directory for the command
|
| 24 |
+
WORKDIR /code/app
|
| 25 |
+
|
| 26 |
+
# Expose the port (HF Spaces uses 7860)
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
# Command to run the app
|
| 30 |
+
CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]
|
German_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WeogYTNaDj3r","executionInfo":{"status":"ok","timestamp":1770451365426,"user_tz":-420,"elapsed":8232,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"31f7bff5-87fd-4226-f664-6caa3742a41c"},"outputs":[{"output_type":"stream","name":"stdout","text":["Running in Google Colab\n","Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n","Requirement already satisfied: sentencepiece in /usr/local/lib/python3.12/dist-packages (0.2.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from datasets) (3.20.3)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.0.2)\n","Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n","Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n","Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n","Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.32.4)\n","Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.12/dist-packages (from datasets) (4.67.2)\n","Requirement already satisfied: xxhash in /usr/local/lib/python3.12/dist-packages (from datasets) (3.6.0)\n","Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n","Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n","Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (1.3.7)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from datasets) (26.0)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from datasets) (6.0.3)\n","Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.13.3)\n","Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.2.0)\n","Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (0.28.1)\n","Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.5.4)\n","Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (0.21.1)\n","Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.15.0)\n","Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (3.4.4)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (3.11)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (2.5.0)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (2026.1.4)\n","Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n","Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.3)\n","Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n","Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.4.0)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.8.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.7.1)\n","Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.4.1)\n","Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.22.0)\n","Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.24.0->datasets) (4.12.1)\n","Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.24.0->datasets) (1.0.9)\n","Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface-hub>=0.24.0->datasets) (0.16.0)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n","Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim->huggingface-hub>=0.24.0->datasets) (8.3.1)\n","Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["# Google Colab Setup\n","try:\n"," import google.colab\n"," IN_COLAB = True\n"," print(\"Running in Google Colab\")\n"," !pip install datasets sentencepiece\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," # Optional: Change to your project directory if needed\n"," # import os\n"," # os.chdir('/content/drive/MyDrive/NLP/Project_A3/A3_Burmese_English_Puffer')\n","except ImportError:\n"," IN_COLAB = False\n"," print(\"Running Locally\")"],"id":"WeogYTNaDj3r"},{"cell_type":"markdown","metadata":{"id":"o60RyQ1GDj3t"},"source":["# German-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: German (de) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for German-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"o60RyQ1GDj3t"},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mWWrcN7TDj3v","executionInfo":{"status":"ok","timestamp":1770451365477,"user_tz":-420,"elapsed":34,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"505529e2-c5bc-4618-d60e-ebf3205e02a6"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu'))\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"mWWrcN7TDj3v"},{"cell_type":"markdown","metadata":{"id":"u5XpwPylDj3w"},"source":["## 2. Data Loading (Opus-100)\n","Loading German-English pairs from Opus-100."],"id":"u5XpwPylDj3w"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sBw80f5rDj3w","executionInfo":{"status":"ok","timestamp":1770451394573,"user_tz":-420,"elapsed":29093,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"6f313d68-5f9c-4c07-82d1-6fc2b4726e37"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (German-English)...\n","Loaded 1004000 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 German-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (German-English)...\")\n","\n","data = []\n","try:\n"," # Opus-100 has 'de-en' or 'en-de'\n"," dataset = load_dataset(\"opus100\", \"de-en\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," for item in dataset:\n"," if 'translation' in item:\n"," # 'de' is the language code for German\n"," if 'de' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'de': item['translation']['de'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size for this project\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} German-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"sBw80f5rDj3w"},{"cell_type":"code","execution_count":8,"metadata":{"id":"ETHqEVLgDj3w","executionInfo":{"status":"ok","timestamp":1770451394738,"user_tz":-420,"elapsed":172,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"502491ee-6707-4b4b-ab74-fefadc5823b6"},"outputs":[{"output_type":"stream","name":"stdout","text":[" de \\\n","0 Offenbar werde ich verdächtigt. \n","1 Tielt +17°C \n","2 Wie geht's dir? \n","3 Zu ihm verhalten sich die Farben (guasch, temp... \n","4 -Was? \n","\n"," en \n","0 Apparently, I'm a suspect. \n","1 Tucupido +28°C \n","2 How are you? \n","3 Paints concern them (gouache, distemper, poliv... \n","4 You can't mean it! \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['de', 'en'])\n","df['de'] = df['de'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['de'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"ETHqEVLgDj3w"},{"cell_type":"markdown","metadata":{"id":"2cztqOQUDj3x"},"source":["## 3. Tokenization"],"id":"2cztqOQUDj3x"},{"cell_type":"code","execution_count":9,"metadata":{"id":"4YQnkzD_Dj3x","executionInfo":{"status":"ok","timestamp":1770451406984,"user_tz":-420,"elapsed":12244,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"e444ef57-9d4f-43e6-b6df-cbb535d8401e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training German Tokenizer...\n","Training English Tokenizer (for German pair)...\n"]}],"source":["# Save texts to files\n","with open('train_de.txt', 'w', encoding='utf-8') as f:\n"," for line in df['de']: f.write(line + '\\n')\n","\n","with open('train_en_de.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training German Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_de.txt',\n"," model_prefix='spm_de',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for German pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_de.txt',\n"," model_prefix='spm_en_de',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_de.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_de.model')"],"id":"4YQnkzD_Dj3x"},{"cell_type":"markdown","metadata":{"id":"VxbMUHVeDj3x"},"source":["## 4. Dataset & Model"],"id":"VxbMUHVeDj3x"},{"cell_type":"code","execution_count":10,"metadata":{"id":"hVhDYMytDj3x","executionInfo":{"status":"ok","timestamp":1770451406999,"user_tz":-420,"elapsed":5,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['de']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"hVhDYMytDj3x"},{"cell_type":"code","execution_count":11,"metadata":{"id":"f5fLfvEWDj3y","executionInfo":{"status":"ok","timestamp":1770451407006,"user_tz":-420,"elapsed":4,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"f5fLfvEWDj3y"},{"cell_type":"code","execution_count":12,"metadata":{"id":"01why68ZDj3z","executionInfo":{"status":"ok","timestamp":1770451876745,"user_tz":-420,"elapsed":469736,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"ecf12f45-ce36-4f55-9ef0-e7f42d6326f1"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.122\n","Step 100, Loss: 6.802\n","Step 200, Loss: 6.478\n","Step 300, Loss: 6.420\n","Step 400, Loss: 6.104\n","Step 500, Loss: 6.265\n","Step 600, Loss: 5.877\n","Step 700, Loss: 5.790\n","Epoch 1 Loss: 6.320\n","Step 0, Loss: 5.578\n","Step 100, Loss: 5.876\n","Step 200, Loss: 5.782\n","Step 300, Loss: 5.453\n","Step 400, Loss: 5.472\n","Step 500, Loss: 5.311\n","Step 600, Loss: 5.294\n","Step 700, Loss: 5.511\n","Epoch 2 Loss: 5.540\n","Step 0, Loss: 5.304\n","Step 100, Loss: 4.828\n","Step 200, Loss: 5.449\n","Step 300, Loss: 5.142\n","Step 400, Loss: 4.986\n","Step 500, Loss: 5.251\n","Step 600, Loss: 5.048\n","Step 700, Loss: 5.164\n","Epoch 3 Loss: 5.111\n","Step 0, Loss: 4.924\n","Step 100, Loss: 4.869\n","Step 200, Loss: 4.970\n","Step 300, Loss: 4.884\n","Step 400, Loss: 4.627\n","Step 500, Loss: 4.850\n","Step 600, Loss: 4.678\n","Step 700, Loss: 4.876\n","Epoch 4 Loss: 4.832\n","Step 0, Loss: 4.758\n","Step 100, Loss: 4.387\n","Step 200, Loss: 4.616\n","Step 300, Loss: 4.687\n","Step 400, Loss: 4.621\n","Step 500, Loss: 4.487\n","Step 600, Loss: 4.673\n","Step 700, Loss: 4.743\n","Epoch 5 Loss: 4.632\n","Step 0, Loss: 4.118\n","Step 100, Loss: 4.295\n","Step 200, Loss: 4.074\n","Step 300, Loss: 4.624\n","Step 400, Loss: 4.367\n","Step 500, Loss: 4.572\n","Step 600, Loss: 4.676\n","Step 700, Loss: 4.437\n","Epoch 6 Loss: 4.476\n","Step 0, Loss: 4.247\n","Step 100, Loss: 4.121\n","Step 200, Loss: 4.197\n","Step 300, Loss: 4.304\n","Step 400, Loss: 4.441\n","Step 500, Loss: 4.371\n","Step 600, Loss: 4.300\n","Step 700, Loss: 4.265\n","Epoch 7 Loss: 4.346\n","Step 0, Loss: 4.091\n","Step 100, Loss: 4.079\n","Step 200, Loss: 4.234\n","Step 300, Loss: 4.174\n","Step 400, Loss: 4.122\n","Step 500, Loss: 4.436\n","Step 600, Loss: 4.196\n","Step 700, Loss: 4.381\n","Epoch 8 Loss: 4.236\n","Step 0, Loss: 4.214\n","Step 100, Loss: 4.318\n","Step 200, Loss: 4.281\n","Step 300, Loss: 4.474\n","Step 400, Loss: 4.199\n","Step 500, Loss: 4.254\n","Step 600, Loss: 4.127\n","Step 700, Loss: 4.140\n","Epoch 9 Loss: 4.137\n","Step 0, Loss: 3.667\n","Step 100, Loss: 4.102\n","Step 200, Loss: 3.962\n","Step 300, Loss: 4.091\n","Step 400, Loss: 3.765\n","Step 500, Loss: 4.123\n","Step 600, Loss: 4.305\n","Step 700, Loss: 4.151\n","Epoch 10 Loss: 4.051\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo (Opus-100 is large)\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_de.pt')"],"id":"01why68ZDj3z"},{"cell_type":"code","execution_count":13,"metadata":{"id":"NRZITD1eDj3z","executionInfo":{"status":"ok","timestamp":1770451876783,"user_tz":-420,"elapsed":27,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"ca82b3a4-6b08-4e16-a6c7-e755022ff738"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_de.pt', 'app/models/transformer_model_de.pt')\n","shutil.copy('spm_de.model', 'app/models/spm_de.model')\n","shutil.copy('spm_en_de.model', 'app/models/spm_en_de.model')\n","print(\"Models copied to app/models/\")"],"id":"NRZITD1eDj3z"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":5}
|
Hindi_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"2_-xU-YKZ3AG"},"source":["# Hindi-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Hindi (hi) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Hindi-English parallel data (since ALT does not cover Hindi).\n","We use **SentencePiece** for subword tokenization.\n"],"id":"2_-xU-YKZ3AG"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fMCg2NSiZ3AH","executionInfo":{"status":"ok","timestamp":1770439305524,"user_tz":-420,"elapsed":9873,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d8aa7012-4d09-4b86-a027-86a6b2d1a4c0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"fMCg2NSiZ3AH"},{"cell_type":"markdown","metadata":{"id":"N6W3E7C7Z3AI"},"source":["## 2. Data Loading (Opus-100)\n","Loading Hindi-English pairs from Opus-100."],"id":"N6W3E7C7Z3AI"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":623,"referenced_widgets":["437663195c6746bfa6b2db60708da4d8","a375904851a741bbb6e80903c99c2336","be9b257d8a9d4d6a858f42fb7bad7487","167b27b305a54e45a83c90f2a2234fe4","b482bc3c5ab64217acf38b0c58733c81","420b6bb329e143dcb4e22ad5040a10fd","d49f13205a5e43efa4850f6b108ee8e2","623d17818c9d493c9bd05718026e5300","85d4dfeee47d426284e811b81df6c154","47267a7b0d9b4f56a159e9e9ec2d95ab","1d6bd8dfa7404740b2ae38e0adbdf547","05fffb9917f14ca29764c15592faa90c","941646a2437140558463790ec6888a92","1f6dc7ea27824dde9f6c3b2c0e907bdc","ff23a58b77e940389a763baa33ff99e0","08af38a9022f4ea897e5f7731ea0ff2b","fd231f52742b41f1974c3a967bc76cb4","2b05431acf894cf7ad2bc3d4ea50021e","c1e61972c72747559d111d3d80291215","17a2ed8d6e744dcaa7ba40e80249b52c","de5b3713b323434bbc8139e7bd4475d4","cfca09a62db84b62883d08e3b56410b3","7959c7c2ba4546f99fc076393c7fb7ad","c120b868eb674b529ae35262dbc4b612","a6880c136872473fb42ec08c14c95fcd","175954c78c194b84a1e154809b6b392b","f6cfe7df507a4f44a8cd2b15c0278e17","c9e2176f57b347af893ec0f111f4cb8a","7bf9d2ea798d481ea14ac5d3a0c29ac9","cfed5d010f8f4c93a6b73f6233cf42ee","036749fb7a9144f0aed04aa1c43947a2","0e16fefc8e4743c482c69ad85e3cb4a1","7e84d743dbd941ad84f8c4705bb888fd","57fa0730048b43fda0a54d0678742cf4","ce72a413eef148b5aae9c13eb5deb512","185515355670429db54e06afbe48576c","0e35c8f7b4864d7fa12b007fd3abc685","c78fb971e0334c808d11127e07e5276a","c459964cfdba4ed18a3681c85a50ecea","6bc79f1299d44fed9863f2dff949d404","e130efc4cd6f433d970148cf564eba8e","627f0732315d46a79fd312f74fac1444","25e17a4fd8464f6e9fe79239d617c5e2","fd8c71a0100b4f14950390fc32b38c7f","87d8cfdd9065475883f445720b23a6bf","14f158d8f9c04bd3aa25fb4b69d534e4","6dba2606e98e41d98b6073eae4de3dcb","30e24c52f4554996b0ede06884b787ac","eec36cab181c4d688abbecb798bb1580","3f953ef6b50141c8b189e800a39019cd","5bcc54add4494207938d942f0766282e","920f33e7188242e384ad94097565e05a","4747b9bf4e4641469db631484997deee","d78253da13ca495492d8336b00206f28","58c4a5875abe407ab60d151bfa3bc113","a26303b712724c198e16438af88f5e40","13ba2d7a2400443697abdd25caef84f0","db4aec2ebecf426b9aa74b3566cc9dc5","8195422ace0e4f55a67ed28ef61eb0d0","bc1f687387324ee29f237d34d73a2b1e","77d1ede3a5d4489a85adab40a8e2c69d","888d651e83124fc0a95362127dac9ad9","c10b9c6e22fa46108381e197462e3a43","df9f4045c34b43459aa2e8014cd55bd6","9056e3ef7a9d419b83b076bc0104715c","fe7d345e7d684c4a930151492bb9b006","aa1492921e9a48f499ebba69b3d38a2c","4f565fea90d84b47bd688a3ccfe63253","7264a7929d064ea696a8cb036b5e6799","e790ca616e094758baee332d0f6b2f24","4b05110b780a46c787ada479b5655046","f36dd1d4a8c34b9c94bc71888dc0bd60","abe69424975a4885960fd1c62521f7ac","69d7fff4b4374aa982552bc8c8fe5d76","283b912911d3408a95e453560834b8af","65f16bfd9e634276857c70337aca6293","db3f39cf20304a90b1824f7f135955e4"]},"id":"xz9bfbUoZ3AI","executionInfo":{"status":"ok","timestamp":1770439328301,"user_tz":-420,"elapsed":22779,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"415d423a-8900-40a9-9448-56caf520da25"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Hindi-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n","Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"437663195c6746bfa6b2db60708da4d8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-hi/test-00000-of-00001.parquet: 0%| | 0.00/259k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"05fffb9917f14ca29764c15592faa90c"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-hi/train-00000-of-00001.parquet: 0%| | 0.00/65.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7959c7c2ba4546f99fc076393c7fb7ad"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-hi/validation-00000-of-00001.parquet: 0%| | 0.00/247k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"57fa0730048b43fda0a54d0678742cf4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"87d8cfdd9065475883f445720b23a6bf"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/534319 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a26303b712724c198e16438af88f5e40"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aa1492921e9a48f499ebba69b3d38a2c"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 538319 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Hindi-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Hindi-English)...\")\n","try:\n"," # Opus-100 has 'en-hi'\n"," dataset = load_dataset(\"opus100\", \"en-hi\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," # Extract data\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'hi' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'hi': item['translation']['hi'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size for training on Colab if too large (Opus is huge)\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Hindi-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"xz9bfbUoZ3AI"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kQqs9NE0Z3AJ","executionInfo":{"status":"ok","timestamp":1770439328393,"user_tz":-420,"elapsed":96,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"b8cf15b2-f982-4e06-c658-3125ab82b60b"},"outputs":[{"output_type":"stream","name":"stdout","text":[" hi \\\n","0 - ल .. \n","1 अलिफ़॰ लाम॰ रा॰। यह एक किताब है जिसकी आयतें पक... \n","2 इन बेबी के बिना कैसे रहे। \n","3 वाहीआवाCity name (optional, probably does not ... \n","4 - ट्रेवर 'Atlantis.u के की uLost शहर: मम \n","\n"," en \n","0 - L... \n","1 Alif Lam Ra (This is) a Book, whose verses are... \n","2 Pre-ordering a prossie to murder at the next t... \n","3 Wahiawa \n","4 -'uLost City of Atlantis.u' TREVOR: \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['hi', 'en'])\n","df['hi'] = df['hi'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['hi'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"kQqs9NE0Z3AJ"},{"cell_type":"markdown","metadata":{"id":"fiBwsHrRZ3AJ"},"source":["## 3. Tokenization"],"id":"fiBwsHrRZ3AJ"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4PQtqo4FZ3AJ","executionInfo":{"status":"ok","timestamp":1770439332700,"user_tz":-420,"elapsed":4303,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"459a7116-0418-4be9-98e8-55fb36fbbd7b"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Hindi Tokenizer...\n","Training English Tokenizer (for Hindi pair)...\n"]}],"source":["# Save texts to files\n","with open('train_hi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['hi']: f.write(line + '\\n')\n","\n","with open('train_en_hi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000 # Increased for larger dataset/diversity\n","model_type = 'bpe'\n","\n","print(\"Training Hindi Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_hi.txt',\n"," model_prefix='spm_hi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Hindi pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_hi.txt',\n"," model_prefix='spm_en_hi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_hi.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_hi.model')"],"id":"4PQtqo4FZ3AJ"},{"cell_type":"markdown","metadata":{"id":"AYygVGEdZ3AJ"},"source":["## 4. Dataset & Model"],"id":"AYygVGEdZ3AJ"},{"cell_type":"code","execution_count":5,"metadata":{"id":"VKOFee4bZ3AJ","executionInfo":{"status":"ok","timestamp":1770439332721,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['hi']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"VKOFee4bZ3AJ"},{"cell_type":"code","execution_count":6,"metadata":{"id":"nKQM2fpsZ3AK","executionInfo":{"status":"ok","timestamp":1770439332729,"user_tz":-420,"elapsed":7,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"nKQM2fpsZ3AK"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pnz6_UXcZ3AK","executionInfo":{"status":"ok","timestamp":1770440101437,"user_tz":-420,"elapsed":768708,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"088f0fb2-eb0c-45d5-8d7e-325461bfd275"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.189\n","Step 100, Loss: 6.300\n","Step 200, Loss: 5.875\n","Step 300, Loss: 5.781\n","Step 400, Loss: 5.471\n","Step 500, Loss: 5.690\n","Step 600, Loss: 5.303\n","Step 700, Loss: 5.245\n","Epoch 1 Loss: 5.626\n","Step 0, Loss: 4.651\n","Step 100, Loss: 4.864\n","Step 200, Loss: 4.755\n","Step 300, Loss: 4.875\n","Step 400, Loss: 4.955\n","Step 500, Loss: 4.613\n","Step 600, Loss: 4.793\n","Step 700, Loss: 4.608\n","Epoch 2 Loss: 4.749\n","Step 0, Loss: 4.476\n","Step 100, Loss: 4.460\n","Step 200, Loss: 4.573\n","Step 300, Loss: 4.310\n","Step 400, Loss: 4.169\n","Step 500, Loss: 4.296\n","Step 600, Loss: 4.434\n","Step 700, Loss: 4.135\n","Epoch 3 Loss: 4.344\n","Step 0, Loss: 4.275\n","Step 100, Loss: 4.253\n","Step 200, Loss: 4.010\n","Step 300, Loss: 4.083\n","Step 400, Loss: 4.119\n","Step 500, Loss: 3.813\n","Step 600, Loss: 3.818\n","Step 700, Loss: 4.239\n","Epoch 4 Loss: 4.054\n","Step 0, Loss: 3.653\n","Step 100, Loss: 3.975\n","Step 200, Loss: 3.831\n","Step 300, Loss: 3.881\n","Step 400, Loss: 3.946\n","Step 500, Loss: 3.686\n","Step 600, Loss: 3.760\n","Step 700, Loss: 3.940\n","Epoch 5 Loss: 3.835\n","Step 0, Loss: 3.656\n","Step 100, Loss: 3.804\n","Step 200, Loss: 3.880\n","Step 300, Loss: 3.327\n","Step 400, Loss: 3.826\n","Step 500, Loss: 3.483\n","Step 600, Loss: 3.967\n","Step 700, Loss: 3.605\n","Epoch 6 Loss: 3.656\n","Step 0, Loss: 3.575\n","Step 100, Loss: 3.515\n","Step 200, Loss: 3.743\n","Step 300, Loss: 3.231\n","Step 400, Loss: 3.877\n","Step 500, Loss: 3.325\n","Step 600, Loss: 3.680\n","Step 700, Loss: 3.678\n","Epoch 7 Loss: 3.509\n","Step 0, Loss: 3.627\n","Step 100, Loss: 3.427\n","Step 200, Loss: 3.491\n","Step 300, Loss: 3.302\n","Step 400, Loss: 3.599\n","Step 500, Loss: 3.448\n","Step 600, Loss: 3.703\n","Step 700, Loss: 3.467\n","Epoch 8 Loss: 3.385\n","Step 0, Loss: 3.238\n","Step 100, Loss: 3.243\n","Step 200, Loss: 3.421\n","Step 300, Loss: 3.424\n","Step 400, Loss: 3.290\n","Step 500, Loss: 3.273\n","Step 600, Loss: 3.180\n","Step 700, Loss: 3.268\n","Epoch 9 Loss: 3.281\n","Step 0, Loss: 2.975\n","Step 100, Loss: 3.293\n","Step 200, Loss: 3.388\n","Step 300, Loss: 3.083\n","Step 400, Loss: 3.350\n","Step 500, Loss: 3.231\n","Step 600, Loss: 3.281\n","Step 700, Loss: 3.309\n","Epoch 10 Loss: 3.185\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_hi.pt')"],"id":"pnz6_UXcZ3AK"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5hFYBEDwZ3AK","executionInfo":{"status":"ok","timestamp":1770440101446,"user_tz":-420,"elapsed":5,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"4ccea3ac-129a-4251-bc70-103959c241c5"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_hi.pt', 'app/models/transformer_model_hi.pt')\n","shutil.copy('spm_hi.model', 'app/models/spm_hi.model')\n","shutil.copy('spm_en_hi.model', 'app/models/spm_en_hi.model')\n","print(\"Models copied to app/models/\")"],"id":"5hFYBEDwZ3AK"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"437663195c6746bfa6b2db60708da4d8":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_a375904851a741bbb6e80903c99c2336","IPY_MODEL_be9b257d8a9d4d6a858f42fb7bad7487","IPY_MODEL_167b27b305a54e45a83c90f2a2234fe4"],"layout":"IPY_MODEL_b482bc3c5ab64217acf38b0c58733c81"}},"a375904851a741bbb6e80903c99c2336":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_420b6bb329e143dcb4e22ad5040a10fd","placeholder":"","style":"IPY_MODEL_d49f13205a5e43efa4850f6b108ee8e2","value":"README.md: "}},"be9b257d8a9d4d6a858f42fb7bad7487":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_623d17818c9d493c9bd05718026e5300","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_85d4dfeee47d426284e811b81df6c154","value":1}},"167b27b305a54e45a83c90f2a2234fe4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_47267a7b0d9b4f56a159e9e9ec2d95ab","placeholder":"","style":"IPY_MODEL_1d6bd8dfa7404740b2ae38e0adbdf547","value":" 65.4k/? [00:00<00:00, 5.70MB/s]"}},"b482bc3c5ab64217acf38b0c58733c81":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"420b6bb329e143dcb4e22ad5040a10fd":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d49f13205a5e43efa4850f6b108ee8e2":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"623d17818c9d493c9bd05718026e5300":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"85d4dfeee47d426284e811b81df6c154":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"47267a7b0d9b4f56a159e9e9ec2d95ab":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1d6bd8dfa7404740b2ae38e0adbdf547":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"05fffb9917f14ca29764c15592faa90c":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_941646a2437140558463790ec6888a92","IPY_MODEL_1f6dc7ea27824dde9f6c3b2c0e907bdc","IPY_MODEL_ff23a58b77e940389a763baa33ff99e0"],"layout":"IPY_MODEL_08af38a9022f4ea897e5f7731ea0ff2b"}},"941646a2437140558463790ec6888a92":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_fd231f52742b41f1974c3a967bc76cb4","placeholder":"","style":"IPY_MODEL_2b05431acf894cf7ad2bc3d4ea50021e","value":"en-hi/test-00000-of-00001.parquet: 100%"}},"1f6dc7ea27824dde9f6c3b2c0e907bdc":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c1e61972c72747559d111d3d80291215","max":259276,"min":0,"orientation":"horizontal","style":"IPY_MODEL_17a2ed8d6e744dcaa7ba40e80249b52c","value":259276}},"ff23a58b77e940389a763baa33ff99e0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_de5b3713b323434bbc8139e7bd4475d4","placeholder":"","style":"IPY_MODEL_cfca09a62db84b62883d08e3b56410b3","value":" 259k/259k [00:00<00:00, 306kB/s]"}},"08af38a9022f4ea897e5f7731ea0ff2b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fd231f52742b41f1974c3a967bc76cb4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2b05431acf894cf7ad2bc3d4ea50021e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c1e61972c72747559d111d3d80291215":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"17a2ed8d6e744dcaa7ba40e80249b52c":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"de5b3713b323434bbc8139e7bd4475d4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cfca09a62db84b62883d08e3b56410b3":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"7959c7c2ba4546f99fc076393c7fb7ad":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_c120b868eb674b529ae35262dbc4b612","IPY_MODEL_a6880c136872473fb42ec08c14c95fcd","IPY_MODEL_175954c78c194b84a1e154809b6b392b"],"layout":"IPY_MODEL_f6cfe7df507a4f44a8cd2b15c0278e17"}},"c120b868eb674b529ae35262dbc4b612":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c9e2176f57b347af893ec0f111f4cb8a","placeholder":"","style":"IPY_MODEL_7bf9d2ea798d481ea14ac5d3a0c29ac9","value":"en-hi/train-00000-of-00001.parquet: 100%"}},"a6880c136872473fb42ec08c14c95fcd":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_cfed5d010f8f4c93a6b73f6233cf42ee","max":65219235,"min":0,"orientation":"horizontal","style":"IPY_MODEL_036749fb7a9144f0aed04aa1c43947a2","value":65219235}},"175954c78c194b84a1e154809b6b392b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0e16fefc8e4743c482c69ad85e3cb4a1","placeholder":"","style":"IPY_MODEL_7e84d743dbd941ad84f8c4705bb888fd","value":" 65.2M/65.2M [00:01<00:00, 61.5MB/s]"}},"f6cfe7df507a4f44a8cd2b15c0278e17":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c9e2176f57b347af893ec0f111f4cb8a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7bf9d2ea798d481ea14ac5d3a0c29ac9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"cfed5d010f8f4c93a6b73f6233cf42ee":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"036749fb7a9144f0aed04aa1c43947a2":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"0e16fefc8e4743c482c69ad85e3cb4a1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7e84d743dbd941ad84f8c4705bb888fd":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"57fa0730048b43fda0a54d0678742cf4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ce72a413eef148b5aae9c13eb5deb512","IPY_MODEL_185515355670429db54e06afbe48576c","IPY_MODEL_0e35c8f7b4864d7fa12b007fd3abc685"],"layout":"IPY_MODEL_c78fb971e0334c808d11127e07e5276a"}},"ce72a413eef148b5aae9c13eb5deb512":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c459964cfdba4ed18a3681c85a50ecea","placeholder":"","style":"IPY_MODEL_6bc79f1299d44fed9863f2dff949d404","value":"en-hi/validation-00000-of-00001.parquet: 100%"}},"185515355670429db54e06afbe48576c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e130efc4cd6f433d970148cf564eba8e","max":247375,"min":0,"orientation":"horizontal","style":"IPY_MODEL_627f0732315d46a79fd312f74fac1444","value":247375}},"0e35c8f7b4864d7fa12b007fd3abc685":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_25e17a4fd8464f6e9fe79239d617c5e2","placeholder":"","style":"IPY_MODEL_fd8c71a0100b4f14950390fc32b38c7f","value":" 247k/247k [00:00<00:00, 254kB/s]"}},"c78fb971e0334c808d11127e07e5276a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c459964cfdba4ed18a3681c85a50ecea":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6bc79f1299d44fed9863f2dff949d404":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e130efc4cd6f433d970148cf564eba8e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"627f0732315d46a79fd312f74fac1444":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"25e17a4fd8464f6e9fe79239d617c5e2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fd8c71a0100b4f14950390fc32b38c7f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"87d8cfdd9065475883f445720b23a6bf":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_14f158d8f9c04bd3aa25fb4b69d534e4","IPY_MODEL_6dba2606e98e41d98b6073eae4de3dcb","IPY_MODEL_30e24c52f4554996b0ede06884b787ac"],"layout":"IPY_MODEL_eec36cab181c4d688abbecb798bb1580"}},"14f158d8f9c04bd3aa25fb4b69d534e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3f953ef6b50141c8b189e800a39019cd","placeholder":"","style":"IPY_MODEL_5bcc54add4494207938d942f0766282e","value":"Generating test split: 100%"}},"6dba2606e98e41d98b6073eae4de3dcb":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_920f33e7188242e384ad94097565e05a","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_4747b9bf4e4641469db631484997deee","value":2000}},"30e24c52f4554996b0ede06884b787ac":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d78253da13ca495492d8336b00206f28","placeholder":"","style":"IPY_MODEL_58c4a5875abe407ab60d151bfa3bc113","value":" 2000/2000 [00:00<00:00, 35619.05 examples/s]"}},"eec36cab181c4d688abbecb798bb1580":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3f953ef6b50141c8b189e800a39019cd":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5bcc54add4494207938d942f0766282e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"920f33e7188242e384ad94097565e05a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4747b9bf4e4641469db631484997deee":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d78253da13ca495492d8336b00206f28":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"58c4a5875abe407ab60d151bfa3bc113":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a26303b712724c198e16438af88f5e40":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_13ba2d7a2400443697abdd25caef84f0","IPY_MODEL_db4aec2ebecf426b9aa74b3566cc9dc5","IPY_MODEL_8195422ace0e4f55a67ed28ef61eb0d0"],"layout":"IPY_MODEL_bc1f687387324ee29f237d34d73a2b1e"}},"13ba2d7a2400443697abdd25caef84f0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_77d1ede3a5d4489a85adab40a8e2c69d","placeholder":"","style":"IPY_MODEL_888d651e83124fc0a95362127dac9ad9","value":"Generating train split: 100%"}},"db4aec2ebecf426b9aa74b3566cc9dc5":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c10b9c6e22fa46108381e197462e3a43","max":534319,"min":0,"orientation":"horizontal","style":"IPY_MODEL_df9f4045c34b43459aa2e8014cd55bd6","value":534319}},"8195422ace0e4f55a67ed28ef61eb0d0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9056e3ef7a9d419b83b076bc0104715c","placeholder":"","style":"IPY_MODEL_fe7d345e7d684c4a930151492bb9b006","value":" 534319/534319 [00:00<00:00, 677541.02 examples/s]"}},"bc1f687387324ee29f237d34d73a2b1e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"77d1ede3a5d4489a85adab40a8e2c69d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"888d651e83124fc0a95362127dac9ad9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c10b9c6e22fa46108381e197462e3a43":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"df9f4045c34b43459aa2e8014cd55bd6":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9056e3ef7a9d419b83b076bc0104715c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fe7d345e7d684c4a930151492bb9b006":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"aa1492921e9a48f499ebba69b3d38a2c":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_4f565fea90d84b47bd688a3ccfe63253","IPY_MODEL_7264a7929d064ea696a8cb036b5e6799","IPY_MODEL_e790ca616e094758baee332d0f6b2f24"],"layout":"IPY_MODEL_4b05110b780a46c787ada479b5655046"}},"4f565fea90d84b47bd688a3ccfe63253":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f36dd1d4a8c34b9c94bc71888dc0bd60","placeholder":"","style":"IPY_MODEL_abe69424975a4885960fd1c62521f7ac","value":"Generating validation split: 100%"}},"7264a7929d064ea696a8cb036b5e6799":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_69d7fff4b4374aa982552bc8c8fe5d76","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_283b912911d3408a95e453560834b8af","value":2000}},"e790ca616e094758baee332d0f6b2f24":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_65f16bfd9e634276857c70337aca6293","placeholder":"","style":"IPY_MODEL_db3f39cf20304a90b1824f7f135955e4","value":" 2000/2000 [00:00<00:00, 77309.37 examples/s]"}},"4b05110b780a46c787ada479b5655046":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f36dd1d4a8c34b9c94bc71888dc0bd60":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"abe69424975a4885960fd1c62521f7ac":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"69d7fff4b4374aa982552bc8c8fe5d76":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"283b912911d3408a95e453560834b8af":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"65f16bfd9e634276857c70337aca6293":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"db3f39cf20304a90b1824f7f135955e4":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
Kazakh_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"jQEigq0G8cJe"},"source":["# Kazakh-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Kazakh (kk) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Kazakh-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"jQEigq0G8cJe"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qrrrFeD88cJg","executionInfo":{"status":"ok","timestamp":1770448390827,"user_tz":-420,"elapsed":7624,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"5b9eebe3-acd4-4645-9f7f-e335477b765f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"qrrrFeD88cJg"},{"cell_type":"markdown","metadata":{"id":"j93FwrdJ8cJh"},"source":["## 2. Data Loading (Opus-100)\n","Loading Kazakh-English pairs from Opus-100."],"id":"j93FwrdJ8cJh"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":559,"referenced_widgets":["6011e696e7134747ad1da29bac8b1fa3","a08334f3baa04417b29d2790f5200899","444138cf68e54c058b3dc517c859d315","03973f3d8fb749d8b447ecb4d8ab925f","2d06b58cbc4d4b3592686575ce8cfe0e","a801924070514530861ec37751863a11","6b55ec48eb544e7fa890a413b3a8a618","e558c7a282124a679d8d1ba66479ae68","480f629eb65344acba595da2adaca5e7","144b6a863d624d0b8f2bc9215e20ecee","f844a6d12a444fee9211e3231700a3b1","2d380a9519d44a748906f7855a8347fc","6f549dace4d242feb8f71d4879ce8c33","de81adc631754b92af67d75251926253","5565e5832d2b42cd94b5d509a07f49d1","c699fc7066e34809bf0fe4ff22ef85bb","9b04dfd472eb4f4e84fa23becb13c5d9","f04429d45a89499b902c4f9825d58b9f","f4b88337dba742afbf965c460579138b","e625835b3492431ea81d0c7f932c9a6d","1f0a23dbc0f744ca931e53cc4377a060","c94bca4b10d14e8088ac983842a16f3e","05f5d68df2c147ad9be40ec37d86a831","24e5962f9db1449e986f1016f413b76e","4fa72adbab5c49e3908578ccd3d15eae","a4385800086d491189765a09829f7165","0f6b85ba92ee4c1395d38a8efeb0d1f2","1155a892b40744158325236140ac0b19","5e699f444ce24d21ac3894941dc75149","6b83449e4d51445185da2df7352868d0","8c9bcd03a0964d69bd61c25e4f881f59","b29f31340d9d44bdab54175cec8a6c95","fd20f577ffd94493b7073e04bf0f3a07","1abdf7ab22f6466da2c66cede3b15ff5","11d1aeeaf1984725b9f1d2219156dcb6","6570d5bc161b4c8698de14ed300dafea","adbb70135fee468293e6f3ea2ef56beb","6a4a3be9cbc4493191a892c573582ba3","84c2ec1f709c47a7b8faa1019893d4e4","0a99e474c195480898b2c35cd127d557","2537324c324b4c60a745c2ff78e15fa0","5e119fe24031446c94d6fcf4f95fdf3d","8b57b2ae0bf842c3af36a18b62d9e015","b69e8d52115a4ddcb17b03cc6f64eb09","941445b90fc04904b6648a5e3e3fa245","844642fe3adb4f01a07d7d07ba4798bc","e068281ab48043489289c1ab2f92473e","045011b0c96741dfada3a09a0524b0d4","fd7883f2f8d349db891709dc3101a16d","b66b9e6354584286a9f1f0f4b87d1fb3","c552b2ba8f5f4726a79391e229136a83","76f2c7cbd0c74af9ac85a27d85f9ea06","aaf549f0289f408c897035bd75e5c305","85bb759b75404ac587c5282b284b8d3a","aa449e4c8bea4d559b4f060f69ae4c25","a62dd6045c8c4743bb389a17b8788e73","01ee8d04a6194f8f83fb81212c3d1a10","318be1fd72314dc28b62f265935318bf","db51334d90a34a6fa253d12c7e50e54c","e43487b0dcbb4ec0ae8af912f5535074","71db91520a20457f96a0e8ce83ba2f61","53e0eb9505e14e52a9aaa3ad3d92eb7f","98a4e533d68f4aaa8bd5e556d967ca2c","25aa3f2b14d34a95a10d65099f044490","2c1449b36e254ad3ae9ebf521ba9c06c","048b2b884d594f44a627508114bb2653","edbe3f5f53cc490aac30f8c2a6e7cfe2","61e376e936464743be2c0882861bfeae","3133af8ca0b9442aa0a863a82a0efc90","c2f229cd45bf4dbf86efe061f43010e0","9fb9e54ec0544cb5bf30fe6286ece8c4","074a6c4bedd24a379b59da7c284de342","29618afe42234e1fa1e23d6ca3c9a9bf","c6986d4b832c45fe97b5a663309067de","8485fbb850314eee9e0df683a18d1497","a52f65446e6241e18b5011ef9c55865c","15ba674f577b4559a3ceb2129022bc9c"]},"id":"vEBAsFtF8cJh","executionInfo":{"status":"ok","timestamp":1770448401881,"user_tz":-420,"elapsed":11051,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"5273a0e1-2b4d-4adb-c853-7175a3129406"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Kazakh-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6011e696e7134747ad1da29bac8b1fa3"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-kk/test-00000-of-00001.parquet: 0%| | 0.00/84.1k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2d380a9519d44a748906f7855a8347fc"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-kk/train-00000-of-00001.parquet: 0%| | 0.00/4.64M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"05f5d68df2c147ad9be40ec37d86a831"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["en-kk/validation-00000-of-00001.parquet: 0%| | 0.00/83.1k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1abdf7ab22f6466da2c66cede3b15ff5"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"941445b90fc04904b6648a5e3e3fa245"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/79927 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a62dd6045c8c4743bb389a17b8788e73"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"edbe3f5f53cc490aac30f8c2a6e7cfe2"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 83927 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Kazakh-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Kazakh-English)...\")\n","try:\n"," # Opus-100 has 'en-kk'\n"," dataset = load_dataset(\"opus100\", \"en-kk\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," # 'kk' is the language code for Kazakh\n"," if 'kk' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'kk': item['translation']['kk'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size for this project\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Kazakh-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"vEBAsFtF8cJh"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Zp1IkPgg8cJh","executionInfo":{"status":"ok","timestamp":1770448402080,"user_tz":-420,"elapsed":171,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d19b5f23-8cd8-443b-c2b6-ba8b8dc7c0a9"},"outputs":[{"output_type":"stream","name":"stdout","text":[" kk \\\n","0 Соломон ар- ыName \n","1 Хабарламаның өлшемі ішкі буфердің өлшемінен ас... \n","2 Астероидтарды жүктеу \n","3 unit description in lists \n","4 Ұяшыққа енгізген мәтіннің алғашқы әрібі автома... \n","\n"," en \n","0 Solomon Islands \n","1 The connection is broken. \n","2 Loading asteroids \n","3 ds \n","4 Check this box and the first letter of any tex... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['kk', 'en'])\n","df['kk'] = df['kk'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['kk'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"Zp1IkPgg8cJh"},{"cell_type":"markdown","metadata":{"id":"QAJ-uFQ48cJi"},"source":["## 3. Tokenization"],"id":"QAJ-uFQ48cJi"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"j6XTdYn78cJi","executionInfo":{"status":"ok","timestamp":1770448405527,"user_tz":-420,"elapsed":3439,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"07c0e052-8205-480a-ac09-62d21a32896d"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Kazakh Tokenizer...\n","Training English Tokenizer (for Kazakh pair)...\n"]}],"source":["# Save texts to files\n","with open('train_kk.txt', 'w', encoding='utf-8') as f:\n"," for line in df['kk']: f.write(line + '\\n')\n","\n","with open('train_en_kk.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Kazakh Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_kk.txt',\n"," model_prefix='spm_kk',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Kazakh pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_kk.txt',\n"," model_prefix='spm_en_kk',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_kk.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_kk.model')"],"id":"j6XTdYn78cJi"},{"cell_type":"markdown","metadata":{"id":"vFF66mIP8cJi"},"source":["## 4. Dataset & Model"],"id":"vFF66mIP8cJi"},{"cell_type":"code","execution_count":5,"metadata":{"id":"kgduE8vf8cJi","executionInfo":{"status":"ok","timestamp":1770448405556,"user_tz":-420,"elapsed":27,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['kk']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"kgduE8vf8cJi"},{"cell_type":"code","execution_count":6,"metadata":{"id":"7cWen2oV8cJj","executionInfo":{"status":"ok","timestamp":1770448405577,"user_tz":-420,"elapsed":20,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"7cWen2oV8cJj"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HVLW5dij8cJj","executionInfo":{"status":"ok","timestamp":1770448935027,"user_tz":-420,"elapsed":529443,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"1cce1192-351f-4296-b4a0-223b043ad008"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.075\n","Step 100, Loss: 6.556\n","Step 200, Loss: 6.369\n","Step 300, Loss: 6.011\n","Step 400, Loss: 5.706\n","Step 500, Loss: 6.181\n","Step 600, Loss: 5.752\n","Step 700, Loss: 5.447\n","Epoch 1 Loss: 5.964\n","Step 0, Loss: 5.384\n","Step 100, Loss: 5.231\n","Step 200, Loss: 5.161\n","Step 300, Loss: 4.940\n","Step 400, Loss: 4.740\n","Step 500, Loss: 4.543\n","Step 600, Loss: 4.678\n","Step 700, Loss: 4.400\n","Epoch 2 Loss: 4.925\n","Step 0, Loss: 3.883\n","Step 100, Loss: 4.995\n","Step 200, Loss: 4.852\n","Step 300, Loss: 4.444\n","Step 400, Loss: 4.736\n","Step 500, Loss: 4.302\n","Step 600, Loss: 4.597\n","Step 700, Loss: 4.315\n","Epoch 3 Loss: 4.290\n","Step 0, Loss: 3.856\n","Step 100, Loss: 3.346\n","Step 200, Loss: 3.560\n","Step 300, Loss: 3.536\n","Step 400, Loss: 3.675\n","Step 500, Loss: 3.650\n","Step 600, Loss: 3.788\n","Step 700, Loss: 3.986\n","Epoch 4 Loss: 3.795\n","Step 0, Loss: 3.339\n","Step 100, Loss: 3.281\n","Step 200, Loss: 3.356\n","Step 300, Loss: 3.740\n","Step 400, Loss: 3.298\n","Step 500, Loss: 3.310\n","Step 600, Loss: 3.430\n","Step 700, Loss: 3.159\n","Epoch 5 Loss: 3.421\n","Step 0, Loss: 3.679\n","Step 100, Loss: 3.330\n","Step 200, Loss: 3.167\n","Step 300, Loss: 3.028\n","Step 400, Loss: 3.278\n","Step 500, Loss: 2.776\n","Step 600, Loss: 3.332\n","Step 700, Loss: 3.037\n","Epoch 6 Loss: 3.130\n","Step 0, Loss: 3.069\n","Step 100, Loss: 2.532\n","Step 200, Loss: 3.826\n","Step 300, Loss: 3.024\n","Step 400, Loss: 2.467\n","Step 500, Loss: 3.012\n","Step 600, Loss: 3.027\n","Step 700, Loss: 2.637\n","Epoch 7 Loss: 2.898\n","Step 0, Loss: 1.940\n","Step 100, Loss: 2.974\n","Step 200, Loss: 2.921\n","Step 300, Loss: 2.778\n","Step 400, Loss: 2.822\n","Step 500, Loss: 2.947\n","Step 600, Loss: 2.785\n","Step 700, Loss: 2.522\n","Epoch 8 Loss: 2.705\n","Step 0, Loss: 2.080\n","Step 100, Loss: 2.026\n","Step 200, Loss: 2.744\n","Step 300, Loss: 3.248\n","Step 400, Loss: 2.722\n","Step 500, Loss: 2.287\n","Step 600, Loss: 2.520\n","Step 700, Loss: 2.851\n","Epoch 9 Loss: 2.545\n","Step 0, Loss: 1.841\n","Step 100, Loss: 2.176\n","Step 200, Loss: 2.319\n","Step 300, Loss: 2.149\n","Step 400, Loss: 2.668\n","Step 500, Loss: 2.202\n","Step 600, Loss: 2.811\n","Step 700, Loss: 2.248\n","Epoch 10 Loss: 2.406\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo (Opus-100 is large)\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_kk.pt')"],"id":"HVLW5dij8cJj"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cdxw4XsM8cJj","executionInfo":{"status":"ok","timestamp":1770448935067,"user_tz":-420,"elapsed":37,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"eca2a396-45a9-4cd7-c420-3003e19300d3"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_kk.pt', 'app/models/transformer_model_kk.pt')\n","shutil.copy('spm_kk.model', 'app/models/spm_kk.model')\n","shutil.copy('spm_en_kk.model', 'app/models/spm_en_kk.model')\n","print(\"Models copied to app/models/\")"],"id":"cdxw4XsM8cJj"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"6011e696e7134747ad1da29bac8b1fa3":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_a08334f3baa04417b29d2790f5200899","IPY_MODEL_444138cf68e54c058b3dc517c859d315","IPY_MODEL_03973f3d8fb749d8b447ecb4d8ab925f"],"layout":"IPY_MODEL_2d06b58cbc4d4b3592686575ce8cfe0e"}},"a08334f3baa04417b29d2790f5200899":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a801924070514530861ec37751863a11","placeholder":"","style":"IPY_MODEL_6b55ec48eb544e7fa890a413b3a8a618","value":"README.md: "}},"444138cf68e54c058b3dc517c859d315":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e558c7a282124a679d8d1ba66479ae68","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_480f629eb65344acba595da2adaca5e7","value":1}},"03973f3d8fb749d8b447ecb4d8ab925f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_144b6a863d624d0b8f2bc9215e20ecee","placeholder":"","style":"IPY_MODEL_f844a6d12a444fee9211e3231700a3b1","value":" 65.4k/? [00:00<00:00, 5.22MB/s]"}},"2d06b58cbc4d4b3592686575ce8cfe0e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a801924070514530861ec37751863a11":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6b55ec48eb544e7fa890a413b3a8a618":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e558c7a282124a679d8d1ba66479ae68":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"480f629eb65344acba595da2adaca5e7":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"144b6a863d624d0b8f2bc9215e20ecee":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f844a6d12a444fee9211e3231700a3b1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2d380a9519d44a748906f7855a8347fc":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_6f549dace4d242feb8f71d4879ce8c33","IPY_MODEL_de81adc631754b92af67d75251926253","IPY_MODEL_5565e5832d2b42cd94b5d509a07f49d1"],"layout":"IPY_MODEL_c699fc7066e34809bf0fe4ff22ef85bb"}},"6f549dace4d242feb8f71d4879ce8c33":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9b04dfd472eb4f4e84fa23becb13c5d9","placeholder":"","style":"IPY_MODEL_f04429d45a89499b902c4f9825d58b9f","value":"en-kk/test-00000-of-00001.parquet: 100%"}},"de81adc631754b92af67d75251926253":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f4b88337dba742afbf965c460579138b","max":84062,"min":0,"orientation":"horizontal","style":"IPY_MODEL_e625835b3492431ea81d0c7f932c9a6d","value":84062}},"5565e5832d2b42cd94b5d509a07f49d1":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1f0a23dbc0f744ca931e53cc4377a060","placeholder":"","style":"IPY_MODEL_c94bca4b10d14e8088ac983842a16f3e","value":" 84.1k/84.1k [00:00<00:00, 96.4kB/s]"}},"c699fc7066e34809bf0fe4ff22ef85bb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9b04dfd472eb4f4e84fa23becb13c5d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f04429d45a89499b902c4f9825d58b9f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f4b88337dba742afbf965c460579138b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e625835b3492431ea81d0c7f932c9a6d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"1f0a23dbc0f744ca931e53cc4377a060":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c94bca4b10d14e8088ac983842a16f3e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"05f5d68df2c147ad9be40ec37d86a831":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_24e5962f9db1449e986f1016f413b76e","IPY_MODEL_4fa72adbab5c49e3908578ccd3d15eae","IPY_MODEL_a4385800086d491189765a09829f7165"],"layout":"IPY_MODEL_0f6b85ba92ee4c1395d38a8efeb0d1f2"}},"24e5962f9db1449e986f1016f413b76e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1155a892b40744158325236140ac0b19","placeholder":"","style":"IPY_MODEL_5e699f444ce24d21ac3894941dc75149","value":"en-kk/train-00000-of-00001.parquet: 100%"}},"4fa72adbab5c49e3908578ccd3d15eae":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_6b83449e4d51445185da2df7352868d0","max":4641227,"min":0,"orientation":"horizontal","style":"IPY_MODEL_8c9bcd03a0964d69bd61c25e4f881f59","value":4641227}},"a4385800086d491189765a09829f7165":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b29f31340d9d44bdab54175cec8a6c95","placeholder":"","style":"IPY_MODEL_fd20f577ffd94493b7073e04bf0f3a07","value":" 4.64M/4.64M [00:01<00:00, 5.63MB/s]"}},"0f6b85ba92ee4c1395d38a8efeb0d1f2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1155a892b40744158325236140ac0b19":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5e699f444ce24d21ac3894941dc75149":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6b83449e4d51445185da2df7352868d0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8c9bcd03a0964d69bd61c25e4f881f59":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"b29f31340d9d44bdab54175cec8a6c95":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fd20f577ffd94493b7073e04bf0f3a07":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1abdf7ab22f6466da2c66cede3b15ff5":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_11d1aeeaf1984725b9f1d2219156dcb6","IPY_MODEL_6570d5bc161b4c8698de14ed300dafea","IPY_MODEL_adbb70135fee468293e6f3ea2ef56beb"],"layout":"IPY_MODEL_6a4a3be9cbc4493191a892c573582ba3"}},"11d1aeeaf1984725b9f1d2219156dcb6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_84c2ec1f709c47a7b8faa1019893d4e4","placeholder":"","style":"IPY_MODEL_0a99e474c195480898b2c35cd127d557","value":"en-kk/validation-00000-of-00001.parquet: 100%"}},"6570d5bc161b4c8698de14ed300dafea":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_2537324c324b4c60a745c2ff78e15fa0","max":83071,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5e119fe24031446c94d6fcf4f95fdf3d","value":83071}},"adbb70135fee468293e6f3ea2ef56beb":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8b57b2ae0bf842c3af36a18b62d9e015","placeholder":"","style":"IPY_MODEL_b69e8d52115a4ddcb17b03cc6f64eb09","value":" 83.1k/83.1k [00:00<00:00, 165kB/s]"}},"6a4a3be9cbc4493191a892c573582ba3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"84c2ec1f709c47a7b8faa1019893d4e4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0a99e474c195480898b2c35cd127d557":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2537324c324b4c60a745c2ff78e15fa0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5e119fe24031446c94d6fcf4f95fdf3d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"8b57b2ae0bf842c3af36a18b62d9e015":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b69e8d52115a4ddcb17b03cc6f64eb09":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"941445b90fc04904b6648a5e3e3fa245":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_844642fe3adb4f01a07d7d07ba4798bc","IPY_MODEL_e068281ab48043489289c1ab2f92473e","IPY_MODEL_045011b0c96741dfada3a09a0524b0d4"],"layout":"IPY_MODEL_fd7883f2f8d349db891709dc3101a16d"}},"844642fe3adb4f01a07d7d07ba4798bc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b66b9e6354584286a9f1f0f4b87d1fb3","placeholder":"","style":"IPY_MODEL_c552b2ba8f5f4726a79391e229136a83","value":"Generating test split: 100%"}},"e068281ab48043489289c1ab2f92473e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_76f2c7cbd0c74af9ac85a27d85f9ea06","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_aaf549f0289f408c897035bd75e5c305","value":2000}},"045011b0c96741dfada3a09a0524b0d4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_85bb759b75404ac587c5282b284b8d3a","placeholder":"","style":"IPY_MODEL_aa449e4c8bea4d559b4f060f69ae4c25","value":" 2000/2000 [00:00<00:00, 41502.08 examples/s]"}},"fd7883f2f8d349db891709dc3101a16d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b66b9e6354584286a9f1f0f4b87d1fb3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c552b2ba8f5f4726a79391e229136a83":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"76f2c7cbd0c74af9ac85a27d85f9ea06":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aaf549f0289f408c897035bd75e5c305":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"85bb759b75404ac587c5282b284b8d3a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aa449e4c8bea4d559b4f060f69ae4c25":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a62dd6045c8c4743bb389a17b8788e73":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_01ee8d04a6194f8f83fb81212c3d1a10","IPY_MODEL_318be1fd72314dc28b62f265935318bf","IPY_MODEL_db51334d90a34a6fa253d12c7e50e54c"],"layout":"IPY_MODEL_e43487b0dcbb4ec0ae8af912f5535074"}},"01ee8d04a6194f8f83fb81212c3d1a10":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_71db91520a20457f96a0e8ce83ba2f61","placeholder":"","style":"IPY_MODEL_53e0eb9505e14e52a9aaa3ad3d92eb7f","value":"Generating train split: 100%"}},"318be1fd72314dc28b62f265935318bf":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_98a4e533d68f4aaa8bd5e556d967ca2c","max":79927,"min":0,"orientation":"horizontal","style":"IPY_MODEL_25aa3f2b14d34a95a10d65099f044490","value":79927}},"db51334d90a34a6fa253d12c7e50e54c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2c1449b36e254ad3ae9ebf521ba9c06c","placeholder":"","style":"IPY_MODEL_048b2b884d594f44a627508114bb2653","value":" 79927/79927 [00:00<00:00, 918229.20 examples/s]"}},"e43487b0dcbb4ec0ae8af912f5535074":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"71db91520a20457f96a0e8ce83ba2f61":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"53e0eb9505e14e52a9aaa3ad3d92eb7f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"98a4e533d68f4aaa8bd5e556d967ca2c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"25aa3f2b14d34a95a10d65099f044490":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"2c1449b36e254ad3ae9ebf521ba9c06c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"048b2b884d594f44a627508114bb2653":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"edbe3f5f53cc490aac30f8c2a6e7cfe2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_61e376e936464743be2c0882861bfeae","IPY_MODEL_3133af8ca0b9442aa0a863a82a0efc90","IPY_MODEL_c2f229cd45bf4dbf86efe061f43010e0"],"layout":"IPY_MODEL_9fb9e54ec0544cb5bf30fe6286ece8c4"}},"61e376e936464743be2c0882861bfeae":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_074a6c4bedd24a379b59da7c284de342","placeholder":"","style":"IPY_MODEL_29618afe42234e1fa1e23d6ca3c9a9bf","value":"Generating validation split: 100%"}},"3133af8ca0b9442aa0a863a82a0efc90":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c6986d4b832c45fe97b5a663309067de","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_8485fbb850314eee9e0df683a18d1497","value":2000}},"c2f229cd45bf4dbf86efe061f43010e0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a52f65446e6241e18b5011ef9c55865c","placeholder":"","style":"IPY_MODEL_15ba674f577b4559a3ceb2129022bc9c","value":" 2000/2000 [00:00<00:00, 130939.02 examples/s]"}},"9fb9e54ec0544cb5bf30fe6286ece8c4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"074a6c4bedd24a379b59da7c284de342":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"29618afe42234e1fa1e23d6ca3c9a9bf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c6986d4b832c45fe97b5a663309067de":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8485fbb850314eee9e0df683a18d1497":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a52f65446e6241e18b5011ef9c55865c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"15ba674f577b4559a3ceb2129022bc9c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
Nepali_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"LbVGaOODddTz"},"source":["# Nepali-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Nepali (ne) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Nepali-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"LbVGaOODddTz"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZ7JcWyrddT1","executionInfo":{"status":"ok","timestamp":1770440337344,"user_tz":-420,"elapsed":5458,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"335f12ce-bd5b-4992-c358-b8185ac2a1b9"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"yZ7JcWyrddT1"},{"cell_type":"markdown","metadata":{"id":"u7XmZQkZddT2"},"source":["## 2. Data Loading (Opus-100)\n","Loading Nepali-English pairs from Opus-100."],"id":"u7XmZQkZddT2"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":659,"referenced_widgets":["a41c32e2052d441c841f4617448813c2","7ac0d88a4d9f4e0bb1d87cd80430b1e9","ee706053eaae4069a05db7e2f56ba69d","78ece472f8ea40459ea595513f5240e2","ff66d13a55e84797b188d1382bf50091","d6e1511fb8844085beb80f8bd2930ae6","cdaa2216ab1040fda24528b555b4ab0a","06740d7f569f4113a012c128edb51057","627d1d072e794b2ca7b36bf6b77eaec8","94b1dccd601f48eb939d3b70f7c3cd01","8026c53a8c2b406d9e231f528535b918","ea79c3477e64407ab92b59495a9fe7f7","0e102f99679e415baf15276596dd8f48","7cfbc4d2a5ba482bb82bf34513014bb7","6b5b9da87832435aa93fc0b35cafe144","8b40b8060e6d4beaa89aa03b5da350d8","2957929a0a594134a39e9f50972c95f5","b0b0dbef7bbf49348b38ae4135c0325f","2ac916d608ba461ea5cc41e742fc9050","8d4bd45404ea4e1d92a93500be7ecc83","a29fbfb99d584fb1a0d11f43c87d71aa","c1a20886f8824428bb20e5d6d90402dc","a55c8ff37bf64d12bdb89e635dc48a51","2d738657d18e45d5b25298e3f1cf77cb","c09344c7b9804c398159e386c0ea4793","4f48a69a3ffc4a34b5cabf6049e0cffa","c70fc8be949b4cfda2a967429531bed1","8987b208e7f146b8b53aa10e7a74f25d","99da700e76704aa2884f7734a130079a","bc6b755aeec94d1aa14524898e0e8a76","741e2559968047a5bd064f310f661aaf","cac8c251e15944f193520251d24a6e73","018f6c77133a4d1d9ce7d5a75b670e53","aa70893f207845ec9c45f5ef016f2012","43e618cd2c08491a918c3addd066e174","3d17473809be4bf186d87b77cfcd4251","80c2b1e78b444afe958d06840d0b3c22","f3d46b4dc6834a119ee3991d2fadf7a3","1d6dab275642477180f6af35d716c9c0","b7b4268a75354f6192788eb36578ad35","f5669a8b55b8462fbbe15284078ead59","a0ceda60536a4dea9417b67afe7fefdc","0388568f16a84b60b89dd9e9d5dcc094","49f0330a97ad46249d9eeef7abc23026","09ad9dd4f7ea4002a7a2c5ec941768fb","3e1aef5c69d24a83ae07961e029b97fa","a2f7309ff7104c0480d6ebdf57de9668","c8ae4f36f344437e87336203b5117ebc","8af56068d3ef48198078732f2a2daa11","0d106be8be624440bf53600a515aa21e","742154eadf264acabf813f2115942644","f1d955df3d2a4631866f30dee3a9ad4e","f18bc0bf195949a3a6580fba2ae4d4a4","ee7f47b44b4040619a4393afc6220774","fe9b3042222d4db2b0f381fa5634fa94","e29b2e2d761347eb96cdbaec4534ca2c","eb60ac90d2a04fcc843a0019b397cb02","59aeec517e0e4f65a86c848eed8f798e","242fb8afaa8043c98d32a10d7bd096e4","83983989c1d54dc7933bf8b32c1873d9","06fef064adf34c49825101bb82bfa4b2","730871a915414d6daa4f4392fabea1f9","9c8acd7cd5d5478b91b2c1ab080b2178","a01b7337abd14d9d9ac44215d2c421e4","52ed7fde32214e159f3f331a44d732bc","97b08a16cac746ba8b9a4c6608fca1de","1ab6807b738244b0bc19dcd81068b3e5","b4578f21e55f4a828ba23a6fc88f3f61","26abf52639c743caae95f00dcfb6f61f","86cc3b8137b84116bfe742cb70d17d63","e2cdd9081357477ab36eaa99585d85a1","26ff241b8d8d4e5f8f6e7d88de581967","c322fea973734ca0aa51d5a89f21688e","61f98c2866af4781860c3a3421b26c24","2fdd33ca5b974bf7931b74411824f063","fdc37d1c376b4dce92f471ff453e98fb","b29682f2ebb04ef4954e7c3f7d382648"]},"id":"aEbelD3SddT2","executionInfo":{"status":"ok","timestamp":1770440353235,"user_tz":-420,"elapsed":15889,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"dff0c74d-1bdd-4e92-c834-a50ce026e880"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Nepali-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a41c32e2052d441c841f4617448813c2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ne/test-00000-of-00001.parquet: 0%| | 0.00/93.5k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"ea79c3477e64407ab92b59495a9fe7f7"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["en-ne/train-00000-of-00001.parquet: 0%| | 0.00/23.9M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a55c8ff37bf64d12bdb89e635dc48a51"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ne/validation-00000-of-00001.parquet: 0%| | 0.00/101k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aa70893f207845ec9c45f5ef016f2012"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"09ad9dd4f7ea4002a7a2c5ec941768fb"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/406381 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e29b2e2d761347eb96cdbaec4534ca2c"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1ab6807b738244b0bc19dcd81068b3e5"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 410381 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Nepali-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Nepali-English)...\")\n","try:\n"," # Opus-100 has 'en-ne'\n"," dataset = load_dataset(\"opus100\", \"en-ne\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," # Extract data\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'ne' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'ne': item['translation']['ne'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Nepali-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"aEbelD3SddT2"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wkuNsm4cddT2","executionInfo":{"status":"ok","timestamp":1770440353336,"user_tz":-420,"elapsed":97,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"c8a53b82-bca6-4802-f961-49bf95c43705"},"outputs":[{"output_type":"stream","name":"stdout","text":[" ne \\\n","0 प्रयोग नगरिएको हटाउनुहोस् \n","1 साधारण थ्रेडिङ \n","2 नम्बर स्तम्भ प्रयोग गर्न पर्दाको कार्यस्थान सज... \n","3 टाइमआउट सर्भरमा जडान गर्दै । \n","4 बोधार्थ प्रतिलिपि फाँट \n","\n"," en \n","0 Remove Unused \n","1 Si_mple threading \n","2 Change the workspace layout of the screen to u... \n","3 Timeout connecting to server. \n","4 _Cc Field \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['ne', 'en'])\n","df['ne'] = df['ne'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['ne'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"wkuNsm4cddT2"},{"cell_type":"markdown","metadata":{"id":"0gy7106RddT2"},"source":["## 3. Tokenization"],"id":"0gy7106RddT2"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZaZ7dr2ddT3","executionInfo":{"status":"ok","timestamp":1770440355997,"user_tz":-420,"elapsed":2659,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"f44c1dbb-bf92-4ccc-ec69-5feff5f69533"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Nepali Tokenizer...\n","Training English Tokenizer (for Nepali pair)...\n"]}],"source":["# Save texts to files\n","with open('train_ne.txt', 'w', encoding='utf-8') as f:\n"," for line in df['ne']: f.write(line + '\\n')\n","\n","with open('train_en_ne.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Nepali Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_ne.txt',\n"," model_prefix='spm_ne',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Nepali pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_ne.txt',\n"," model_prefix='spm_en_ne',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_ne.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_ne.model')"],"id":"yZaZ7dr2ddT3"},{"cell_type":"markdown","metadata":{"id":"sciub41gddT3"},"source":["## 4. Dataset & Model"],"id":"sciub41gddT3"},{"cell_type":"code","execution_count":5,"metadata":{"id":"KYoV5LvZddT3","executionInfo":{"status":"ok","timestamp":1770440356001,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['ne']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"KYoV5LvZddT3"},{"cell_type":"code","execution_count":6,"metadata":{"id":"V9eoQcEsddT3","executionInfo":{"status":"ok","timestamp":1770440356004,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"V9eoQcEsddT3"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BnI01iQ7ddT3","executionInfo":{"status":"ok","timestamp":1770440617090,"user_tz":-420,"elapsed":261083,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"0754250e-37de-4e98-f94b-1d45083c67cf"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.176\n","Step 100, Loss: 6.588\n","Step 200, Loss: 6.126\n","Step 300, Loss: 6.147\n","Step 400, Loss: 5.600\n","Step 500, Loss: 5.748\n","Step 600, Loss: 5.715\n","Step 700, Loss: 5.169\n","Epoch 1 Loss: 5.846\n","Step 0, Loss: 4.846\n","Step 100, Loss: 4.999\n","Step 200, Loss: 4.959\n","Step 300, Loss: 4.773\n","Step 400, Loss: 4.586\n","Step 500, Loss: 4.851\n","Step 600, Loss: 4.514\n","Step 700, Loss: 4.902\n","Epoch 2 Loss: 4.711\n","Step 0, Loss: 4.046\n","Step 100, Loss: 3.816\n","Step 200, Loss: 3.815\n","Step 300, Loss: 3.796\n","Step 400, Loss: 3.608\n","Step 500, Loss: 4.376\n","Step 600, Loss: 3.983\n","Step 700, Loss: 3.369\n","Epoch 3 Loss: 3.951\n","Step 0, Loss: 3.170\n","Step 100, Loss: 3.633\n","Step 200, Loss: 3.516\n","Step 300, Loss: 3.477\n","Step 400, Loss: 3.575\n","Step 500, Loss: 3.580\n","Step 600, Loss: 3.283\n","Step 700, Loss: 3.402\n","Epoch 4 Loss: 3.369\n","Step 0, Loss: 2.605\n","Step 100, Loss: 3.518\n","Step 200, Loss: 3.049\n","Step 300, Loss: 3.254\n","Step 400, Loss: 3.399\n","Step 500, Loss: 2.932\n","Step 600, Loss: 3.141\n","Step 700, Loss: 2.269\n","Epoch 5 Loss: 2.937\n","Step 0, Loss: 2.661\n","Step 100, Loss: 2.003\n","Step 200, Loss: 2.601\n","Step 300, Loss: 3.214\n","Step 400, Loss: 3.140\n","Step 500, Loss: 2.526\n","Step 600, Loss: 2.761\n","Step 700, Loss: 2.669\n","Epoch 6 Loss: 2.618\n","Step 0, Loss: 2.305\n","Step 100, Loss: 2.072\n","Step 200, Loss: 2.323\n","Step 300, Loss: 2.058\n","Step 400, Loss: 2.571\n","Step 500, Loss: 2.387\n","Step 600, Loss: 2.445\n","Step 700, Loss: 2.105\n","Epoch 7 Loss: 2.375\n","Step 0, Loss: 1.901\n","Step 100, Loss: 2.535\n","Step 200, Loss: 2.590\n","Step 300, Loss: 2.259\n","Step 400, Loss: 2.060\n","Step 500, Loss: 2.945\n","Step 600, Loss: 2.238\n","Step 700, Loss: 2.150\n","Epoch 8 Loss: 2.181\n","Step 0, Loss: 2.019\n","Step 100, Loss: 2.238\n","Step 200, Loss: 1.810\n","Step 300, Loss: 1.984\n","Step 400, Loss: 2.144\n","Step 500, Loss: 1.977\n","Step 600, Loss: 2.094\n","Step 700, Loss: 1.831\n","Epoch 9 Loss: 2.025\n","Step 0, Loss: 1.937\n","Step 100, Loss: 1.998\n","Step 200, Loss: 2.205\n","Step 300, Loss: 1.905\n","Step 400, Loss: 1.929\n","Step 500, Loss: 1.959\n","Step 600, Loss: 2.074\n","Step 700, Loss: 1.576\n","Epoch 10 Loss: 1.891\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_ne.pt')"],"id":"BnI01iQ7ddT3"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kglUseModdT3","executionInfo":{"status":"ok","timestamp":1770440617119,"user_tz":-420,"elapsed":26,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"aafa9beb-3d49-45cd-8026-ff5d5f584dff"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_ne.pt', 'app/models/transformer_model_ne.pt')\n","shutil.copy('spm_ne.model', 'app/models/spm_ne.model')\n","shutil.copy('spm_en_ne.model', 'app/models/spm_en_ne.model')\n","print(\"Models copied to app/models/\")"],"id":"kglUseModdT3"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"a41c32e2052d441c841f4617448813c2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_7ac0d88a4d9f4e0bb1d87cd80430b1e9","IPY_MODEL_ee706053eaae4069a05db7e2f56ba69d","IPY_MODEL_78ece472f8ea40459ea595513f5240e2"],"layout":"IPY_MODEL_ff66d13a55e84797b188d1382bf50091"}},"7ac0d88a4d9f4e0bb1d87cd80430b1e9":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d6e1511fb8844085beb80f8bd2930ae6","placeholder":"","style":"IPY_MODEL_cdaa2216ab1040fda24528b555b4ab0a","value":"README.md: "}},"ee706053eaae4069a05db7e2f56ba69d":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_06740d7f569f4113a012c128edb51057","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_627d1d072e794b2ca7b36bf6b77eaec8","value":1}},"78ece472f8ea40459ea595513f5240e2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_94b1dccd601f48eb939d3b70f7c3cd01","placeholder":"","style":"IPY_MODEL_8026c53a8c2b406d9e231f528535b918","value":" 65.4k/? [00:00<00:00, 7.17MB/s]"}},"ff66d13a55e84797b188d1382bf50091":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6e1511fb8844085beb80f8bd2930ae6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cdaa2216ab1040fda24528b555b4ab0a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"06740d7f569f4113a012c128edb51057":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"627d1d072e794b2ca7b36bf6b77eaec8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"94b1dccd601f48eb939d3b70f7c3cd01":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8026c53a8c2b406d9e231f528535b918":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ea79c3477e64407ab92b59495a9fe7f7":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0e102f99679e415baf15276596dd8f48","IPY_MODEL_7cfbc4d2a5ba482bb82bf34513014bb7","IPY_MODEL_6b5b9da87832435aa93fc0b35cafe144"],"layout":"IPY_MODEL_8b40b8060e6d4beaa89aa03b5da350d8"}},"0e102f99679e415baf15276596dd8f48":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2957929a0a594134a39e9f50972c95f5","placeholder":"","style":"IPY_MODEL_b0b0dbef7bbf49348b38ae4135c0325f","value":"en-ne/test-00000-of-00001.parquet: 100%"}},"7cfbc4d2a5ba482bb82bf34513014bb7":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_2ac916d608ba461ea5cc41e742fc9050","max":93474,"min":0,"orientation":"horizontal","style":"IPY_MODEL_8d4bd45404ea4e1d92a93500be7ecc83","value":93474}},"6b5b9da87832435aa93fc0b35cafe144":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a29fbfb99d584fb1a0d11f43c87d71aa","placeholder":"","style":"IPY_MODEL_c1a20886f8824428bb20e5d6d90402dc","value":" 93.5k/93.5k [00:00<00:00, 179kB/s]"}},"8b40b8060e6d4beaa89aa03b5da350d8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2957929a0a594134a39e9f50972c95f5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b0b0dbef7bbf49348b38ae4135c0325f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2ac916d608ba461ea5cc41e742fc9050":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8d4bd45404ea4e1d92a93500be7ecc83":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a29fbfb99d584fb1a0d11f43c87d71aa":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c1a20886f8824428bb20e5d6d90402dc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a55c8ff37bf64d12bdb89e635dc48a51":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_2d738657d18e45d5b25298e3f1cf77cb","IPY_MODEL_c09344c7b9804c398159e386c0ea4793","IPY_MODEL_4f48a69a3ffc4a34b5cabf6049e0cffa"],"layout":"IPY_MODEL_c70fc8be949b4cfda2a967429531bed1"}},"2d738657d18e45d5b25298e3f1cf77cb":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8987b208e7f146b8b53aa10e7a74f25d","placeholder":"","style":"IPY_MODEL_99da700e76704aa2884f7734a130079a","value":"en-ne/train-00000-of-00001.parquet: 100%"}},"c09344c7b9804c398159e386c0ea4793":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_bc6b755aeec94d1aa14524898e0e8a76","max":23913307,"min":0,"orientation":"horizontal","style":"IPY_MODEL_741e2559968047a5bd064f310f661aaf","value":23913307}},"4f48a69a3ffc4a34b5cabf6049e0cffa":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cac8c251e15944f193520251d24a6e73","placeholder":"","style":"IPY_MODEL_018f6c77133a4d1d9ce7d5a75b670e53","value":" 23.9M/23.9M [00:00<00:00, 12.0MB/s]"}},"c70fc8be949b4cfda2a967429531bed1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8987b208e7f146b8b53aa10e7a74f25d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"99da700e76704aa2884f7734a130079a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"bc6b755aeec94d1aa14524898e0e8a76":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"741e2559968047a5bd064f310f661aaf":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"cac8c251e15944f193520251d24a6e73":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"018f6c77133a4d1d9ce7d5a75b670e53":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"aa70893f207845ec9c45f5ef016f2012":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_43e618cd2c08491a918c3addd066e174","IPY_MODEL_3d17473809be4bf186d87b77cfcd4251","IPY_MODEL_80c2b1e78b444afe958d06840d0b3c22"],"layout":"IPY_MODEL_f3d46b4dc6834a119ee3991d2fadf7a3"}},"43e618cd2c08491a918c3addd066e174":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1d6dab275642477180f6af35d716c9c0","placeholder":"","style":"IPY_MODEL_b7b4268a75354f6192788eb36578ad35","value":"en-ne/validation-00000-of-00001.parquet: 100%"}},"3d17473809be4bf186d87b77cfcd4251":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f5669a8b55b8462fbbe15284078ead59","max":100742,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a0ceda60536a4dea9417b67afe7fefdc","value":100742}},"80c2b1e78b444afe958d06840d0b3c22":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0388568f16a84b60b89dd9e9d5dcc094","placeholder":"","style":"IPY_MODEL_49f0330a97ad46249d9eeef7abc23026","value":" 101k/101k [00:00<00:00, 128kB/s]"}},"f3d46b4dc6834a119ee3991d2fadf7a3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1d6dab275642477180f6af35d716c9c0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b7b4268a75354f6192788eb36578ad35":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f5669a8b55b8462fbbe15284078ead59":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a0ceda60536a4dea9417b67afe7fefdc":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"0388568f16a84b60b89dd9e9d5dcc094":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"49f0330a97ad46249d9eeef7abc23026":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"09ad9dd4f7ea4002a7a2c5ec941768fb":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_3e1aef5c69d24a83ae07961e029b97fa","IPY_MODEL_a2f7309ff7104c0480d6ebdf57de9668","IPY_MODEL_c8ae4f36f344437e87336203b5117ebc"],"layout":"IPY_MODEL_8af56068d3ef48198078732f2a2daa11"}},"3e1aef5c69d24a83ae07961e029b97fa":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0d106be8be624440bf53600a515aa21e","placeholder":"","style":"IPY_MODEL_742154eadf264acabf813f2115942644","value":"Generating test split: 100%"}},"a2f7309ff7104c0480d6ebdf57de9668":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f1d955df3d2a4631866f30dee3a9ad4e","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_f18bc0bf195949a3a6580fba2ae4d4a4","value":2000}},"c8ae4f36f344437e87336203b5117ebc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ee7f47b44b4040619a4393afc6220774","placeholder":"","style":"IPY_MODEL_fe9b3042222d4db2b0f381fa5634fa94","value":" 2000/2000 [00:00<00:00, 55864.09 examples/s]"}},"8af56068d3ef48198078732f2a2daa11":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0d106be8be624440bf53600a515aa21e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"742154eadf264acabf813f2115942644":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f1d955df3d2a4631866f30dee3a9ad4e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f18bc0bf195949a3a6580fba2ae4d4a4":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"ee7f47b44b4040619a4393afc6220774":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fe9b3042222d4db2b0f381fa5634fa94":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e29b2e2d761347eb96cdbaec4534ca2c":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_eb60ac90d2a04fcc843a0019b397cb02","IPY_MODEL_59aeec517e0e4f65a86c848eed8f798e","IPY_MODEL_242fb8afaa8043c98d32a10d7bd096e4"],"layout":"IPY_MODEL_83983989c1d54dc7933bf8b32c1873d9"}},"eb60ac90d2a04fcc843a0019b397cb02":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_06fef064adf34c49825101bb82bfa4b2","placeholder":"","style":"IPY_MODEL_730871a915414d6daa4f4392fabea1f9","value":"Generating train split: 100%"}},"59aeec517e0e4f65a86c848eed8f798e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_9c8acd7cd5d5478b91b2c1ab080b2178","max":406381,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a01b7337abd14d9d9ac44215d2c421e4","value":406381}},"242fb8afaa8043c98d32a10d7bd096e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_52ed7fde32214e159f3f331a44d732bc","placeholder":"","style":"IPY_MODEL_97b08a16cac746ba8b9a4c6608fca1de","value":" 406381/406381 [00:00<00:00, 1711431.02 examples/s]"}},"83983989c1d54dc7933bf8b32c1873d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"06fef064adf34c49825101bb82bfa4b2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"730871a915414d6daa4f4392fabea1f9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9c8acd7cd5d5478b91b2c1ab080b2178":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a01b7337abd14d9d9ac44215d2c421e4":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"52ed7fde32214e159f3f331a44d732bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"97b08a16cac746ba8b9a4c6608fca1de":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1ab6807b738244b0bc19dcd81068b3e5":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_b4578f21e55f4a828ba23a6fc88f3f61","IPY_MODEL_26abf52639c743caae95f00dcfb6f61f","IPY_MODEL_86cc3b8137b84116bfe742cb70d17d63"],"layout":"IPY_MODEL_e2cdd9081357477ab36eaa99585d85a1"}},"b4578f21e55f4a828ba23a6fc88f3f61":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_26ff241b8d8d4e5f8f6e7d88de581967","placeholder":"","style":"IPY_MODEL_c322fea973734ca0aa51d5a89f21688e","value":"Generating validation split: 100%"}},"26abf52639c743caae95f00dcfb6f61f":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_61f98c2866af4781860c3a3421b26c24","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_2fdd33ca5b974bf7931b74411824f063","value":2000}},"86cc3b8137b84116bfe742cb70d17d63":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_fdc37d1c376b4dce92f471ff453e98fb","placeholder":"","style":"IPY_MODEL_b29682f2ebb04ef4954e7c3f7d382648","value":" 2000/2000 [00:00<00:00, 163374.13 examples/s]"}},"e2cdd9081357477ab36eaa99585d85a1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"26ff241b8d8d4e5f8f6e7d88de581967":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c322fea973734ca0aa51d5a89f21688e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"61f98c2866af4781860c3a3421b26c24":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2fdd33ca5b974bf7931b74411824f063":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"fdc37d1c376b4dce92f471ff453e98fb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b29682f2ebb04ef4954e7c3f7d382648":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
README.md
CHANGED
|
@@ -1,10 +1,74 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
---
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: My Translator by Ko Ko
|
| 3 |
+
emoji: 🌍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
+
# Multilingual Neural Machine Translation (Project A3)
|
| 10 |
|
| 11 |
+
**Developed by:** Htut Ko Ko (st126010)
|
| 12 |
+
|
| 13 |
+
* 👉 **Live App** : [huggingface.co/spaces/shadowsilence/burmese-english-translator](https://huggingface.co/spaces/shadowsilence/burmese-english-translator)
|
| 14 |
+
|
| 15 |
+
This project implements high-quality machine translation systems for multiple languages (Burmese, Thai, Chinese, Vietnamese, Hindi, Nepali, Urdu, Tagalog, Kazakh, Bengali, German) to English using two approaches:
|
| 16 |
+
|
| 17 |
+
1. **Fine-Tuned NLLB-200**: State-of-the-art multilingual model tailored for high-quality translation across all supported languages.
|
| 18 |
+
2. **Transformer from Scratch**: Educational implementation to demonstrate understanding of NMT architecture.
|
| 19 |
+
|
| 20 |
+
## Experiments
|
| 21 |
+
|
| 22 |
+

|
| 23 |
+
|
| 24 |
+
### Attention Mechanisms (Burmese-English)
|
| 25 |
+
|
| 26 |
+
I compared **General (Dot Product)** and **Additive (Bahdanau)** attention mechanisms using a Seq2Seq GRU model.
|
| 27 |
+
|
| 28 |
+
| Attention Mechanism | Training Loss | Training PPL | Validation Loss | Validation PPL |
|
| 29 |
+
| ----------------------------- | --------------- | ---------------- | --------------- | ----------------- |
|
| 30 |
+
| General (Dot) | 4.819 | 123.868 | 6.662 | 782.166 |
|
| 31 |
+
| **Additive (Bahdanau)** | **4.447** | **85.368** | **6.440** | **626.673** |
|
| 32 |
+
|
| 33 |
+
**Observation:** Additive Attention achieved lower validation perplexity, indicating better performance.
|
| 34 |
+
|
| 35 |
+
## Demo
|
| 36 |
+
|
| 37 |
+

|
| 38 |
+
|
| 39 |
+
## Folder Structure
|
| 40 |
+
|
| 41 |
+
- `Burmese_English_NLLB.ipynb`: **(Recommended)** Fine-Tuning NLLB for high-quality translation.
|
| 42 |
+
- `Burmese_English_Transformer.ipynb`: Transformer from Scratch implementation for Burmese-English.
|
| 43 |
+
- `*_English_Transformer.ipynb`: Transformer implementation for Foreign_language_for_AIT_students-English.
|
| 44 |
+
- `Attention_Experiments.ipynb`: Comparison of General vs. Additive Attention (Burmese-English).
|
| 45 |
+
- `app/`: Web Application folder.
|
| 46 |
+
- `app.py`: Flask application supporting multiple languages.
|
| 47 |
+
- `nllb_model/`: Fine-tuned NLLB model.
|
| 48 |
+
|
| 49 |
+
## How to Run Locally
|
| 50 |
+
|
| 51 |
+
### 1. Requirements
|
| 52 |
+
|
| 53 |
+
Install dependencies:
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
cd app
|
| 57 |
+
pip install -r requirements.txt
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### 2. Run the App
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
python app.py
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Open `http://localhost:5001`.
|
| 67 |
+
|
| 68 |
+
## Credits & Acknowledgements
|
| 69 |
+
|
| 70 |
+
This project respects the academic integrity and usage policies of the following resources:
|
| 71 |
+
|
| 72 |
+
- **Dataset**: [Asian Language Treebank (ALT)](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/), [Opus-100](https://opus.nlpl.eu/)
|
| 73 |
+
- **Base Model**: [NLLB-200](https://ai.meta.com/research/no-language-left-behind/) by Meta AI.
|
| 74 |
+
- **Tokenization**: [SentencePiece](https://github.com/google/sentencepiece) by Google.
|
Tagalog_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"JLkDXbmyi7DI"},"source":["# Tagalog-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Tagalog (tl) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **ALT (Asian Language Treebank)** dataset for Tagalog-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"JLkDXbmyi7DI"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"h_1v3NHui7DJ","executionInfo":{"status":"ok","timestamp":1770441680901,"user_tz":-420,"elapsed":11371,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"1afa5161-47f7-4df9-95dd-f7f8aa105a3d"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"h_1v3NHui7DJ"},{"cell_type":"markdown","metadata":{"id":"IPLDi898i7DK"},"source":["## 2. Data Loading (ALT Dataset)\n","Loading Tagalog-English pairs from ALT."],"id":"IPLDi898i7DK"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":713,"referenced_widgets":["3bea3f3ba71840d08d15d89cd9b1942f","7260b34547704aa1a25ce5764326d84a","99a8c2e3e5b348ffa3398f3f5fa7feeb","ad978d099dff43899c4c2b27f5965d96","152028b582394ed1aa179fe7e49a71e5","66fd2df97a514b4abf8c495215a0c40d","1de3bb240c4e48299b80c32479f7236f","e0a682bc6a954368bab0b5cd24afe0bc","5bc80f4a727a4a2f89048592923819f2","ac63742075ee4a8aacde692dd878949d","602c57471f0a4ee9b2c1a62b50c81b30","9cb7f8df246f47718c3088bd75b4f615","9b9b1352c1eb45cb87795854e0ffdfc1","bcf36300957947c2ac51b1bf1050156e","46a5329ffe0f41288446476b3c1e02f3","7c4627887bbb42388cf1902e76eb483b","9f23df2a6c234262ad7870e5cbbe68a0","f7f91e01fc0d4c5fa76734c051a68815","db1a0ccf2a7a4171957399db5adc5ba4","628845fcf7da4e4f8030a019e44f5a88","e1b84510fd384d2eb17c23a8dc216e68","7ee522e7a7ec411aa178c1f2f4d487cf","2dd50fb2684d49c684d0196ad5d6e065","8288e34b12704fa880150517a686a9e4","dc201daf7f9549cf91806bc409d68c6c","8db637dc8ee24def8ee31ffb1d63d0b2","6cebb045b0124eb3bd151a6ae4bb5230","3bc431e04c444df081ff754a0cb6c6b5","568b2f1130994f278776e45bcd939a28","f88dfe8d492f46518bdf554b8a94131d","bff35403467b42a88d0bca8f2919065a","881ab236fe8e429c883603d3de9528ab","25368c07e6724131a2f8e5395707d668","9d7773e115654ad9b75a7ad44f502f4a","040d4b5b41c94a2bb1f36e3296b3bdd4","c21d3525924c47979813f514fcbb949e","9f707859d1b447b9bf2b400bf713bb51","e78ea5453d714b1fbfde8466abf9d683","3167bcf1029e4ab2b0e1ac9c777ced6c","e01b0c927e4b4456a83f1360ee6debcc","3b2aad8a0cdb4bb4866b89e49484e75f","871f6c7ca002447cb4616ae764bcf996","e15f751bbcbd4f53b4d756da399d6d9c","2e2167c120244bfba2a600cd11bea24c","0fc80220974d42e383a059fb7fb4cac4","93a42a372e0e470ab0172a5c86047b78","48f749ac01c7414786abb9e76bfde0b4","0aec42c7404b42a4bda73ba4ab759576","d0d1f6343b014c769ba95945fd399365","d6f0b3924aa048f7955cd9a1ad391106","ba639735e8594aed937e838652efc534","b96937a130a745ec99861af1ca8b7a97","35179739868040d1ba7d542f450f04da","78ce17d6759143faa0fecb35ce77dbaf","c388f8c8f7454befa4ef800ca881ab69","6dbf026b6dd541e8a616611f88d0fdd2","190d05dede1848f8ac868a8fd8c4f00b","740e1171bb4842ebac7f6dbdd7ee9f01","217fd7ebc0b5431f918656dd8c681243","a27cf14d24b94adfbbffaa816c1ce491","cdb53cc2c0e14e04aceb8faef270e136","d32ae0c469a242faaafeee4f7d14a17b","266ef0735e414b8a88e1333f91cfb615","eb1caf07bdb54ead8026a1dc689105fb","55e32e2868c6429687ec74c79f226386","2220cdff006d48e4927d4a80f3912cf9","ceec10aebf0645cdbbb06950685ca7d9","893ec2094c7846c0b36815fe2ed1fcc8","5366d27cd0654132b39b175e6e7a50d9","7b4877c7b2a1410fb7aa243edc77dd15","38d4f3d3233b46608e21a2889b5be2f8","c6d608c9fce94163a951e4651c7f30b3","21c72058ee9a472283a517619379dd77","88329550b2ab488c9a28527399de5e77","f14af4590e09400ea39e5406668afe53","327c428abb4443849591e08a144972ae","66591ba93f4943528bebe7d0bb7b8fd6"]},"id":"MJCOAcXPi7DK","executionInfo":{"status":"ok","timestamp":1770441690479,"user_tz":-420,"elapsed":9582,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"c0772e60-279d-43ac-ea7b-e3ab59d4cf59"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading ALT Dataset...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3bea3f3ba71840d08d15d89cd9b1942f"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["alt-parallel/train-00000-of-00001.parque(…): 0%| | 0.00/31.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9cb7f8df246f47718c3088bd75b4f615"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/validation-00000-of-00001.p(…): 0%| | 0.00/1.71M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2dd50fb2684d49c684d0196ad5d6e065"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/test-00000-of-00001.parquet: 0%| | 0.00/1.79M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9d7773e115654ad9b75a7ad44f502f4a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/18088 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0fc80220974d42e383a059fb7fb4cac4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/1000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6dbf026b6dd541e8a616611f88d0fdd2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/1019 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"ceec10aebf0645cdbbb06950685ca7d9"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 20107 sentences from ALT.\n","Extracted 20107 Tagalog-English pairs.\n"]}],"source":["print(\"Loading ALT Dataset...\")\n","try:\n"," # ALT has 'fil' for Filipino (Tagalog)\n"," dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from ALT.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'fil' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'tl': item['translation']['fil'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," print(f\"Extracted {len(data)} Tagalog-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"MJCOAcXPi7DK"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"C8jIYNA3i7DL","executionInfo":{"status":"ok","timestamp":1770441690484,"user_tz":-420,"elapsed":4,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"bc395897-349b-4b9a-c491-f185332e22ae"},"outputs":[{"output_type":"stream","name":"stdout","text":[" tl \\\n","0 Natalo ng Italya ang Portugal sa puntos na 31-... \n","1 Si Andrea Masi ang nagsimula na makapuntos sa ... \n","2 Sa kabila ng pagmamanipula sa unang kalahati n... \n","3 Hindi sumuko ang Portugal at si David Penalva ... \n","4 Nanguna ang Italya sa puntos na 16-5 sa kalagi... \n","\n"," en \n","0 Italy have defeated Portugal 31-5 in Pool C of... \n","1 Andrea Masi opened the scoring in the fourth m... \n","2 Despite controlling the game for much of the f... \n","3 Portugal never gave up and David Penalva score... \n","4 Italy led 16-5 at half time but were matched b... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['tl', 'en'])\n","df['tl'] = df['tl'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['tl'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"C8jIYNA3i7DL"},{"cell_type":"markdown","metadata":{"id":"cQGT4HOti7DL"},"source":["## 3. Tokenization"],"id":"cQGT4HOti7DL"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tbPUUwtOi7DL","executionInfo":{"status":"ok","timestamp":1770441695072,"user_tz":-420,"elapsed":4587,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"53ad1aaa-20df-44e3-c917-9b41496b81cc"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Tagalog Tokenizer...\n","Training English Tokenizer (for Tagalog pair)...\n"]}],"source":["# Save texts to files\n","with open('train_tl.txt', 'w', encoding='utf-8') as f:\n"," for line in df['tl']: f.write(line + '\\n')\n","\n","with open('train_en_tl.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Tagalog Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_tl.txt',\n"," model_prefix='spm_tl',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Tagalog pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_tl.txt',\n"," model_prefix='spm_en_tl',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_tl.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_tl.model')"],"id":"tbPUUwtOi7DL"},{"cell_type":"markdown","metadata":{"id":"TB-giK3ei7DL"},"source":["## 4. Dataset & Model"],"id":"TB-giK3ei7DL"},{"cell_type":"code","execution_count":5,"metadata":{"id":"y4IZvtrGi7DL","executionInfo":{"status":"ok","timestamp":1770441695079,"user_tz":-420,"elapsed":3,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['tl']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"y4IZvtrGi7DL"},{"cell_type":"code","execution_count":6,"metadata":{"id":"aiJSJu5Ui7DM","executionInfo":{"status":"ok","timestamp":1770441695091,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"aiJSJu5Ui7DM"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EL4FOyYli7DM","executionInfo":{"status":"ok","timestamp":1770443127273,"user_tz":-420,"elapsed":1432179,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"f969203b-beb2-4c51-89c9-791964ccc429"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.210\n","Step 100, Loss: 6.797\n","Step 200, Loss: 6.489\n","Step 300, Loss: 6.381\n","Epoch 1 Loss: 6.725\n","Step 0, Loss: 6.260\n","Step 100, Loss: 5.953\n","Step 200, Loss: 5.918\n","Step 300, Loss: 5.736\n","Epoch 2 Loss: 6.015\n","Step 0, Loss: 5.615\n","Step 100, Loss: 5.594\n","Step 200, Loss: 5.530\n","Step 300, Loss: 5.515\n","Epoch 3 Loss: 5.589\n","Step 0, Loss: 5.190\n","Step 100, Loss: 5.415\n","Step 200, Loss: 5.166\n","Step 300, Loss: 5.139\n","Epoch 4 Loss: 5.258\n","Step 0, Loss: 5.087\n","Step 100, Loss: 5.019\n","Step 200, Loss: 5.036\n","Step 300, Loss: 4.975\n","Epoch 5 Loss: 5.001\n","Step 0, Loss: 4.884\n","Step 100, Loss: 4.778\n","Step 200, Loss: 4.828\n","Step 300, Loss: 4.676\n","Epoch 6 Loss: 4.792\n","Step 0, Loss: 4.523\n","Step 100, Loss: 4.596\n","Step 200, Loss: 4.639\n","Step 300, Loss: 4.588\n","Epoch 7 Loss: 4.617\n","Step 0, Loss: 4.536\n","Step 100, Loss: 4.275\n","Step 200, Loss: 4.409\n","Step 300, Loss: 4.398\n","Epoch 8 Loss: 4.461\n","Step 0, Loss: 4.208\n","Step 100, Loss: 4.340\n","Step 200, Loss: 4.500\n","Step 300, Loss: 4.374\n","Epoch 9 Loss: 4.323\n","Step 0, Loss: 4.190\n","Step 100, Loss: 4.173\n","Step 200, Loss: 4.264\n","Step 300, Loss: 4.224\n","Epoch 10 Loss: 4.204\n","Step 0, Loss: 3.999\n","Step 100, Loss: 4.082\n","Step 200, Loss: 4.328\n","Step 300, Loss: 4.229\n","Epoch 11 Loss: 4.094\n","Step 0, Loss: 3.885\n","Step 100, Loss: 3.996\n","Step 200, Loss: 4.035\n","Step 300, Loss: 4.102\n","Epoch 12 Loss: 3.989\n","Step 0, Loss: 3.838\n","Step 100, Loss: 3.867\n","Step 200, Loss: 3.930\n","Step 300, Loss: 3.849\n","Epoch 13 Loss: 3.897\n","Step 0, Loss: 3.722\n","Step 100, Loss: 3.670\n","Step 200, Loss: 3.909\n","Step 300, Loss: 3.918\n","Epoch 14 Loss: 3.810\n","Step 0, Loss: 3.634\n","Step 100, Loss: 3.624\n","Step 200, Loss: 3.850\n","Step 300, Loss: 3.949\n","Epoch 15 Loss: 3.730\n","Step 0, Loss: 3.555\n","Step 100, Loss: 3.679\n","Step 200, Loss: 3.749\n","Step 300, Loss: 3.787\n","Epoch 16 Loss: 3.656\n","Step 0, Loss: 3.512\n","Step 100, Loss: 3.532\n","Step 200, Loss: 3.613\n","Step 300, Loss: 3.729\n","Epoch 17 Loss: 3.582\n","Step 0, Loss: 3.320\n","Step 100, Loss: 3.593\n","Step 200, Loss: 3.603\n","Step 300, Loss: 3.622\n","Epoch 18 Loss: 3.518\n","Step 0, Loss: 3.346\n","Step 100, Loss: 3.409\n","Step 200, Loss: 3.498\n","Step 300, Loss: 3.355\n","Epoch 19 Loss: 3.456\n","Step 0, Loss: 3.229\n","Step 100, Loss: 3.335\n","Step 200, Loss: 3.442\n","Step 300, Loss: 3.505\n","Epoch 20 Loss: 3.395\n","Step 0, Loss: 3.102\n","Step 100, Loss: 3.359\n","Step 200, Loss: 3.311\n","Step 300, Loss: 3.466\n","Epoch 21 Loss: 3.342\n","Step 0, Loss: 3.045\n","Step 100, Loss: 3.320\n","Step 200, Loss: 3.197\n","Step 300, Loss: 3.455\n","Epoch 22 Loss: 3.295\n","Step 0, Loss: 3.050\n","Step 100, Loss: 3.280\n","Step 200, Loss: 3.276\n","Step 300, Loss: 3.456\n","Epoch 23 Loss: 3.243\n","Step 0, Loss: 3.122\n","Step 100, Loss: 3.209\n","Step 200, Loss: 3.183\n","Step 300, Loss: 3.135\n","Epoch 24 Loss: 3.197\n","Step 0, Loss: 2.916\n","Step 100, Loss: 3.175\n","Step 200, Loss: 3.129\n","Step 300, Loss: 3.387\n","Epoch 25 Loss: 3.149\n","Step 0, Loss: 2.970\n","Step 100, Loss: 3.263\n","Step 200, Loss: 3.150\n","Step 300, Loss: 3.108\n","Epoch 26 Loss: 3.112\n","Step 0, Loss: 2.946\n","Step 100, Loss: 3.011\n","Step 200, Loss: 3.132\n","Step 300, Loss: 3.279\n","Epoch 27 Loss: 3.073\n","Step 0, Loss: 2.808\n","Step 100, Loss: 3.053\n","Step 200, Loss: 3.114\n","Step 300, Loss: 3.123\n","Epoch 28 Loss: 3.041\n","Step 0, Loss: 2.847\n","Step 100, Loss: 2.930\n","Step 200, Loss: 3.195\n","Step 300, Loss: 3.101\n","Epoch 29 Loss: 3.000\n","Step 0, Loss: 2.866\n","Step 100, Loss: 2.908\n","Step 200, Loss: 2.950\n","Step 300, Loss: 3.234\n","Epoch 30 Loss: 2.967\n","Step 0, Loss: 2.873\n","Step 100, Loss: 2.911\n","Step 200, Loss: 2.841\n","Step 300, Loss: 2.970\n","Epoch 31 Loss: 2.930\n","Step 0, Loss: 2.749\n","Step 100, Loss: 3.014\n","Step 200, Loss: 2.898\n","Step 300, Loss: 2.942\n","Epoch 32 Loss: 2.902\n","Step 0, Loss: 2.662\n","Step 100, Loss: 2.879\n","Step 200, Loss: 2.882\n","Step 300, Loss: 2.923\n","Epoch 33 Loss: 2.875\n","Step 0, Loss: 2.714\n","Step 100, Loss: 2.701\n","Step 200, Loss: 2.897\n","Step 300, Loss: 3.026\n","Epoch 34 Loss: 2.840\n","Step 0, Loss: 2.709\n","Step 100, Loss: 2.807\n","Step 200, Loss: 2.826\n","Step 300, Loss: 2.929\n","Epoch 35 Loss: 2.814\n","Step 0, Loss: 2.691\n","Step 100, Loss: 2.725\n","Step 200, Loss: 2.831\n","Step 300, Loss: 2.991\n","Epoch 36 Loss: 2.789\n","Step 0, Loss: 2.623\n","Step 100, Loss: 2.744\n","Step 200, Loss: 2.841\n","Step 300, Loss: 2.849\n","Epoch 37 Loss: 2.766\n","Step 0, Loss: 2.595\n","Step 100, Loss: 2.715\n","Step 200, Loss: 2.849\n","Step 300, Loss: 2.844\n","Epoch 38 Loss: 2.736\n","Step 0, Loss: 2.516\n","Step 100, Loss: 2.631\n","Step 200, Loss: 2.853\n","Step 300, Loss: 2.662\n","Epoch 39 Loss: 2.717\n","Step 0, Loss: 2.569\n","Step 100, Loss: 2.618\n","Step 200, Loss: 2.787\n","Step 300, Loss: 2.816\n","Epoch 40 Loss: 2.693\n","Step 0, Loss: 2.608\n","Step 100, Loss: 2.583\n","Step 200, Loss: 2.668\n","Step 300, Loss: 2.813\n","Epoch 41 Loss: 2.670\n","Step 0, Loss: 2.519\n","Step 100, Loss: 2.595\n","Step 200, Loss: 2.600\n","Step 300, Loss: 2.710\n","Epoch 42 Loss: 2.650\n","Step 0, Loss: 2.546\n","Step 100, Loss: 2.591\n","Step 200, Loss: 2.790\n","Step 300, Loss: 2.816\n","Epoch 43 Loss: 2.631\n","Step 0, Loss: 2.500\n","Step 100, Loss: 2.651\n","Step 200, Loss: 2.752\n","Step 300, Loss: 2.643\n","Epoch 44 Loss: 2.608\n","Step 0, Loss: 2.488\n","Step 100, Loss: 2.614\n","Step 200, Loss: 2.687\n","Step 300, Loss: 2.730\n","Epoch 45 Loss: 2.592\n","Step 0, Loss: 2.365\n","Step 100, Loss: 2.544\n","Step 200, Loss: 2.575\n","Step 300, Loss: 2.590\n","Epoch 46 Loss: 2.569\n","Step 0, Loss: 2.300\n","Step 100, Loss: 2.454\n","Step 200, Loss: 2.510\n","Step 300, Loss: 2.621\n","Epoch 47 Loss: 2.555\n","Step 0, Loss: 2.508\n","Step 100, Loss: 2.466\n","Step 200, Loss: 2.577\n","Step 300, Loss: 2.596\n","Epoch 48 Loss: 2.534\n","Step 0, Loss: 2.390\n","Step 100, Loss: 2.496\n","Step 200, Loss: 2.567\n","Step 300, Loss: 2.660\n","Epoch 49 Loss: 2.519\n","Step 0, Loss: 2.340\n","Step 100, Loss: 2.477\n","Step 200, Loss: 2.551\n","Step 300, Loss: 2.687\n","Epoch 50 Loss: 2.496\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(50): # 50 Epochs for ALT\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_tl.pt')"],"id":"EL4FOyYli7DM"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"G_0EQE5Vi7DM","executionInfo":{"status":"ok","timestamp":1770443127285,"user_tz":-420,"elapsed":9,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"fa2fa05e-e591-49a5-e0c0-267ce50c411b"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_tl.pt', 'app/models/transformer_model_tl.pt')\n","shutil.copy('spm_tl.model', 'app/models/spm_tl.model')\n","shutil.copy('spm_en_tl.model', 'app/models/spm_en_tl.model')\n","print(\"Models copied to app/models/\")"],"id":"G_0EQE5Vi7DM"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"3bea3f3ba71840d08d15d89cd9b1942f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_7260b34547704aa1a25ce5764326d84a","IPY_MODEL_99a8c2e3e5b348ffa3398f3f5fa7feeb","IPY_MODEL_ad978d099dff43899c4c2b27f5965d96"],"layout":"IPY_MODEL_152028b582394ed1aa179fe7e49a71e5"}},"7260b34547704aa1a25ce5764326d84a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_66fd2df97a514b4abf8c495215a0c40d","placeholder":"","style":"IPY_MODEL_1de3bb240c4e48299b80c32479f7236f","value":"README.md: "}},"99a8c2e3e5b348ffa3398f3f5fa7feeb":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e0a682bc6a954368bab0b5cd24afe0bc","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5bc80f4a727a4a2f89048592923819f2","value":1}},"ad978d099dff43899c4c2b27f5965d96":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ac63742075ee4a8aacde692dd878949d","placeholder":"","style":"IPY_MODEL_602c57471f0a4ee9b2c1a62b50c81b30","value":" 13.2k/? [00:00<00:00, 1.06MB/s]"}},"152028b582394ed1aa179fe7e49a71e5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"66fd2df97a514b4abf8c495215a0c40d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1de3bb240c4e48299b80c32479f7236f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e0a682bc6a954368bab0b5cd24afe0bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"5bc80f4a727a4a2f89048592923819f2":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"ac63742075ee4a8aacde692dd878949d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"602c57471f0a4ee9b2c1a62b50c81b30":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9cb7f8df246f47718c3088bd75b4f615":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_9b9b1352c1eb45cb87795854e0ffdfc1","IPY_MODEL_bcf36300957947c2ac51b1bf1050156e","IPY_MODEL_46a5329ffe0f41288446476b3c1e02f3"],"layout":"IPY_MODEL_7c4627887bbb42388cf1902e76eb483b"}},"9b9b1352c1eb45cb87795854e0ffdfc1":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9f23df2a6c234262ad7870e5cbbe68a0","placeholder":"","style":"IPY_MODEL_f7f91e01fc0d4c5fa76734c051a68815","value":"alt-parallel/train-00000-of-00001.parque(…): 100%"}},"bcf36300957947c2ac51b1bf1050156e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_db1a0ccf2a7a4171957399db5adc5ba4","max":31211167,"min":0,"orientation":"horizontal","style":"IPY_MODEL_628845fcf7da4e4f8030a019e44f5a88","value":31211167}},"46a5329ffe0f41288446476b3c1e02f3":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e1b84510fd384d2eb17c23a8dc216e68","placeholder":"","style":"IPY_MODEL_7ee522e7a7ec411aa178c1f2f4d487cf","value":" 31.2M/31.2M [00:01<00:00, 17.0MB/s]"}},"7c4627887bbb42388cf1902e76eb483b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9f23df2a6c234262ad7870e5cbbe68a0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f7f91e01fc0d4c5fa76734c051a68815":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"db1a0ccf2a7a4171957399db5adc5ba4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"628845fcf7da4e4f8030a019e44f5a88":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e1b84510fd384d2eb17c23a8dc216e68":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7ee522e7a7ec411aa178c1f2f4d487cf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2dd50fb2684d49c684d0196ad5d6e065":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_8288e34b12704fa880150517a686a9e4","IPY_MODEL_dc201daf7f9549cf91806bc409d68c6c","IPY_MODEL_8db637dc8ee24def8ee31ffb1d63d0b2"],"layout":"IPY_MODEL_6cebb045b0124eb3bd151a6ae4bb5230"}},"8288e34b12704fa880150517a686a9e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3bc431e04c444df081ff754a0cb6c6b5","placeholder":"","style":"IPY_MODEL_568b2f1130994f278776e45bcd939a28","value":"alt-parallel/validation-00000-of-00001.p(…): 100%"}},"dc201daf7f9549cf91806bc409d68c6c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f88dfe8d492f46518bdf554b8a94131d","max":1710203,"min":0,"orientation":"horizontal","style":"IPY_MODEL_bff35403467b42a88d0bca8f2919065a","value":1710203}},"8db637dc8ee24def8ee31ffb1d63d0b2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_881ab236fe8e429c883603d3de9528ab","placeholder":"","style":"IPY_MODEL_25368c07e6724131a2f8e5395707d668","value":" 1.71M/1.71M [00:00<00:00, 3.30MB/s]"}},"6cebb045b0124eb3bd151a6ae4bb5230":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3bc431e04c444df081ff754a0cb6c6b5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"568b2f1130994f278776e45bcd939a28":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f88dfe8d492f46518bdf554b8a94131d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bff35403467b42a88d0bca8f2919065a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"881ab236fe8e429c883603d3de9528ab":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"25368c07e6724131a2f8e5395707d668":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9d7773e115654ad9b75a7ad44f502f4a":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_040d4b5b41c94a2bb1f36e3296b3bdd4","IPY_MODEL_c21d3525924c47979813f514fcbb949e","IPY_MODEL_9f707859d1b447b9bf2b400bf713bb51"],"layout":"IPY_MODEL_e78ea5453d714b1fbfde8466abf9d683"}},"040d4b5b41c94a2bb1f36e3296b3bdd4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3167bcf1029e4ab2b0e1ac9c777ced6c","placeholder":"","style":"IPY_MODEL_e01b0c927e4b4456a83f1360ee6debcc","value":"alt-parallel/test-00000-of-00001.parquet: 100%"}},"c21d3525924c47979813f514fcbb949e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3b2aad8a0cdb4bb4866b89e49484e75f","max":1786537,"min":0,"orientation":"horizontal","style":"IPY_MODEL_871f6c7ca002447cb4616ae764bcf996","value":1786537}},"9f707859d1b447b9bf2b400bf713bb51":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e15f751bbcbd4f53b4d756da399d6d9c","placeholder":"","style":"IPY_MODEL_2e2167c120244bfba2a600cd11bea24c","value":" 1.79M/1.79M [00:00<00:00, 3.94MB/s]"}},"e78ea5453d714b1fbfde8466abf9d683":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3167bcf1029e4ab2b0e1ac9c777ced6c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e01b0c927e4b4456a83f1360ee6debcc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3b2aad8a0cdb4bb4866b89e49484e75f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"871f6c7ca002447cb4616ae764bcf996":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e15f751bbcbd4f53b4d756da399d6d9c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2e2167c120244bfba2a600cd11bea24c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"0fc80220974d42e383a059fb7fb4cac4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_93a42a372e0e470ab0172a5c86047b78","IPY_MODEL_48f749ac01c7414786abb9e76bfde0b4","IPY_MODEL_0aec42c7404b42a4bda73ba4ab759576"],"layout":"IPY_MODEL_d0d1f6343b014c769ba95945fd399365"}},"93a42a372e0e470ab0172a5c86047b78":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d6f0b3924aa048f7955cd9a1ad391106","placeholder":"","style":"IPY_MODEL_ba639735e8594aed937e838652efc534","value":"Generating train split: 100%"}},"48f749ac01c7414786abb9e76bfde0b4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_b96937a130a745ec99861af1ca8b7a97","max":18088,"min":0,"orientation":"horizontal","style":"IPY_MODEL_35179739868040d1ba7d542f450f04da","value":18088}},"0aec42c7404b42a4bda73ba4ab759576":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_78ce17d6759143faa0fecb35ce77dbaf","placeholder":"","style":"IPY_MODEL_c388f8c8f7454befa4ef800ca881ab69","value":" 18088/18088 [00:00<00:00, 58062.28 examples/s]"}},"d0d1f6343b014c769ba95945fd399365":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6f0b3924aa048f7955cd9a1ad391106":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ba639735e8594aed937e838652efc534":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"b96937a130a745ec99861af1ca8b7a97":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"35179739868040d1ba7d542f450f04da":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"78ce17d6759143faa0fecb35ce77dbaf":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c388f8c8f7454befa4ef800ca881ab69":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6dbf026b6dd541e8a616611f88d0fdd2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_190d05dede1848f8ac868a8fd8c4f00b","IPY_MODEL_740e1171bb4842ebac7f6dbdd7ee9f01","IPY_MODEL_217fd7ebc0b5431f918656dd8c681243"],"layout":"IPY_MODEL_a27cf14d24b94adfbbffaa816c1ce491"}},"190d05dede1848f8ac868a8fd8c4f00b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cdb53cc2c0e14e04aceb8faef270e136","placeholder":"","style":"IPY_MODEL_d32ae0c469a242faaafeee4f7d14a17b","value":"Generating validation split: 100%"}},"740e1171bb4842ebac7f6dbdd7ee9f01":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_266ef0735e414b8a88e1333f91cfb615","max":1000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_eb1caf07bdb54ead8026a1dc689105fb","value":1000}},"217fd7ebc0b5431f918656dd8c681243":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_55e32e2868c6429687ec74c79f226386","placeholder":"","style":"IPY_MODEL_2220cdff006d48e4927d4a80f3912cf9","value":" 1000/1000 [00:00<00:00, 30195.27 examples/s]"}},"a27cf14d24b94adfbbffaa816c1ce491":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cdb53cc2c0e14e04aceb8faef270e136":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d32ae0c469a242faaafeee4f7d14a17b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"266ef0735e414b8a88e1333f91cfb615":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"eb1caf07bdb54ead8026a1dc689105fb":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"55e32e2868c6429687ec74c79f226386":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2220cdff006d48e4927d4a80f3912cf9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ceec10aebf0645cdbbb06950685ca7d9":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_893ec2094c7846c0b36815fe2ed1fcc8","IPY_MODEL_5366d27cd0654132b39b175e6e7a50d9","IPY_MODEL_7b4877c7b2a1410fb7aa243edc77dd15"],"layout":"IPY_MODEL_38d4f3d3233b46608e21a2889b5be2f8"}},"893ec2094c7846c0b36815fe2ed1fcc8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c6d608c9fce94163a951e4651c7f30b3","placeholder":"","style":"IPY_MODEL_21c72058ee9a472283a517619379dd77","value":"Generating test split: 100%"}},"5366d27cd0654132b39b175e6e7a50d9":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_88329550b2ab488c9a28527399de5e77","max":1019,"min":0,"orientation":"horizontal","style":"IPY_MODEL_f14af4590e09400ea39e5406668afe53","value":1019}},"7b4877c7b2a1410fb7aa243edc77dd15":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_327c428abb4443849591e08a144972ae","placeholder":"","style":"IPY_MODEL_66591ba93f4943528bebe7d0bb7b8fd6","value":" 1019/1019 [00:00<00:00, 27028.37 examples/s]"}},"38d4f3d3233b46608e21a2889b5be2f8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c6d608c9fce94163a951e4651c7f30b3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"21c72058ee9a472283a517619379dd77":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"88329550b2ab488c9a28527399de5e77":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f14af4590e09400ea39e5406668afe53":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"327c428abb4443849591e08a144972ae":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"66591ba93f4943528bebe7d0bb7b8fd6":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
Thai_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"ovL7yvuuM1J8"},"source":["# Thai-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Thai (th) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **ALT (Asian Language Treebank)** dataset for Thai-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n","\n","## Pipeline\n","1. **Setup**: Install/Import dependencies.\n","2. **Data Loading**: Load the ALT dataset (Thai-English).\n","3. **Tokenization**: Train SentencePiece model (`spm_th`, `spm_en_th`).\n","4. **Data Processing**: Create PyTorch Datasets and DataLoaders.\n","5. **Model**: Implement Transformer.\n","6. **Training**: Train the model.\n","7. **Evaluation**: Calculate BLEU score.\n","8. **Inference**: Demo function and save model for Web App."],"id":"ovL7yvuuM1J8"},{"cell_type":"markdown","metadata":{"id":"5lxOnCsnM1J-"},"source":["## 1. Setup and Imports"],"id":"5lxOnCsnM1J-"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"larr4GScM1J-","executionInfo":{"status":"ok","timestamp":1770435885900,"user_tz":-420,"elapsed":9552,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"3893aa16-273d-4de6-cd68-edd42f94d2c5"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","# Set seeds\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"larr4GScM1J-"},{"cell_type":"code","execution_count":2,"metadata":{"id":"tUHgxeXwM1J_","executionInfo":{"status":"ok","timestamp":1770435885906,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["# Install dependencies if missing (uncomment if needed)\n","# !pip install sentencepiece datasets portalocker"],"id":"tUHgxeXwM1J_"},{"cell_type":"markdown","metadata":{"id":"CSodHP5IM1J_"},"source":["## 2. Data Loading (ALT Dataset)\n","Loading Thai-English pairs from ALT."],"id":"CSodHP5IM1J_"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":541,"referenced_widgets":["d4ec5329c1844388b4f6b896b76c586e","aa53e174527f43dcb3fa9fae564098d9","b68adfcf941447f38a1b49e9b709308b","41c73eb3cc5f4938a6091d907620233a","940dbf51962f4e98b3e36155dd166fd7","f9041e18aa7342199f63cc555eda0993","73b36006efa64e22b9c480f1f85828c0","ef841660bc9a4fb5b5f99f84507480fe","a8002a144c3447bfa3aee9212129fd1a","28bb8872ccee4dc4a3fb8b0a078e3576","c735929e87eb435d801c413ff9b0fca0","dfd4e0697b15478c839a697e7744852d","0d05c76faa734d6195ef21f2403a666f","0745b063a2df4c48b58f413b637bdaa2","baf15b3bf3774eed9738e244ebd7b16c","8036e1c5dd76462ba3bdc17c73f2f06e","7b43840efecb47b2bcd4611b99df11d7","7b91e2472b9b44e29bd431dd22b1c737","3064ea06da344e7dac706999c2785261","5572d2fe61ac43118afa9b304b16b510","a7d4a56724634785a2f1416c8f08eb10","aaed2b2a50ef4370bb8ae1951c386a91","20147e3fe9dc4e888f33069c3c3f523e","ff646d16028842a29963ee5cb4eae215","70d8e5233dfd49519a4a12e9fbc45351","25b982891ac04ccb87f2555549dbbd18","f5b81c8f4ca44375b763e77d8b7f6c15","1e2345f0d2fa4f5582c5635344a9ff4d","cada57a9f19247d69aa72a9b9c1d4000","e4d59e8900eb43909d06b969bd735289","21126f0d2ca6408ab3ffdc2d288a74e3","ea27c3e675ac4f1588f59f47d06e1263","edfc67720c9641cf8c8876f1bad6dcea","af5d3bec80a2485ba399db24ba326c91","0afa3b2829104136a268120d7e48fa1e","87ed005e275e4c05846fd9f4cb7e3e8c","2fd5509f4c8b44c1ae0d8f65a9808e19","cd37370826174938b41dddaa78e2ad2e","728ef89112d049d2adb5626823a31164","cb17f9c954dc4d8ab4ad1411fb9e01da","ab1a3305c8e64d698ccf96bf2e89f8f6","f7afee3e635249d0b7962eb73876add8","39f6e71fb05246798a7e1e5f86dc6c8a","013b681e3d32499095ffa169e0f8d27e","90b4ee1a9def4ef8be493eda8a40f873","0afd65653e4148b4b01d8b75699c5c49","99b699e79fb3466c80bf532f1f4978e8","b57a41822d80427b8bf08861f424806d","f95fa07a24a642ceb167385cd41237e6","61d30f08cf91410b8881944c71dbdf72","25f96a4aa8c840efb6c8d3c954c3440a","414890d7ea3848d982503a684f9b1438","a17e670578444551bbe6a33ce3eb1469","3e4e6bcd2b114edc86291e3fb6fc64ae","a15d1a9f2848416f812c1e2101054dca","8e2fa8bbd8e54193a1603669e5b73d99","48597d1c17ce4c4596e06db84fa57c0e","235a020b60fd4417b223de7522d98904","6b00786102084a9d8ac295a86b4a018e","50b37c8c62434616b00ca71334033ea3","e8c79aaefe8d4b58b63a6d630137c238","d2b8d4ebfc064bf6826d2e71e2d764e6","3202b4ce97ad44ad9877d3f5f9fc4c41","323a619824c442b8aa8609d255b0d470","3c6fef78cd854e2ab3fa23af01faa51f","29f1f1575fb443a6b1e01741a7d63f6a","c543038331c449de80fd72f37458950b","9bcf8d42b16645ddb29750021e5e52d6","1d09575ade5048ccb4501812157a2c21","4f75e0e560704eeab8aeb09be89add0b","cbefddfd5f5f4f8da3c1fa962a110737","48ce12da1224499f9238b21b0d3b2f4f","e3b3496a577e4d92b2b0e7edd4f888c4","bf35bf464ace496aa23fd8a184be1cd3","78aa0dae0714424b8019e3c6f25008d8","02a269db1c7c48d4b3c2ebb647abd98d","043730347106465e9995143ebbd7852c"]},"id":"VzLTzFz5M1KA","executionInfo":{"status":"ok","timestamp":1770435895935,"user_tz":-420,"elapsed":10028,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"333d7e1b-5821-45fa-ae65-ebefdcce6bab"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading ALT Dataset (Thai-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d4ec5329c1844388b4f6b896b76c586e"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["alt-parallel/train-00000-of-00001.parque(…): 0%| | 0.00/31.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"dfd4e0697b15478c839a697e7744852d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/validation-00000-of-00001.p(…): 0%| | 0.00/1.71M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"20147e3fe9dc4e888f33069c3c3f523e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/test-00000-of-00001.parquet: 0%| | 0.00/1.79M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"af5d3bec80a2485ba399db24ba326c91"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/18088 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"90b4ee1a9def4ef8be493eda8a40f873"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/1000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8e2fa8bbd8e54193a1603669e5b73d99"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/1019 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c543038331c449de80fd72f37458950b"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 20107 sentences from ALT dataset.\n","Extracted 20107 Thai-English pairs.\n"]}],"source":["from datasets import load_dataset\n","\n","print(\"Loading ALT Dataset (Thai-English)...\")\n","try:\n"," dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from ALT dataset.\")\n","\n"," # Filter/Extract only Thai and English\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'th' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'th': item['translation']['th'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," print(f\"Extracted {len(data)} Thai-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")\n"],"id":"VzLTzFz5M1KA"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RjL0ai03M1KA","executionInfo":{"status":"ok","timestamp":1770435895973,"user_tz":-420,"elapsed":35,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"38846639-c1e9-4864-afba-8f6312d502c3"},"outputs":[{"output_type":"stream","name":"stdout","text":[" th \\\n","0 อิตาลีได้เอาชนะโปรตุเกสด้วยคะแนน31ต่อ5 ในกลุ่ม... \n","1 Andrea Masi ได้เปิดฉากทำคะแนนในนาทีที่สี่ ด้วย... \n","2 ทั้งที่เป็นฝ่ายคุมเกมส์ในครึ่งแรกของการแข่งขัน... \n","3 โปรตุเกสไม่ละความพยยาม และDavid Penalvaได้ทำคะ... \n","4 ในครึ่งแรกอิตาลีขึ้นนำด้วยคะแนน16 ต่อ5 แต่ถูกป... \n","\n"," en \n","0 Italy have defeated Portugal 31-5 in Pool C of... \n","1 Andrea Masi opened the scoring in the fourth m... \n","2 Despite controlling the game for much of the f... \n","3 Portugal never gave up and David Penalva score... \n","4 Italy led 16-5 at half time but were matched b... \n","After cleaning: 20101 pairs\n","\n","--- Data Alignment Check ---\n","Source (th): ปัญหาเริ่มจากมีกลุ่ม \"นักปีนเขาที่สวมแต่รองเท้าบูทเท่านั้น\" ถูกตำรวจที่ Alpine จับเมื่อฤดูใบไม้ร่วงที่ผ่านมา\n","Target (en): The problem started with a group of \"boot-only hikers\" who were stopped by the police in the Alpine region last autumn.\n","--------------------\n","Source (th): ส่วนผู้สมัครคนอื่น ๆ ได้แก่ เจมี แม็คการ์วีย์, กรีน เกล็น ฮอดจ์สัน จากพรรคริเบอรัล และเดวิด โรว์แลนด์ ผู้สมัครอิสระ\n","Target (en): Other candidates in the riding are Liberal Jamie McGarvey, Green Glen Hodgson, and independent David Rowland.\n","--------------------\n","Source (th): \"ผู้หญิงที่ได้รับบาดเจ็บถูกหามขึ้นบนรถฉุกเฉิน Lebanon Ambulance One และส่งไปที่โรงพยาบาล Frisbie ใน Rochester เพื่อรักษาอาการให้คงที่ จากนั้นถูกส่งต่อไปที่ศูนย์อุบัติเหตุที่ Maine Medical Center\"\n","Target (en): \"The female patient was loaded into Lebanon Ambulance One and transported to Frisbie Hospital in Rochester to be stabilized and then was transferred to the trauma center at Maine Medical Center.\"\n","--------------------\n","Source (th): อดึตรัฐมนตรีว่าการกระทรวงพลังงานนิวเคลียร์ Yevgeny Adamov ได้ถูกจับกุมเมื่อวันจันทร์โดยเจ้าหน้าที่สวิสเซอร์แลนด์\n","Target (en): Former Russian nuclear energy minister Yevgeny Adamov was arrested on Monday by Swiss authorities.\n","--------------------\n","Source (th): พยานผู้เห็นเหตุการณ์รายหนึ่ง ให้สำนักข่าวว้อยซ์ ออฟ อเมริกาว่า ตอนเช้าวันศุกร์เขากำลังนั่งอยู่ในร้านของเขา เมื่อตอนที่ระเบิดใหญ่สะเทือนบริเวณนั้น เกิดฝุ่นคลุ้งไปทั่วบริเวณและทำให้สิ่งของหล่นจากผนัง\n","Target (en): One eyewitness told the Voice of America news agency he was sitting in his shop when a big explosion shook the area Friday morning, sending dust in the air and causing objects to fall from the walls.\n","--------------------\n"]}],"source":["# Convert to DataFrame\n","df = pd.DataFrame(data)\n","print(df.head())\n","\n","# Basic Cleaning\n","df = df.dropna(subset=['th', 'en'])\n","df['th'] = df['th'].astype(str)\n","df['en'] = df['en'].astype(str)\n","\n","df = df[df['th'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']\n","print(f\"After cleaning: {len(df)} pairs\")\n","\n","print(\"\\n--- Data Alignment Check ---\")\n","for i in range(5):\n"," sample = df.sample(1).iloc[0]\n"," print(f\"Source (th): {sample['th']}\")\n"," print(f\"Target (en): {sample['en']}\")\n"," print(\"-\" * 20)"],"id":"RjL0ai03M1KA"},{"cell_type":"markdown","metadata":{"id":"mL8WYE8YM1KA"},"source":["## 3. Tokenization (SentencePiece)\n","Training separate tokenizers for Thai (`spm_th`) and English (`spm_en_th`)."],"id":"mL8WYE8YM1KA"},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cvDYVzxQM1KA","executionInfo":{"status":"ok","timestamp":1770435910685,"user_tz":-420,"elapsed":14694,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"0616ff53-1703-43d0-e9bd-56e30cd9c417"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Thai Tokenizer...\n","Training English Tokenizer (for Thai pair)...\n","Tokenizer training complete!\n"]}],"source":["import sentencepiece as spm\n","\n","# 1. Save texts to files\n","with open('train_th.txt', 'w', encoding='utf-8') as f:\n"," for line in df['th']:\n"," f.write(line + '\\n')\n","\n","with open('train_en_th.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']:\n"," f.write(line + '\\n')\n","\n","# 2. Train SentencePiece models\n","vocab_size = 4000\n","model_type = 'bpe'\n","\n","print(\"Training Thai Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_th.txt',\n"," model_prefix='spm_th',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Thai pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_th.txt',\n"," model_prefix='spm_en_th',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Tokenizer training complete!\")"],"id":"cvDYVzxQM1KA"},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fssRkvGwM1KA","executionInfo":{"status":"ok","timestamp":1770435910711,"user_tz":-420,"elapsed":23,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"7322cd51-b246-43a7-e75c-2f22935d68ca"},"outputs":[{"output_type":"stream","name":"stdout","text":["Original th: อิตาลีได้เอาชนะโปรตุเกสด้วยคะแนน31ต่อ5 ในกลุ่มc ของการแข่งขันรักบี้เวิลด์คัพปี2007 ที่สนามปาร์กเดแพร็งส์ ที่กรุงปารีส ประเทศฝรั่งเศส\n","Tokens: ['▁', 'อิตาลี', 'ได้', 'เอาชนะ', 'โปร', 'ตุ', 'เก', 'ส', 'ด้วยคะแนน', '3', '1', 'ต่อ', '5', '▁ใน', 'กลุ่ม', 'c', '▁ของ', 'การแข่งขัน', 'รัก', 'บ', 'ี้', 'เ', 'วิ', 'ล', 'ด์', 'ค', 'ัพ', 'ปี', '200', '7', '▁ที่', 'สนาม', 'ป', 'าร์', 'ก', 'เด', 'แพร', '็ง', 'ส์', '▁ที่', 'กรุง', 'ป', 'าร', 'ี', 'ส', '▁ประเทศ', 'ฝรั่งเศส']\n","IDs: [3866, 2645, 25, 2150, 2037, 170, 70, 3882, 2998, 3950, 3931, 120, 3947, 109, 321, 3929, 420, 806, 358, 3886, 59, 3872, 102, 3879, 389, 3888, 1481, 108, 2812, 3970, 237, 955, 3890, 374, 3869, 111, 1251, 1222, 729, 237, 1028, 3890, 4, 3877, 3882, 990, 1232]\n"]}],"source":["# Load the processors\n","sp_th = spm.SentencePieceProcessor(model_file='spm_th.model')\n","sp_en = spm.SentencePieceProcessor(model_file='spm_en_th.model')\n","\n","# Test Tokenization\n","idx = 0\n","print(f\"Original th: {df.iloc[idx]['th']}\")\n","print(f\"Tokens: {sp_th.encode(df.iloc[idx]['th'], out_type=str)}\")\n","print(f\"IDs: {sp_th.encode(df.iloc[idx]['th'], out_type=int)}\")"],"id":"fssRkvGwM1KA"},{"cell_type":"markdown","metadata":{"id":"4uCE2oXHM1KA"},"source":["## 4. PyTorch Dataset and DataLoader"],"id":"4uCE2oXHM1KA"},{"cell_type":"code","execution_count":7,"metadata":{"id":"Elb_T_R1M1KB","executionInfo":{"status":"ok","timestamp":1770435910717,"user_tz":-420,"elapsed":7,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['th']\n"," trg_text = self.data.iloc[idx]['en']\n","\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n","\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n","\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n","\n"," return src_pad, trg_pad\n","\n","# Split Data\n","train_df = df.sample(frac=0.8, random_state=SEED)\n","val_test_df = df.drop(train_df.index)\n","val_df = val_test_df.sample(frac=0.5, random_state=SEED)\n","test_df = val_test_df.drop(val_df.index)\n","\n","train_dataset = TranslationDataset(train_df, sp_th, sp_en)\n","val_dataset = TranslationDataset(val_df, sp_th, sp_en)\n","test_dataset = TranslationDataset(test_df, sp_th, sp_en)\n","\n","BATCH_SIZE = 64\n","train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n","val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n","test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)"],"id":"Elb_T_R1M1KB"},{"cell_type":"markdown","metadata":{"id":"ghbp1QkpM1KB"},"source":["## 5. Transformer Model"],"id":"ghbp1QkpM1KB"},{"cell_type":"code","execution_count":8,"metadata":{"id":"Oe5LIn2lM1KB","executionInfo":{"status":"ok","timestamp":1770435910736,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=512, nhead=8, num_encoder_layers=3,\n"," num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n","\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n","\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n","\n"," self.transformer = nn.Transformer(\n"," d_model=d_model,\n"," nhead=nhead,\n"," num_encoder_layers=num_encoder_layers,\n"," num_decoder_layers=num_decoder_layers,\n"," dim_feedforward=dim_feedforward,\n"," dropout=dropout,\n"," batch_first=True\n"," )\n","\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n"," self.init_weights()\n","\n"," def init_weights(self):\n"," for p in self.parameters():\n"," if p.dim() > 1:\n"," nn.init.xavier_uniform_(p)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n","\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n","\n"," output = self.transformer(\n"," src=src_emb,\n"," tgt=trg_emb,\n"," tgt_mask=trg_mask,\n"," src_key_padding_mask=src_key_padding_mask\n"," )\n"," return self.fc_out(output)\n","\n","class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)"],"id":"Oe5LIn2lM1KB"},{"cell_type":"markdown","metadata":{"id":"ot44IfLkM1KB"},"source":["## 6. Training"],"id":"ot44IfLkM1KB"},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TPsbvXwKM1KB","executionInfo":{"status":"ok","timestamp":1770438089612,"user_tz":-420,"elapsed":2178869,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"7211b8cc-f8a7-4cb3-dcfd-49c1bccf342a"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting training...\n","Epoch: 01 | Train Loss: 6.842\n","Epoch: 02 | Train Loss: 6.028\n","Epoch: 03 | Train Loss: 5.618\n","Epoch: 04 | Train Loss: 5.384\n","Epoch: 05 | Train Loss: 5.247\n","Epoch: 06 | Train Loss: 5.146\n","Epoch: 07 | Train Loss: 5.064\n","Epoch: 08 | Train Loss: 4.994\n","Epoch: 09 | Train Loss: 4.929\n","Epoch: 10 | Train Loss: 4.868\n","Epoch: 11 | Train Loss: 4.809\n","Epoch: 12 | Train Loss: 4.755\n","Epoch: 13 | Train Loss: 4.703\n","Epoch: 14 | Train Loss: 4.653\n","Epoch: 15 | Train Loss: 4.607\n","Epoch: 16 | Train Loss: 4.563\n","Epoch: 17 | Train Loss: 4.523\n","Epoch: 18 | Train Loss: 4.487\n","Epoch: 19 | Train Loss: 4.450\n","Epoch: 20 | Train Loss: 4.416\n","Epoch: 21 | Train Loss: 4.383\n","Epoch: 22 | Train Loss: 4.353\n","Epoch: 23 | Train Loss: 4.327\n","Epoch: 24 | Train Loss: 4.300\n","Epoch: 25 | Train Loss: 4.277\n","Epoch: 26 | Train Loss: 4.252\n","Epoch: 27 | Train Loss: 4.229\n","Epoch: 28 | Train Loss: 4.208\n","Epoch: 29 | Train Loss: 4.189\n","Epoch: 30 | Train Loss: 4.169\n","Epoch: 31 | Train Loss: 4.150\n","Epoch: 32 | Train Loss: 4.132\n","Epoch: 33 | Train Loss: 4.115\n","Epoch: 34 | Train Loss: 4.099\n","Epoch: 35 | Train Loss: 4.084\n","Epoch: 36 | Train Loss: 4.070\n","Epoch: 37 | Train Loss: 4.055\n","Epoch: 38 | Train Loss: 4.043\n","Epoch: 39 | Train Loss: 4.028\n","Epoch: 40 | Train Loss: 4.014\n","Epoch: 41 | Train Loss: 4.001\n","Epoch: 42 | Train Loss: 3.988\n","Epoch: 43 | Train Loss: 3.979\n","Epoch: 44 | Train Loss: 3.967\n","Epoch: 45 | Train Loss: 3.960\n","Epoch: 46 | Train Loss: 3.946\n","Epoch: 47 | Train Loss: 3.936\n","Epoch: 48 | Train Loss: 3.929\n","Epoch: 49 | Train Loss: 3.915\n","Epoch: 50 | Train Loss: 3.909\n","Epoch: 51 | Train Loss: 3.897\n","Epoch: 52 | Train Loss: 3.889\n","Epoch: 53 | Train Loss: 3.881\n","Epoch: 54 | Train Loss: 3.872\n","Epoch: 55 | Train Loss: 3.865\n","Epoch: 56 | Train Loss: 3.856\n","Epoch: 57 | Train Loss: 3.851\n","Epoch: 58 | Train Loss: 3.844\n","Epoch: 59 | Train Loss: 3.836\n","Epoch: 60 | Train Loss: 3.830\n","Epoch: 61 | Train Loss: 3.821\n","Epoch: 62 | Train Loss: 3.817\n","Epoch: 63 | Train Loss: 3.807\n","Epoch: 64 | Train Loss: 3.802\n","Epoch: 65 | Train Loss: 3.798\n","Epoch: 66 | Train Loss: 3.792\n","Epoch: 67 | Train Loss: 3.785\n","Epoch: 68 | Train Loss: 3.779\n","Epoch: 69 | Train Loss: 3.774\n","Epoch: 70 | Train Loss: 3.770\n","Epoch: 71 | Train Loss: 3.763\n","Epoch: 72 | Train Loss: 3.757\n","Epoch: 73 | Train Loss: 3.751\n","Epoch: 74 | Train Loss: 3.748\n","Epoch: 75 | Train Loss: 3.742\n","Epoch: 76 | Train Loss: 3.737\n","Epoch: 77 | Train Loss: 3.734\n","Epoch: 78 | Train Loss: 3.727\n","Epoch: 79 | Train Loss: 3.726\n","Epoch: 80 | Train Loss: 3.719\n","Epoch: 81 | Train Loss: 3.714\n","Epoch: 82 | Train Loss: 3.713\n","Epoch: 83 | Train Loss: 3.711\n","Epoch: 84 | Train Loss: 3.704\n","Epoch: 85 | Train Loss: 3.698\n","Epoch: 86 | Train Loss: 3.696\n","Epoch: 87 | Train Loss: 3.693\n","Epoch: 88 | Train Loss: 3.687\n","Epoch: 89 | Train Loss: 3.682\n","Epoch: 90 | Train Loss: 3.681\n","Epoch: 91 | Train Loss: 3.677\n","Epoch: 92 | Train Loss: 3.674\n","Epoch: 93 | Train Loss: 3.670\n","Epoch: 94 | Train Loss: 3.666\n","Epoch: 95 | Train Loss: 3.663\n","Epoch: 96 | Train Loss: 3.660\n","Epoch: 97 | Train Loss: 3.655\n","Epoch: 98 | Train Loss: 3.653\n","Epoch: 99 | Train Loss: 3.650\n","Epoch: 100 | Train Loss: 3.648\n"]}],"source":["SRC_VOCAB_SIZE = vocab_size\n","TRG_VOCAB_SIZE = vocab_size\n","D_MODEL = 256\n","N_HEAD = 4\n","NUM_LAYERS = 2\n","FF_DIM = 512\n","DROPOUT = 0.4\n","LR = 0.0005\n","EPOCHS = 100\n","\n","model = TransformerModel(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, N_HEAD, NUM_LAYERS, NUM_LAYERS, FF_DIM, DROPOUT).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=LR)\n","criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)\n","\n","def train(model, iterator, optimizer, criterion, clip):\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(iterator):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output_dim = output.shape[-1]\n"," output = output.contiguous().view(-1, output_dim)\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," return epoch_loss / len(iterator)\n","\n","print(\"Starting training...\")\n","for epoch in range(EPOCHS):\n"," train_loss = train(model, train_loader, optimizer, criterion, 1.0)\n"," print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')\n"," # Save every epoch or best validation (skipped val loop for brevity here, but included in full code)\n"," torch.save(model.state_dict(), 'transformer_model_th.pt')"],"id":"TPsbvXwKM1KB"},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"OCQG6akdM1KB","executionInfo":{"status":"ok","timestamp":1770438089644,"user_tz":-420,"elapsed":27,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"99f934d0-7490-4a34-bdd7-c2f0e5548986"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Save artifacts for Web App\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_th.pt', 'app/models/transformer_model_th.pt')\n","shutil.copy('spm_th.model', 'app/models/spm_th.model')\n","shutil.copy('spm_en_th.model', 'app/models/spm_en_th.model')\n","print(\"Models copied to app/models/\")"],"id":"OCQG6akdM1KB"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"d4ec5329c1844388b4f6b896b76c586e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_aa53e174527f43dcb3fa9fae564098d9","IPY_MODEL_b68adfcf941447f38a1b49e9b709308b","IPY_MODEL_41c73eb3cc5f4938a6091d907620233a"],"layout":"IPY_MODEL_940dbf51962f4e98b3e36155dd166fd7"}},"aa53e174527f43dcb3fa9fae564098d9":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f9041e18aa7342199f63cc555eda0993","placeholder":"","style":"IPY_MODEL_73b36006efa64e22b9c480f1f85828c0","value":"README.md: "}},"b68adfcf941447f38a1b49e9b709308b":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ef841660bc9a4fb5b5f99f84507480fe","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a8002a144c3447bfa3aee9212129fd1a","value":1}},"41c73eb3cc5f4938a6091d907620233a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_28bb8872ccee4dc4a3fb8b0a078e3576","placeholder":"","style":"IPY_MODEL_c735929e87eb435d801c413ff9b0fca0","value":" 13.2k/? [00:00<00:00, 931kB/s]"}},"940dbf51962f4e98b3e36155dd166fd7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f9041e18aa7342199f63cc555eda0993":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"73b36006efa64e22b9c480f1f85828c0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ef841660bc9a4fb5b5f99f84507480fe":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"a8002a144c3447bfa3aee9212129fd1a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"28bb8872ccee4dc4a3fb8b0a078e3576":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c735929e87eb435d801c413ff9b0fca0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"dfd4e0697b15478c839a697e7744852d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0d05c76faa734d6195ef21f2403a666f","IPY_MODEL_0745b063a2df4c48b58f413b637bdaa2","IPY_MODEL_baf15b3bf3774eed9738e244ebd7b16c"],"layout":"IPY_MODEL_8036e1c5dd76462ba3bdc17c73f2f06e"}},"0d05c76faa734d6195ef21f2403a666f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_7b43840efecb47b2bcd4611b99df11d7","placeholder":"","style":"IPY_MODEL_7b91e2472b9b44e29bd431dd22b1c737","value":"alt-parallel/train-00000-of-00001.parque(…): 100%"}},"0745b063a2df4c48b58f413b637bdaa2":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3064ea06da344e7dac706999c2785261","max":31211167,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5572d2fe61ac43118afa9b304b16b510","value":31211167}},"baf15b3bf3774eed9738e244ebd7b16c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a7d4a56724634785a2f1416c8f08eb10","placeholder":"","style":"IPY_MODEL_aaed2b2a50ef4370bb8ae1951c386a91","value":" 31.2M/31.2M [00:01<00:00, 24.5MB/s]"}},"8036e1c5dd76462ba3bdc17c73f2f06e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7b43840efecb47b2bcd4611b99df11d7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7b91e2472b9b44e29bd431dd22b1c737":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3064ea06da344e7dac706999c2785261":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5572d2fe61ac43118afa9b304b16b510":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a7d4a56724634785a2f1416c8f08eb10":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aaed2b2a50ef4370bb8ae1951c386a91":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"20147e3fe9dc4e888f33069c3c3f523e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ff646d16028842a29963ee5cb4eae215","IPY_MODEL_70d8e5233dfd49519a4a12e9fbc45351","IPY_MODEL_25b982891ac04ccb87f2555549dbbd18"],"layout":"IPY_MODEL_f5b81c8f4ca44375b763e77d8b7f6c15"}},"ff646d16028842a29963ee5cb4eae215":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1e2345f0d2fa4f5582c5635344a9ff4d","placeholder":"","style":"IPY_MODEL_cada57a9f19247d69aa72a9b9c1d4000","value":"alt-parallel/validation-00000-of-00001.p(…): 100%"}},"70d8e5233dfd49519a4a12e9fbc45351":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e4d59e8900eb43909d06b969bd735289","max":1710203,"min":0,"orientation":"horizontal","style":"IPY_MODEL_21126f0d2ca6408ab3ffdc2d288a74e3","value":1710203}},"25b982891ac04ccb87f2555549dbbd18":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ea27c3e675ac4f1588f59f47d06e1263","placeholder":"","style":"IPY_MODEL_edfc67720c9641cf8c8876f1bad6dcea","value":" 1.71M/1.71M [00:00<00:00, 3.69MB/s]"}},"f5b81c8f4ca44375b763e77d8b7f6c15":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1e2345f0d2fa4f5582c5635344a9ff4d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cada57a9f19247d69aa72a9b9c1d4000":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e4d59e8900eb43909d06b969bd735289":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"21126f0d2ca6408ab3ffdc2d288a74e3":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"ea27c3e675ac4f1588f59f47d06e1263":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"edfc67720c9641cf8c8876f1bad6dcea":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"af5d3bec80a2485ba399db24ba326c91":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0afa3b2829104136a268120d7e48fa1e","IPY_MODEL_87ed005e275e4c05846fd9f4cb7e3e8c","IPY_MODEL_2fd5509f4c8b44c1ae0d8f65a9808e19"],"layout":"IPY_MODEL_cd37370826174938b41dddaa78e2ad2e"}},"0afa3b2829104136a268120d7e48fa1e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_728ef89112d049d2adb5626823a31164","placeholder":"","style":"IPY_MODEL_cb17f9c954dc4d8ab4ad1411fb9e01da","value":"alt-parallel/test-00000-of-00001.parquet: 100%"}},"87ed005e275e4c05846fd9f4cb7e3e8c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ab1a3305c8e64d698ccf96bf2e89f8f6","max":1786537,"min":0,"orientation":"horizontal","style":"IPY_MODEL_f7afee3e635249d0b7962eb73876add8","value":1786537}},"2fd5509f4c8b44c1ae0d8f65a9808e19":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_39f6e71fb05246798a7e1e5f86dc6c8a","placeholder":"","style":"IPY_MODEL_013b681e3d32499095ffa169e0f8d27e","value":" 1.79M/1.79M [00:00<00:00, 4.44MB/s]"}},"cd37370826174938b41dddaa78e2ad2e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"728ef89112d049d2adb5626823a31164":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cb17f9c954dc4d8ab4ad1411fb9e01da":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ab1a3305c8e64d698ccf96bf2e89f8f6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f7afee3e635249d0b7962eb73876add8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"39f6e71fb05246798a7e1e5f86dc6c8a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"013b681e3d32499095ffa169e0f8d27e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"90b4ee1a9def4ef8be493eda8a40f873":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0afd65653e4148b4b01d8b75699c5c49","IPY_MODEL_99b699e79fb3466c80bf532f1f4978e8","IPY_MODEL_b57a41822d80427b8bf08861f424806d"],"layout":"IPY_MODEL_f95fa07a24a642ceb167385cd41237e6"}},"0afd65653e4148b4b01d8b75699c5c49":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_61d30f08cf91410b8881944c71dbdf72","placeholder":"","style":"IPY_MODEL_25f96a4aa8c840efb6c8d3c954c3440a","value":"Generating train split: 100%"}},"99b699e79fb3466c80bf532f1f4978e8":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_414890d7ea3848d982503a684f9b1438","max":18088,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a17e670578444551bbe6a33ce3eb1469","value":18088}},"b57a41822d80427b8bf08861f424806d":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3e4e6bcd2b114edc86291e3fb6fc64ae","placeholder":"","style":"IPY_MODEL_a15d1a9f2848416f812c1e2101054dca","value":" 18088/18088 [00:00<00:00, 51523.10 examples/s]"}},"f95fa07a24a642ceb167385cd41237e6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"61d30f08cf91410b8881944c71dbdf72":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"25f96a4aa8c840efb6c8d3c954c3440a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"414890d7ea3848d982503a684f9b1438":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a17e670578444551bbe6a33ce3eb1469":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3e4e6bcd2b114edc86291e3fb6fc64ae":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a15d1a9f2848416f812c1e2101054dca":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8e2fa8bbd8e54193a1603669e5b73d99":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_48597d1c17ce4c4596e06db84fa57c0e","IPY_MODEL_235a020b60fd4417b223de7522d98904","IPY_MODEL_6b00786102084a9d8ac295a86b4a018e"],"layout":"IPY_MODEL_50b37c8c62434616b00ca71334033ea3"}},"48597d1c17ce4c4596e06db84fa57c0e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e8c79aaefe8d4b58b63a6d630137c238","placeholder":"","style":"IPY_MODEL_d2b8d4ebfc064bf6826d2e71e2d764e6","value":"Generating validation split: 100%"}},"235a020b60fd4417b223de7522d98904":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3202b4ce97ad44ad9877d3f5f9fc4c41","max":1000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_323a619824c442b8aa8609d255b0d470","value":1000}},"6b00786102084a9d8ac295a86b4a018e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3c6fef78cd854e2ab3fa23af01faa51f","placeholder":"","style":"IPY_MODEL_29f1f1575fb443a6b1e01741a7d63f6a","value":" 1000/1000 [00:00<00:00, 25709.06 examples/s]"}},"50b37c8c62434616b00ca71334033ea3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e8c79aaefe8d4b58b63a6d630137c238":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d2b8d4ebfc064bf6826d2e71e2d764e6":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3202b4ce97ad44ad9877d3f5f9fc4c41":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"323a619824c442b8aa8609d255b0d470":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3c6fef78cd854e2ab3fa23af01faa51f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"29f1f1575fb443a6b1e01741a7d63f6a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c543038331c449de80fd72f37458950b":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_9bcf8d42b16645ddb29750021e5e52d6","IPY_MODEL_1d09575ade5048ccb4501812157a2c21","IPY_MODEL_4f75e0e560704eeab8aeb09be89add0b"],"layout":"IPY_MODEL_cbefddfd5f5f4f8da3c1fa962a110737"}},"9bcf8d42b16645ddb29750021e5e52d6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_48ce12da1224499f9238b21b0d3b2f4f","placeholder":"","style":"IPY_MODEL_e3b3496a577e4d92b2b0e7edd4f888c4","value":"Generating test split: 100%"}},"1d09575ade5048ccb4501812157a2c21":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_bf35bf464ace496aa23fd8a184be1cd3","max":1019,"min":0,"orientation":"horizontal","style":"IPY_MODEL_78aa0dae0714424b8019e3c6f25008d8","value":1019}},"4f75e0e560704eeab8aeb09be89add0b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_02a269db1c7c48d4b3c2ebb647abd98d","placeholder":"","style":"IPY_MODEL_043730347106465e9995143ebbd7852c","value":" 1019/1019 [00:00<00:00, 25704.84 examples/s]"}},"cbefddfd5f5f4f8da3c1fa962a110737":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"48ce12da1224499f9238b21b0d3b2f4f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e3b3496a577e4d92b2b0e7edd4f888c4":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"bf35bf464ace496aa23fd8a184be1cd3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"78aa0dae0714424b8019e3c6f25008d8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"02a269db1c7c48d4b3c2ebb647abd98d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"043730347106465e9995143ebbd7852c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
Urdu_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"1HmZNMCVfYGZ"},"source":["# Urdu-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Urdu (ur) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Urdu-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"1HmZNMCVfYGZ"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"i4bBOIlgfYGb","executionInfo":{"status":"ok","timestamp":1770440810575,"user_tz":-420,"elapsed":5851,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"a2893c57-6f17-4fd5-c960-f284633cceb2"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"i4bBOIlgfYGb"},{"cell_type":"markdown","metadata":{"id":"rYPtZV_DfYGb"},"source":["## 2. Data Loading (Opus-100)\n","Loading Urdu-English pairs from Opus-100."],"id":"rYPtZV_DfYGb"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":635,"referenced_widgets":["4c07e9f9ef4a43998f8815cbb8a4f7d4","94779a76a45b4befba74054bbfd467d0","0e2c97e85f024a879cf7f58511052578","d94a089f1efa411fb8ccb82d2e26ade0","458c9e4ce6b443e98e5f8fb1c53e5e29","d8e59fb3dff34a4a8d422555e0aea6f7","4df631a1c84e46ce8b342d9ef832a740","d91a021523834b389c9ce9c8f9cec2bb","e8606c142c2e4bf39002074eacde19e8","cdab68a2574c4f8796a9555e5636389b","d502ee27f9d441e49b9e52d6c4344a22","7727c5cc79c747ddba6c3fba9adedd03","ccf60a63a5d740f7a4a937d00d12298c","522456874d164a9491bc74b16283c957","b95b1fe20f094222b6406f553bd013c8","fa14c7d360444c12b945e6acb210d1bc","1b05a64ef6544cb8af0d833659dc7508","e4ec51cc4ac54d05abfc1eadbfae1bb0","3a78306c9ed445f7a0b0204d7dbf98d2","6bb9ddb68e014790b2ce6f3b81814a82","a04f65b9c24c4a9c9ff57452d4142902","6008b4c3a29f4ab29a6aa97f0fd40745","98d2c90e3255476c868ce2aec11bd40f","5ff6b67810814fceabd7655048f3edf6","fc5319fd0372476cbe90c7c6e66f6bf6","33e3b83f9e8f4066b4f9dd38f86a5f34","7557fb3a682a4abba297ee872c243c83","3696d6c431184ac2a787e8be379885e3","39796bec65a74876b423a5beefc37e83","ce9ec3c5bbab4d9c8e56d556421b6645","316db49fe7cf43489e3ca58a9d313a2a","d1213b9c69284d16883f408f5b1f8048","68256b9bfd2f4d848aea4a6174ce2028","b3876e04d1d844738ee51613344e4739","7380a8c67ceb4857b5d9ffc66a554630","b00a302b235145a8855dc91b74b282bd","4dba981f0e6748168c4e86b2925d324b","63c6aa1173c54d49b3eb0d805d7c477b","b78945a747ce40eb95756149fbfb95cc","a662d27f646f4326b3469543f5d2a63c","b21ffa7749674a67a61b0a7fa1ba99f7","d9b9049a68b449df87dad7775d845ee7","f6a6e8f216e84720ba36b15d4d7ac81e","ac4b27ebe30d42e9ba07cb23647ebc27","417c9e6a1dd74232819a230cf8d99943","42267e92152e473897fd47343f02b283","f73dff27cf9c4eb1834c4b2edba6ac7e","03bd7d8f89774129b1252f0aa3ce6fa0","d61a75049aa04f7eb44b63c8134de3a6","ea8e3f0d26ee4b9aa91028882ae7c1ea","bdcfae8bc78542f982ddff5e76aca0f3","fbe8e753526a494f960e0e23a76d19ca","69df32f2825144f8b398f1fd9583a8be","dbcbeefa1f05437d834fda23b2e93d5a","f8a1cc4691eb412583c3aec6513ce447","f3cd3e9c0c0a420a87b80d32df51a888","651b748ffbcf4b66b19cc6ce63e34e7a","2fb00de56f9d40ffab06bde4e09095bd","7e81d321a5c84fbb8e6d60856e16b7a7","513556bb91324ecdad415564f9b6ab2d","63133d05ec304cfaac336b346308ce6d","061019974f254fd7afc9ca4fe05b3831","d8c92679f4e2420e89da403a69b75a5b","52dfaca0539f46218aec0ee50731b4bd","2b7c3d1a4fb2400baf9ca96a69649b57","0217f1e38530403685127fdd120b6ed0","c86c02301e7144cabab7b596c855c922","97a2cd7cb62f41e3b3c8a4a5a31697b2","a92af3d602c541488d20feeefa361d9c","9149268626854fdbae9a2c7c36f5ef15","cab3bc9f5746483cad0ed7fcd532fc73","f0db55c4f25249019e5d7a2691e50366","4bd624c5737e4adc87d41ef527720016","cc918e93af824c18adefb3fdf4b0f307","cf790630f3254fcea11210ca25667755","b222362d13e447d1873929f6f477084c","7af0010ef5f646c89a932b9299a948ee"]},"id":"QGa0ct6NfYGc","executionInfo":{"status":"ok","timestamp":1770440845487,"user_tz":-420,"elapsed":34909,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d3a17c97-6d81-47f3-8e15-9cbceac894f7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Urdu-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"4c07e9f9ef4a43998f8815cbb8a4f7d4"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["en-ur/test-00000-of-00001.parquet: 0%| | 0.00/301k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7727c5cc79c747ddba6c3fba9adedd03"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ur/train-00000-of-00001.parquet: 0%| | 0.00/148M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"98d2c90e3255476c868ce2aec11bd40f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ur/validation-00000-of-00001.parquet: 0%| | 0.00/296k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b3876e04d1d844738ee51613344e4739"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"417c9e6a1dd74232819a230cf8d99943"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/753913 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f3cd3e9c0c0a420a87b80d32df51a888"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c86c02301e7144cabab7b596c855c922"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 757913 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Urdu-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Urdu-English)...\")\n","try:\n"," # Opus-100 has 'en-ur'\n"," dataset = load_dataset(\"opus100\", \"en-ur\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," # Extract data\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'ur' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'ur': item['translation']['ur'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Urdu-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"QGa0ct6NfYGc"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"KAxwboZPfYGc","executionInfo":{"status":"ok","timestamp":1770440845692,"user_tz":-420,"elapsed":193,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"800f788f-5319-436c-b300-bf9a902fa14d"},"outputs":[{"output_type":"stream","name":"stdout","text":[" ur \\\n","0 اور وہ کہ جب کہ انہیں ان کے رب کی آیتیں یاد د ... \n","1 یہ سارے کے سارے قیامت کے دن اکیلے اس کے پاس حا... \n","2 شیطان ان سے وعدے کرتا ہےاور انہیں امیدیں دلاتا... \n","3 کیا تم نے نہیں دیکھا کہ خدا نے سات آسمان کیسے ... \n","4 لیکن ہم نے بہت سی نسلیں پیدا کیں جن پر لمبی مد... \n","\n"," en \n","0 Who, when reminded of their Lord's revelations... \n","1 And everyone of them will come to Him alone on... \n","2 He [Shaitan (Satan)] makes promises to them, a... \n","3 \"Have you not seen that God has created the se... \n","4 But We raised up (new) generations, and long w... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['ur', 'en'])\n","df['ur'] = df['ur'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['ur'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"KAxwboZPfYGc"},{"cell_type":"markdown","metadata":{"id":"YWbTPuM-fYGc"},"source":["## 3. Tokenization"],"id":"YWbTPuM-fYGc"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pjA-NxXffYGc","executionInfo":{"status":"ok","timestamp":1770440849735,"user_tz":-420,"elapsed":4044,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"86c788ea-ff9a-4067-ace9-756dade232ab"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Urdu Tokenizer...\n","Training English Tokenizer (for Urdu pair)...\n"]}],"source":["# Save texts to files\n","with open('train_ur.txt', 'w', encoding='utf-8') as f:\n"," for line in df['ur']: f.write(line + '\\n')\n","\n","with open('train_en_ur.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Urdu Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_ur.txt',\n"," model_prefix='spm_ur',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Urdu pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_ur.txt',\n"," model_prefix='spm_en_ur',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_ur.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_ur.model')"],"id":"pjA-NxXffYGc"},{"cell_type":"markdown","metadata":{"id":"2z7j-DBSfYGc"},"source":["## 4. Dataset & Model"],"id":"2z7j-DBSfYGc"},{"cell_type":"code","execution_count":5,"metadata":{"id":"NOC6FziQfYGc","executionInfo":{"status":"ok","timestamp":1770440849738,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['ur']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"NOC6FziQfYGc"},{"cell_type":"code","execution_count":6,"metadata":{"id":"1jck_VZ8fYGd","executionInfo":{"status":"ok","timestamp":1770440849739,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"1jck_VZ8fYGd"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"A-OVPe8FfYGd","executionInfo":{"status":"ok","timestamp":1770441403046,"user_tz":-420,"elapsed":553307,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"628af614-851e-4e44-e3e0-5043e6cc354f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.226\n","Step 100, Loss: 5.868\n","Step 200, Loss: 5.375\n","Step 300, Loss: 5.134\n","Step 400, Loss: 4.910\n","Step 500, Loss: 4.746\n","Step 600, Loss: 4.627\n","Step 700, Loss: 4.644\n","Epoch 1 Loss: 5.131\n","Step 0, Loss: 4.443\n","Step 100, Loss: 4.345\n","Step 200, Loss: 4.251\n","Step 300, Loss: 4.343\n","Step 400, Loss: 4.134\n","Step 500, Loss: 4.181\n","Step 600, Loss: 4.057\n","Step 700, Loss: 4.120\n","Epoch 2 Loss: 4.244\n","Step 0, Loss: 4.032\n","Step 100, Loss: 4.073\n","Step 200, Loss: 3.957\n","Step 300, Loss: 3.818\n","Step 400, Loss: 3.853\n","Step 500, Loss: 3.864\n","Step 600, Loss: 3.875\n","Step 700, Loss: 3.605\n","Epoch 3 Loss: 3.886\n","Step 0, Loss: 3.811\n","Step 100, Loss: 3.600\n","Step 200, Loss: 3.583\n","Step 300, Loss: 3.638\n","Step 400, Loss: 3.742\n","Step 500, Loss: 3.589\n","Step 600, Loss: 3.483\n","Step 700, Loss: 3.716\n","Epoch 4 Loss: 3.640\n","Step 0, Loss: 3.661\n","Step 100, Loss: 3.563\n","Step 200, Loss: 3.329\n","Step 300, Loss: 3.446\n","Step 400, Loss: 3.519\n","Step 500, Loss: 3.519\n","Step 600, Loss: 3.268\n","Step 700, Loss: 3.564\n","Epoch 5 Loss: 3.458\n","Step 0, Loss: 3.315\n","Step 100, Loss: 3.228\n","Step 200, Loss: 3.513\n","Step 300, Loss: 3.184\n","Step 400, Loss: 3.431\n","Step 500, Loss: 3.297\n","Step 600, Loss: 3.256\n","Step 700, Loss: 3.244\n","Epoch 6 Loss: 3.318\n","Step 0, Loss: 3.247\n","Step 100, Loss: 3.172\n","Step 200, Loss: 3.218\n","Step 300, Loss: 3.424\n","Step 400, Loss: 3.134\n","Step 500, Loss: 3.326\n","Step 600, Loss: 3.261\n","Step 700, Loss: 3.400\n","Epoch 7 Loss: 3.203\n","Step 0, Loss: 3.005\n","Step 100, Loss: 3.210\n","Step 200, Loss: 3.063\n","Step 300, Loss: 3.111\n","Step 400, Loss: 3.125\n","Step 500, Loss: 3.147\n","Step 600, Loss: 3.146\n","Step 700, Loss: 2.993\n","Epoch 8 Loss: 3.109\n","Step 0, Loss: 2.872\n","Step 100, Loss: 2.797\n","Step 200, Loss: 3.205\n","Step 300, Loss: 2.936\n","Step 400, Loss: 2.938\n","Step 500, Loss: 3.081\n","Step 600, Loss: 3.183\n","Step 700, Loss: 2.846\n","Epoch 9 Loss: 3.032\n","Step 0, Loss: 2.861\n","Step 100, Loss: 2.839\n","Step 200, Loss: 2.970\n","Step 300, Loss: 2.929\n","Step 400, Loss: 2.815\n","Step 500, Loss: 3.130\n","Step 600, Loss: 3.194\n","Step 700, Loss: 2.977\n","Epoch 10 Loss: 2.962\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_ur.pt')"],"id":"A-OVPe8FfYGd"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"IzEtrjoJfYGd","executionInfo":{"status":"ok","timestamp":1770441403058,"user_tz":-420,"elapsed":15,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"0ac1a748-3943-4343-a786-1831f28817bc"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_ur.pt', 'app/models/transformer_model_ur.pt')\n","shutil.copy('spm_ur.model', 'app/models/spm_ur.model')\n","shutil.copy('spm_en_ur.model', 'app/models/spm_en_ur.model')\n","print(\"Models copied to app/models/\")"],"id":"IzEtrjoJfYGd"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"4c07e9f9ef4a43998f8815cbb8a4f7d4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_94779a76a45b4befba74054bbfd467d0","IPY_MODEL_0e2c97e85f024a879cf7f58511052578","IPY_MODEL_d94a089f1efa411fb8ccb82d2e26ade0"],"layout":"IPY_MODEL_458c9e4ce6b443e98e5f8fb1c53e5e29"}},"94779a76a45b4befba74054bbfd467d0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d8e59fb3dff34a4a8d422555e0aea6f7","placeholder":"","style":"IPY_MODEL_4df631a1c84e46ce8b342d9ef832a740","value":"README.md: "}},"0e2c97e85f024a879cf7f58511052578":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_d91a021523834b389c9ce9c8f9cec2bb","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_e8606c142c2e4bf39002074eacde19e8","value":1}},"d94a089f1efa411fb8ccb82d2e26ade0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cdab68a2574c4f8796a9555e5636389b","placeholder":"","style":"IPY_MODEL_d502ee27f9d441e49b9e52d6c4344a22","value":" 65.4k/? [00:00<00:00, 6.80MB/s]"}},"458c9e4ce6b443e98e5f8fb1c53e5e29":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d8e59fb3dff34a4a8d422555e0aea6f7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4df631a1c84e46ce8b342d9ef832a740":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"d91a021523834b389c9ce9c8f9cec2bb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"e8606c142c2e4bf39002074eacde19e8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"cdab68a2574c4f8796a9555e5636389b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d502ee27f9d441e49b9e52d6c4344a22":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"7727c5cc79c747ddba6c3fba9adedd03":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ccf60a63a5d740f7a4a937d00d12298c","IPY_MODEL_522456874d164a9491bc74b16283c957","IPY_MODEL_b95b1fe20f094222b6406f553bd013c8"],"layout":"IPY_MODEL_fa14c7d360444c12b945e6acb210d1bc"}},"ccf60a63a5d740f7a4a937d00d12298c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1b05a64ef6544cb8af0d833659dc7508","placeholder":"","style":"IPY_MODEL_e4ec51cc4ac54d05abfc1eadbfae1bb0","value":"en-ur/test-00000-of-00001.parquet: 100%"}},"522456874d164a9491bc74b16283c957":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3a78306c9ed445f7a0b0204d7dbf98d2","max":300728,"min":0,"orientation":"horizontal","style":"IPY_MODEL_6bb9ddb68e014790b2ce6f3b81814a82","value":300728}},"b95b1fe20f094222b6406f553bd013c8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a04f65b9c24c4a9c9ff57452d4142902","placeholder":"","style":"IPY_MODEL_6008b4c3a29f4ab29a6aa97f0fd40745","value":" 301k/301k [00:01<00:00, 192kB/s]"}},"fa14c7d360444c12b945e6acb210d1bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1b05a64ef6544cb8af0d833659dc7508":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e4ec51cc4ac54d05abfc1eadbfae1bb0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3a78306c9ed445f7a0b0204d7dbf98d2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6bb9ddb68e014790b2ce6f3b81814a82":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a04f65b9c24c4a9c9ff57452d4142902":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6008b4c3a29f4ab29a6aa97f0fd40745":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"98d2c90e3255476c868ce2aec11bd40f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_5ff6b67810814fceabd7655048f3edf6","IPY_MODEL_fc5319fd0372476cbe90c7c6e66f6bf6","IPY_MODEL_33e3b83f9e8f4066b4f9dd38f86a5f34"],"layout":"IPY_MODEL_7557fb3a682a4abba297ee872c243c83"}},"5ff6b67810814fceabd7655048f3edf6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3696d6c431184ac2a787e8be379885e3","placeholder":"","style":"IPY_MODEL_39796bec65a74876b423a5beefc37e83","value":"en-ur/train-00000-of-00001.parquet: 100%"}},"fc5319fd0372476cbe90c7c6e66f6bf6":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ce9ec3c5bbab4d9c8e56d556421b6645","max":147739018,"min":0,"orientation":"horizontal","style":"IPY_MODEL_316db49fe7cf43489e3ca58a9d313a2a","value":147739018}},"33e3b83f9e8f4066b4f9dd38f86a5f34":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d1213b9c69284d16883f408f5b1f8048","placeholder":"","style":"IPY_MODEL_68256b9bfd2f4d848aea4a6174ce2028","value":" 148M/148M [00:03<00:00, 103MB/s]"}},"7557fb3a682a4abba297ee872c243c83":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3696d6c431184ac2a787e8be379885e3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"39796bec65a74876b423a5beefc37e83":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ce9ec3c5bbab4d9c8e56d556421b6645":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"316db49fe7cf43489e3ca58a9d313a2a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d1213b9c69284d16883f408f5b1f8048":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"68256b9bfd2f4d848aea4a6174ce2028":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"b3876e04d1d844738ee51613344e4739":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_7380a8c67ceb4857b5d9ffc66a554630","IPY_MODEL_b00a302b235145a8855dc91b74b282bd","IPY_MODEL_4dba981f0e6748168c4e86b2925d324b"],"layout":"IPY_MODEL_63c6aa1173c54d49b3eb0d805d7c477b"}},"7380a8c67ceb4857b5d9ffc66a554630":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b78945a747ce40eb95756149fbfb95cc","placeholder":"","style":"IPY_MODEL_a662d27f646f4326b3469543f5d2a63c","value":"en-ur/validation-00000-of-00001.parquet: 100%"}},"b00a302b235145a8855dc91b74b282bd":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_b21ffa7749674a67a61b0a7fa1ba99f7","max":296298,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d9b9049a68b449df87dad7775d845ee7","value":296298}},"4dba981f0e6748168c4e86b2925d324b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f6a6e8f216e84720ba36b15d4d7ac81e","placeholder":"","style":"IPY_MODEL_ac4b27ebe30d42e9ba07cb23647ebc27","value":" 296k/296k [00:00<00:00, 394kB/s]"}},"63c6aa1173c54d49b3eb0d805d7c477b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b78945a747ce40eb95756149fbfb95cc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a662d27f646f4326b3469543f5d2a63c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"b21ffa7749674a67a61b0a7fa1ba99f7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d9b9049a68b449df87dad7775d845ee7":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"f6a6e8f216e84720ba36b15d4d7ac81e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ac4b27ebe30d42e9ba07cb23647ebc27":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"417c9e6a1dd74232819a230cf8d99943":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_42267e92152e473897fd47343f02b283","IPY_MODEL_f73dff27cf9c4eb1834c4b2edba6ac7e","IPY_MODEL_03bd7d8f89774129b1252f0aa3ce6fa0"],"layout":"IPY_MODEL_d61a75049aa04f7eb44b63c8134de3a6"}},"42267e92152e473897fd47343f02b283":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ea8e3f0d26ee4b9aa91028882ae7c1ea","placeholder":"","style":"IPY_MODEL_bdcfae8bc78542f982ddff5e76aca0f3","value":"Generating test split: 100%"}},"f73dff27cf9c4eb1834c4b2edba6ac7e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_fbe8e753526a494f960e0e23a76d19ca","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_69df32f2825144f8b398f1fd9583a8be","value":2000}},"03bd7d8f89774129b1252f0aa3ce6fa0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_dbcbeefa1f05437d834fda23b2e93d5a","placeholder":"","style":"IPY_MODEL_f8a1cc4691eb412583c3aec6513ce447","value":" 2000/2000 [00:00<00:00, 48086.86 examples/s]"}},"d61a75049aa04f7eb44b63c8134de3a6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ea8e3f0d26ee4b9aa91028882ae7c1ea":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bdcfae8bc78542f982ddff5e76aca0f3":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"fbe8e753526a494f960e0e23a76d19ca":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"69df32f2825144f8b398f1fd9583a8be":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"dbcbeefa1f05437d834fda23b2e93d5a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f8a1cc4691eb412583c3aec6513ce447":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f3cd3e9c0c0a420a87b80d32df51a888":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_651b748ffbcf4b66b19cc6ce63e34e7a","IPY_MODEL_2fb00de56f9d40ffab06bde4e09095bd","IPY_MODEL_7e81d321a5c84fbb8e6d60856e16b7a7"],"layout":"IPY_MODEL_513556bb91324ecdad415564f9b6ab2d"}},"651b748ffbcf4b66b19cc6ce63e34e7a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_63133d05ec304cfaac336b346308ce6d","placeholder":"","style":"IPY_MODEL_061019974f254fd7afc9ca4fe05b3831","value":"Generating train split: 100%"}},"2fb00de56f9d40ffab06bde4e09095bd":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_d8c92679f4e2420e89da403a69b75a5b","max":753913,"min":0,"orientation":"horizontal","style":"IPY_MODEL_52dfaca0539f46218aec0ee50731b4bd","value":753913}},"7e81d321a5c84fbb8e6d60856e16b7a7":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2b7c3d1a4fb2400baf9ca96a69649b57","placeholder":"","style":"IPY_MODEL_0217f1e38530403685127fdd120b6ed0","value":" 753913/753913 [00:01<00:00, 727575.52 examples/s]"}},"513556bb91324ecdad415564f9b6ab2d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"63133d05ec304cfaac336b346308ce6d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"061019974f254fd7afc9ca4fe05b3831":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"d8c92679f4e2420e89da403a69b75a5b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"52dfaca0539f46218aec0ee50731b4bd":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"2b7c3d1a4fb2400baf9ca96a69649b57":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0217f1e38530403685127fdd120b6ed0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c86c02301e7144cabab7b596c855c922":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_97a2cd7cb62f41e3b3c8a4a5a31697b2","IPY_MODEL_a92af3d602c541488d20feeefa361d9c","IPY_MODEL_9149268626854fdbae9a2c7c36f5ef15"],"layout":"IPY_MODEL_cab3bc9f5746483cad0ed7fcd532fc73"}},"97a2cd7cb62f41e3b3c8a4a5a31697b2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f0db55c4f25249019e5d7a2691e50366","placeholder":"","style":"IPY_MODEL_4bd624c5737e4adc87d41ef527720016","value":"Generating validation split: 100%"}},"a92af3d602c541488d20feeefa361d9c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_cc918e93af824c18adefb3fdf4b0f307","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_cf790630f3254fcea11210ca25667755","value":2000}},"9149268626854fdbae9a2c7c36f5ef15":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b222362d13e447d1873929f6f477084c","placeholder":"","style":"IPY_MODEL_7af0010ef5f646c89a932b9299a948ee","value":" 2000/2000 [00:00<00:00, 139973.44 examples/s]"}},"cab3bc9f5746483cad0ed7fcd532fc73":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f0db55c4f25249019e5d7a2691e50366":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4bd624c5737e4adc87d41ef527720016":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"cc918e93af824c18adefb3fdf4b0f307":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cf790630f3254fcea11210ca25667755":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"b222362d13e447d1873929f6f477084c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7af0010ef5f646c89a932b9299a948ee":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
Vietnamese_English_Transformer.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"cells":[{"cell_type":"markdown","metadata":{"id":"Y6Xiq5X1qAJ3"},"source":["# Vietnamese-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Vietnamese (vi) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **ALT (Asian Language Treebank)** dataset for Vietnamese-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"Y6Xiq5X1qAJ3"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EH9of9XXqAJ5","executionInfo":{"status":"ok","timestamp":1770443538034,"user_tz":-420,"elapsed":14177,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"cff84c10-22ef-4cb6-f4c9-fcdc7ce7a7a0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"EH9of9XXqAJ5"},{"cell_type":"markdown","metadata":{"id":"Xlli4N5iqAJ6"},"source":["## 2. Data Loading (ALT Dataset)\n","Loading Vietnamese-English pairs from ALT."],"id":"Xlli4N5iqAJ6"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":690,"referenced_widgets":["fb20b1e521b64aa1962f136fd63bbf9d","89d6648073bd422c9277abbe2fece8b8","929200ce40424a8a98aae0611dc3777a","cc1d34e078e544d1a5e5424aeca7f8b0","1f4431edf035470b82cfe98efcff7695","76b3fcfa58a64d24aefa15e4ecb2ce3b","d6e25207451a45fab08d448108835294","4a1909474d084d68a660bcf2422c58d8","bb784334e7ce4b9399f25920f4807366","e398947cea754173ab48c3dfdaf0d87f","caf41e44597047429d0be0ec9e2067c5","a15308a64214421bae1d2e0f0a862aa8","ab8b4fae3fcf4140912a978b3323adb5","0ce6f6b343d74a84ad68ad9488bd2a89","49cc8ebaaa1e437cbcbc8acfe813b4ef","103b3787e870426fbc6ff55ba1e40e3b","5f30b380e9df4de4a08c3b0a04625e79","18d5b221667e4d8eb534f99832765915","34b3dd2221554eb0b513d1d4853f04a3","7e940d9ee70449fb9556ff15bc7e2397","845d27d0d0f84dce84df211e959cc823","b6463d48ff6d4202b41be959f9c50268","401b5d9999f043a98772faf9b8a8f197","93f592745f444769bc86e9f9dd875e3f","dfa0355de7094119aa7465c6525dbe4a","fda7062fa4104029839de98df15a391d","7fa7685332fb41d79a6a61fbfea90ae0","83429e92713c453bbd36c47ed9150e44","4c2e26a4ddf742268c223fc17185919d","9499f61b2e044d8a9b196cc0f69e56c9","4988cd6403fe4808b685900ffd10a347","c11064e79ad84bb6ac585401b9decbb5","5e7852f70d304162b5cbb1424813ad92","5928fc881dc54140b47f4a98ae76a3b4","1770cefadd524409837537a1c2743153","552cb64ab51b4c038f7065d5ece72ad4","398ffcf356c14086a94fe4fc102ea618","c3b786bbff9c485f9c74ac3dae7857be","2efa929d583b4716a5bdf5496583d6b5","ddb822e7b8d947b4a59430c805653c3b","e7cac3c07a8e47179c2ca3a308dda345","7961adc6d65d42308947ff3d1c6d8c8e","dac510cdf0fc4e22866eadf9c0294f06","68148c6a30e34e96aa39aebb31be1c43","0c9399c9b77b414db4ef4b2d3a34f46e","456000f0d012409db8c3077e7278e499","08253a1253c3451b943560fc197352e2","fa202bbd9af449d6b65ec9f2b590c2b1","62d00b422ac44a07a2b72fbb22ecb1aa","1303c6f4973947dda8bf925b9a2c6aa9","151cfdb17570418ba523cd6e06e78685","c96e992a7e22432a897c7cd239b930a8","cfc12ece33a94fe4b34eae0c60d565a4","9203996d133f4d50a821f122aea5f75a","70e20e6d36a54c719b1cf1b4e5674acf","78b2cc67439d4cbf92e038eddd3f161b","ae68c0a697e14aaa86c9d0882a0ad3dc","f83e25cbf112462895c7bf73679bb347","0446501b448f475ab2dd1ac31e271df8","affae7b453604976aeee265fe28b337d","da49068b06c54390937853d411badb7a","329fbecbfe054fa6ab310f380e01779e","691731bd2d8d42e694f0d43aec6fc0e5","c2ffb0dd03664863973ee3362a99b63b","e18a194feab84ff885559e97433c0406","b309512414e2439fbdc4e92208646853","1f113a7f71b84b2d89de2b461cfa4daa","e5bcb09ac5294ef8afed6e13d3297c13","796c0bd2f45042f3aed902a50c7d8278","0ff64bbe47a5407fa961e979c93e2cdc","31e3616994124d17883f1361fd733633","f44ab67a089c464c95321c33cdaa2b88","aafe20126db3455495ac520639f536f1","74e314998e414ce9baba95bef1c09e56","e0d16a103b944bd4841f62c47557bc59","b43dba75938d4af58da0e016d5193054","92b02abc4b2b42ccb5c095013df90af9"]},"id":"wAS6astuqAJ6","executionInfo":{"status":"ok","timestamp":1770443551213,"user_tz":-420,"elapsed":13176,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d9472227-5475-490a-bdfe-f07c701d947e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading ALT Dataset...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"fb20b1e521b64aa1962f136fd63bbf9d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/train-00000-of-00001.parque(…): 0%| | 0.00/31.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a15308a64214421bae1d2e0f0a862aa8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/validation-00000-of-00001.p(…): 0%| | 0.00/1.71M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"401b5d9999f043a98772faf9b8a8f197"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/test-00000-of-00001.parquet: 0%| | 0.00/1.79M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"5928fc881dc54140b47f4a98ae76a3b4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/18088 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0c9399c9b77b414db4ef4b2d3a34f46e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/1000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"78b2cc67439d4cbf92e038eddd3f161b"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/1019 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1f113a7f71b84b2d89de2b461cfa4daa"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 20107 sentences from ALT.\n","Extracted 20107 Vietnamese-English pairs.\n"]}],"source":["print(\"Loading ALT Dataset...\")\n","try:\n"," # ALT has 'vi' for Vietnamese\n"," dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from ALT.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'vi' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'vi': item['translation']['vi'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," print(f\"Extracted {len(data)} Vietnamese-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"wAS6astuqAJ6"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VzmCBLvzqAJ6","executionInfo":{"status":"ok","timestamp":1770443551236,"user_tz":-420,"elapsed":19,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"c1c6bdda-a123-41d0-8ac7-d838c096f534"},"outputs":[{"output_type":"stream","name":"stdout","text":[" vi \\\n","0 Ý đã đánh bại Bồ Đào Nha với tỉ số 31-5 ở Bảng... \n","1 Andrea Maisi đã mở tỉ số cho Ý ở phút thứ tư v... \n","2 Chiếm thế áp đảo trong hầu hết hiệp đầu nhưng ... \n","3 Bồ Đào Nha chưa bao giờ từ bỏ và David Penalva... \n","4 Ý đã dẫn 16-5 ở hiệp đầu nhưng ngang sức với B... \n","\n"," en \n","0 Italy have defeated Portugal 31-5 in Pool C of... \n","1 Andrea Masi opened the scoring in the fourth m... \n","2 Despite controlling the game for much of the f... \n","3 Portugal never gave up and David Penalva score... \n","4 Italy led 16-5 at half time but were matched b... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['vi', 'en'])\n","df['vi'] = df['vi'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['vi'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"VzmCBLvzqAJ6"},{"cell_type":"markdown","metadata":{"id":"A7u71TB0qAJ6"},"source":["## 3. Tokenization"],"id":"A7u71TB0qAJ6"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"O1KJ-ZMaqAJ6","executionInfo":{"status":"ok","timestamp":1770443554794,"user_tz":-420,"elapsed":3557,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"8b6ed1e3-dc97-48a9-f05c-905866f5da07"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Vietnamese Tokenizer...\n","Training English Tokenizer (for Vietnamese pair)...\n"]}],"source":["# Save texts to files\n","with open('train_vi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['vi']: f.write(line + '\\n')\n","\n","with open('train_en_vi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Vietnamese Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_vi.txt',\n"," model_prefix='spm_vi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Vietnamese pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_vi.txt',\n"," model_prefix='spm_en_vi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_vi.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_vi.model')"],"id":"O1KJ-ZMaqAJ6"},{"cell_type":"markdown","metadata":{"id":"MA5tAC0jqAJ6"},"source":["## 4. Dataset & Model"],"id":"MA5tAC0jqAJ6"},{"cell_type":"code","execution_count":5,"metadata":{"id":"w__12RzXqAJ6","executionInfo":{"status":"ok","timestamp":1770443554795,"user_tz":-420,"elapsed":5,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['vi']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"w__12RzXqAJ6"},{"cell_type":"code","execution_count":6,"metadata":{"id":"BILlnw20qAJ7","executionInfo":{"status":"ok","timestamp":1770443554797,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"BILlnw20qAJ7"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5PBRBuZgqAJ7","executionInfo":{"status":"ok","timestamp":1770444978379,"user_tz":-420,"elapsed":1423582,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"1a41f621-2330-4305-915c-a903f787e271"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.199\n","Step 100, Loss: 6.805\n","Step 200, Loss: 6.359\n","Step 300, Loss: 6.330\n","Epoch 1 Loss: 6.723\n","Step 0, Loss: 6.117\n","Step 100, Loss: 6.091\n","Step 200, Loss: 5.959\n","Step 300, Loss: 5.850\n","Epoch 2 Loss: 6.015\n","Step 0, Loss: 5.786\n","Step 100, Loss: 5.669\n","Step 200, Loss: 5.571\n","Step 300, Loss: 5.395\n","Epoch 3 Loss: 5.567\n","Step 0, Loss: 5.350\n","Step 100, Loss: 5.146\n","Step 200, Loss: 5.098\n","Step 300, Loss: 5.085\n","Epoch 4 Loss: 5.205\n","Step 0, Loss: 5.094\n","Step 100, Loss: 4.863\n","Step 200, Loss: 4.994\n","Step 300, Loss: 4.943\n","Epoch 5 Loss: 4.930\n","Step 0, Loss: 4.727\n","Step 100, Loss: 4.725\n","Step 200, Loss: 4.634\n","Step 300, Loss: 4.705\n","Epoch 6 Loss: 4.712\n","Step 0, Loss: 4.487\n","Step 100, Loss: 4.530\n","Step 200, Loss: 4.572\n","Step 300, Loss: 4.625\n","Epoch 7 Loss: 4.532\n","Step 0, Loss: 4.232\n","Step 100, Loss: 4.386\n","Step 200, Loss: 4.404\n","Step 300, Loss: 4.404\n","Epoch 8 Loss: 4.374\n","Step 0, Loss: 4.188\n","Step 100, Loss: 4.158\n","Step 200, Loss: 4.067\n","Step 300, Loss: 4.373\n","Epoch 9 Loss: 4.237\n","Step 0, Loss: 4.073\n","Step 100, Loss: 4.177\n","Step 200, Loss: 4.128\n","Step 300, Loss: 4.088\n","Epoch 10 Loss: 4.114\n","Step 0, Loss: 3.954\n","Step 100, Loss: 4.134\n","Step 200, Loss: 4.026\n","Step 300, Loss: 4.075\n","Epoch 11 Loss: 4.004\n","Step 0, Loss: 3.807\n","Step 100, Loss: 3.826\n","Step 200, Loss: 3.945\n","Step 300, Loss: 3.993\n","Epoch 12 Loss: 3.904\n","Step 0, Loss: 3.667\n","Step 100, Loss: 3.773\n","Step 200, Loss: 3.857\n","Step 300, Loss: 3.869\n","Epoch 13 Loss: 3.809\n","Step 0, Loss: 3.582\n","Step 100, Loss: 3.760\n","Step 200, Loss: 3.771\n","Step 300, Loss: 3.693\n","Epoch 14 Loss: 3.722\n","Step 0, Loss: 3.460\n","Step 100, Loss: 3.688\n","Step 200, Loss: 3.798\n","Step 300, Loss: 3.677\n","Epoch 15 Loss: 3.643\n","Step 0, Loss: 3.350\n","Step 100, Loss: 3.528\n","Step 200, Loss: 3.602\n","Step 300, Loss: 3.594\n","Epoch 16 Loss: 3.570\n","Step 0, Loss: 3.277\n","Step 100, Loss: 3.365\n","Step 200, Loss: 3.505\n","Step 300, Loss: 3.669\n","Epoch 17 Loss: 3.502\n","Step 0, Loss: 3.241\n","Step 100, Loss: 3.465\n","Step 200, Loss: 3.505\n","Step 300, Loss: 3.534\n","Epoch 18 Loss: 3.439\n","Step 0, Loss: 3.272\n","Step 100, Loss: 3.327\n","Step 200, Loss: 3.366\n","Step 300, Loss: 3.471\n","Epoch 19 Loss: 3.375\n","Step 0, Loss: 3.175\n","Step 100, Loss: 3.368\n","Step 200, Loss: 3.371\n","Step 300, Loss: 3.524\n","Epoch 20 Loss: 3.320\n","Step 0, Loss: 3.176\n","Step 100, Loss: 3.242\n","Step 200, Loss: 3.334\n","Step 300, Loss: 3.289\n","Epoch 21 Loss: 3.268\n","Step 0, Loss: 3.054\n","Step 100, Loss: 3.189\n","Step 200, Loss: 3.286\n","Step 300, Loss: 3.345\n","Epoch 22 Loss: 3.220\n","Step 0, Loss: 2.885\n","Step 100, Loss: 3.224\n","Step 200, Loss: 3.267\n","Step 300, Loss: 3.370\n","Epoch 23 Loss: 3.172\n","Step 0, Loss: 2.916\n","Step 100, Loss: 3.072\n","Step 200, Loss: 3.271\n","Step 300, Loss: 3.285\n","Epoch 24 Loss: 3.126\n","Step 0, Loss: 2.876\n","Step 100, Loss: 2.970\n","Step 200, Loss: 3.283\n","Step 300, Loss: 3.254\n","Epoch 25 Loss: 3.086\n","Step 0, Loss: 2.885\n","Step 100, Loss: 2.799\n","Step 200, Loss: 3.193\n","Step 300, Loss: 3.138\n","Epoch 26 Loss: 3.047\n","Step 0, Loss: 2.969\n","Step 100, Loss: 2.899\n","Step 200, Loss: 3.050\n","Step 300, Loss: 3.159\n","Epoch 27 Loss: 3.010\n","Step 0, Loss: 2.920\n","Step 100, Loss: 2.789\n","Step 200, Loss: 3.038\n","Step 300, Loss: 2.991\n","Epoch 28 Loss: 2.972\n","Step 0, Loss: 2.781\n","Step 100, Loss: 2.905\n","Step 200, Loss: 2.962\n","Step 300, Loss: 2.985\n","Epoch 29 Loss: 2.941\n","Step 0, Loss: 2.769\n","Step 100, Loss: 2.839\n","Step 200, Loss: 2.968\n","Step 300, Loss: 3.039\n","Epoch 30 Loss: 2.908\n","Step 0, Loss: 2.646\n","Step 100, Loss: 2.901\n","Step 200, Loss: 2.838\n","Step 300, Loss: 3.146\n","Epoch 31 Loss: 2.877\n","Step 0, Loss: 2.704\n","Step 100, Loss: 2.683\n","Step 200, Loss: 2.751\n","Step 300, Loss: 2.991\n","Epoch 32 Loss: 2.848\n","Step 0, Loss: 2.643\n","Step 100, Loss: 2.790\n","Step 200, Loss: 2.930\n","Step 300, Loss: 2.879\n","Epoch 33 Loss: 2.816\n","Step 0, Loss: 2.700\n","Step 100, Loss: 2.728\n","Step 200, Loss: 2.832\n","Step 300, Loss: 2.819\n","Epoch 34 Loss: 2.791\n","Step 0, Loss: 2.608\n","Step 100, Loss: 2.701\n","Step 200, Loss: 2.897\n","Step 300, Loss: 2.904\n","Epoch 35 Loss: 2.766\n","Step 0, Loss: 2.662\n","Step 100, Loss: 2.692\n","Step 200, Loss: 2.754\n","Step 300, Loss: 2.699\n","Epoch 36 Loss: 2.741\n","Step 0, Loss: 2.577\n","Step 100, Loss: 2.596\n","Step 200, Loss: 2.766\n","Step 300, Loss: 2.778\n","Epoch 37 Loss: 2.718\n","Step 0, Loss: 2.515\n","Step 100, Loss: 2.777\n","Step 200, Loss: 2.766\n","Step 300, Loss: 2.710\n","Epoch 38 Loss: 2.696\n","Step 0, Loss: 2.627\n","Step 100, Loss: 2.528\n","Step 200, Loss: 2.744\n","Step 300, Loss: 2.689\n","Epoch 39 Loss: 2.670\n","Step 0, Loss: 2.468\n","Step 100, Loss: 2.600\n","Step 200, Loss: 2.690\n","Step 300, Loss: 2.738\n","Epoch 40 Loss: 2.652\n","Step 0, Loss: 2.496\n","Step 100, Loss: 2.726\n","Step 200, Loss: 2.581\n","Step 300, Loss: 2.683\n","Epoch 41 Loss: 2.633\n","Step 0, Loss: 2.477\n","Step 100, Loss: 2.534\n","Step 200, Loss: 2.756\n","Step 300, Loss: 2.716\n","Epoch 42 Loss: 2.607\n","Step 0, Loss: 2.439\n","Step 100, Loss: 2.644\n","Step 200, Loss: 2.660\n","Step 300, Loss: 2.713\n","Epoch 43 Loss: 2.593\n","Step 0, Loss: 2.510\n","Step 100, Loss: 2.524\n","Step 200, Loss: 2.621\n","Step 300, Loss: 2.761\n","Epoch 44 Loss: 2.569\n","Step 0, Loss: 2.446\n","Step 100, Loss: 2.339\n","Step 200, Loss: 2.598\n","Step 300, Loss: 2.721\n","Epoch 45 Loss: 2.553\n","Step 0, Loss: 2.442\n","Step 100, Loss: 2.435\n","Step 200, Loss: 2.676\n","Step 300, Loss: 2.520\n","Epoch 46 Loss: 2.537\n","Step 0, Loss: 2.414\n","Step 100, Loss: 2.367\n","Step 200, Loss: 2.617\n","Step 300, Loss: 2.624\n","Epoch 47 Loss: 2.521\n","Step 0, Loss: 2.365\n","Step 100, Loss: 2.521\n","Step 200, Loss: 2.483\n","Step 300, Loss: 2.530\n","Epoch 48 Loss: 2.498\n","Step 0, Loss: 2.389\n","Step 100, Loss: 2.312\n","Step 200, Loss: 2.437\n","Step 300, Loss: 2.613\n","Epoch 49 Loss: 2.485\n","Step 0, Loss: 2.360\n","Step 100, Loss: 2.506\n","Step 200, Loss: 2.559\n","Step 300, Loss: 2.589\n","Epoch 50 Loss: 2.472\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(50): # 50 Epochs for ALT\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_vi.pt')"],"id":"5PBRBuZgqAJ7"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cXFPvAxTqAJ7","executionInfo":{"status":"ok","timestamp":1770444978451,"user_tz":-420,"elapsed":54,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"f48d07a6-5625-4afc-f4ed-ac999485ae4f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_vi.pt', 'app/models/transformer_model_vi.pt')\n","shutil.copy('spm_vi.model', 'app/models/spm_vi.model')\n","shutil.copy('spm_en_vi.model', 'app/models/spm_en_vi.model')\n","print(\"Models copied to app/models/\")"],"id":"cXFPvAxTqAJ7"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"fb20b1e521b64aa1962f136fd63bbf9d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_89d6648073bd422c9277abbe2fece8b8","IPY_MODEL_929200ce40424a8a98aae0611dc3777a","IPY_MODEL_cc1d34e078e544d1a5e5424aeca7f8b0"],"layout":"IPY_MODEL_1f4431edf035470b82cfe98efcff7695"}},"89d6648073bd422c9277abbe2fece8b8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_76b3fcfa58a64d24aefa15e4ecb2ce3b","placeholder":"","style":"IPY_MODEL_d6e25207451a45fab08d448108835294","value":"README.md: "}},"929200ce40424a8a98aae0611dc3777a":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_4a1909474d084d68a660bcf2422c58d8","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_bb784334e7ce4b9399f25920f4807366","value":1}},"cc1d34e078e544d1a5e5424aeca7f8b0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e398947cea754173ab48c3dfdaf0d87f","placeholder":"","style":"IPY_MODEL_caf41e44597047429d0be0ec9e2067c5","value":" 13.2k/? [00:00<00:00, 841kB/s]"}},"1f4431edf035470b82cfe98efcff7695":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"76b3fcfa58a64d24aefa15e4ecb2ce3b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6e25207451a45fab08d448108835294":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"4a1909474d084d68a660bcf2422c58d8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"bb784334e7ce4b9399f25920f4807366":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e398947cea754173ab48c3dfdaf0d87f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"caf41e44597047429d0be0ec9e2067c5":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a15308a64214421bae1d2e0f0a862aa8":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ab8b4fae3fcf4140912a978b3323adb5","IPY_MODEL_0ce6f6b343d74a84ad68ad9488bd2a89","IPY_MODEL_49cc8ebaaa1e437cbcbc8acfe813b4ef"],"layout":"IPY_MODEL_103b3787e870426fbc6ff55ba1e40e3b"}},"ab8b4fae3fcf4140912a978b3323adb5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_5f30b380e9df4de4a08c3b0a04625e79","placeholder":"","style":"IPY_MODEL_18d5b221667e4d8eb534f99832765915","value":"alt-parallel/train-00000-of-00001.parque(…): 100%"}},"0ce6f6b343d74a84ad68ad9488bd2a89":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_34b3dd2221554eb0b513d1d4853f04a3","max":31211167,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7e940d9ee70449fb9556ff15bc7e2397","value":31211167}},"49cc8ebaaa1e437cbcbc8acfe813b4ef":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_845d27d0d0f84dce84df211e959cc823","placeholder":"","style":"IPY_MODEL_b6463d48ff6d4202b41be959f9c50268","value":" 31.2M/31.2M [00:01<00:00, 11.6MB/s]"}},"103b3787e870426fbc6ff55ba1e40e3b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5f30b380e9df4de4a08c3b0a04625e79":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"18d5b221667e4d8eb534f99832765915":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"34b3dd2221554eb0b513d1d4853f04a3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7e940d9ee70449fb9556ff15bc7e2397":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"845d27d0d0f84dce84df211e959cc823":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b6463d48ff6d4202b41be959f9c50268":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"401b5d9999f043a98772faf9b8a8f197":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_93f592745f444769bc86e9f9dd875e3f","IPY_MODEL_dfa0355de7094119aa7465c6525dbe4a","IPY_MODEL_fda7062fa4104029839de98df15a391d"],"layout":"IPY_MODEL_7fa7685332fb41d79a6a61fbfea90ae0"}},"93f592745f444769bc86e9f9dd875e3f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_83429e92713c453bbd36c47ed9150e44","placeholder":"","style":"IPY_MODEL_4c2e26a4ddf742268c223fc17185919d","value":"alt-parallel/validation-00000-of-00001.p(…): 100%"}},"dfa0355de7094119aa7465c6525dbe4a":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_9499f61b2e044d8a9b196cc0f69e56c9","max":1710203,"min":0,"orientation":"horizontal","style":"IPY_MODEL_4988cd6403fe4808b685900ffd10a347","value":1710203}},"fda7062fa4104029839de98df15a391d":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c11064e79ad84bb6ac585401b9decbb5","placeholder":"","style":"IPY_MODEL_5e7852f70d304162b5cbb1424813ad92","value":" 1.71M/1.71M [00:00<00:00, 2.43MB/s]"}},"7fa7685332fb41d79a6a61fbfea90ae0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"83429e92713c453bbd36c47ed9150e44":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4c2e26a4ddf742268c223fc17185919d":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9499f61b2e044d8a9b196cc0f69e56c9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4988cd6403fe4808b685900ffd10a347":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"c11064e79ad84bb6ac585401b9decbb5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5e7852f70d304162b5cbb1424813ad92":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"5928fc881dc54140b47f4a98ae76a3b4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_1770cefadd524409837537a1c2743153","IPY_MODEL_552cb64ab51b4c038f7065d5ece72ad4","IPY_MODEL_398ffcf356c14086a94fe4fc102ea618"],"layout":"IPY_MODEL_c3b786bbff9c485f9c74ac3dae7857be"}},"1770cefadd524409837537a1c2743153":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2efa929d583b4716a5bdf5496583d6b5","placeholder":"","style":"IPY_MODEL_ddb822e7b8d947b4a59430c805653c3b","value":"alt-parallel/test-00000-of-00001.parquet: 100%"}},"552cb64ab51b4c038f7065d5ece72ad4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e7cac3c07a8e47179c2ca3a308dda345","max":1786537,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7961adc6d65d42308947ff3d1c6d8c8e","value":1786537}},"398ffcf356c14086a94fe4fc102ea618":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_dac510cdf0fc4e22866eadf9c0294f06","placeholder":"","style":"IPY_MODEL_68148c6a30e34e96aa39aebb31be1c43","value":" 1.79M/1.79M [00:00<00:00, 2.16MB/s]"}},"c3b786bbff9c485f9c74ac3dae7857be":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2efa929d583b4716a5bdf5496583d6b5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ddb822e7b8d947b4a59430c805653c3b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e7cac3c07a8e47179c2ca3a308dda345":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7961adc6d65d42308947ff3d1c6d8c8e":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"dac510cdf0fc4e22866eadf9c0294f06":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"68148c6a30e34e96aa39aebb31be1c43":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"0c9399c9b77b414db4ef4b2d3a34f46e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_456000f0d012409db8c3077e7278e499","IPY_MODEL_08253a1253c3451b943560fc197352e2","IPY_MODEL_fa202bbd9af449d6b65ec9f2b590c2b1"],"layout":"IPY_MODEL_62d00b422ac44a07a2b72fbb22ecb1aa"}},"456000f0d012409db8c3077e7278e499":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1303c6f4973947dda8bf925b9a2c6aa9","placeholder":"","style":"IPY_MODEL_151cfdb17570418ba523cd6e06e78685","value":"Generating train split: 100%"}},"08253a1253c3451b943560fc197352e2":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c96e992a7e22432a897c7cd239b930a8","max":18088,"min":0,"orientation":"horizontal","style":"IPY_MODEL_cfc12ece33a94fe4b34eae0c60d565a4","value":18088}},"fa202bbd9af449d6b65ec9f2b590c2b1":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9203996d133f4d50a821f122aea5f75a","placeholder":"","style":"IPY_MODEL_70e20e6d36a54c719b1cf1b4e5674acf","value":" 18088/18088 [00:00<00:00, 62254.98 examples/s]"}},"62d00b422ac44a07a2b72fbb22ecb1aa":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1303c6f4973947dda8bf925b9a2c6aa9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"151cfdb17570418ba523cd6e06e78685":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c96e992a7e22432a897c7cd239b930a8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cfc12ece33a94fe4b34eae0c60d565a4":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9203996d133f4d50a821f122aea5f75a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"70e20e6d36a54c719b1cf1b4e5674acf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"78b2cc67439d4cbf92e038eddd3f161b":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ae68c0a697e14aaa86c9d0882a0ad3dc","IPY_MODEL_f83e25cbf112462895c7bf73679bb347","IPY_MODEL_0446501b448f475ab2dd1ac31e271df8"],"layout":"IPY_MODEL_affae7b453604976aeee265fe28b337d"}},"ae68c0a697e14aaa86c9d0882a0ad3dc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_da49068b06c54390937853d411badb7a","placeholder":"","style":"IPY_MODEL_329fbecbfe054fa6ab310f380e01779e","value":"Generating validation split: 100%"}},"f83e25cbf112462895c7bf73679bb347":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_691731bd2d8d42e694f0d43aec6fc0e5","max":1000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_c2ffb0dd03664863973ee3362a99b63b","value":1000}},"0446501b448f475ab2dd1ac31e271df8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e18a194feab84ff885559e97433c0406","placeholder":"","style":"IPY_MODEL_b309512414e2439fbdc4e92208646853","value":" 1000/1000 [00:00<00:00, 25237.25 examples/s]"}},"affae7b453604976aeee265fe28b337d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"da49068b06c54390937853d411badb7a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"329fbecbfe054fa6ab310f380e01779e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"691731bd2d8d42e694f0d43aec6fc0e5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c2ffb0dd03664863973ee3362a99b63b":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e18a194feab84ff885559e97433c0406":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b309512414e2439fbdc4e92208646853":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1f113a7f71b84b2d89de2b461cfa4daa":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_e5bcb09ac5294ef8afed6e13d3297c13","IPY_MODEL_796c0bd2f45042f3aed902a50c7d8278","IPY_MODEL_0ff64bbe47a5407fa961e979c93e2cdc"],"layout":"IPY_MODEL_31e3616994124d17883f1361fd733633"}},"e5bcb09ac5294ef8afed6e13d3297c13":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f44ab67a089c464c95321c33cdaa2b88","placeholder":"","style":"IPY_MODEL_aafe20126db3455495ac520639f536f1","value":"Generating test split: 100%"}},"796c0bd2f45042f3aed902a50c7d8278":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_74e314998e414ce9baba95bef1c09e56","max":1019,"min":0,"orientation":"horizontal","style":"IPY_MODEL_e0d16a103b944bd4841f62c47557bc59","value":1019}},"0ff64bbe47a5407fa961e979c93e2cdc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b43dba75938d4af58da0e016d5193054","placeholder":"","style":"IPY_MODEL_92b02abc4b2b42ccb5c095013df90af9","value":" 1019/1019 [00:00<00:00, 27799.43 examples/s]"}},"31e3616994124d17883f1361fd733633":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f44ab67a089c464c95321c33cdaa2b88":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aafe20126db3455495ac520639f536f1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"74e314998e414ce9baba95bef1c09e56":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e0d16a103b944bd4841f62c47557bc59":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"b43dba75938d4af58da0e016d5193054":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"92b02abc4b2b42ccb5c095013df90af9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
|
app/.dockerignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
nllb_model/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.ipynb_checkpoints/
|
| 5 |
+
.DS_Store
|
| 6 |
+
venv/
|
| 7 |
+
env/
|
| 8 |
+
.env
|
app/Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python runtime as a parent image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /code
|
| 6 |
+
|
| 7 |
+
# Copy the dependencies file
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
# Upgrade pip to avoid install issues
|
| 12 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy the rest of the application code
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Create a writable directory for cache/temp files if needed
|
| 19 |
+
# (Optional, but good practice for transformers cache if not pre-downloaded)
|
| 20 |
+
RUN mkdir -p /tmp/cache
|
| 21 |
+
ENV HF_HOME=/tmp/cache
|
| 22 |
+
|
| 23 |
+
# Expose the port that HuggingFace Spaces expects (7860)
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
# Run the application using Gunicorn
|
| 27 |
+
# Bind to 0.0.0.0:7860
|
| 28 |
+
CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
|
app/app.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import sentencepiece as spm
|
| 5 |
+
import math
|
| 6 |
+
from flask import Flask, render_template, request, jsonify
|
| 7 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 8 |
+
|
| 9 |
+
app = Flask(__name__)
|
| 10 |
+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 11 |
+
|
| 12 |
+
# --- 1. Transformer from Scratch Definition ---
|
| 13 |
+
# --- 1. Transformer from Scratch Definition ---
|
| 14 |
+
class TransformationModel(nn.Module):
|
| 15 |
+
# NOTE: Class name in notebook might have been TransformerModel, but let's check if user renamed it
|
| 16 |
+
# The user's notebook has 'TransformerModel'.
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
class PositionalEncoding(nn.Module):
|
| 20 |
+
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
| 21 |
+
super(PositionalEncoding, self).__init__()
|
| 22 |
+
self.dropout = nn.Dropout(p=dropout)
|
| 23 |
+
|
| 24 |
+
pe = torch.zeros(max_len, d_model)
|
| 25 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
| 26 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
|
| 27 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
| 28 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
| 29 |
+
self.register_buffer('pe', pe)
|
| 30 |
+
|
| 31 |
+
def forward(self, x):
|
| 32 |
+
x = x + self.pe[:x.size(1), :]
|
| 33 |
+
return self.dropout(x)
|
| 34 |
+
|
| 35 |
+
class TransformerModel(nn.Module):
|
| 36 |
+
def __init__(self, src_vocab_size, trg_vocab_size,
|
| 37 |
+
d_model=512, nhead=8, num_encoder_layers=3,
|
| 38 |
+
num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):
|
| 39 |
+
super(TransformerModel, self).__init__()
|
| 40 |
+
|
| 41 |
+
self.d_model = d_model
|
| 42 |
+
self.pad_idx = pad_idx
|
| 43 |
+
|
| 44 |
+
# Embeddings
|
| 45 |
+
self.src_embedding = nn.Embedding(src_vocab_size, d_model)
|
| 46 |
+
self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
|
| 47 |
+
|
| 48 |
+
# Positional Encoding
|
| 49 |
+
self.pos_encoder = PositionalEncoding(d_model, dropout)
|
| 50 |
+
|
| 51 |
+
# Transformer
|
| 52 |
+
self.transformer = nn.Transformer(
|
| 53 |
+
d_model=d_model,
|
| 54 |
+
nhead=nhead,
|
| 55 |
+
num_encoder_layers=num_encoder_layers,
|
| 56 |
+
num_decoder_layers=num_decoder_layers,
|
| 57 |
+
dim_feedforward=dim_feedforward,
|
| 58 |
+
dropout=dropout,
|
| 59 |
+
batch_first=True
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Output Layer
|
| 63 |
+
self.fc_out = nn.Linear(d_model, trg_vocab_size)
|
| 64 |
+
|
| 65 |
+
def forward(self, src, trg):
|
| 66 |
+
# src: [batch_size, src_len]
|
| 67 |
+
# trg: [batch_size, trg_len]
|
| 68 |
+
|
| 69 |
+
# Create masks
|
| 70 |
+
src_key_padding_mask = (src == self.pad_idx)
|
| 71 |
+
# trg_key_padding_mask = (trg == self.pad_idx) # Optional, usually handled by generating loop mask
|
| 72 |
+
|
| 73 |
+
# Target mask for autoregressive decoding
|
| 74 |
+
trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)
|
| 75 |
+
|
| 76 |
+
# Embed + Positional Encoding
|
| 77 |
+
src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
|
| 78 |
+
trg_emb = self.trg_embedding(trg) * math.sqrt(self.d_model)
|
| 79 |
+
|
| 80 |
+
src_emb = self.pos_encoder(src_emb)
|
| 81 |
+
trg_emb = self.pos_encoder(trg_emb)
|
| 82 |
+
|
| 83 |
+
# Transformer Forward
|
| 84 |
+
output = self.transformer(
|
| 85 |
+
src=src_emb,
|
| 86 |
+
tgt=trg_emb,
|
| 87 |
+
tgt_mask=trg_mask,
|
| 88 |
+
src_key_padding_mask=src_key_padding_mask,
|
| 89 |
+
# tgt_key_padding_mask=trg_key_padding_mask
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
return self.fc_out(output)
|
| 93 |
+
|
| 94 |
+
# --- 2. Load Models ---
|
| 95 |
+
# Paths
|
| 96 |
+
BASE_DIR = os.path.dirname(__file__)
|
| 97 |
+
NLLB_PATH = os.path.join(BASE_DIR, 'nllb_model')
|
| 98 |
+
NLLB_PATH_SYNC = os.path.join(BASE_DIR, '../../nllb_model')
|
| 99 |
+
TRANSFORMER_PATH = os.path.join(BASE_DIR, 'models/transformer_model.pt')
|
| 100 |
+
SPM_MY_PATH = os.path.join(BASE_DIR, 'models/spm_my.model')
|
| 101 |
+
SPM_EN_PATH = os.path.join(BASE_DIR, 'models/spm_en.model')
|
| 102 |
+
|
| 103 |
+
# Global Variables
|
| 104 |
+
nllb_model = None
|
| 105 |
+
nllb_tokenizer = None
|
| 106 |
+
# Global Variables for Scratch Models
|
| 107 |
+
scratch_models = {}
|
| 108 |
+
sp_src_models = {}
|
| 109 |
+
sp_trg_models = {}
|
| 110 |
+
|
| 111 |
+
# Language Mapping for NLLB
|
| 112 |
+
NLLB_LANG_MAP = {
|
| 113 |
+
'my': 'mya_Mymr',
|
| 114 |
+
'th': 'tha_Thai',
|
| 115 |
+
'zh': 'zho_Hans',
|
| 116 |
+
'hi': 'hin_Deva',
|
| 117 |
+
'ne': 'npi_Deva',
|
| 118 |
+
'ur': 'urd_Arab',
|
| 119 |
+
'vi': 'vie_Latn',
|
| 120 |
+
'tl': 'tgl_Latn',
|
| 121 |
+
'kk': 'kaz_Cyrl',
|
| 122 |
+
'bn': 'ben_Beng',
|
| 123 |
+
'de': 'deu_Latn'
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def load_nllb():
|
| 127 |
+
global nllb_model, nllb_tokenizer
|
| 128 |
+
try:
|
| 129 |
+
print("Loading NLLB Model...")
|
| 130 |
+
# Check if model exists locally
|
| 131 |
+
if os.path.exists(NLLB_PATH) or os.path.exists(NLLB_PATH_SYNC):
|
| 132 |
+
model_path = NLLB_PATH if os.path.exists(NLLB_PATH) else NLLB_PATH_SYNC
|
| 133 |
+
print(f"Loading from {model_path}...")
|
| 134 |
+
nllb_tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 135 |
+
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE)
|
| 136 |
+
else:
|
| 137 |
+
# Download if not found (fallback)
|
| 138 |
+
print("NLLB model not found locally. Downloading facebook/nllb-200-distilled-600M...")
|
| 139 |
+
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
| 140 |
+
nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(DEVICE)
|
| 141 |
+
|
| 142 |
+
# Save for later
|
| 143 |
+
print(f"Saving NLLB model to {NLLB_PATH}...")
|
| 144 |
+
nllb_tokenizer.save_pretrained(NLLB_PATH)
|
| 145 |
+
nllb_model.save_pretrained(NLLB_PATH)
|
| 146 |
+
|
| 147 |
+
print("NLLB Model Loaded.")
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"Failed to load NLLB Model: {e}")
|
| 150 |
+
|
| 151 |
+
def translate_nllb(text, src_lang="mya_Mymr", tgt_lang="eng_Latn"):
|
| 152 |
+
if not nllb_model or not nllb_tokenizer: return "Error: NLLB Model not loaded. Please wait for the model to download or check logs."
|
| 153 |
+
try:
|
| 154 |
+
# Set source language
|
| 155 |
+
nllb_tokenizer.src_lang = src_lang
|
| 156 |
+
|
| 157 |
+
inputs = nllb_tokenizer(text, return_tensors="pt").to(DEVICE)
|
| 158 |
+
with torch.no_grad():
|
| 159 |
+
translated_tokens = nllb_model.generate(**inputs, forced_bos_token_id=nllb_tokenizer.convert_tokens_to_ids(tgt_lang), max_length=128)
|
| 160 |
+
return nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
| 161 |
+
except Exception as e:
|
| 162 |
+
print(f"Error during NLLB translation: {e}")
|
| 163 |
+
return f"Error translating: {str(e)}"
|
| 164 |
+
|
| 165 |
+
# Initial Load
|
| 166 |
+
load_nllb()
|
| 167 |
+
|
| 168 |
+
def load_scratch_transformer():
|
| 169 |
+
global scratch_models, sp_src_models, sp_trg_models
|
| 170 |
+
|
| 171 |
+
languages = ['my', 'th', 'zh', 'hi', 'ne', 'ur', 'vi', 'tl', 'kk', 'bn', 'de']
|
| 172 |
+
|
| 173 |
+
for lang in languages:
|
| 174 |
+
# Define paths for each language
|
| 175 |
+
t_name = f'transformer_model_{lang}.pt' if lang != 'my' else 'transformer_model.pt'
|
| 176 |
+
s_name = f'spm_{lang}.model'
|
| 177 |
+
# English tokenizer naming convention
|
| 178 |
+
if lang == 'my': e_name = 'spm_en.model'
|
| 179 |
+
elif lang in ['th', 'zh', 'hi', 'ne', 'ur', 'vi', 'tl', 'kk', 'bn', 'de']: e_name = f'spm_en_{lang}.model'
|
| 180 |
+
else: e_name = 'spm_en.model'
|
| 181 |
+
|
| 182 |
+
# Check local then sync
|
| 183 |
+
t_path = os.path.join(BASE_DIR, f'models/{t_name}')
|
| 184 |
+
if not os.path.exists(t_path): t_path = os.path.join(BASE_DIR, f'../../models/{t_name}') # Fallback logic if needed, but standard is models/
|
| 185 |
+
|
| 186 |
+
s_path = os.path.join(BASE_DIR, f'models/{s_name}')
|
| 187 |
+
e_path = os.path.join(BASE_DIR, f'models/{e_name}')
|
| 188 |
+
|
| 189 |
+
# Fix for standard deployment structure (app/models) vs dev
|
| 190 |
+
if not os.path.exists(t_path):
|
| 191 |
+
# Try sync path logic for dev
|
| 192 |
+
t_path = os.path.join(BASE_DIR, f'../../app/models/{t_name}')
|
| 193 |
+
s_path = os.path.join(BASE_DIR, f'../../app/models/{s_name}')
|
| 194 |
+
e_path = os.path.join(BASE_DIR, f'../../app/models/{e_name}')
|
| 195 |
+
|
| 196 |
+
if os.path.exists(t_path) and os.path.exists(s_path) and os.path.exists(e_path):
|
| 197 |
+
try:
|
| 198 |
+
print(f"Loading Scratch Model for {lang}...")
|
| 199 |
+
sp_src_models[lang] = spm.SentencePieceProcessor(model_file=s_path)
|
| 200 |
+
sp_trg_models[lang] = spm.SentencePieceProcessor(model_file=e_path)
|
| 201 |
+
|
| 202 |
+
# Model params must match notebooks
|
| 203 |
+
# New languages use vocab_size=8000
|
| 204 |
+
vocab_size = 8000 if lang in ['hi', 'ne', 'ur', 'vi', 'tl', 'kk', 'bn', 'de'] else 4000
|
| 205 |
+
|
| 206 |
+
model = TransformerModel(
|
| 207 |
+
src_vocab_size=vocab_size,
|
| 208 |
+
trg_vocab_size=vocab_size,
|
| 209 |
+
d_model=256, nhead=4, num_encoder_layers=2,
|
| 210 |
+
num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0
|
| 211 |
+
).to(DEVICE)
|
| 212 |
+
|
| 213 |
+
model.load_state_dict(torch.load(t_path, map_location=DEVICE))
|
| 214 |
+
model.eval()
|
| 215 |
+
scratch_models[lang] = model
|
| 216 |
+
print(f"Scratch Transformer ({lang}) Loaded.")
|
| 217 |
+
except Exception as e:
|
| 218 |
+
print(f"Failed to load Scratch Transformer ({lang}): {e}")
|
| 219 |
+
else:
|
| 220 |
+
print(f"Scratch Transformer files for {lang} not found. Skipping.")
|
| 221 |
+
|
| 222 |
+
def translate_scratch(text, lang='my'):
|
| 223 |
+
# Lazy loading if model not found
|
| 224 |
+
if lang not in scratch_models:
|
| 225 |
+
print(f"Model for {lang} not found. Attempting to load...")
|
| 226 |
+
load_scratch_transformer()
|
| 227 |
+
|
| 228 |
+
if lang not in scratch_models:
|
| 229 |
+
return f"Error: Model for {lang} not available. Please train it first."
|
| 230 |
+
|
| 231 |
+
model = scratch_models[lang]
|
| 232 |
+
sp_src = sp_src_models[lang]
|
| 233 |
+
sp_trg = sp_trg_models[lang]
|
| 234 |
+
|
| 235 |
+
encoded_list = sp_src.encode_as_ids(text)
|
| 236 |
+
src_ids = [sp_src.bos_id()] + encoded_list + [sp_src.eos_id()]
|
| 237 |
+
src_tensor = torch.LongTensor(src_ids).unsqueeze(0).to(DEVICE)
|
| 238 |
+
|
| 239 |
+
outputs = [sp_trg.bos_id()]
|
| 240 |
+
for i in range(50):
|
| 241 |
+
trg_tensor = torch.LongTensor(outputs).unsqueeze(0).to(DEVICE)
|
| 242 |
+
with torch.no_grad():
|
| 243 |
+
output = model(src_tensor, trg_tensor)
|
| 244 |
+
best_guess = output.argmax(2)[:, -1].item()
|
| 245 |
+
if best_guess == sp_trg.eos_id(): break
|
| 246 |
+
outputs.append(best_guess)
|
| 247 |
+
|
| 248 |
+
return sp_trg.decode(outputs[1:])
|
| 249 |
+
|
| 250 |
+
# --- 4. Routes ---
|
| 251 |
+
@app.route('/', methods=['GET', 'POST'])
|
| 252 |
+
def index():
|
| 253 |
+
translation = ""
|
| 254 |
+
original = ""
|
| 255 |
+
model_choice = "nllb" # This will now effectively allow NLLB vs Scratch
|
| 256 |
+
lang_choice = "my"
|
| 257 |
+
|
| 258 |
+
if request.method == 'POST':
|
| 259 |
+
original = request.form.get('source_text', '')
|
| 260 |
+
model_choice = request.form.get('model_choice', 'nllb')
|
| 261 |
+
lang_choice = request.form.get('lang_choice', 'my')
|
| 262 |
+
|
| 263 |
+
if original:
|
| 264 |
+
if model_choice == 'nllb':
|
| 265 |
+
# Use NLLB with language code
|
| 266 |
+
src_code = NLLB_LANG_MAP.get(lang_choice, 'mya_Mymr')
|
| 267 |
+
translation = translate_nllb(original, src_lang=src_code, tgt_lang='eng_Latn')
|
| 268 |
+
else:
|
| 269 |
+
translation = translate_scratch(original, lang=lang_choice)
|
| 270 |
+
|
| 271 |
+
return render_template('index.html', translation=translation, original=original, model_choice=model_choice, lang_choice=lang_choice)
|
| 272 |
+
|
| 273 |
+
@app.route('/api/translate', methods=['POST'])
|
| 274 |
+
def api_translate():
|
| 275 |
+
data = request.json
|
| 276 |
+
text = data.get('text', '')
|
| 277 |
+
model_type = data.get('model', 'nllb')
|
| 278 |
+
lang = data.get('lang', 'my')
|
| 279 |
+
direction = data.get('direction', 'f2e') # f2e (Foreign to English) or e2f (English to Foreign)
|
| 280 |
+
|
| 281 |
+
if not text: return jsonify({'error': 'No text provided'}), 400
|
| 282 |
+
|
| 283 |
+
# Language Mapping for NLLB
|
| 284 |
+
# Language Mapping for NLLB (Use Global)
|
| 285 |
+
target_code = NLLB_LANG_MAP.get(lang, 'mya_Mymr')
|
| 286 |
+
english_code = 'eng_Latn'
|
| 287 |
+
|
| 288 |
+
if model_type == 'nllb':
|
| 289 |
+
if direction == 'f2e':
|
| 290 |
+
# Foreign -> English
|
| 291 |
+
translation = translate_nllb(text, src_lang=target_code, tgt_lang=english_code)
|
| 292 |
+
else:
|
| 293 |
+
# English -> Foreign
|
| 294 |
+
translation = translate_nllb(text, src_lang=english_code, tgt_lang=target_code)
|
| 295 |
+
else:
|
| 296 |
+
# Scratch model
|
| 297 |
+
if direction == 'e2f':
|
| 298 |
+
translation = f"Error: The Scratch Transformer model only supports {lang.upper()} -> English translation. Please use NLLB for English -> {lang.upper()}."
|
| 299 |
+
else:
|
| 300 |
+
translation = translate_scratch(text, lang=lang)
|
| 301 |
+
|
| 302 |
+
return jsonify({'translation': translation, 'model': model_type, 'lang': lang, 'direction': direction})
|
| 303 |
+
|
| 304 |
+
# Load Scratch Models
|
| 305 |
+
load_scratch_transformer()
|
| 306 |
+
|
| 307 |
+
if __name__ == '__main__':
|
| 308 |
+
app.run(debug=True, host='0.0.0.0', port=5001)
|
app/models/spm_bn.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eca3987b35b4fc63c8195574d289ed4a38cb928d9ba98ee3ed907c61e6ea8f17
|
| 3 |
+
size 432288
|
app/models/spm_de.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2bcfb1adf5c4d1ee77c4adf990b221b070eb9bae324eec5f304f1e1e5197c00c
|
| 3 |
+
size 363507
|
app/models/spm_en.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0682aa69df3a4b8759b1b84de76d420ac7bb0c1d3eb13654e893aa44e0fcf3fc
|
| 3 |
+
size 301789
|
app/models/spm_en_bn.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:215de7a00f9237d8d84f886f503fc4835c28e5e31bfea8132a1c4c267897db8e
|
| 3 |
+
size 367329
|
app/models/spm_en_de.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a0566dc48b91bd1a88a740adedb28d5aa6e98bfd55fab635fb392d0e5682f04
|
| 3 |
+
size 364310
|
app/models/spm_en_hi.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63103db31e76ca6114c7ca07e096519870b2453504cc7e4c261e9abf6656282a
|
| 3 |
+
size 367126
|
app/models/spm_en_kk.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:328fe08e93d2300a4e8275f0fae48603d4bea168fb4ecb0b802ea4d3bf920194
|
| 3 |
+
size 365178
|
app/models/spm_en_ne.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9d25c7975ef6934f32cade8b0a18ee152e68adc54177ea6933d5b59cc8d8b77
|
| 3 |
+
size 365442
|
app/models/spm_en_th.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26072cd673684251201725c9562f0399cb36147edcefa94e3d9f86f7fcefc48c
|
| 3 |
+
size 301805
|
app/models/spm_en_tl.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8d2ad373e42fd13404bbd3a93ff786c416032cff6cc589eafe15460706f719b
|
| 3 |
+
size 368786
|
app/models/spm_en_ur.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c2fd8602e6cfa0926d971e22706bb3139d204e61f9f272f289bf68eb3cdd250
|
| 3 |
+
size 371045
|
app/models/spm_en_vi.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2d56b5384f055ac5775104317cdc1fe075c2b815247da69d16ae750457d7c37
|
| 3 |
+
size 368792
|
app/models/spm_en_zh.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fb3964e62398422318800e5cfa3c8e99129ff6dd18026061e0dd82dedca65ef
|
| 3 |
+
size 301801
|
app/models/spm_hi.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9ef78a411f6cdc495619b08d51a14cdb31e479f8c8d697ad6719d80ae0b0bfc
|
| 3 |
+
size 420423
|
app/models/spm_kk.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32d5dae6db9558ec7d2644b468c37cd56a6e7c57c1f4f3787f66eeb3dbf1fec5
|
| 3 |
+
size 394807
|
app/models/spm_my.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f84cebd908633ca6dfa8b2f3a68564e192820b3b85660ea95223c4a8a18f50ab
|
| 3 |
+
size 345249
|
app/models/spm_ne.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2f8a4fe5b75ae9d389708a925309863d1dfba4ca7f2a116621a4bdc026a0930
|
| 3 |
+
size 421149
|
app/models/spm_th.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42b112d07c7803b68e31f0fcec6a4e1efb980690130b09397053368a75c90d12
|
| 3 |
+
size 327768
|
app/models/spm_tl.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a7be22c5f9aab20883ddeb0d47317f4b265b2ff3429dcf17fb31205a2d76124
|
| 3 |
+
size 369093
|
app/models/spm_ur.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d68c3c1184864ab9df74b153dc5d5eed0c12223485c04684279a1cc5cf6552b
|
| 3 |
+
size 392567
|
app/models/spm_vi.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6f4a9a8673ddbb5274ba70aa17fa6c5bfb382b04fb6144485c5fd3bc1cc8eb8
|
| 3 |
+
size 362817
|
app/models/spm_zh.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c3920cdc6dda7d859164562bf9bfb1092bf3ebe6a25272fc58c48c8968e8266
|
| 3 |
+
size 291274
|
app/models/transformer_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:493c1cf6d24814251f4d690e5a247b955521fd98eebcbf0bc22730ad08c595ea
|
| 3 |
+
size 27998914
|
app/models/transformer_model_bn.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:858734606c7d355b405619fb011d240bc935bfc2cf5e6eee3f94ea201e5a102f
|
| 3 |
+
size 40303139
|
app/models/transformer_model_de.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2547249aa6c5a7a3a778b6c04e9892dd3c1d7025a51bb3dd9cbbda92c77846b7
|
| 3 |
+
size 40303139
|
app/models/transformer_model_hi.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1dad73c3e1108571bb623f5e8a74a85a6ee1abdeb518de7475aa9dee2d0f608
|
| 3 |
+
size 40303139
|
app/models/transformer_model_kk.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1a82208803dc4726864fab0a3b87cea99c0a61a116bc1641f5a7fa8def8e9ce
|
| 3 |
+
size 40303139
|
app/models/transformer_model_ne.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ee2bf41a678beb9e895f66609e686861338fa214c3e9fdbbd1d4ecad3909be9
|
| 3 |
+
size 40303139
|
app/models/transformer_model_th.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3f5834dc9188f5af71da97f3e332b31e4664c3fff5dbc95bf8cf65479d8a3ad
|
| 3 |
+
size 27999139
|
app/models/transformer_model_tl.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17ab6e54c69ac135b92861b7450bf5040fa0f0b8be8023d9bf38aabe04e73cf9
|
| 3 |
+
size 40303139
|