shadowsilence commited on
Commit
fba0a90
·
verified ·
1 Parent(s): 8503b49

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +13 -0
  2. .gitignore +9 -0
  3. Attention_Experiments.ipynb +0 -0
  4. Bengali_English_Transformer.ipynb +1 -0
  5. Burmese_English_NLLB.ipynb +333 -0
  6. Burmese_English_Transformer.ipynb +592 -0
  7. Chinese_English_Transformer.ipynb +433 -0
  8. Dockerfile +30 -0
  9. German_English_Transformer.ipynb +1 -0
  10. Hindi_English_Transformer.ipynb +1 -0
  11. Kazakh_English_Transformer.ipynb +1 -0
  12. Nepali_English_Transformer.ipynb +1 -0
  13. README.md +69 -5
  14. Tagalog_English_Transformer.ipynb +1 -0
  15. Thai_English_Transformer.ipynb +1 -0
  16. Urdu_English_Transformer.ipynb +1 -0
  17. Vietnamese_English_Transformer.ipynb +1 -0
  18. app/.dockerignore +8 -0
  19. app/Dockerfile +28 -0
  20. app/app.py +308 -0
  21. app/models/spm_bn.model +3 -0
  22. app/models/spm_de.model +3 -0
  23. app/models/spm_en.model +3 -0
  24. app/models/spm_en_bn.model +3 -0
  25. app/models/spm_en_de.model +3 -0
  26. app/models/spm_en_hi.model +3 -0
  27. app/models/spm_en_kk.model +3 -0
  28. app/models/spm_en_ne.model +3 -0
  29. app/models/spm_en_th.model +3 -0
  30. app/models/spm_en_tl.model +3 -0
  31. app/models/spm_en_ur.model +3 -0
  32. app/models/spm_en_vi.model +3 -0
  33. app/models/spm_en_zh.model +3 -0
  34. app/models/spm_hi.model +3 -0
  35. app/models/spm_kk.model +3 -0
  36. app/models/spm_my.model +3 -0
  37. app/models/spm_ne.model +3 -0
  38. app/models/spm_th.model +3 -0
  39. app/models/spm_tl.model +3 -0
  40. app/models/spm_ur.model +3 -0
  41. app/models/spm_vi.model +3 -0
  42. app/models/spm_zh.model +3 -0
  43. app/models/transformer_model.pt +3 -0
  44. app/models/transformer_model_bn.pt +3 -0
  45. app/models/transformer_model_de.pt +3 -0
  46. app/models/transformer_model_hi.pt +3 -0
  47. app/models/transformer_model_kk.pt +3 -0
  48. app/models/transformer_model_ne.pt +3 -0
  49. app/models/transformer_model_th.pt +3 -0
  50. app/models/transformer_model_tl.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/static/images/bengali_background.png filter=lfs diff=lfs merge=lfs -text
37
+ app/static/images/burmese_background.png filter=lfs diff=lfs merge=lfs -text
38
+ app/static/images/chinese_background.png filter=lfs diff=lfs merge=lfs -text
39
+ app/static/images/german_background.png filter=lfs diff=lfs merge=lfs -text
40
+ app/static/images/hindi_background.png filter=lfs diff=lfs merge=lfs -text
41
+ app/static/images/kazakh_background.png filter=lfs diff=lfs merge=lfs -text
42
+ app/static/images/nepali_background.png filter=lfs diff=lfs merge=lfs -text
43
+ app/static/images/tagalog_background.png filter=lfs diff=lfs merge=lfs -text
44
+ app/static/images/thai_background.png filter=lfs diff=lfs merge=lfs -text
45
+ app/static/images/urdu_background.png filter=lfs diff=lfs merge=lfs -text
46
+ app/static/images/vietnamese_background.png filter=lfs diff=lfs merge=lfs -text
47
+ attention/train_my_att.txt filter=lfs diff=lfs merge=lfs -text
48
+ demo.gif filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .ipynb_checkpoints/
4
+ app/nllb_model/
5
+ .DS_Store
6
+ # app/models/ (removed so models can be uploaded)
7
+ app/venv/
8
+ app/env/
9
+ app/.env
Attention_Experiments.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Bengali_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"vpBMKu9S_DKX"},"source":["# Bengali-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Bengali (bn) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Bengali-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"vpBMKu9S_DKX"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZTpz4Y0K_DKZ","executionInfo":{"status":"ok","timestamp":1770449632993,"user_tz":-420,"elapsed":5884,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"84eef5c7-2b97-492e-a89f-84a73053f460"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"ZTpz4Y0K_DKZ"},{"cell_type":"markdown","metadata":{"id":"-y16NO8G_DKb"},"source":["## 2. Data Loading (Opus-100)\n","Loading Bengali-English pairs from Opus-100."],"id":"-y16NO8G_DKb"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":475,"referenced_widgets":["63800baa7f3342fea580c1754d23a187","8e5f81d65723424997ac3de79a7104c2","9412b9b7abd642818a3b5d7c1a7a59d4","51d24b76a33742cf9c61997e19666fb2","71cbb7ffb31e4663b669459808744e91","3d6f71b8889d4431aea09446a05a3cc7","401623c48fec466590876f0e27dac857","74ace8a9c3a04f86bae7ca8899b86fcc","2bbab3affa6244b6ac95974dc009ce15","3e1beecc6600411692f811cc12f35627","f1500e9b0d48434c9bacd5899a56b1b1","50e9c865bf914984b82021b7939f1047","096dfefc18d04d049a75376a18e30e6f","e277f9f3079b4cd4960d9e6f80e13834","c9da49cc397645fb9ae591d204fa9084","40f5a7f2c3214965a8924dcee95154f0","a9ffc0417cfd41f8a554b18690d7d7b3","91e4cff8fdce4d9e98e249f5d53a8338","9265830e2f0147bb823ac2de0ba3300e","2fc08ee9109a4349ba6ffb00c41b842e","e9e425a232384a0a8524e52f6be6a275","86cf30f6e84b43829b7e1d8a6e6e7446","3419947ed2d14de291532f4a13c41992","1019ec6a91cc400d9d79419090be471f","13d4c4cb6e634caa8975cc81911e658d","59a8d28e81804653b0525c7b88d417c5","b529ab421acb4cfe9a0e456f81b8fb10","8c63057306c5441390bee4d65dcdb830","8e8c9e37c21c492184c3be8cb82ffd0b","a3400c3db06645d79d1764881a265c55","d76a77c5e45843549b88f216aecfbbef","a1ce122dd98b44348ccbaf18e574009b","bcebc445abc440cc9efea5135cbe3b27","1c8327669a5b4bbfb8b0eb2a4b814e6d","baa12bef6fc540c1a1aff0c8835cfe5e","d86a597b23e14abd87f9567664b0fd47","b8b3cf1e411840f592c9a9509c31da7b","a6fad26675f14ccaabdae96ccfd07430","a6e95f0632ec49df90be34d6e6f3db38","1276d4a6f6af4d1ca75a9e6e64ddcca1","a61ba76440c84ae686b401b70bf1c379","3d903e048f6d4e618b2a27d61f124931","91281030b7164090841b0846e4f020bf","752d688c557a43bca08e2aa7d7f72331","8c9b8dcbd89645a7a955c06109cadfd7","0c5f2d21f42a473cb4b150ce1cf5ef42","eb9afc65cdf44f008990ec8c1082dea0","c27ce174a6814361bcdbc163ea79a85f","213d90a2119b46219902f0b01ead5521","8a72fd6282c645f4906d817db92c20c0","57024c3d9ead4c14b2bb25a1ac58a392","f572059b9f584008bfe67987e9610844","d6b43ff1450a4d57a09cde163557b732","40723bace09f404087f04f5d3b5da910","7930dcae389240128f87909b8091c838","9885cc8eb23a4bd5979b00fe729ab50a","503e5a8eea7647c791e75c93262869d3","812e5f5b21d74dc992b0a6d318578be6","559291f54b354a7190d9bf3b6c5916f0","bc3b447abfa34bc4baf747de9aa11286","362e85ad032f45f2911ed2233cf29d1e","69f0040fe8fc479b9fa64d4a9c0c7cdc","122d8bc238484956b5e93402a23becf9","41a80cc9f2834727b83c25bdf17fb789","6de63976604e42e8acef1b437c645700","5345097ef89145b7837dad5deb68cc3f","4f887be0d0434ed486c40c3d203c556f","079bd8aa81ad460593477de7a1beb27c","ac05ba3b0f1d4fd0b479bd309c74e805","aa6578ad022d418d8c14b59d8c303363","47e8d29853384b4cb6fdfbbb355b5044","cf6c5979b59c4e7b8c9041dd58b0970c","7b9da41a85d14b5ebe849a1a870b8b1a","1391beb1571349cebdb8200a9e5ee20a","983ede9cd17f464c933db0203a09cb3d","57e6475668804afd89b1559fae888165","c9b8876bbdc34f7b9bdcec235b05c6ef"]},"id":"PbQklUu0_DKb","executionInfo":{"status":"ok","timestamp":1770449672296,"user_tz":-420,"elapsed":39277,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"3b67ebbb-0e6a-4722-8c8e-0a62396d8346"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Bengali-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"63800baa7f3342fea580c1754d23a187"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["bn-en/test-00000-of-00001.parquet: 0%| | 0.00/279k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"50e9c865bf914984b82021b7939f1047"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["bn-en/train-00000-of-00001.parquet: 0%| | 0.00/134M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3419947ed2d14de291532f4a13c41992"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["bn-en/validation-00000-of-00001.parquet: 0%| | 0.00/272k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1c8327669a5b4bbfb8b0eb2a4b814e6d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8c9b8dcbd89645a7a955c06109cadfd7"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/1000000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9885cc8eb23a4bd5979b00fe729ab50a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"4f887be0d0434ed486c40c3d203c556f"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 1004000 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Bengali-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Bengali-English)...\")\n","try:\n"," # Opus-100 has 'bn-en' (or 'en-bn')\n"," dataset = load_dataset(\"opus100\", \"bn-en\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'bn' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'bn': item['translation']['bn'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Bengali-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"PbQklUu0_DKb"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"41o9Zc-K_DKc","executionInfo":{"status":"ok","timestamp":1770449672512,"user_tz":-420,"elapsed":218,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"8167a35f-6e43-41e7-9d71-7efb5dad305c"},"outputs":[{"output_type":"stream","name":"stdout","text":[" bn \\\n","0 তোমায় ঐ হৃদপিন্ড খেতে হবে না। \n","1 আমি দরজা সামান্য খোলা রেখে যাচ্ছি. \n","2 এই ঘটনার ক্ষেত্রে, গণপ্রচার মাধ্যম ঠিক মতই কাজ... \n","3 মিথ্যা বলবো? \n","4 এ বছর পাকিস্তানে তার প্রত্যাবর্তন খুব অপয়া ভাব... \n","\n"," en \n","0 You don't have to fucking eat his heart. \n","1 I'll leave the door open a little bit. \n","2 In this case, mass media have continued to fun... \n","3 Lies? \n","4 Her return to Pakistan earlier this year start... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['bn', 'en'])\n","df['bn'] = df['bn'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['bn'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"41o9Zc-K_DKc"},{"cell_type":"markdown","metadata":{"id":"vU0esOr-_DKc"},"source":["## 3. Tokenization"],"id":"vU0esOr-_DKc"},{"cell_type":"code","execution_count":4,"metadata":{"id":"w739sfKC_DKc","executionInfo":{"status":"ok","timestamp":1770449678700,"user_tz":-420,"elapsed":6188,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"6835514e-07cf-441f-9600-46cdf745862a"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Bengali Tokenizer...\n","Training English Tokenizer (for Bengali pair)...\n"]}],"source":["# Save texts to files\n","with open('train_bn.txt', 'w', encoding='utf-8') as f:\n"," for line in df['bn']: f.write(line + '\\n')\n","\n","with open('train_en_bn.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Bengali Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_bn.txt',\n"," model_prefix='spm_bn',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Bengali pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_bn.txt',\n"," model_prefix='spm_en_bn',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_bn.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_bn.model')"],"id":"w739sfKC_DKc"},{"cell_type":"markdown","metadata":{"id":"mPL7piMQ_DKd"},"source":["## 4. Dataset & Model"],"id":"mPL7piMQ_DKd"},{"cell_type":"code","execution_count":5,"metadata":{"id":"TcClWCSz_DKd","executionInfo":{"status":"ok","timestamp":1770449678705,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['bn']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"TcClWCSz_DKd"},{"cell_type":"code","execution_count":6,"metadata":{"id":"kgLpC6aG_DKd","executionInfo":{"status":"ok","timestamp":1770449678714,"user_tz":-420,"elapsed":3,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"kgLpC6aG_DKd"},{"cell_type":"code","execution_count":7,"metadata":{"id":"aWpT2aj1_DKe","executionInfo":{"status":"ok","timestamp":1770450066061,"user_tz":-420,"elapsed":387345,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"cd31b25e-e7e1-48c0-c161-45069232ea3f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.192\n","Step 100, Loss: 6.546\n","Step 200, Loss: 6.175\n","Step 300, Loss: 5.916\n","Step 400, Loss: 5.874\n","Step 500, Loss: 5.361\n","Step 600, Loss: 5.727\n","Step 700, Loss: 5.421\n","Epoch 1 Loss: 6.014\n","Step 0, Loss: 5.319\n","Step 100, Loss: 5.217\n","Step 200, Loss: 5.392\n","Step 300, Loss: 5.172\n","Step 400, Loss: 5.023\n","Step 500, Loss: 5.232\n","Step 600, Loss: 5.307\n","Step 700, Loss: 5.209\n","Epoch 2 Loss: 5.263\n","Step 0, Loss: 4.776\n","Step 100, Loss: 5.007\n","Step 200, Loss: 5.070\n","Step 300, Loss: 4.992\n","Step 400, Loss: 4.958\n","Step 500, Loss: 4.863\n","Step 600, Loss: 5.025\n","Step 700, Loss: 5.010\n","Epoch 3 Loss: 4.886\n","Step 0, Loss: 4.940\n","Step 100, Loss: 4.741\n","Step 200, Loss: 4.769\n","Step 300, Loss: 4.715\n","Step 400, Loss: 4.508\n","Step 500, Loss: 4.680\n","Step 600, Loss: 4.605\n","Step 700, Loss: 4.755\n","Epoch 4 Loss: 4.629\n","Step 0, Loss: 4.324\n","Step 100, Loss: 4.510\n","Step 200, Loss: 4.466\n","Step 300, Loss: 4.252\n","Step 400, Loss: 4.540\n","Step 500, Loss: 4.343\n","Step 600, Loss: 4.285\n","Step 700, Loss: 4.335\n","Epoch 5 Loss: 4.443\n","Step 0, Loss: 4.265\n","Step 100, Loss: 4.288\n","Step 200, Loss: 4.463\n","Step 300, Loss: 4.301\n","Step 400, Loss: 4.595\n","Step 500, Loss: 4.464\n","Step 600, Loss: 4.206\n","Step 700, Loss: 4.423\n","Epoch 6 Loss: 4.295\n","Step 0, Loss: 3.883\n","Step 100, Loss: 4.193\n","Step 200, Loss: 4.195\n","Step 300, Loss: 3.978\n","Step 400, Loss: 4.358\n","Step 500, Loss: 4.160\n","Step 600, Loss: 4.146\n","Step 700, Loss: 4.027\n","Epoch 7 Loss: 4.171\n","Step 0, Loss: 4.029\n","Step 100, Loss: 4.170\n","Step 200, Loss: 4.145\n","Step 300, Loss: 4.106\n","Step 400, Loss: 3.941\n","Step 500, Loss: 4.163\n","Step 600, Loss: 4.277\n","Step 700, Loss: 4.172\n","Epoch 8 Loss: 4.063\n","Step 0, Loss: 3.674\n","Step 100, Loss: 3.885\n","Step 200, Loss: 4.137\n","Step 300, Loss: 3.860\n","Step 400, Loss: 4.117\n","Step 500, Loss: 4.026\n","Step 600, Loss: 4.033\n","Step 700, Loss: 3.988\n","Epoch 9 Loss: 3.970\n","Step 0, Loss: 3.889\n","Step 100, Loss: 3.525\n","Step 200, Loss: 3.580\n","Step 300, Loss: 3.651\n","Step 400, Loss: 3.900\n","Step 500, Loss: 3.773\n","Step 600, Loss: 3.872\n","Step 700, Loss: 4.015\n","Epoch 10 Loss: 3.883\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_bn.pt')"],"id":"aWpT2aj1_DKe"},{"cell_type":"code","execution_count":8,"metadata":{"id":"RHGxIZb-_DKf","executionInfo":{"status":"ok","timestamp":1770450066093,"user_tz":-420,"elapsed":18,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"e8611f94-0727-4ad0-e07b-a0c1d374b834"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_bn.pt', 'app/models/transformer_model_bn.pt')\n","shutil.copy('spm_bn.model', 'app/models/spm_bn.model')\n","shutil.copy('spm_en_bn.model', 'app/models/spm_en_bn.model')\n","print(\"Models copied to app/models/\")"],"id":"RHGxIZb-_DKf"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"63800baa7f3342fea580c1754d23a187":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_8e5f81d65723424997ac3de79a7104c2","IPY_MODEL_9412b9b7abd642818a3b5d7c1a7a59d4","IPY_MODEL_51d24b76a33742cf9c61997e19666fb2"],"layout":"IPY_MODEL_71cbb7ffb31e4663b669459808744e91"}},"8e5f81d65723424997ac3de79a7104c2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3d6f71b8889d4431aea09446a05a3cc7","placeholder":"​","style":"IPY_MODEL_401623c48fec466590876f0e27dac857","value":"README.md: "}},"9412b9b7abd642818a3b5d7c1a7a59d4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_74ace8a9c3a04f86bae7ca8899b86fcc","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_2bbab3affa6244b6ac95974dc009ce15","value":1}},"51d24b76a33742cf9c61997e19666fb2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3e1beecc6600411692f811cc12f35627","placeholder":"​","style":"IPY_MODEL_f1500e9b0d48434c9bacd5899a56b1b1","value":" 65.4k/? [00:00&lt;00:00, 6.69MB/s]"}},"71cbb7ffb31e4663b669459808744e91":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3d6f71b8889d4431aea09446a05a3cc7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"401623c48fec466590876f0e27dac857":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"74ace8a9c3a04f86bae7ca8899b86fcc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"2bbab3affa6244b6ac95974dc009ce15":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3e1beecc6600411692f811cc12f35627":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f1500e9b0d48434c9bacd5899a56b1b1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"50e9c865bf914984b82021b7939f1047":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_096dfefc18d04d049a75376a18e30e6f","IPY_MODEL_e277f9f3079b4cd4960d9e6f80e13834","IPY_MODEL_c9da49cc397645fb9ae591d204fa9084"],"layout":"IPY_MODEL_40f5a7f2c3214965a8924dcee95154f0"}},"096dfefc18d04d049a75376a18e30e6f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a9ffc0417cfd41f8a554b18690d7d7b3","placeholder":"​","style":"IPY_MODEL_91e4cff8fdce4d9e98e249f5d53a8338","value":"bn-en/test-00000-of-00001.parquet: 100%"}},"e277f9f3079b4cd4960d9e6f80e13834":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_9265830e2f0147bb823ac2de0ba3300e","max":279391,"min":0,"orientation":"horizontal","style":"IPY_MODEL_2fc08ee9109a4349ba6ffb00c41b842e","value":279391}},"c9da49cc397645fb9ae591d204fa9084":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e9e425a232384a0a8524e52f6be6a275","placeholder":"​","style":"IPY_MODEL_86cf30f6e84b43829b7e1d8a6e6e7446","value":" 279k/279k [00:01&lt;00:00, 46.7kB/s]"}},"40f5a7f2c3214965a8924dcee95154f0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a9ffc0417cfd41f8a554b18690d7d7b3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"91e4cff8fdce4d9e98e249f5d53a8338":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9265830e2f0147bb823ac2de0ba3300e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2fc08ee9109a4349ba6ffb00c41b842e":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e9e425a232384a0a8524e52f6be6a275":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"86cf30f6e84b43829b7e1d8a6e6e7446":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3419947ed2d14de291532f4a13c41992":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_1019ec6a91cc400d9d79419090be471f","IPY_MODEL_13d4c4cb6e634caa8975cc81911e658d","IPY_MODEL_59a8d28e81804653b0525c7b88d417c5"],"layout":"IPY_MODEL_b529ab421acb4cfe9a0e456f81b8fb10"}},"1019ec6a91cc400d9d79419090be471f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8c63057306c5441390bee4d65dcdb830","placeholder":"​","style":"IPY_MODEL_8e8c9e37c21c492184c3be8cb82ffd0b","value":"bn-en/train-00000-of-00001.parquet: 100%"}},"13d4c4cb6e634caa8975cc81911e658d":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a3400c3db06645d79d1764881a265c55","max":133525065,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d76a77c5e45843549b88f216aecfbbef","value":133525065}},"59a8d28e81804653b0525c7b88d417c5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a1ce122dd98b44348ccbaf18e574009b","placeholder":"​","style":"IPY_MODEL_bcebc445abc440cc9efea5135cbe3b27","value":" 134M/134M [00:01&lt;00:00, 44.1MB/s]"}},"b529ab421acb4cfe9a0e456f81b8fb10":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8c63057306c5441390bee4d65dcdb830":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8e8c9e37c21c492184c3be8cb82ffd0b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a3400c3db06645d79d1764881a265c55":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d76a77c5e45843549b88f216aecfbbef":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a1ce122dd98b44348ccbaf18e574009b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bcebc445abc440cc9efea5135cbe3b27":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1c8327669a5b4bbfb8b0eb2a4b814e6d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_baa12bef6fc540c1a1aff0c8835cfe5e","IPY_MODEL_d86a597b23e14abd87f9567664b0fd47","IPY_MODEL_b8b3cf1e411840f592c9a9509c31da7b"],"layout":"IPY_MODEL_a6fad26675f14ccaabdae96ccfd07430"}},"baa12bef6fc540c1a1aff0c8835cfe5e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a6e95f0632ec49df90be34d6e6f3db38","placeholder":"​","style":"IPY_MODEL_1276d4a6f6af4d1ca75a9e6e64ddcca1","value":"bn-en/validation-00000-of-00001.parquet: 100%"}},"d86a597b23e14abd87f9567664b0fd47":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a61ba76440c84ae686b401b70bf1c379","max":272140,"min":0,"orientation":"horizontal","style":"IPY_MODEL_3d903e048f6d4e618b2a27d61f124931","value":272140}},"b8b3cf1e411840f592c9a9509c31da7b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_91281030b7164090841b0846e4f020bf","placeholder":"​","style":"IPY_MODEL_752d688c557a43bca08e2aa7d7f72331","value":" 272k/272k [00:00&lt;00:00, 430kB/s]"}},"a6fad26675f14ccaabdae96ccfd07430":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a6e95f0632ec49df90be34d6e6f3db38":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1276d4a6f6af4d1ca75a9e6e64ddcca1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a61ba76440c84ae686b401b70bf1c379":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3d903e048f6d4e618b2a27d61f124931":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"91281030b7164090841b0846e4f020bf":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"752d688c557a43bca08e2aa7d7f72331":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8c9b8dcbd89645a7a955c06109cadfd7":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0c5f2d21f42a473cb4b150ce1cf5ef42","IPY_MODEL_eb9afc65cdf44f008990ec8c1082dea0","IPY_MODEL_c27ce174a6814361bcdbc163ea79a85f"],"layout":"IPY_MODEL_213d90a2119b46219902f0b01ead5521"}},"0c5f2d21f42a473cb4b150ce1cf5ef42":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8a72fd6282c645f4906d817db92c20c0","placeholder":"​","style":"IPY_MODEL_57024c3d9ead4c14b2bb25a1ac58a392","value":"Generating test split: 100%"}},"eb9afc65cdf44f008990ec8c1082dea0":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f572059b9f584008bfe67987e9610844","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d6b43ff1450a4d57a09cde163557b732","value":2000}},"c27ce174a6814361bcdbc163ea79a85f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_40723bace09f404087f04f5d3b5da910","placeholder":"​","style":"IPY_MODEL_7930dcae389240128f87909b8091c838","value":" 2000/2000 [00:00&lt;00:00, 51153.17 examples/s]"}},"213d90a2119b46219902f0b01ead5521":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8a72fd6282c645f4906d817db92c20c0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"57024c3d9ead4c14b2bb25a1ac58a392":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f572059b9f584008bfe67987e9610844":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6b43ff1450a4d57a09cde163557b732":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"40723bace09f404087f04f5d3b5da910":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7930dcae389240128f87909b8091c838":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9885cc8eb23a4bd5979b00fe729ab50a":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_503e5a8eea7647c791e75c93262869d3","IPY_MODEL_812e5f5b21d74dc992b0a6d318578be6","IPY_MODEL_559291f54b354a7190d9bf3b6c5916f0"],"layout":"IPY_MODEL_bc3b447abfa34bc4baf747de9aa11286"}},"503e5a8eea7647c791e75c93262869d3":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_362e85ad032f45f2911ed2233cf29d1e","placeholder":"​","style":"IPY_MODEL_69f0040fe8fc479b9fa64d4a9c0c7cdc","value":"Generating train split: 100%"}},"812e5f5b21d74dc992b0a6d318578be6":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_122d8bc238484956b5e93402a23becf9","max":1000000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_41a80cc9f2834727b83c25bdf17fb789","value":1000000}},"559291f54b354a7190d9bf3b6c5916f0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_6de63976604e42e8acef1b437c645700","placeholder":"​","style":"IPY_MODEL_5345097ef89145b7837dad5deb68cc3f","value":" 1000000/1000000 [00:01&lt;00:00, 988617.74 examples/s]"}},"bc3b447abfa34bc4baf747de9aa11286":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"362e85ad032f45f2911ed2233cf29d1e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"69f0040fe8fc479b9fa64d4a9c0c7cdc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"122d8bc238484956b5e93402a23becf9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"41a80cc9f2834727b83c25bdf17fb789":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"6de63976604e42e8acef1b437c645700":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5345097ef89145b7837dad5deb68cc3f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"4f887be0d0434ed486c40c3d203c556f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_079bd8aa81ad460593477de7a1beb27c","IPY_MODEL_ac05ba3b0f1d4fd0b479bd309c74e805","IPY_MODEL_aa6578ad022d418d8c14b59d8c303363"],"layout":"IPY_MODEL_47e8d29853384b4cb6fdfbbb355b5044"}},"079bd8aa81ad460593477de7a1beb27c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cf6c5979b59c4e7b8c9041dd58b0970c","placeholder":"​","style":"IPY_MODEL_7b9da41a85d14b5ebe849a1a870b8b1a","value":"Generating validation split: 100%"}},"ac05ba3b0f1d4fd0b479bd309c74e805":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_1391beb1571349cebdb8200a9e5ee20a","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_983ede9cd17f464c933db0203a09cb3d","value":2000}},"aa6578ad022d418d8c14b59d8c303363":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_57e6475668804afd89b1559fae888165","placeholder":"​","style":"IPY_MODEL_c9b8876bbdc34f7b9bdcec235b05c6ef","value":" 2000/2000 [00:00&lt;00:00, 146595.04 examples/s]"}},"47e8d29853384b4cb6fdfbbb355b5044":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cf6c5979b59c4e7b8c9041dd58b0970c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7b9da41a85d14b5ebe849a1a870b8b1a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1391beb1571349cebdb8200a9e5ee20a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"983ede9cd17f464c933db0203a09cb3d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"57e6475668804afd89b1559fae888165":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c9b8876bbdc34f7b9bdcec235b05c6ef":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
Burmese_English_NLLB.ipynb ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Burmese-English NMT with NLLB-200 (Fine-Tuning)\n",
8
+ "\n",
9
+ "**Student**: Htut Ko Ko (st126010) \n",
10
+ "**Course**: NLP Project A3 \n",
11
+ "**Task**: High-Quality Machine Translation (Web App Integration)\n",
12
+ "\n",
13
+ "## 1. Introduction & Motivation\n",
14
+ "In this notebook, I implement a **Neural Machine Translation (NMT)** system to translate between **Burmese** and **English**.\n",
15
+ "\n",
16
+ "For the assignment's \"Task 4: Web Application\", my goal was to achieve **production-quality** translation that users would actually find useful.\n",
17
+ "\n",
18
+ "Training a Transformer from scratch (as done in my other notebook) on the small **ALT dataset (20k pairs)** resulted in poor fluency because deep learning models require massive amounts of data. To solve this, I chose to **fine-tune** a state-of-the-art pre-trained model: **NLLB-200 (No Language Left Behind)** by Meta.\n",
19
+ "\n",
20
+ "This approach allows me to leverage the model's existing knowledge of Burmese and English while adapting it specifically to the ALT dataset style."
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "markdown",
25
+ "metadata": {},
26
+ "source": [
27
+ "## 2. Setup & Dependencies\n",
28
+ "First, I install the necessary libraries from HuggingFace (`transformers`, `datasets`) and tools for evaluating translation quality (`sacrebleu`). I also mount my Google Drive so that I can save the fine-tuned model safely and use it later in my Web App."
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "!pip install transformers datasets sentencepiece sacremoses accelerate"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "import os\n",
47
+ "import torch\n",
48
+ "import numpy as np\n",
49
+ "import pandas as pd\n",
50
+ "from datasets import load_dataset, Dataset, DatasetDict\n",
51
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
52
+ "from google.colab import drive\n",
53
+ "\n",
54
+ "# I mount Google Drive to ensure my model is saved persistently.\n",
55
+ "drive.mount('/content/drive')\n",
56
+ "\n",
57
+ "# I define the save path in my Drive so I can download it later for the Web App.\n",
58
+ "DRIVE_SAVE_PATH = \"/content/drive/MyDrive/NLP/Project_A3/nllb_model\"\n",
59
+ "os.makedirs(DRIVE_SAVE_PATH, exist_ok=True)\n",
60
+ "\n",
61
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
62
+ "print(f\"Using device: {device}\")"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "markdown",
67
+ "metadata": {},
68
+ "source": [
69
+ "## 3. Data Preparation (ALT Dataset)\n",
70
+ "I am using the **Asian Language Treebank (ALT)** dataset as required. The raw dataset contains multiple languages, so I filter it to extract only the **Burmese ('my')** and **English ('en')** pairs.\n",
71
+ "\n",
72
+ "I then split the data into:\n",
73
+ "- **Train (81%)**: For teaching the model.\n",
74
+ "- **Validation (9%)**: For checking improvements during training.\n",
75
+ "- **Test (10%)**: For final evaluation."
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "print(\"Loading ALT Dataset...\")\n",
85
+ "try:\n",
86
+ " raw_dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n",
87
+ " \n",
88
+ " data = []\n",
89
+ " for item in raw_dataset:\n",
90
+ " if 'translation' in item:\n",
91
+ " if 'my' in item['translation'] and 'en' in item['translation']:\n",
92
+ " data.append({\n",
93
+ " 'my': item['translation']['my'],\n",
94
+ " 'en': item['translation']['en']\n",
95
+ " })\n",
96
+ " \n",
97
+ " df = pd.DataFrame(data)\n",
98
+ " df = df.dropna()\n",
99
+ " print(f\"Total Pairs Extracted: {len(df)}\")\n",
100
+ "\n",
101
+ "except Exception as e:\n",
102
+ " print(f\"Error: {e}\")"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "from sklearn.model_selection import train_test_split\n",
112
+ "\n",
113
+ "# Splitting: 90% Train+Val, 10% Test\n",
114
+ "train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)\n",
115
+ "# Splitting Train+Val: 90% Train, 10% Val\n",
116
+ "train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)\n",
117
+ "\n",
118
+ "print(f\"Train Size: {len(train_df)}\")\n",
119
+ "print(f\"Results Validation Size: {len(val_df)}\")\n",
120
+ "print(f\"Test Size: {len(test_df)}\")\n",
121
+ "\n",
122
+ "# Convert back to HuggingFace Dataset format for easier processing\n",
123
+ "train_dataset = Dataset.from_pandas(train_df)\n",
124
+ "val_dataset = Dataset.from_pandas(val_df)\n",
125
+ "test_dataset = Dataset.from_pandas(test_df)\n",
126
+ "\n",
127
+ "dataset = DatasetDict({\n",
128
+ " 'train': train_dataset,\n",
129
+ " 'validation': val_dataset,\n",
130
+ " 'test': test_dataset\n",
131
+ "})"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "markdown",
136
+ "metadata": {},
137
+ "source": [
138
+ "## 4. Model Loading & Tokenization\n",
139
+ "Here I load the **NLLB-200-distilled-600M** model. This is a distilled version of the massive 54B parameter model, making it efficient enough to fine-tune on Colab while retaining high performance.\n",
140
+ "\n",
141
+ "**Important**: NLLB requires specific language codes:\n",
142
+ "- Burmese: `mya_Mymr`\n",
143
+ "- English: `eng_Latn`\n",
144
+ "\n",
145
+ "I create a preprocessing function to tokenize the inputs. We tokenize the inputs (Burmese) and the targets (English) simultaneously."
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "model_checkpoint = \"facebook/nllb-200-distilled-600M\"\n",
155
+ "\n",
156
+ "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang=\"mya_Mymr\", tgt_lang=\"eng_Latn\")\n",
157
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "max_input_length = 128\n",
167
+ "max_target_length = 128\n",
168
+ "\n",
169
+ "def preprocess_function(examples):\n",
170
+ " inputs = [ex for ex in examples['my']]\n",
171
+ " targets = [ex for ex in examples['en']]\n",
172
+ " \n",
173
+ " # We tokenize the input (Burmese)\n",
174
+ " model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)\n",
175
+ " # We tokenize the target (English) as labels\n",
176
+ " labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)\n",
177
+ "\n",
178
+ " model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
179
+ " return model_inputs\n",
180
+ "\n",
181
+ "tokenized_datasets = dataset.map(preprocess_function, batched=True)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "metadata": {},
187
+ "source": [
188
+ "## 5. Fine-Tuning (Training)\n",
189
+ "I use the `Seq2SeqTrainer` to fine-tune the model.\n",
190
+ "\n",
191
+ "**Hyperparameters:**\n",
192
+ "- **Batch Size**: 16 (fits in Colab GPU memory).\n",
193
+ "- **Learning Rate**: 2e-5 (low learning rate to gently adjust pre-trained weights).\n",
194
+ "- **Epochs**: 3 (Since the model is already pre-trained, it converges very quickly. 3 epochs is sufficient to adapt to the ALT dataset style without overfitting)."
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "batch_size = 16\n",
204
+ "learning_rate = 2e-5\n",
205
+ "weight_decay = 0.01\n",
206
+ "num_train_epochs = 3\n",
207
+ "\n",
208
+ "args = Seq2SeqTrainingArguments(\n",
209
+ " DRIVE_SAVE_PATH,\n",
210
+ " eval_strategy = \"epoch\",\n",
211
+ " learning_rate=learning_rate,\n",
212
+ " per_device_train_batch_size=batch_size,\n",
213
+ " per_device_eval_batch_size=batch_size,\n",
214
+ " weight_decay=weight_decay,\n",
215
+ " save_total_limit=1,\n",
216
+ " num_train_epochs=num_train_epochs,\n",
217
+ " predict_with_generate=True,\n",
218
+ " fp16=True if torch.cuda.is_available() else False,\n",
219
+ ")\n",
220
+ "\n",
221
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
222
+ "\n",
223
+ "trainer = Seq2SeqTrainer(\n",
224
+ " model=model,\n",
225
+ " args=args,\n",
226
+ " train_dataset=tokenized_datasets[\"train\"],\n",
227
+ " eval_dataset=tokenized_datasets[\"validation\"],\n",
228
+ " data_collator=data_collator,\n",
229
+ ")"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "print(\"Starting Training...\")\n",
239
+ "trainer.train()"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "markdown",
244
+ "metadata": {},
245
+ "source": [
246
+ "## 6. Saving the Model\n",
247
+ "After training is complete, I save the model and the tokenizer to Google Drive. This is the crucial step that allows me to download the model folder later and use it in my local Flask web application."
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "trainer.save_model(DRIVE_SAVE_PATH)\n",
257
+ "tokenizer.save_pretrained(DRIVE_SAVE_PATH)\n",
258
+ "print(f\"Model and Tokenizer saved safely to '{DRIVE_SAVE_PATH}'\")"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "markdown",
263
+ "metadata": {},
264
+ "source": [
265
+ "## 7. Verification & Inference\n",
266
+ "Finally, I verify that the model works by loading it back from the drive and running a translation test. I use `model.generate()` directly for robustness, ensuring the correct language codes are sent to the model."
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "metadata": {},
273
+ "outputs": [],
274
+ "source": [
275
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
276
+ "import torch\n",
277
+ "\n",
278
+ "# Reload from Drive to verify consistency\n",
279
+ "print(f\"Reloading model from {DRIVE_SAVE_PATH}...\")\n",
280
+ "tokenizer = AutoTokenizer.from_pretrained(DRIVE_SAVE_PATH)\n",
281
+ "model = AutoModelForSeq2SeqLM.from_pretrained(DRIVE_SAVE_PATH).to(device)\n",
282
+ "\n",
283
+ "def translate(text):\n",
284
+ " # Set source language explicitly\n",
285
+ " tokenizer.src_lang = \"mya_Mymr\"\n",
286
+ " inputs = tokenizer(text, return_tensors=\"pt\").to(device)\n",
287
+ " \n",
288
+ " with torch.no_grad():\n",
289
+ " # Generate encoded output\n",
290
+ " translated_tokens = model.generate(\n",
291
+ " **inputs, \n",
292
+ " # Force the target language to be English\n",
293
+ " forced_bos_token_id=tokenizer.convert_tokens_to_ids(\"eng_Latn\"), \n",
294
+ " max_length=128\n",
295
+ " )\n",
296
+ " # Decode tokens back to text\n",
297
+ " return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]\n",
298
+ "\n",
299
+ "# Manual Test\n",
300
+ "text = \"မင်္ဂလာပါ\"\n",
301
+ "print(f\"Source: {text}\")\n",
302
+ "print(f\"Prediction: {translate(text)}\")\n",
303
+ "\n",
304
+ "# Random Test from Test Set\n",
305
+ "sample = test_df.sample(1).iloc[0]\n",
306
+ "print(f\"\\nTest Sample Source: {sample['my']}\")\n",
307
+ "print(f\"Test Sample Target: {sample['en']}\")\n",
308
+ "print(f\"Model Prediction: {translate(sample['my'])}\")"
309
+ ]
310
+ }
311
+ ],
312
+ "metadata": {
313
+ "kernelspec": {
314
+ "display_name": "Python 3",
315
+ "language": "python",
316
+ "name": "python3"
317
+ },
318
+ "language_info": {
319
+ "codemirror_mode": {
320
+ "name": "ipython",
321
+ "version": 3
322
+ },
323
+ "file_extension": ".py",
324
+ "mimetype": "text/x-python",
325
+ "name": "python",
326
+ "nbconvert_exporter": "python",
327
+ "pygments_lexer": "ipython3",
328
+ "version": "3.8.10"
329
+ }
330
+ },
331
+ "nbformat": 4,
332
+ "nbformat_minor": 5
333
+ }
Burmese_English_Transformer.ipynb ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Burmese-English Machine Translation (A3 Project)\n",
8
+ "\n",
9
+ "**Student**: Htut Ko Ko \n",
10
+ "**Course**: Natural Language Understanding \n",
11
+ "**Task**: Burmese (my) <-> English (en) Translation using Transformer\n",
12
+ "\n",
13
+ "## Project Overview\n",
14
+ "This notebook implements a Neural Machine Translation system using a **Transformer** architecture. \n",
15
+ "We use the **ALT (Asian Language Treebank)** dataset for Burmese-English parallel data.\n",
16
+ "We use **SentencePiece** for subword tokenization to handle the Burmese script effectively.\n",
17
+ "\n",
18
+ "## Pipeline\n",
19
+ "1. **Setup**: Install/Import dependencies.\n",
20
+ "2. **Data Loading**: Load the ALT dataset.\n",
21
+ "3. **Tokenization**: Train SentencePiece model on the corpus.\n",
22
+ "4. **Data Processing**: Create PyTorch Datasets and DataLoaders.\n",
23
+ "5. **Model**: Implement Transformer (using `nn.Transformer`).\n",
24
+ "6. **Training**: Train the model and log performance.\n",
25
+ "7. **Evaluation**: Calculate BLEU score on Test set.\n",
26
+ "8. **Inference**: Demo function and save model for Web App."
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "metadata": {},
32
+ "source": [
33
+ "## 1. Setup and Imports"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": null,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "import os\n",
43
+ "import math\n",
44
+ "import time\n",
45
+ "import random\n",
46
+ "import numpy as np\n",
47
+ "import pandas as pd\n",
48
+ "import matplotlib.pyplot as plt\n",
49
+ "import seaborn as sns\n",
50
+ "\n",
51
+ "import torch\n",
52
+ "import torch.nn as nn\n",
53
+ "import torch.optim as optim\n",
54
+ "from torch.utils.data import Dataset, DataLoader\n",
55
+ "from torch.nn.utils.rnn import pad_sequence\n",
56
+ "\n",
57
+ "# Check for GPU\n",
58
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
59
+ "print(f\"Using device: {device}\")\n",
60
+ "\n",
61
+ "# Set seeds\n",
62
+ "SEED = 1234\n",
63
+ "random.seed(SEED)\n",
64
+ "np.random.seed(SEED)\n",
65
+ "torch.manual_seed(SEED)\n",
66
+ "torch.cuda.manual_seed(SEED)\n",
67
+ "torch.backends.cudnn.deterministic = True"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "# Install dependencies if missing (uncomment if needed)\n",
77
+ "# !pip install sentencepiece datasets portalocker"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "metadata": {},
83
+ "source": [
84
+ "## 2. Data Loading (ALT Dataset)\n",
85
+ "We will use the **ALT (Asian Language Treebank)** dataset via the HuggingFace `datasets` library."
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "from datasets import load_dataset\n",
95
+ "\n",
96
+ "print(\"Loading ALT Dataset (Burmese-English)...\")\n",
97
+ "try:\n",
98
+ " # Load ALT dataset from HuggingFace (my-en pair)\n",
99
+ " # Note: 'alt' dataset on HF might need specific config configuration or we can use 'bs-modeling-metadata/alt-burmese-english-parallel'\n",
100
+ " # For reliability, we'll try to load a known good source or fallback to manual download if needed.\n",
101
+ " # Here we use 'larryvrh/alt-my-en' or similar if available, else we process raw files if local.\n",
102
+ " # Let's try loading 'alt' configuration directly if supported, otherwise 'Helsinki-NLP/alt' does not exist.\n",
103
+ " # Using a generic approach: Loading from a known reliable HF path or url if standard 'alt' fails.\n",
104
+ " \n",
105
+ " # Let's use 'my_alt' from 'Asian-Language-Treebank' if available, but for now we'll assume the user has internet access.\n",
106
+ " # We will use 'alt' script if available or a direct parquet/csv link if we were doing custom.\n",
107
+ " # Actually, let's use the 'alt' dataset provided by 'my_en' config if possible.\n",
108
+ " \n",
109
+ " dataset = load_dataset(\"alt\", split=\"train+validation+test\") # Load all for custom splitting\n",
110
+ " print(f\"Loaded {len(dataset)} sentences from ALT dataset.\")\n",
111
+ " \n",
112
+ " # Filter/Extract only Burmese and English\n",
113
+ " data = []\n",
114
+ " for item in dataset:\n",
115
+ " # ALT structure usually: {'translation': {'bg': '...', 'en': '...', 'my': '...'}}\n",
116
+ " # The HF 'alt' dataset structure check:\n",
117
+ " if 'translation' in item:\n",
118
+ " if 'my' in item['translation'] and 'en' in item['translation']:\n",
119
+ " data.append({\n",
120
+ " 'my': item['translation']['my'],\n",
121
+ " 'en': item['translation']['en']\n",
122
+ " })\n",
123
+ " \n",
124
+ " print(f\"Extracted {len(data)} Burmese-English pairs.\")\n",
125
+ " \n",
126
+ "except Exception as e:\n",
127
+ " print(f\"Error loading from HF: {e}\")\n",
128
+ " print(\"Attempting fallback or assuming local file 'alt_my_en.csv' exists...\")\n",
129
+ " # fallback code would go here\n"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "# Convert to DataFrame for easier handling\n",
139
+ "df = pd.DataFrame(data)\n",
140
+ "print(df.head())\n",
141
+ "\n",
142
+ "# Basic Cleaning\n",
143
+ "# 1. Drop NaN/None\n",
144
+ "df = df.dropna(subset=['my', 'en'])\n",
145
+ "# 2. Ensure they are strings\n",
146
+ "df['my'] = df['my'].astype(str)\n",
147
+ "df['en'] = df['en'].astype(str)\n",
148
+ "\n",
149
+ "# 3. Remove empty strings\n",
150
+ "df = df[df['my'].str.strip() != '']\n",
151
+ "df = df[df['en'].str.strip() != '']\n",
152
+ "print(f\"After cleaning: {len(df)} pairs\")\n",
153
+ "\n",
154
+ "print(\"\\n--- Data Alignment Check ---\")\n",
155
+ "for i in range(5):\n",
156
+ " sample = df.sample(1).iloc[0]\n",
157
+ " print(f\"Source (my): {sample['my']}\")\n",
158
+ " print(f\"Target (en): {sample['en']}\")\n",
159
+ " print(\"-\" * 20)"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "markdown",
164
+ "metadata": {},
165
+ "source": [
166
+ "## 3. Tokenization (SentencePiece)\n",
167
+ "Burmese does not use spaces between words cleanly. **SentencePiece** is excellent for this as it builds a vocabulary based on subword frequency, handling rare words and no-space languages effectively without external segmenters."
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "import sentencepiece as spm\n",
177
+ "\n",
178
+ "# 1. Save texts to files to train tokenizer\n",
179
+ "with open('train_my.txt', 'w', encoding='utf-8') as f:\n",
180
+ " for line in df['my']:\n",
181
+ " f.write(line + '\\n')\n",
182
+ "\n",
183
+ "with open('train_en.txt', 'w', encoding='utf-8') as f:\n",
184
+ " for line in df['en']:\n",
185
+ " f.write(line + '\\n')\n",
186
+ "\n",
187
+ "# 2. Train SentencePiece models\n",
188
+ "vocab_size = 4000 # Reduced for small dataset (~20k sentences) to learn better representations\n",
189
+ "model_type = 'bpe' # Byte-Pair Encoding\n",
190
+ "\n",
191
+ "print(\"Training Burmese Tokenizer...\")\n",
192
+ "spm.SentencePieceTrainer.train(\n",
193
+ " input='train_my.txt', \n",
194
+ " model_prefix='spm_my', \n",
195
+ " vocab_size=vocab_size, \n",
196
+ " model_type=model_type,\n",
197
+ " pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
198
+ ")\n",
199
+ "\n",
200
+ "print(\"Training English Tokenizer...\")\n",
201
+ "spm.SentencePieceTrainer.train(\n",
202
+ " input='train_en.txt', \n",
203
+ " model_prefix='spm_en', \n",
204
+ " vocab_size=vocab_size, \n",
205
+ " model_type=model_type,\n",
206
+ " pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
207
+ ")\n",
208
+ "\n",
209
+ "print(\"Tokenizer training complete!\")"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": null,
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "# Load the processors\n",
219
+ "sp_my = spm.SentencePieceProcessor(model_file='spm_my.model')\n",
220
+ "sp_en = spm.SentencePieceProcessor(model_file='spm_en.model')\n",
221
+ "\n",
222
+ "# Test Tokenization\n",
223
+ "idx = 0\n",
224
+ "print(f\"Original my: {df.iloc[idx]['my']}\")\n",
225
+ "print(f\"Tokens: {sp_my.encode(df.iloc[idx]['my'], out_type=str)}\")\n",
226
+ "print(f\"IDs: {sp_my.encode(df.iloc[idx]['my'], out_type=int)}\")\n",
227
+ "\n",
228
+ "print(f\"\\nOriginal en: {df.iloc[idx]['en']}\")\n",
229
+ "print(f\"Tokens: {sp_en.encode(df.iloc[idx]['en'], out_type=str)}\")\n",
230
+ "print(f\"IDs: {sp_en.encode(df.iloc[idx]['en'], out_type=int)}\")"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "markdown",
235
+ "metadata": {},
236
+ "source": [
237
+ "## 4. PyTorch Dataset and DataLoader"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": null,
243
+ "id": "9377dc67",
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "class TranslationDataset(Dataset):\n",
248
+ " def __init__(self, df, sp_src, sp_trg):\n",
249
+ " self.data = df\n",
250
+ " self.sp_src = sp_src\n",
251
+ " self.sp_trg = sp_trg\n",
252
+ " \n",
253
+ " def __len__(self):\n",
254
+ " return len(self.data)\n",
255
+ " \n",
256
+ " def __getitem__(self, idx):\n",
257
+ " src_text = self.data.iloc[idx]['my']\n",
258
+ " trg_text = self.data.iloc[idx]['en']\n",
259
+ " \n",
260
+ " # Encode with EOS\n",
261
+ " # spm doesn't add sos/eos by default unless configured, we'll adds manually for safety or usage in model\n",
262
+ " # Use bos_id() for beginning of sentence\n",
263
+ " src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n",
264
+ " trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n",
265
+ " \n",
266
+ " return torch.tensor(src_ids), torch.tensor(trg_ids)\n",
267
+ "\n",
268
+ "def collate_fn(batch):\n",
269
+ " src_batch, trg_batch = [], []\n",
270
+ " for src, trg in batch:\n",
271
+ " src_batch.append(src)\n",
272
+ " trg_batch.append(trg)\n",
273
+ " \n",
274
+ " # Pad sequences\n",
275
+ " # PAD ID is 0 for our spm models\n",
276
+ " src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n",
277
+ " trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n",
278
+ " \n",
279
+ " return src_pad, trg_pad\n",
280
+ "\n",
281
+ "# Split Data\n",
282
+ "train_df = df.sample(frac=0.8, random_state=SEED)\n",
283
+ "val_test_df = df.drop(train_df.index)\n",
284
+ "val_df = val_test_df.sample(frac=0.5, random_state=SEED)\n",
285
+ "test_df = val_test_df.drop(val_df.index)\n",
286
+ "\n",
287
+ "print(f\"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}\")\n",
288
+ "\n",
289
+ "train_dataset = TranslationDataset(train_df, sp_my, sp_en)\n",
290
+ "val_dataset = TranslationDataset(val_df, sp_my, sp_en)\n",
291
+ "test_dataset = TranslationDataset(test_df, sp_my, sp_en)\n",
292
+ "\n",
293
+ "BATCH_SIZE = 64 # Increased to stabilize gradients\n",
294
+ "train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n",
295
+ "val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n",
296
+ "test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "markdown",
301
+ "metadata": {},
302
+ "source": [
303
+ "## 5. Transformer Model\n",
304
+ "Using PyTorch's `nn.Transformer`."
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": null,
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "class TransformerModel(nn.Module):\n",
314
+ " def __init__(self, src_vocab_size, trg_vocab_size, \n",
315
+ " d_model=512, nhead=8, num_encoder_layers=3, \n",
316
+ " num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):\n",
317
+ " super(TransformerModel, self).__init__()\n",
318
+ " \n",
319
+ " self.d_model = d_model\n",
320
+ " self.pad_idx = pad_idx\n",
321
+ " \n",
322
+ " # Embeddings\n",
323
+ " self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n",
324
+ " self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n",
325
+ " \n",
326
+ " # Positional Encoding\n",
327
+ " self.pos_encoder = PositionalEncoding(d_model, dropout)\n",
328
+ " \n",
329
+ " # Transformer\n",
330
+ " self.transformer = nn.Transformer(\n",
331
+ " d_model=d_model, \n",
332
+ " nhead=nhead, \n",
333
+ " num_encoder_layers=num_encoder_layers, \n",
334
+ " num_decoder_layers=num_decoder_layers, \n",
335
+ " dim_feedforward=dim_feedforward, \n",
336
+ " dropout=dropout,\n",
337
+ " batch_first=True\n",
338
+ " )\n",
339
+ " \n",
340
+ " # Output Layer\n",
341
+ " self.fc_out = nn.Linear(d_model, trg_vocab_size)\n",
342
+ " \n",
343
+ " self.init_weights()\n",
344
+ " \n",
345
+ " def init_weights(self):\n",
346
+ " for p in self.parameters():\n",
347
+ " if p.dim() > 1:\n",
348
+ " nn.init.xavier_uniform_(p)\n",
349
+ " \n",
350
+ " def forward(self, src, trg):\n",
351
+ " # src: [batch_size, src_len]\n",
352
+ " # trg: [batch_size, trg_len]\n",
353
+ " \n",
354
+ " # Create masks\n",
355
+ " src_key_padding_mask = (src == self.pad_idx)\n",
356
+ " trg_key_padding_mask = (trg == self.pad_idx)\n",
357
+ " \n",
358
+ " # Target mask for autoregressive decoding (prevent peeking future)\n",
359
+ " trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n",
360
+ " \n",
361
+ " # Embed + Positional Encoding\n",
362
+ " src_emb = self.src_embedding(src) * math.sqrt(self.d_model)\n",
363
+ " trg_emb = self.trg_embedding(trg) * math.sqrt(self.d_model)\n",
364
+ " \n",
365
+ " src_emb = self.pos_encoder(src_emb)\n",
366
+ " trg_emb = self.pos_encoder(trg_emb)\n",
367
+ " \n",
368
+ " # Transformer Forward\n",
369
+ " output = self.transformer(\n",
370
+ " src=src_emb, \n",
371
+ " tgt=trg_emb, \n",
372
+ " tgt_mask=trg_mask,\n",
373
+ " src_key_padding_mask=src_key_padding_mask,\n",
374
+ " tgt_key_padding_mask=trg_key_padding_mask\n",
375
+ " )\n",
376
+ " \n",
377
+ " prediction = self.fc_out(output)\n",
378
+ " return prediction\n",
379
+ "\n",
380
+ "class PositionalEncoding(nn.Module):\n",
381
+ " def __init__(self, d_model, dropout=0.1, max_len=5000):\n",
382
+ " super(PositionalEncoding, self).__init__()\n",
383
+ " self.dropout = nn.Dropout(p=dropout)\n",
384
+ "\n",
385
+ " pe = torch.zeros(max_len, d_model)\n",
386
+ " position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
387
+ " div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n",
388
+ " pe[:, 0::2] = torch.sin(position * div_term)\n",
389
+ " pe[:, 1::2] = torch.cos(position * div_term)\n",
390
+ " self.register_buffer('pe', pe)\n",
391
+ "\n",
392
+ " def forward(self, x):\n",
393
+ " # x: [batch_size, seq_len, d_model]\n",
394
+ " x = x + self.pe[:x.size(1), :]\n",
395
+ " return self.dropout(x)"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "markdown",
400
+ "metadata": {},
401
+ "source": [
402
+ "## 6. Training Loop"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": null,
408
+ "metadata": {},
409
+ "outputs": [],
410
+ "source": [
411
+ "# Config\n",
412
+ "SRC_VOCAB_SIZE = vocab_size\n",
413
+ "TRG_VOCAB_SIZE = vocab_size\n",
414
+ "D_MODEL = 256\n",
415
+ "N_HEAD = 4 # Reduced for small dataset\n",
416
+ "NUM_LAYERS = 2 # Reduced layers\n",
417
+ "FF_DIM = 512\n",
418
+ "DROPOUT = 0.4 # Increased for regularization\n",
419
+ "LR = 0.0005\n",
420
+ "EPOCHS = 100 # Increased to allow convergence\n",
421
+ "\n",
422
+ "model = TransformerModel(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, N_HEAD, NUM_LAYERS, NUM_LAYERS, FF_DIM, DROPOUT, pad_idx=0).to(device)\n",
423
+ "optimizer = optim.Adam(model.parameters(), lr=LR)\n",
424
+ "scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)\n",
425
+ "criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1) # Label smoothing helps with generation\n",
426
+ "\n",
427
+ "def train(model, iterator, optimizer, criterion, clip):\n",
428
+ " model.train()\n",
429
+ " epoch_loss = 0\n",
430
+ " \n",
431
+ " for i, (src, trg) in enumerate(iterator):\n",
432
+ " src, trg = src.to(device), trg.to(device)\n",
433
+ " \n",
434
+ " optimizer.zero_grad()\n",
435
+ " \n",
436
+ " # trg input = trg[:, :-1] (all except last)\n",
437
+ " # trg output = trg[:, 1:] (all except first - predicted next token)\n",
438
+ " output = model(src, trg[:, :-1])\n",
439
+ " \n",
440
+ " output_dim = output.shape[-1]\n",
441
+ " \n",
442
+ " # Flatten for loss calculation\n",
443
+ " output = output.contiguous().view(-1, output_dim)\n",
444
+ " trg = trg[:, 1:].contiguous().view(-1)\n",
445
+ " \n",
446
+ " loss = criterion(output, trg)\n",
447
+ " loss.backward()\n",
448
+ " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
449
+ " optimizer.step()\n",
450
+ " \n",
451
+ " epoch_loss += loss.item()\n",
452
+ " \n",
453
+ " return epoch_loss / len(iterator)\n",
454
+ "\n",
455
+ "def evaluate(model, iterator, criterion):\n",
456
+ " model.eval()\n",
457
+ " epoch_loss = 0\n",
458
+ " \n",
459
+ " with torch.no_grad():\n",
460
+ " for i, (src, trg) in enumerate(iterator):\n",
461
+ " src, trg = src.to(device), trg.to(device)\n",
462
+ " output = model(src, trg[:, :-1])\n",
463
+ " \n",
464
+ " output_dim = output.shape[-1]\n",
465
+ " output = output.contiguous().view(-1, output_dim)\n",
466
+ " trg = trg[:, 1:].contiguous().view(-1)\n",
467
+ " \n",
468
+ " loss = criterion(output, trg)\n",
469
+ " epoch_loss += loss.item()\n",
470
+ " \n",
471
+ " return epoch_loss / len(iterator)\n",
472
+ "\n",
473
+ "print(\"Starting training...\")\n",
474
+ "best_valid_loss = float('inf')\n",
475
+ "\n",
476
+ "for epoch in range(EPOCHS):\n",
477
+ " start_time = time.time()\n",
478
+ " \n",
479
+ " train_loss = train(model, train_loader, optimizer, criterion, 1.0)\n",
480
+ " valid_loss = evaluate(model, val_loader, criterion)\n",
481
+ " \n",
482
+ " end_time = time.time()\n",
483
+ " \n",
484
+ " # Step the scheduler\n",
485
+ " scheduler.step(valid_loss)\n",
486
+ " \n",
487
+ " if valid_loss < best_valid_loss:\n",
488
+ " best_valid_loss = valid_loss\n",
489
+ " torch.save(model.state_dict(), 'transformer_model.pt')\n",
490
+ " \n",
491
+ " print(f'Epoch: {epoch+1:02} | Time: {end_time-start_time:.0f}s')\n",
492
+ " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
493
+ " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')\n",
494
+ " print(f'\\t LR: {optimizer.param_groups[0][\"lr\"]:.6f}')"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "markdown",
499
+ "metadata": {},
500
+ "source": [
501
+ "## 7. Inference and Verification"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": null,
507
+ "metadata": {},
508
+ "outputs": [],
509
+ "source": [
510
+ "# Load Best Model\n",
511
+ "model.load_state_dict(torch.load('transformer_model.pt', map_location=device))\n",
512
+ "\n",
513
+ "def translate_sentence(sentence, model, sp_src, sp_trg, max_len=50, device=device):\n",
514
+ " model.eval()\n",
515
+ " \n",
516
+ " # Tokenize src\n",
517
+ " tokens = [sp_src.bos_id()] + sp_src.encode(sentence, out_type=int) + [sp_src.eos_id()]\n",
518
+ " print(f\"Debug - Source tokens: {sp_src.encode(sentence, out_type=str)}\")\n",
519
+ " src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)\n",
520
+ " \n",
521
+ " # Start with SOS\n",
522
+ " trg_indices = [sp_trg.bos_id()]\n",
523
+ " \n",
524
+ " for i in range(max_len):\n",
525
+ " trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)\n",
526
+ " \n",
527
+ " with torch.no_grad():\n",
528
+ " output = model(src_tensor, trg_tensor)\n",
529
+ " \n",
530
+ " # Get last predicted token\n",
531
+ " pred_token = output.argmax(2)[:, -1].item()\n",
532
+ " \n",
533
+ " trg_indices.append(pred_token)\n",
534
+ " \n",
535
+ " if pred_token == sp_trg.eos_id():\n",
536
+ " break\n",
537
+ " \n",
538
+ " # Decode\n",
539
+ " translated_text = sp_trg.decode(trg_indices)\n",
540
+ " return translated_text\n",
541
+ "\n",
542
+ "# Test Translation\n",
543
+ "idx = random.randint(0, len(test_df)-1)\n",
544
+ "src_sent = test_df.iloc[idx]['my']\n",
545
+ "trg_sent = test_df.iloc[idx]['en']\n",
546
+ "\n",
547
+ "print(f\"Source: {src_sent}\")\n",
548
+ "print(f\"Target: {trg_sent}\")\n",
549
+ "print(f\"Pred: {translate_sentence(src_sent, model, sp_my, sp_en)}\")"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "code",
554
+ "execution_count": null,
555
+ "metadata": {},
556
+ "outputs": [],
557
+ "source": [
558
+ "# Save artifacts for Web App\n",
559
+ "# Already saved: 'transformer_model.pt', 'spm_my.model', 'spm_en.model'\n",
560
+ "# The web app will need these files.\n",
561
+ "import shutil\n",
562
+ "\n",
563
+ "os.makedirs('app/models', exist_ok=True)\n",
564
+ "shutil.copy('transformer_model.pt', 'app/models/transformer_model.pt')\n",
565
+ "shutil.copy('spm_my.model', 'app/models/spm_my.model')\n",
566
+ "shutil.copy('spm_en.model', 'app/models/spm_en.model')\n",
567
+ "print(\"Models copied to app/models/\")"
568
+ ]
569
+ }
570
+ ],
571
+ "metadata": {
572
+ "kernelspec": {
573
+ "display_name": "Python 3",
574
+ "language": "python",
575
+ "name": "python3"
576
+ },
577
+ "language_info": {
578
+ "codemirror_mode": {
579
+ "name": "ipython",
580
+ "version": 3
581
+ },
582
+ "file_extension": ".py",
583
+ "mimetype": "text/x-python",
584
+ "name": "python",
585
+ "nbconvert_exporter": "python",
586
+ "pygments_lexer": "ipython3",
587
+ "version": "3.8.10"
588
+ }
589
+ },
590
+ "nbformat": 4,
591
+ "nbformat_minor": 5
592
+ }
Chinese_English_Transformer.ipynb ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Chinese-English Machine Translation (A3 Project)\n",
8
+ "\n",
9
+ "**Student**: Htut Ko Ko \n",
10
+ "**Course**: Natural Language Understanding \n",
11
+ "**Task**: Chinese (zh) <-> English (en) Translation using Transformer\n",
12
+ "\n",
13
+ "## Project Overview\n",
14
+ "This notebook implements a Neural Machine Translation system using a **Transformer** architecture. \n",
15
+ "We use the **ALT (Asian Language Treebank)** dataset for Chinese-English parallel data.\n",
16
+ "We use **SentencePiece** for subword tokenization.\n",
17
+ "\n",
18
+ "## Pipeline\n",
19
+ "1. **Setup**: Install/Import dependencies.\n",
20
+ "2. **Data Loading**: Load the ALT dataset (Chinese-English).\n",
21
+ "3. **Tokenization**: Train SentencePiece model (`spm_zh`, `spm_en_zh`).\n",
22
+ "4. **Data Processing**: Create PyTorch Datasets and DataLoaders.\n",
23
+ "5. **Model**: Implement Transformer.\n",
24
+ "6. **Training**: Train the model.\n",
25
+ "7. **Evaluation**: Calculate BLEU score.\n",
26
+ "8. **Inference**: Demo function and save model for Web App."
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "metadata": {},
32
+ "source": [
33
+ "## 1. Setup and Imports"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": null,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "import os\n",
43
+ "import math\n",
44
+ "import time\n",
45
+ "import random\n",
46
+ "import numpy as np\n",
47
+ "import pandas as pd\n",
48
+ "import matplotlib.pyplot as plt\n",
49
+ "import seaborn as sns\n",
50
+ "\n",
51
+ "import torch\n",
52
+ "import torch.nn as nn\n",
53
+ "import torch.optim as optim\n",
54
+ "from torch.utils.data import Dataset, DataLoader\n",
55
+ "from torch.nn.utils.rnn import pad_sequence\n",
56
+ "\n",
57
+ "# Check for GPU\n",
58
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
59
+ "print(f\"Using device: {device}\")\n",
60
+ "\n",
61
+ "# Set seeds\n",
62
+ "SEED = 1234\n",
63
+ "random.seed(SEED)\n",
64
+ "np.random.seed(SEED)\n",
65
+ "torch.manual_seed(SEED)\n",
66
+ "torch.cuda.manual_seed(SEED)\n",
67
+ "torch.backends.cudnn.deterministic = True"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "# Install dependencies if missing (uncomment if needed)\n",
77
+ "# !pip install sentencepiece datasets portalocker"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "metadata": {},
83
+ "source": [
84
+ "## 2. Data Loading (ALT Dataset)\n",
85
+ "Loading Chinese-English pairs from ALT."
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "from datasets import load_dataset\n",
95
+ "\n",
96
+ "print(\"Loading ALT Dataset (Chinese-English)...\")\n",
97
+ "try:\n",
98
+ " dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n",
99
+ " print(f\"Loaded {len(dataset)} sentences from ALT dataset.\")\n",
100
+ " \n",
101
+ " # Filter/Extract only Chinese and English\n",
102
+ " data = []\n",
103
+ " for item in dataset:\n",
104
+ " if 'translation' in item:\n",
105
+ " if 'zh' in item['translation'] and 'en' in item['translation']:\n",
106
+ " data.append({\n",
107
+ " 'zh': item['translation']['zh'],\n",
108
+ " 'en': item['translation']['en']\n",
109
+ " })\n",
110
+ " \n",
111
+ " print(f\"Extracted {len(data)} Chinese-English pairs.\")\n",
112
+ " \n",
113
+ "except Exception as e:\n",
114
+ " print(f\"Error loading from HF: {e}\")\n"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# Convert to DataFrame\n",
124
+ "df = pd.DataFrame(data)\n",
125
+ "print(df.head())\n",
126
+ "\n",
127
+ "# Basic Cleaning\n",
128
+ "df = df.dropna(subset=['zh', 'en'])\n",
129
+ "df['zh'] = df['zh'].astype(str)\n",
130
+ "df['en'] = df['en'].astype(str)\n",
131
+ "\n",
132
+ "df = df[df['zh'].str.strip() != '']\n",
133
+ "df = df[df['en'].str.strip() != '']\n",
134
+ "print(f\"After cleaning: {len(df)} pairs\")\n",
135
+ "\n",
136
+ "print(\"\\n--- Data Alignment Check ---\")\n",
137
+ "for i in range(5):\n",
138
+ " sample = df.sample(1).iloc[0]\n",
139
+ " print(f\"Source (zh): {sample['zh']}\")\n",
140
+ " print(f\"Target (en): {sample['en']}\")\n",
141
+ " print(\"-\" * 20)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "markdown",
146
+ "metadata": {},
147
+ "source": [
148
+ "## 3. Tokenization (SentencePiece)\n",
149
+ "Training separate tokenizers for Chinese (`spm_zh`) and English (`spm_en_zh`)."
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "import sentencepiece as spm\n",
159
+ "\n",
160
+ "# 1. Save texts to files\n",
161
+ "with open('train_zh.txt', 'w', encoding='utf-8') as f:\n",
162
+ " for line in df['zh']:\n",
163
+ " f.write(line + '\\n')\n",
164
+ "\n",
165
+ "with open('train_en_zh.txt', 'w', encoding='utf-8') as f:\n",
166
+ " for line in df['en']:\n",
167
+ " f.write(line + '\\n')\n",
168
+ "\n",
169
+ "# 2. Train SentencePiece models\n",
170
+ "vocab_size = 4000\n",
171
+ "model_type = 'bpe'\n",
172
+ "\n",
173
+ "print(\"Training Chinese Tokenizer...\")\n",
174
+ "spm.SentencePieceTrainer.train(\n",
175
+ " input='train_zh.txt', \n",
176
+ " model_prefix='spm_zh', \n",
177
+ " vocab_size=vocab_size, \n",
178
+ " model_type=model_type,\n",
179
+ " pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
180
+ ")\n",
181
+ "\n",
182
+ "print(\"Training English Tokenizer (for Chinese pair)...\")\n",
183
+ "spm.SentencePieceTrainer.train(\n",
184
+ " input='train_en_zh.txt', \n",
185
+ " model_prefix='spm_en_zh', \n",
186
+ " vocab_size=vocab_size, \n",
187
+ " model_type=model_type,\n",
188
+ " pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",
189
+ ")\n",
190
+ "\n",
191
+ "print(\"Tokenizer training complete!\")"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": null,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "# Load the processors\n",
201
+ "sp_zh = spm.SentencePieceProcessor(model_file='spm_zh.model')\n",
202
+ "sp_en = spm.SentencePieceProcessor(model_file='spm_en_zh.model')\n",
203
+ "\n",
204
+ "# Test Tokenization\n",
205
+ "idx = 0\n",
206
+ "print(f\"Original zh: {df.iloc[idx]['zh']}\")\n",
207
+ "print(f\"Tokens: {sp_zh.encode(df.iloc[idx]['zh'], out_type=str)}\")\n",
208
+ "print(f\"IDs: {sp_zh.encode(df.iloc[idx]['zh'], out_type=int)}\")"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "markdown",
213
+ "metadata": {},
214
+ "source": [
215
+ "## 4. PyTorch Dataset and DataLoader"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "class TranslationDataset(Dataset):\n",
225
+ " def __init__(self, df, sp_src, sp_trg):\n",
226
+ " self.data = df\n",
227
+ " self.sp_src = sp_src\n",
228
+ " self.sp_trg = sp_trg\n",
229
+ " \n",
230
+ " def __len__(self):\n",
231
+ " return len(self.data)\n",
232
+ " \n",
233
+ " def __getitem__(self, idx):\n",
234
+ " src_text = self.data.iloc[idx]['zh']\n",
235
+ " trg_text = self.data.iloc[idx]['en']\n",
236
+ " \n",
237
+ " src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n",
238
+ " trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n",
239
+ " \n",
240
+ " return torch.tensor(src_ids), torch.tensor(trg_ids)\n",
241
+ "\n",
242
+ "def collate_fn(batch):\n",
243
+ " src_batch, trg_batch = [], []\n",
244
+ " for src, trg in batch:\n",
245
+ " src_batch.append(src)\n",
246
+ " trg_batch.append(trg)\n",
247
+ " \n",
248
+ " src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n",
249
+ " trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n",
250
+ " \n",
251
+ " return src_pad, trg_pad\n",
252
+ "\n",
253
+ "# Split Data\n",
254
+ "train_df = df.sample(frac=0.8, random_state=SEED)\n",
255
+ "val_test_df = df.drop(train_df.index)\n",
256
+ "val_df = val_test_df.sample(frac=0.5, random_state=SEED)\n",
257
+ "test_df = val_test_df.drop(val_df.index)\n",
258
+ "\n",
259
+ "train_dataset = TranslationDataset(train_df, sp_zh, sp_en)\n",
260
+ "val_dataset = TranslationDataset(val_df, sp_zh, sp_en)\n",
261
+ "test_dataset = TranslationDataset(test_df, sp_zh, sp_en)\n",
262
+ "\n",
263
+ "BATCH_SIZE = 64\n",
264
+ "train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n",
265
+ "val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n",
266
+ "test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "markdown",
271
+ "metadata": {},
272
+ "source": [
273
+ "## 5. Transformer Model"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": null,
279
+ "metadata": {},
280
+ "outputs": [],
281
+ "source": [
282
+ "class TransformerModel(nn.Module):\n",
283
+ " def __init__(self, src_vocab_size, trg_vocab_size, \n",
284
+ " d_model=512, nhead=8, num_encoder_layers=3, \n",
285
+ " num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):\n",
286
+ " super(TransformerModel, self).__init__()\n",
287
+ " \n",
288
+ " self.d_model = d_model\n",
289
+ " self.pad_idx = pad_idx\n",
290
+ " \n",
291
+ " self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n",
292
+ " self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n",
293
+ " self.pos_encoder = PositionalEncoding(d_model, dropout)\n",
294
+ " \n",
295
+ " self.transformer = nn.Transformer(\n",
296
+ " d_model=d_model, \n",
297
+ " nhead=nhead, \n",
298
+ " num_encoder_layers=num_encoder_layers, \n",
299
+ " num_decoder_layers=num_decoder_layers, \n",
300
+ " dim_feedforward=dim_feedforward, \n",
301
+ " dropout=dropout,\n",
302
+ " batch_first=True\n",
303
+ " )\n",
304
+ " \n",
305
+ " self.fc_out = nn.Linear(d_model, trg_vocab_size)\n",
306
+ " self.init_weights()\n",
307
+ " \n",
308
+ " def init_weights(self):\n",
309
+ " for p in self.parameters():\n",
310
+ " if p.dim() > 1:\n",
311
+ " nn.init.xavier_uniform_(p)\n",
312
+ " \n",
313
+ " def forward(self, src, trg):\n",
314
+ " src_key_padding_mask = (src == self.pad_idx)\n",
315
+ " trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n",
316
+ " \n",
317
+ " src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n",
318
+ " trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n",
319
+ " \n",
320
+ " output = self.transformer(\n",
321
+ " src=src_emb, \n",
322
+ " tgt=trg_emb, \n",
323
+ " tgt_mask=trg_mask,\n",
324
+ " src_key_padding_mask=src_key_padding_mask\n",
325
+ " )\n",
326
+ " return self.fc_out(output)\n",
327
+ "\n",
328
+ "class PositionalEncoding(nn.Module):\n",
329
+ " def __init__(self, d_model, dropout=0.1, max_len=5000):\n",
330
+ " super(PositionalEncoding, self).__init__()\n",
331
+ " self.dropout = nn.Dropout(p=dropout)\n",
332
+ " pe = torch.zeros(max_len, d_model)\n",
333
+ " position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
334
+ " div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n",
335
+ " pe[:, 0::2] = torch.sin(position * div_term)\n",
336
+ " pe[:, 1::2] = torch.cos(position * div_term)\n",
337
+ " self.register_buffer('pe', pe)\n",
338
+ "\n",
339
+ " def forward(self, x):\n",
340
+ " x = x + self.pe[:x.size(1), :]\n",
341
+ " return self.dropout(x)"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "markdown",
346
+ "metadata": {},
347
+ "source": [
348
+ "## 6. Training"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": null,
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": [
357
+ "SRC_VOCAB_SIZE = vocab_size\n",
358
+ "TRG_VOCAB_SIZE = vocab_size\n",
359
+ "D_MODEL = 256\n",
360
+ "N_HEAD = 4\n",
361
+ "NUM_LAYERS = 2\n",
362
+ "FF_DIM = 512\n",
363
+ "DROPOUT = 0.4\n",
364
+ "LR = 0.0005\n",
365
+ "EPOCHS = 100\n",
366
+ "\n",
367
+ "model = TransformerModel(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, N_HEAD, NUM_LAYERS, NUM_LAYERS, FF_DIM, DROPOUT).to(device)\n",
368
+ "optimizer = optim.Adam(model.parameters(), lr=LR)\n",
369
+ "criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)\n",
370
+ "\n",
371
+ "def train(model, iterator, optimizer, criterion, clip):\n",
372
+ " model.train()\n",
373
+ " epoch_loss = 0\n",
374
+ " for i, (src, trg) in enumerate(iterator):\n",
375
+ " src, trg = src.to(device), trg.to(device)\n",
376
+ " optimizer.zero_grad()\n",
377
+ " output = model(src, trg[:, :-1])\n",
378
+ " output_dim = output.shape[-1]\n",
379
+ " output = output.contiguous().view(-1, output_dim)\n",
380
+ " trg = trg[:, 1:].contiguous().view(-1)\n",
381
+ " loss = criterion(output, trg)\n",
382
+ " loss.backward()\n",
383
+ " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
384
+ " optimizer.step()\n",
385
+ " epoch_loss += loss.item()\n",
386
+ " return epoch_loss / len(iterator)\n",
387
+ "\n",
388
+ "print(\"Starting training...\")\n",
389
+ "for epoch in range(EPOCHS):\n",
390
+ " train_loss = train(model, train_loader, optimizer, criterion, 1.0)\n",
391
+ " print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')\n",
392
+ " # Save every epoch or best validation (skipped val loop for brevity here, but included in full code)\n",
393
+ " torch.save(model.state_dict(), 'transformer_model_zh.pt')"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": null,
399
+ "metadata": {},
400
+ "outputs": [],
401
+ "source": [
402
+ "# Save artifacts for Web App\n",
403
+ "import shutil\n",
404
+ "os.makedirs('app/models', exist_ok=True)\n",
405
+ "shutil.copy('transformer_model_zh.pt', 'app/models/transformer_model_zh.pt')\n",
406
+ "shutil.copy('spm_zh.model', 'app/models/spm_zh.model')\n",
407
+ "shutil.copy('spm_en_zh.model', 'app/models/spm_en_zh.model')\n",
408
+ "print(\"Models copied to app/models/\")"
409
+ ]
410
+ }
411
+ ],
412
+ "metadata": {
413
+ "kernelspec": {
414
+ "display_name": "Python 3",
415
+ "language": "python",
416
+ "name": "python3"
417
+ },
418
+ "language_info": {
419
+ "codemirror_mode": {
420
+ "name": "ipython",
421
+ "version": 3
422
+ },
423
+ "file_extension": ".py",
424
+ "mimetype": "text/x-python",
425
+ "name": "python",
426
+ "nbconvert_exporter": "python",
427
+ "pygments_lexer": "ipython3",
428
+ "version": "3.8.10"
429
+ }
430
+ },
431
+ "nbformat": 4,
432
+ "nbformat_minor": 5
433
+ }
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+
8
+ # Create and set the working directory
9
+ WORKDIR /code
10
+
11
+ # Copy the requirements first to leverage Docker cache
12
+ # (I'll create a requirements.txt if it doesn't exist)
13
+ COPY requirements.txt .
14
+
15
+ # Install dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy the rest of the application
19
+ COPY . .
20
+
21
+ # Move app files to root if necessary or adjust the command
22
+ # The app is in /app directory, but we want to run app.py
23
+ # Let's adjust the working directory for the command
24
+ WORKDIR /code/app
25
+
26
+ # Expose the port (HF Spaces uses 7860)
27
+ EXPOSE 7860
28
+
29
+ # Command to run the app
30
+ CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]
German_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WeogYTNaDj3r","executionInfo":{"status":"ok","timestamp":1770451365426,"user_tz":-420,"elapsed":8232,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"31f7bff5-87fd-4226-f664-6caa3742a41c"},"outputs":[{"output_type":"stream","name":"stdout","text":["Running in Google Colab\n","Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n","Requirement already satisfied: sentencepiece in /usr/local/lib/python3.12/dist-packages (0.2.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from datasets) (3.20.3)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.0.2)\n","Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n","Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n","Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n","Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.32.4)\n","Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.12/dist-packages (from datasets) (4.67.2)\n","Requirement already satisfied: xxhash in /usr/local/lib/python3.12/dist-packages (from datasets) (3.6.0)\n","Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n","Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n","Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (1.3.7)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from datasets) (26.0)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from datasets) (6.0.3)\n","Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.13.3)\n","Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.2.0)\n","Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (0.28.1)\n","Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.5.4)\n","Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (0.21.1)\n","Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.15.0)\n","Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (3.4.4)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (3.11)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (2.5.0)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets) (2026.1.4)\n","Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n","Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.3)\n","Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n","Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.4.0)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.8.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.7.1)\n","Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.4.1)\n","Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.22.0)\n","Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.24.0->datasets) (4.12.1)\n","Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.24.0->datasets) (1.0.9)\n","Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface-hub>=0.24.0->datasets) (0.16.0)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n","Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim->huggingface-hub>=0.24.0->datasets) (8.3.1)\n","Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["# Google Colab Setup\n","try:\n"," import google.colab\n"," IN_COLAB = True\n"," print(\"Running in Google Colab\")\n"," !pip install datasets sentencepiece\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," # Optional: Change to your project directory if needed\n"," # import os\n"," # os.chdir('/content/drive/MyDrive/NLP/Project_A3/A3_Burmese_English_Puffer')\n","except ImportError:\n"," IN_COLAB = False\n"," print(\"Running Locally\")"],"id":"WeogYTNaDj3r"},{"cell_type":"markdown","metadata":{"id":"o60RyQ1GDj3t"},"source":["# German-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: German (de) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for German-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"o60RyQ1GDj3t"},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mWWrcN7TDj3v","executionInfo":{"status":"ok","timestamp":1770451365477,"user_tz":-420,"elapsed":34,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"505529e2-c5bc-4618-d60e-ebf3205e02a6"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu'))\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"mWWrcN7TDj3v"},{"cell_type":"markdown","metadata":{"id":"u5XpwPylDj3w"},"source":["## 2. Data Loading (Opus-100)\n","Loading German-English pairs from Opus-100."],"id":"u5XpwPylDj3w"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sBw80f5rDj3w","executionInfo":{"status":"ok","timestamp":1770451394573,"user_tz":-420,"elapsed":29093,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"6f313d68-5f9c-4c07-82d1-6fc2b4726e37"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (German-English)...\n","Loaded 1004000 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 German-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (German-English)...\")\n","\n","data = []\n","try:\n"," # Opus-100 has 'de-en' or 'en-de'\n"," dataset = load_dataset(\"opus100\", \"de-en\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," for item in dataset:\n"," if 'translation' in item:\n"," # 'de' is the language code for German\n"," if 'de' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'de': item['translation']['de'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size for this project\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} German-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"sBw80f5rDj3w"},{"cell_type":"code","execution_count":8,"metadata":{"id":"ETHqEVLgDj3w","executionInfo":{"status":"ok","timestamp":1770451394738,"user_tz":-420,"elapsed":172,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"502491ee-6707-4b4b-ab74-fefadc5823b6"},"outputs":[{"output_type":"stream","name":"stdout","text":[" de \\\n","0 Offenbar werde ich verdächtigt. \n","1 Tielt +17°C \n","2 Wie geht's dir? \n","3 Zu ihm verhalten sich die Farben (guasch, temp... \n","4 -Was? \n","\n"," en \n","0 Apparently, I'm a suspect. \n","1 Tucupido +28°C \n","2 How are you? \n","3 Paints concern them (gouache, distemper, poliv... \n","4 You can't mean it! \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['de', 'en'])\n","df['de'] = df['de'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['de'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"ETHqEVLgDj3w"},{"cell_type":"markdown","metadata":{"id":"2cztqOQUDj3x"},"source":["## 3. Tokenization"],"id":"2cztqOQUDj3x"},{"cell_type":"code","execution_count":9,"metadata":{"id":"4YQnkzD_Dj3x","executionInfo":{"status":"ok","timestamp":1770451406984,"user_tz":-420,"elapsed":12244,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"e444ef57-9d4f-43e6-b6df-cbb535d8401e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training German Tokenizer...\n","Training English Tokenizer (for German pair)...\n"]}],"source":["# Save texts to files\n","with open('train_de.txt', 'w', encoding='utf-8') as f:\n"," for line in df['de']: f.write(line + '\\n')\n","\n","with open('train_en_de.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training German Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_de.txt',\n"," model_prefix='spm_de',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for German pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_de.txt',\n"," model_prefix='spm_en_de',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_de.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_de.model')"],"id":"4YQnkzD_Dj3x"},{"cell_type":"markdown","metadata":{"id":"VxbMUHVeDj3x"},"source":["## 4. Dataset & Model"],"id":"VxbMUHVeDj3x"},{"cell_type":"code","execution_count":10,"metadata":{"id":"hVhDYMytDj3x","executionInfo":{"status":"ok","timestamp":1770451406999,"user_tz":-420,"elapsed":5,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['de']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"hVhDYMytDj3x"},{"cell_type":"code","execution_count":11,"metadata":{"id":"f5fLfvEWDj3y","executionInfo":{"status":"ok","timestamp":1770451407006,"user_tz":-420,"elapsed":4,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"f5fLfvEWDj3y"},{"cell_type":"code","execution_count":12,"metadata":{"id":"01why68ZDj3z","executionInfo":{"status":"ok","timestamp":1770451876745,"user_tz":-420,"elapsed":469736,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"ecf12f45-ce36-4f55-9ef0-e7f42d6326f1"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.122\n","Step 100, Loss: 6.802\n","Step 200, Loss: 6.478\n","Step 300, Loss: 6.420\n","Step 400, Loss: 6.104\n","Step 500, Loss: 6.265\n","Step 600, Loss: 5.877\n","Step 700, Loss: 5.790\n","Epoch 1 Loss: 6.320\n","Step 0, Loss: 5.578\n","Step 100, Loss: 5.876\n","Step 200, Loss: 5.782\n","Step 300, Loss: 5.453\n","Step 400, Loss: 5.472\n","Step 500, Loss: 5.311\n","Step 600, Loss: 5.294\n","Step 700, Loss: 5.511\n","Epoch 2 Loss: 5.540\n","Step 0, Loss: 5.304\n","Step 100, Loss: 4.828\n","Step 200, Loss: 5.449\n","Step 300, Loss: 5.142\n","Step 400, Loss: 4.986\n","Step 500, Loss: 5.251\n","Step 600, Loss: 5.048\n","Step 700, Loss: 5.164\n","Epoch 3 Loss: 5.111\n","Step 0, Loss: 4.924\n","Step 100, Loss: 4.869\n","Step 200, Loss: 4.970\n","Step 300, Loss: 4.884\n","Step 400, Loss: 4.627\n","Step 500, Loss: 4.850\n","Step 600, Loss: 4.678\n","Step 700, Loss: 4.876\n","Epoch 4 Loss: 4.832\n","Step 0, Loss: 4.758\n","Step 100, Loss: 4.387\n","Step 200, Loss: 4.616\n","Step 300, Loss: 4.687\n","Step 400, Loss: 4.621\n","Step 500, Loss: 4.487\n","Step 600, Loss: 4.673\n","Step 700, Loss: 4.743\n","Epoch 5 Loss: 4.632\n","Step 0, Loss: 4.118\n","Step 100, Loss: 4.295\n","Step 200, Loss: 4.074\n","Step 300, Loss: 4.624\n","Step 400, Loss: 4.367\n","Step 500, Loss: 4.572\n","Step 600, Loss: 4.676\n","Step 700, Loss: 4.437\n","Epoch 6 Loss: 4.476\n","Step 0, Loss: 4.247\n","Step 100, Loss: 4.121\n","Step 200, Loss: 4.197\n","Step 300, Loss: 4.304\n","Step 400, Loss: 4.441\n","Step 500, Loss: 4.371\n","Step 600, Loss: 4.300\n","Step 700, Loss: 4.265\n","Epoch 7 Loss: 4.346\n","Step 0, Loss: 4.091\n","Step 100, Loss: 4.079\n","Step 200, Loss: 4.234\n","Step 300, Loss: 4.174\n","Step 400, Loss: 4.122\n","Step 500, Loss: 4.436\n","Step 600, Loss: 4.196\n","Step 700, Loss: 4.381\n","Epoch 8 Loss: 4.236\n","Step 0, Loss: 4.214\n","Step 100, Loss: 4.318\n","Step 200, Loss: 4.281\n","Step 300, Loss: 4.474\n","Step 400, Loss: 4.199\n","Step 500, Loss: 4.254\n","Step 600, Loss: 4.127\n","Step 700, Loss: 4.140\n","Epoch 9 Loss: 4.137\n","Step 0, Loss: 3.667\n","Step 100, Loss: 4.102\n","Step 200, Loss: 3.962\n","Step 300, Loss: 4.091\n","Step 400, Loss: 3.765\n","Step 500, Loss: 4.123\n","Step 600, Loss: 4.305\n","Step 700, Loss: 4.151\n","Epoch 10 Loss: 4.051\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo (Opus-100 is large)\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_de.pt')"],"id":"01why68ZDj3z"},{"cell_type":"code","execution_count":13,"metadata":{"id":"NRZITD1eDj3z","executionInfo":{"status":"ok","timestamp":1770451876783,"user_tz":-420,"elapsed":27,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"ca82b3a4-6b08-4e16-a6c7-e755022ff738"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_de.pt', 'app/models/transformer_model_de.pt')\n","shutil.copy('spm_de.model', 'app/models/spm_de.model')\n","shutil.copy('spm_en_de.model', 'app/models/spm_en_de.model')\n","print(\"Models copied to app/models/\")"],"id":"NRZITD1eDj3z"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":5}
Hindi_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"2_-xU-YKZ3AG"},"source":["# Hindi-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Hindi (hi) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Hindi-English parallel data (since ALT does not cover Hindi).\n","We use **SentencePiece** for subword tokenization.\n"],"id":"2_-xU-YKZ3AG"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fMCg2NSiZ3AH","executionInfo":{"status":"ok","timestamp":1770439305524,"user_tz":-420,"elapsed":9873,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d8aa7012-4d09-4b86-a027-86a6b2d1a4c0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"fMCg2NSiZ3AH"},{"cell_type":"markdown","metadata":{"id":"N6W3E7C7Z3AI"},"source":["## 2. Data Loading (Opus-100)\n","Loading Hindi-English pairs from Opus-100."],"id":"N6W3E7C7Z3AI"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":623,"referenced_widgets":["437663195c6746bfa6b2db60708da4d8","a375904851a741bbb6e80903c99c2336","be9b257d8a9d4d6a858f42fb7bad7487","167b27b305a54e45a83c90f2a2234fe4","b482bc3c5ab64217acf38b0c58733c81","420b6bb329e143dcb4e22ad5040a10fd","d49f13205a5e43efa4850f6b108ee8e2","623d17818c9d493c9bd05718026e5300","85d4dfeee47d426284e811b81df6c154","47267a7b0d9b4f56a159e9e9ec2d95ab","1d6bd8dfa7404740b2ae38e0adbdf547","05fffb9917f14ca29764c15592faa90c","941646a2437140558463790ec6888a92","1f6dc7ea27824dde9f6c3b2c0e907bdc","ff23a58b77e940389a763baa33ff99e0","08af38a9022f4ea897e5f7731ea0ff2b","fd231f52742b41f1974c3a967bc76cb4","2b05431acf894cf7ad2bc3d4ea50021e","c1e61972c72747559d111d3d80291215","17a2ed8d6e744dcaa7ba40e80249b52c","de5b3713b323434bbc8139e7bd4475d4","cfca09a62db84b62883d08e3b56410b3","7959c7c2ba4546f99fc076393c7fb7ad","c120b868eb674b529ae35262dbc4b612","a6880c136872473fb42ec08c14c95fcd","175954c78c194b84a1e154809b6b392b","f6cfe7df507a4f44a8cd2b15c0278e17","c9e2176f57b347af893ec0f111f4cb8a","7bf9d2ea798d481ea14ac5d3a0c29ac9","cfed5d010f8f4c93a6b73f6233cf42ee","036749fb7a9144f0aed04aa1c43947a2","0e16fefc8e4743c482c69ad85e3cb4a1","7e84d743dbd941ad84f8c4705bb888fd","57fa0730048b43fda0a54d0678742cf4","ce72a413eef148b5aae9c13eb5deb512","185515355670429db54e06afbe48576c","0e35c8f7b4864d7fa12b007fd3abc685","c78fb971e0334c808d11127e07e5276a","c459964cfdba4ed18a3681c85a50ecea","6bc79f1299d44fed9863f2dff949d404","e130efc4cd6f433d970148cf564eba8e","627f0732315d46a79fd312f74fac1444","25e17a4fd8464f6e9fe79239d617c5e2","fd8c71a0100b4f14950390fc32b38c7f","87d8cfdd9065475883f445720b23a6bf","14f158d8f9c04bd3aa25fb4b69d534e4","6dba2606e98e41d98b6073eae4de3dcb","30e24c52f4554996b0ede06884b787ac","eec36cab181c4d688abbecb798bb1580","3f953ef6b50141c8b189e800a39019cd","5bcc54add4494207938d942f0766282e","920f33e7188242e384ad94097565e05a","4747b9bf4e4641469db631484997deee","d78253da13ca495492d8336b00206f28","58c4a5875abe407ab60d151bfa3bc113","a26303b712724c198e16438af88f5e40","13ba2d7a2400443697abdd25caef84f0","db4aec2ebecf426b9aa74b3566cc9dc5","8195422ace0e4f55a67ed28ef61eb0d0","bc1f687387324ee29f237d34d73a2b1e","77d1ede3a5d4489a85adab40a8e2c69d","888d651e83124fc0a95362127dac9ad9","c10b9c6e22fa46108381e197462e3a43","df9f4045c34b43459aa2e8014cd55bd6","9056e3ef7a9d419b83b076bc0104715c","fe7d345e7d684c4a930151492bb9b006","aa1492921e9a48f499ebba69b3d38a2c","4f565fea90d84b47bd688a3ccfe63253","7264a7929d064ea696a8cb036b5e6799","e790ca616e094758baee332d0f6b2f24","4b05110b780a46c787ada479b5655046","f36dd1d4a8c34b9c94bc71888dc0bd60","abe69424975a4885960fd1c62521f7ac","69d7fff4b4374aa982552bc8c8fe5d76","283b912911d3408a95e453560834b8af","65f16bfd9e634276857c70337aca6293","db3f39cf20304a90b1824f7f135955e4"]},"id":"xz9bfbUoZ3AI","executionInfo":{"status":"ok","timestamp":1770439328301,"user_tz":-420,"elapsed":22779,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"415d423a-8900-40a9-9448-56caf520da25"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Hindi-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n","Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"437663195c6746bfa6b2db60708da4d8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-hi/test-00000-of-00001.parquet: 0%| | 0.00/259k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"05fffb9917f14ca29764c15592faa90c"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-hi/train-00000-of-00001.parquet: 0%| | 0.00/65.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7959c7c2ba4546f99fc076393c7fb7ad"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-hi/validation-00000-of-00001.parquet: 0%| | 0.00/247k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"57fa0730048b43fda0a54d0678742cf4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"87d8cfdd9065475883f445720b23a6bf"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/534319 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a26303b712724c198e16438af88f5e40"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aa1492921e9a48f499ebba69b3d38a2c"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 538319 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Hindi-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Hindi-English)...\")\n","try:\n"," # Opus-100 has 'en-hi'\n"," dataset = load_dataset(\"opus100\", \"en-hi\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," # Extract data\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'hi' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'hi': item['translation']['hi'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size for training on Colab if too large (Opus is huge)\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Hindi-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"xz9bfbUoZ3AI"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kQqs9NE0Z3AJ","executionInfo":{"status":"ok","timestamp":1770439328393,"user_tz":-420,"elapsed":96,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"b8cf15b2-f982-4e06-c658-3125ab82b60b"},"outputs":[{"output_type":"stream","name":"stdout","text":[" hi \\\n","0 - ल .. \n","1 अलिफ़॰ लाम॰ रा॰। यह एक किताब है जिसकी आयतें पक... \n","2 इन बेबी के बिना कैसे रहे। \n","3 वाहीआवाCity name (optional, probably does not ... \n","4 - ट्रेवर 'Atlantis.u के की uLost शहर: मम \n","\n"," en \n","0 - L... \n","1 Alif Lam Ra (This is) a Book, whose verses are... \n","2 Pre-ordering a prossie to murder at the next t... \n","3 Wahiawa \n","4 -'uLost City of Atlantis.u' TREVOR: \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['hi', 'en'])\n","df['hi'] = df['hi'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['hi'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"kQqs9NE0Z3AJ"},{"cell_type":"markdown","metadata":{"id":"fiBwsHrRZ3AJ"},"source":["## 3. Tokenization"],"id":"fiBwsHrRZ3AJ"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4PQtqo4FZ3AJ","executionInfo":{"status":"ok","timestamp":1770439332700,"user_tz":-420,"elapsed":4303,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"459a7116-0418-4be9-98e8-55fb36fbbd7b"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Hindi Tokenizer...\n","Training English Tokenizer (for Hindi pair)...\n"]}],"source":["# Save texts to files\n","with open('train_hi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['hi']: f.write(line + '\\n')\n","\n","with open('train_en_hi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000 # Increased for larger dataset/diversity\n","model_type = 'bpe'\n","\n","print(\"Training Hindi Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_hi.txt',\n"," model_prefix='spm_hi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Hindi pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_hi.txt',\n"," model_prefix='spm_en_hi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_hi.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_hi.model')"],"id":"4PQtqo4FZ3AJ"},{"cell_type":"markdown","metadata":{"id":"AYygVGEdZ3AJ"},"source":["## 4. Dataset & Model"],"id":"AYygVGEdZ3AJ"},{"cell_type":"code","execution_count":5,"metadata":{"id":"VKOFee4bZ3AJ","executionInfo":{"status":"ok","timestamp":1770439332721,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['hi']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"VKOFee4bZ3AJ"},{"cell_type":"code","execution_count":6,"metadata":{"id":"nKQM2fpsZ3AK","executionInfo":{"status":"ok","timestamp":1770439332729,"user_tz":-420,"elapsed":7,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"nKQM2fpsZ3AK"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pnz6_UXcZ3AK","executionInfo":{"status":"ok","timestamp":1770440101437,"user_tz":-420,"elapsed":768708,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"088f0fb2-eb0c-45d5-8d7e-325461bfd275"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.189\n","Step 100, Loss: 6.300\n","Step 200, Loss: 5.875\n","Step 300, Loss: 5.781\n","Step 400, Loss: 5.471\n","Step 500, Loss: 5.690\n","Step 600, Loss: 5.303\n","Step 700, Loss: 5.245\n","Epoch 1 Loss: 5.626\n","Step 0, Loss: 4.651\n","Step 100, Loss: 4.864\n","Step 200, Loss: 4.755\n","Step 300, Loss: 4.875\n","Step 400, Loss: 4.955\n","Step 500, Loss: 4.613\n","Step 600, Loss: 4.793\n","Step 700, Loss: 4.608\n","Epoch 2 Loss: 4.749\n","Step 0, Loss: 4.476\n","Step 100, Loss: 4.460\n","Step 200, Loss: 4.573\n","Step 300, Loss: 4.310\n","Step 400, Loss: 4.169\n","Step 500, Loss: 4.296\n","Step 600, Loss: 4.434\n","Step 700, Loss: 4.135\n","Epoch 3 Loss: 4.344\n","Step 0, Loss: 4.275\n","Step 100, Loss: 4.253\n","Step 200, Loss: 4.010\n","Step 300, Loss: 4.083\n","Step 400, Loss: 4.119\n","Step 500, Loss: 3.813\n","Step 600, Loss: 3.818\n","Step 700, Loss: 4.239\n","Epoch 4 Loss: 4.054\n","Step 0, Loss: 3.653\n","Step 100, Loss: 3.975\n","Step 200, Loss: 3.831\n","Step 300, Loss: 3.881\n","Step 400, Loss: 3.946\n","Step 500, Loss: 3.686\n","Step 600, Loss: 3.760\n","Step 700, Loss: 3.940\n","Epoch 5 Loss: 3.835\n","Step 0, Loss: 3.656\n","Step 100, Loss: 3.804\n","Step 200, Loss: 3.880\n","Step 300, Loss: 3.327\n","Step 400, Loss: 3.826\n","Step 500, Loss: 3.483\n","Step 600, Loss: 3.967\n","Step 700, Loss: 3.605\n","Epoch 6 Loss: 3.656\n","Step 0, Loss: 3.575\n","Step 100, Loss: 3.515\n","Step 200, Loss: 3.743\n","Step 300, Loss: 3.231\n","Step 400, Loss: 3.877\n","Step 500, Loss: 3.325\n","Step 600, Loss: 3.680\n","Step 700, Loss: 3.678\n","Epoch 7 Loss: 3.509\n","Step 0, Loss: 3.627\n","Step 100, Loss: 3.427\n","Step 200, Loss: 3.491\n","Step 300, Loss: 3.302\n","Step 400, Loss: 3.599\n","Step 500, Loss: 3.448\n","Step 600, Loss: 3.703\n","Step 700, Loss: 3.467\n","Epoch 8 Loss: 3.385\n","Step 0, Loss: 3.238\n","Step 100, Loss: 3.243\n","Step 200, Loss: 3.421\n","Step 300, Loss: 3.424\n","Step 400, Loss: 3.290\n","Step 500, Loss: 3.273\n","Step 600, Loss: 3.180\n","Step 700, Loss: 3.268\n","Epoch 9 Loss: 3.281\n","Step 0, Loss: 2.975\n","Step 100, Loss: 3.293\n","Step 200, Loss: 3.388\n","Step 300, Loss: 3.083\n","Step 400, Loss: 3.350\n","Step 500, Loss: 3.231\n","Step 600, Loss: 3.281\n","Step 700, Loss: 3.309\n","Epoch 10 Loss: 3.185\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_hi.pt')"],"id":"pnz6_UXcZ3AK"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5hFYBEDwZ3AK","executionInfo":{"status":"ok","timestamp":1770440101446,"user_tz":-420,"elapsed":5,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"4ccea3ac-129a-4251-bc70-103959c241c5"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_hi.pt', 'app/models/transformer_model_hi.pt')\n","shutil.copy('spm_hi.model', 'app/models/spm_hi.model')\n","shutil.copy('spm_en_hi.model', 'app/models/spm_en_hi.model')\n","print(\"Models copied to app/models/\")"],"id":"5hFYBEDwZ3AK"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"437663195c6746bfa6b2db60708da4d8":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_a375904851a741bbb6e80903c99c2336","IPY_MODEL_be9b257d8a9d4d6a858f42fb7bad7487","IPY_MODEL_167b27b305a54e45a83c90f2a2234fe4"],"layout":"IPY_MODEL_b482bc3c5ab64217acf38b0c58733c81"}},"a375904851a741bbb6e80903c99c2336":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_420b6bb329e143dcb4e22ad5040a10fd","placeholder":"​","style":"IPY_MODEL_d49f13205a5e43efa4850f6b108ee8e2","value":"README.md: "}},"be9b257d8a9d4d6a858f42fb7bad7487":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_623d17818c9d493c9bd05718026e5300","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_85d4dfeee47d426284e811b81df6c154","value":1}},"167b27b305a54e45a83c90f2a2234fe4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_47267a7b0d9b4f56a159e9e9ec2d95ab","placeholder":"​","style":"IPY_MODEL_1d6bd8dfa7404740b2ae38e0adbdf547","value":" 65.4k/? [00:00&lt;00:00, 5.70MB/s]"}},"b482bc3c5ab64217acf38b0c58733c81":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"420b6bb329e143dcb4e22ad5040a10fd":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d49f13205a5e43efa4850f6b108ee8e2":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"623d17818c9d493c9bd05718026e5300":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"85d4dfeee47d426284e811b81df6c154":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"47267a7b0d9b4f56a159e9e9ec2d95ab":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1d6bd8dfa7404740b2ae38e0adbdf547":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"05fffb9917f14ca29764c15592faa90c":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_941646a2437140558463790ec6888a92","IPY_MODEL_1f6dc7ea27824dde9f6c3b2c0e907bdc","IPY_MODEL_ff23a58b77e940389a763baa33ff99e0"],"layout":"IPY_MODEL_08af38a9022f4ea897e5f7731ea0ff2b"}},"941646a2437140558463790ec6888a92":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_fd231f52742b41f1974c3a967bc76cb4","placeholder":"​","style":"IPY_MODEL_2b05431acf894cf7ad2bc3d4ea50021e","value":"en-hi/test-00000-of-00001.parquet: 100%"}},"1f6dc7ea27824dde9f6c3b2c0e907bdc":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c1e61972c72747559d111d3d80291215","max":259276,"min":0,"orientation":"horizontal","style":"IPY_MODEL_17a2ed8d6e744dcaa7ba40e80249b52c","value":259276}},"ff23a58b77e940389a763baa33ff99e0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_de5b3713b323434bbc8139e7bd4475d4","placeholder":"​","style":"IPY_MODEL_cfca09a62db84b62883d08e3b56410b3","value":" 259k/259k [00:00&lt;00:00, 306kB/s]"}},"08af38a9022f4ea897e5f7731ea0ff2b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fd231f52742b41f1974c3a967bc76cb4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2b05431acf894cf7ad2bc3d4ea50021e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c1e61972c72747559d111d3d80291215":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"17a2ed8d6e744dcaa7ba40e80249b52c":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"de5b3713b323434bbc8139e7bd4475d4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cfca09a62db84b62883d08e3b56410b3":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"7959c7c2ba4546f99fc076393c7fb7ad":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_c120b868eb674b529ae35262dbc4b612","IPY_MODEL_a6880c136872473fb42ec08c14c95fcd","IPY_MODEL_175954c78c194b84a1e154809b6b392b"],"layout":"IPY_MODEL_f6cfe7df507a4f44a8cd2b15c0278e17"}},"c120b868eb674b529ae35262dbc4b612":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c9e2176f57b347af893ec0f111f4cb8a","placeholder":"​","style":"IPY_MODEL_7bf9d2ea798d481ea14ac5d3a0c29ac9","value":"en-hi/train-00000-of-00001.parquet: 100%"}},"a6880c136872473fb42ec08c14c95fcd":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_cfed5d010f8f4c93a6b73f6233cf42ee","max":65219235,"min":0,"orientation":"horizontal","style":"IPY_MODEL_036749fb7a9144f0aed04aa1c43947a2","value":65219235}},"175954c78c194b84a1e154809b6b392b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0e16fefc8e4743c482c69ad85e3cb4a1","placeholder":"​","style":"IPY_MODEL_7e84d743dbd941ad84f8c4705bb888fd","value":" 65.2M/65.2M [00:01&lt;00:00, 61.5MB/s]"}},"f6cfe7df507a4f44a8cd2b15c0278e17":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c9e2176f57b347af893ec0f111f4cb8a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7bf9d2ea798d481ea14ac5d3a0c29ac9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"cfed5d010f8f4c93a6b73f6233cf42ee":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"036749fb7a9144f0aed04aa1c43947a2":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"0e16fefc8e4743c482c69ad85e3cb4a1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7e84d743dbd941ad84f8c4705bb888fd":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"57fa0730048b43fda0a54d0678742cf4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ce72a413eef148b5aae9c13eb5deb512","IPY_MODEL_185515355670429db54e06afbe48576c","IPY_MODEL_0e35c8f7b4864d7fa12b007fd3abc685"],"layout":"IPY_MODEL_c78fb971e0334c808d11127e07e5276a"}},"ce72a413eef148b5aae9c13eb5deb512":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c459964cfdba4ed18a3681c85a50ecea","placeholder":"​","style":"IPY_MODEL_6bc79f1299d44fed9863f2dff949d404","value":"en-hi/validation-00000-of-00001.parquet: 100%"}},"185515355670429db54e06afbe48576c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e130efc4cd6f433d970148cf564eba8e","max":247375,"min":0,"orientation":"horizontal","style":"IPY_MODEL_627f0732315d46a79fd312f74fac1444","value":247375}},"0e35c8f7b4864d7fa12b007fd3abc685":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_25e17a4fd8464f6e9fe79239d617c5e2","placeholder":"​","style":"IPY_MODEL_fd8c71a0100b4f14950390fc32b38c7f","value":" 247k/247k [00:00&lt;00:00, 254kB/s]"}},"c78fb971e0334c808d11127e07e5276a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c459964cfdba4ed18a3681c85a50ecea":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6bc79f1299d44fed9863f2dff949d404":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e130efc4cd6f433d970148cf564eba8e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"627f0732315d46a79fd312f74fac1444":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"25e17a4fd8464f6e9fe79239d617c5e2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fd8c71a0100b4f14950390fc32b38c7f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"87d8cfdd9065475883f445720b23a6bf":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_14f158d8f9c04bd3aa25fb4b69d534e4","IPY_MODEL_6dba2606e98e41d98b6073eae4de3dcb","IPY_MODEL_30e24c52f4554996b0ede06884b787ac"],"layout":"IPY_MODEL_eec36cab181c4d688abbecb798bb1580"}},"14f158d8f9c04bd3aa25fb4b69d534e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3f953ef6b50141c8b189e800a39019cd","placeholder":"​","style":"IPY_MODEL_5bcc54add4494207938d942f0766282e","value":"Generating test split: 100%"}},"6dba2606e98e41d98b6073eae4de3dcb":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_920f33e7188242e384ad94097565e05a","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_4747b9bf4e4641469db631484997deee","value":2000}},"30e24c52f4554996b0ede06884b787ac":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d78253da13ca495492d8336b00206f28","placeholder":"​","style":"IPY_MODEL_58c4a5875abe407ab60d151bfa3bc113","value":" 2000/2000 [00:00&lt;00:00, 35619.05 examples/s]"}},"eec36cab181c4d688abbecb798bb1580":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3f953ef6b50141c8b189e800a39019cd":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5bcc54add4494207938d942f0766282e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"920f33e7188242e384ad94097565e05a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4747b9bf4e4641469db631484997deee":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d78253da13ca495492d8336b00206f28":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"58c4a5875abe407ab60d151bfa3bc113":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a26303b712724c198e16438af88f5e40":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_13ba2d7a2400443697abdd25caef84f0","IPY_MODEL_db4aec2ebecf426b9aa74b3566cc9dc5","IPY_MODEL_8195422ace0e4f55a67ed28ef61eb0d0"],"layout":"IPY_MODEL_bc1f687387324ee29f237d34d73a2b1e"}},"13ba2d7a2400443697abdd25caef84f0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_77d1ede3a5d4489a85adab40a8e2c69d","placeholder":"​","style":"IPY_MODEL_888d651e83124fc0a95362127dac9ad9","value":"Generating train split: 100%"}},"db4aec2ebecf426b9aa74b3566cc9dc5":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c10b9c6e22fa46108381e197462e3a43","max":534319,"min":0,"orientation":"horizontal","style":"IPY_MODEL_df9f4045c34b43459aa2e8014cd55bd6","value":534319}},"8195422ace0e4f55a67ed28ef61eb0d0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9056e3ef7a9d419b83b076bc0104715c","placeholder":"​","style":"IPY_MODEL_fe7d345e7d684c4a930151492bb9b006","value":" 534319/534319 [00:00&lt;00:00, 677541.02 examples/s]"}},"bc1f687387324ee29f237d34d73a2b1e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"77d1ede3a5d4489a85adab40a8e2c69d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"888d651e83124fc0a95362127dac9ad9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c10b9c6e22fa46108381e197462e3a43":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"df9f4045c34b43459aa2e8014cd55bd6":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9056e3ef7a9d419b83b076bc0104715c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fe7d345e7d684c4a930151492bb9b006":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"aa1492921e9a48f499ebba69b3d38a2c":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_4f565fea90d84b47bd688a3ccfe63253","IPY_MODEL_7264a7929d064ea696a8cb036b5e6799","IPY_MODEL_e790ca616e094758baee332d0f6b2f24"],"layout":"IPY_MODEL_4b05110b780a46c787ada479b5655046"}},"4f565fea90d84b47bd688a3ccfe63253":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f36dd1d4a8c34b9c94bc71888dc0bd60","placeholder":"​","style":"IPY_MODEL_abe69424975a4885960fd1c62521f7ac","value":"Generating validation split: 100%"}},"7264a7929d064ea696a8cb036b5e6799":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_69d7fff4b4374aa982552bc8c8fe5d76","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_283b912911d3408a95e453560834b8af","value":2000}},"e790ca616e094758baee332d0f6b2f24":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_65f16bfd9e634276857c70337aca6293","placeholder":"​","style":"IPY_MODEL_db3f39cf20304a90b1824f7f135955e4","value":" 2000/2000 [00:00&lt;00:00, 77309.37 examples/s]"}},"4b05110b780a46c787ada479b5655046":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f36dd1d4a8c34b9c94bc71888dc0bd60":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"abe69424975a4885960fd1c62521f7ac":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"69d7fff4b4374aa982552bc8c8fe5d76":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"283b912911d3408a95e453560834b8af":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"65f16bfd9e634276857c70337aca6293":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"db3f39cf20304a90b1824f7f135955e4":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
Kazakh_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"jQEigq0G8cJe"},"source":["# Kazakh-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Kazakh (kk) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Kazakh-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"jQEigq0G8cJe"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qrrrFeD88cJg","executionInfo":{"status":"ok","timestamp":1770448390827,"user_tz":-420,"elapsed":7624,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"5b9eebe3-acd4-4645-9f7f-e335477b765f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"qrrrFeD88cJg"},{"cell_type":"markdown","metadata":{"id":"j93FwrdJ8cJh"},"source":["## 2. Data Loading (Opus-100)\n","Loading Kazakh-English pairs from Opus-100."],"id":"j93FwrdJ8cJh"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":559,"referenced_widgets":["6011e696e7134747ad1da29bac8b1fa3","a08334f3baa04417b29d2790f5200899","444138cf68e54c058b3dc517c859d315","03973f3d8fb749d8b447ecb4d8ab925f","2d06b58cbc4d4b3592686575ce8cfe0e","a801924070514530861ec37751863a11","6b55ec48eb544e7fa890a413b3a8a618","e558c7a282124a679d8d1ba66479ae68","480f629eb65344acba595da2adaca5e7","144b6a863d624d0b8f2bc9215e20ecee","f844a6d12a444fee9211e3231700a3b1","2d380a9519d44a748906f7855a8347fc","6f549dace4d242feb8f71d4879ce8c33","de81adc631754b92af67d75251926253","5565e5832d2b42cd94b5d509a07f49d1","c699fc7066e34809bf0fe4ff22ef85bb","9b04dfd472eb4f4e84fa23becb13c5d9","f04429d45a89499b902c4f9825d58b9f","f4b88337dba742afbf965c460579138b","e625835b3492431ea81d0c7f932c9a6d","1f0a23dbc0f744ca931e53cc4377a060","c94bca4b10d14e8088ac983842a16f3e","05f5d68df2c147ad9be40ec37d86a831","24e5962f9db1449e986f1016f413b76e","4fa72adbab5c49e3908578ccd3d15eae","a4385800086d491189765a09829f7165","0f6b85ba92ee4c1395d38a8efeb0d1f2","1155a892b40744158325236140ac0b19","5e699f444ce24d21ac3894941dc75149","6b83449e4d51445185da2df7352868d0","8c9bcd03a0964d69bd61c25e4f881f59","b29f31340d9d44bdab54175cec8a6c95","fd20f577ffd94493b7073e04bf0f3a07","1abdf7ab22f6466da2c66cede3b15ff5","11d1aeeaf1984725b9f1d2219156dcb6","6570d5bc161b4c8698de14ed300dafea","adbb70135fee468293e6f3ea2ef56beb","6a4a3be9cbc4493191a892c573582ba3","84c2ec1f709c47a7b8faa1019893d4e4","0a99e474c195480898b2c35cd127d557","2537324c324b4c60a745c2ff78e15fa0","5e119fe24031446c94d6fcf4f95fdf3d","8b57b2ae0bf842c3af36a18b62d9e015","b69e8d52115a4ddcb17b03cc6f64eb09","941445b90fc04904b6648a5e3e3fa245","844642fe3adb4f01a07d7d07ba4798bc","e068281ab48043489289c1ab2f92473e","045011b0c96741dfada3a09a0524b0d4","fd7883f2f8d349db891709dc3101a16d","b66b9e6354584286a9f1f0f4b87d1fb3","c552b2ba8f5f4726a79391e229136a83","76f2c7cbd0c74af9ac85a27d85f9ea06","aaf549f0289f408c897035bd75e5c305","85bb759b75404ac587c5282b284b8d3a","aa449e4c8bea4d559b4f060f69ae4c25","a62dd6045c8c4743bb389a17b8788e73","01ee8d04a6194f8f83fb81212c3d1a10","318be1fd72314dc28b62f265935318bf","db51334d90a34a6fa253d12c7e50e54c","e43487b0dcbb4ec0ae8af912f5535074","71db91520a20457f96a0e8ce83ba2f61","53e0eb9505e14e52a9aaa3ad3d92eb7f","98a4e533d68f4aaa8bd5e556d967ca2c","25aa3f2b14d34a95a10d65099f044490","2c1449b36e254ad3ae9ebf521ba9c06c","048b2b884d594f44a627508114bb2653","edbe3f5f53cc490aac30f8c2a6e7cfe2","61e376e936464743be2c0882861bfeae","3133af8ca0b9442aa0a863a82a0efc90","c2f229cd45bf4dbf86efe061f43010e0","9fb9e54ec0544cb5bf30fe6286ece8c4","074a6c4bedd24a379b59da7c284de342","29618afe42234e1fa1e23d6ca3c9a9bf","c6986d4b832c45fe97b5a663309067de","8485fbb850314eee9e0df683a18d1497","a52f65446e6241e18b5011ef9c55865c","15ba674f577b4559a3ceb2129022bc9c"]},"id":"vEBAsFtF8cJh","executionInfo":{"status":"ok","timestamp":1770448401881,"user_tz":-420,"elapsed":11051,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"5273a0e1-2b4d-4adb-c853-7175a3129406"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Kazakh-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6011e696e7134747ad1da29bac8b1fa3"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-kk/test-00000-of-00001.parquet: 0%| | 0.00/84.1k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2d380a9519d44a748906f7855a8347fc"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-kk/train-00000-of-00001.parquet: 0%| | 0.00/4.64M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"05f5d68df2c147ad9be40ec37d86a831"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["en-kk/validation-00000-of-00001.parquet: 0%| | 0.00/83.1k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1abdf7ab22f6466da2c66cede3b15ff5"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"941445b90fc04904b6648a5e3e3fa245"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/79927 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a62dd6045c8c4743bb389a17b8788e73"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"edbe3f5f53cc490aac30f8c2a6e7cfe2"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 83927 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Kazakh-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Kazakh-English)...\")\n","try:\n"," # Opus-100 has 'en-kk'\n"," dataset = load_dataset(\"opus100\", \"en-kk\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," # 'kk' is the language code for Kazakh\n"," if 'kk' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'kk': item['translation']['kk'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size for this project\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Kazakh-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"vEBAsFtF8cJh"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Zp1IkPgg8cJh","executionInfo":{"status":"ok","timestamp":1770448402080,"user_tz":-420,"elapsed":171,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d19b5f23-8cd8-443b-c2b6-ba8b8dc7c0a9"},"outputs":[{"output_type":"stream","name":"stdout","text":[" kk \\\n","0 Соломон ар- ыName \n","1 Хабарламаның өлшемі ішкі буфердің өлшемінен ас... \n","2 Астероидтарды жүктеу \n","3 unit description in lists \n","4 Ұяшыққа енгізген мәтіннің алғашқы әрібі автома... \n","\n"," en \n","0 Solomon Islands \n","1 The connection is broken. \n","2 Loading asteroids \n","3 ds \n","4 Check this box and the first letter of any tex... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['kk', 'en'])\n","df['kk'] = df['kk'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['kk'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"Zp1IkPgg8cJh"},{"cell_type":"markdown","metadata":{"id":"QAJ-uFQ48cJi"},"source":["## 3. Tokenization"],"id":"QAJ-uFQ48cJi"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"j6XTdYn78cJi","executionInfo":{"status":"ok","timestamp":1770448405527,"user_tz":-420,"elapsed":3439,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"07c0e052-8205-480a-ac09-62d21a32896d"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Kazakh Tokenizer...\n","Training English Tokenizer (for Kazakh pair)...\n"]}],"source":["# Save texts to files\n","with open('train_kk.txt', 'w', encoding='utf-8') as f:\n"," for line in df['kk']: f.write(line + '\\n')\n","\n","with open('train_en_kk.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Kazakh Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_kk.txt',\n"," model_prefix='spm_kk',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Kazakh pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_kk.txt',\n"," model_prefix='spm_en_kk',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_kk.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_kk.model')"],"id":"j6XTdYn78cJi"},{"cell_type":"markdown","metadata":{"id":"vFF66mIP8cJi"},"source":["## 4. Dataset & Model"],"id":"vFF66mIP8cJi"},{"cell_type":"code","execution_count":5,"metadata":{"id":"kgduE8vf8cJi","executionInfo":{"status":"ok","timestamp":1770448405556,"user_tz":-420,"elapsed":27,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['kk']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"kgduE8vf8cJi"},{"cell_type":"code","execution_count":6,"metadata":{"id":"7cWen2oV8cJj","executionInfo":{"status":"ok","timestamp":1770448405577,"user_tz":-420,"elapsed":20,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"7cWen2oV8cJj"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HVLW5dij8cJj","executionInfo":{"status":"ok","timestamp":1770448935027,"user_tz":-420,"elapsed":529443,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"1cce1192-351f-4296-b4a0-223b043ad008"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.075\n","Step 100, Loss: 6.556\n","Step 200, Loss: 6.369\n","Step 300, Loss: 6.011\n","Step 400, Loss: 5.706\n","Step 500, Loss: 6.181\n","Step 600, Loss: 5.752\n","Step 700, Loss: 5.447\n","Epoch 1 Loss: 5.964\n","Step 0, Loss: 5.384\n","Step 100, Loss: 5.231\n","Step 200, Loss: 5.161\n","Step 300, Loss: 4.940\n","Step 400, Loss: 4.740\n","Step 500, Loss: 4.543\n","Step 600, Loss: 4.678\n","Step 700, Loss: 4.400\n","Epoch 2 Loss: 4.925\n","Step 0, Loss: 3.883\n","Step 100, Loss: 4.995\n","Step 200, Loss: 4.852\n","Step 300, Loss: 4.444\n","Step 400, Loss: 4.736\n","Step 500, Loss: 4.302\n","Step 600, Loss: 4.597\n","Step 700, Loss: 4.315\n","Epoch 3 Loss: 4.290\n","Step 0, Loss: 3.856\n","Step 100, Loss: 3.346\n","Step 200, Loss: 3.560\n","Step 300, Loss: 3.536\n","Step 400, Loss: 3.675\n","Step 500, Loss: 3.650\n","Step 600, Loss: 3.788\n","Step 700, Loss: 3.986\n","Epoch 4 Loss: 3.795\n","Step 0, Loss: 3.339\n","Step 100, Loss: 3.281\n","Step 200, Loss: 3.356\n","Step 300, Loss: 3.740\n","Step 400, Loss: 3.298\n","Step 500, Loss: 3.310\n","Step 600, Loss: 3.430\n","Step 700, Loss: 3.159\n","Epoch 5 Loss: 3.421\n","Step 0, Loss: 3.679\n","Step 100, Loss: 3.330\n","Step 200, Loss: 3.167\n","Step 300, Loss: 3.028\n","Step 400, Loss: 3.278\n","Step 500, Loss: 2.776\n","Step 600, Loss: 3.332\n","Step 700, Loss: 3.037\n","Epoch 6 Loss: 3.130\n","Step 0, Loss: 3.069\n","Step 100, Loss: 2.532\n","Step 200, Loss: 3.826\n","Step 300, Loss: 3.024\n","Step 400, Loss: 2.467\n","Step 500, Loss: 3.012\n","Step 600, Loss: 3.027\n","Step 700, Loss: 2.637\n","Epoch 7 Loss: 2.898\n","Step 0, Loss: 1.940\n","Step 100, Loss: 2.974\n","Step 200, Loss: 2.921\n","Step 300, Loss: 2.778\n","Step 400, Loss: 2.822\n","Step 500, Loss: 2.947\n","Step 600, Loss: 2.785\n","Step 700, Loss: 2.522\n","Epoch 8 Loss: 2.705\n","Step 0, Loss: 2.080\n","Step 100, Loss: 2.026\n","Step 200, Loss: 2.744\n","Step 300, Loss: 3.248\n","Step 400, Loss: 2.722\n","Step 500, Loss: 2.287\n","Step 600, Loss: 2.520\n","Step 700, Loss: 2.851\n","Epoch 9 Loss: 2.545\n","Step 0, Loss: 1.841\n","Step 100, Loss: 2.176\n","Step 200, Loss: 2.319\n","Step 300, Loss: 2.149\n","Step 400, Loss: 2.668\n","Step 500, Loss: 2.202\n","Step 600, Loss: 2.811\n","Step 700, Loss: 2.248\n","Epoch 10 Loss: 2.406\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo (Opus-100 is large)\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_kk.pt')"],"id":"HVLW5dij8cJj"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cdxw4XsM8cJj","executionInfo":{"status":"ok","timestamp":1770448935067,"user_tz":-420,"elapsed":37,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"eca2a396-45a9-4cd7-c420-3003e19300d3"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_kk.pt', 'app/models/transformer_model_kk.pt')\n","shutil.copy('spm_kk.model', 'app/models/spm_kk.model')\n","shutil.copy('spm_en_kk.model', 'app/models/spm_en_kk.model')\n","print(\"Models copied to app/models/\")"],"id":"cdxw4XsM8cJj"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"6011e696e7134747ad1da29bac8b1fa3":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_a08334f3baa04417b29d2790f5200899","IPY_MODEL_444138cf68e54c058b3dc517c859d315","IPY_MODEL_03973f3d8fb749d8b447ecb4d8ab925f"],"layout":"IPY_MODEL_2d06b58cbc4d4b3592686575ce8cfe0e"}},"a08334f3baa04417b29d2790f5200899":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a801924070514530861ec37751863a11","placeholder":"​","style":"IPY_MODEL_6b55ec48eb544e7fa890a413b3a8a618","value":"README.md: "}},"444138cf68e54c058b3dc517c859d315":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e558c7a282124a679d8d1ba66479ae68","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_480f629eb65344acba595da2adaca5e7","value":1}},"03973f3d8fb749d8b447ecb4d8ab925f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_144b6a863d624d0b8f2bc9215e20ecee","placeholder":"​","style":"IPY_MODEL_f844a6d12a444fee9211e3231700a3b1","value":" 65.4k/? [00:00&lt;00:00, 5.22MB/s]"}},"2d06b58cbc4d4b3592686575ce8cfe0e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a801924070514530861ec37751863a11":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6b55ec48eb544e7fa890a413b3a8a618":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e558c7a282124a679d8d1ba66479ae68":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"480f629eb65344acba595da2adaca5e7":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"144b6a863d624d0b8f2bc9215e20ecee":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f844a6d12a444fee9211e3231700a3b1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2d380a9519d44a748906f7855a8347fc":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_6f549dace4d242feb8f71d4879ce8c33","IPY_MODEL_de81adc631754b92af67d75251926253","IPY_MODEL_5565e5832d2b42cd94b5d509a07f49d1"],"layout":"IPY_MODEL_c699fc7066e34809bf0fe4ff22ef85bb"}},"6f549dace4d242feb8f71d4879ce8c33":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9b04dfd472eb4f4e84fa23becb13c5d9","placeholder":"​","style":"IPY_MODEL_f04429d45a89499b902c4f9825d58b9f","value":"en-kk/test-00000-of-00001.parquet: 100%"}},"de81adc631754b92af67d75251926253":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f4b88337dba742afbf965c460579138b","max":84062,"min":0,"orientation":"horizontal","style":"IPY_MODEL_e625835b3492431ea81d0c7f932c9a6d","value":84062}},"5565e5832d2b42cd94b5d509a07f49d1":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1f0a23dbc0f744ca931e53cc4377a060","placeholder":"​","style":"IPY_MODEL_c94bca4b10d14e8088ac983842a16f3e","value":" 84.1k/84.1k [00:00&lt;00:00, 96.4kB/s]"}},"c699fc7066e34809bf0fe4ff22ef85bb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9b04dfd472eb4f4e84fa23becb13c5d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f04429d45a89499b902c4f9825d58b9f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f4b88337dba742afbf965c460579138b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e625835b3492431ea81d0c7f932c9a6d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"1f0a23dbc0f744ca931e53cc4377a060":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c94bca4b10d14e8088ac983842a16f3e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"05f5d68df2c147ad9be40ec37d86a831":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_24e5962f9db1449e986f1016f413b76e","IPY_MODEL_4fa72adbab5c49e3908578ccd3d15eae","IPY_MODEL_a4385800086d491189765a09829f7165"],"layout":"IPY_MODEL_0f6b85ba92ee4c1395d38a8efeb0d1f2"}},"24e5962f9db1449e986f1016f413b76e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1155a892b40744158325236140ac0b19","placeholder":"​","style":"IPY_MODEL_5e699f444ce24d21ac3894941dc75149","value":"en-kk/train-00000-of-00001.parquet: 100%"}},"4fa72adbab5c49e3908578ccd3d15eae":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_6b83449e4d51445185da2df7352868d0","max":4641227,"min":0,"orientation":"horizontal","style":"IPY_MODEL_8c9bcd03a0964d69bd61c25e4f881f59","value":4641227}},"a4385800086d491189765a09829f7165":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b29f31340d9d44bdab54175cec8a6c95","placeholder":"​","style":"IPY_MODEL_fd20f577ffd94493b7073e04bf0f3a07","value":" 4.64M/4.64M [00:01&lt;00:00, 5.63MB/s]"}},"0f6b85ba92ee4c1395d38a8efeb0d1f2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1155a892b40744158325236140ac0b19":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5e699f444ce24d21ac3894941dc75149":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6b83449e4d51445185da2df7352868d0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8c9bcd03a0964d69bd61c25e4f881f59":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"b29f31340d9d44bdab54175cec8a6c95":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fd20f577ffd94493b7073e04bf0f3a07":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1abdf7ab22f6466da2c66cede3b15ff5":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_11d1aeeaf1984725b9f1d2219156dcb6","IPY_MODEL_6570d5bc161b4c8698de14ed300dafea","IPY_MODEL_adbb70135fee468293e6f3ea2ef56beb"],"layout":"IPY_MODEL_6a4a3be9cbc4493191a892c573582ba3"}},"11d1aeeaf1984725b9f1d2219156dcb6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_84c2ec1f709c47a7b8faa1019893d4e4","placeholder":"​","style":"IPY_MODEL_0a99e474c195480898b2c35cd127d557","value":"en-kk/validation-00000-of-00001.parquet: 100%"}},"6570d5bc161b4c8698de14ed300dafea":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_2537324c324b4c60a745c2ff78e15fa0","max":83071,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5e119fe24031446c94d6fcf4f95fdf3d","value":83071}},"adbb70135fee468293e6f3ea2ef56beb":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8b57b2ae0bf842c3af36a18b62d9e015","placeholder":"​","style":"IPY_MODEL_b69e8d52115a4ddcb17b03cc6f64eb09","value":" 83.1k/83.1k [00:00&lt;00:00, 165kB/s]"}},"6a4a3be9cbc4493191a892c573582ba3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"84c2ec1f709c47a7b8faa1019893d4e4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0a99e474c195480898b2c35cd127d557":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2537324c324b4c60a745c2ff78e15fa0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5e119fe24031446c94d6fcf4f95fdf3d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"8b57b2ae0bf842c3af36a18b62d9e015":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b69e8d52115a4ddcb17b03cc6f64eb09":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"941445b90fc04904b6648a5e3e3fa245":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_844642fe3adb4f01a07d7d07ba4798bc","IPY_MODEL_e068281ab48043489289c1ab2f92473e","IPY_MODEL_045011b0c96741dfada3a09a0524b0d4"],"layout":"IPY_MODEL_fd7883f2f8d349db891709dc3101a16d"}},"844642fe3adb4f01a07d7d07ba4798bc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b66b9e6354584286a9f1f0f4b87d1fb3","placeholder":"​","style":"IPY_MODEL_c552b2ba8f5f4726a79391e229136a83","value":"Generating test split: 100%"}},"e068281ab48043489289c1ab2f92473e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_76f2c7cbd0c74af9ac85a27d85f9ea06","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_aaf549f0289f408c897035bd75e5c305","value":2000}},"045011b0c96741dfada3a09a0524b0d4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_85bb759b75404ac587c5282b284b8d3a","placeholder":"​","style":"IPY_MODEL_aa449e4c8bea4d559b4f060f69ae4c25","value":" 2000/2000 [00:00&lt;00:00, 41502.08 examples/s]"}},"fd7883f2f8d349db891709dc3101a16d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b66b9e6354584286a9f1f0f4b87d1fb3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c552b2ba8f5f4726a79391e229136a83":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"76f2c7cbd0c74af9ac85a27d85f9ea06":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aaf549f0289f408c897035bd75e5c305":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"85bb759b75404ac587c5282b284b8d3a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aa449e4c8bea4d559b4f060f69ae4c25":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a62dd6045c8c4743bb389a17b8788e73":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_01ee8d04a6194f8f83fb81212c3d1a10","IPY_MODEL_318be1fd72314dc28b62f265935318bf","IPY_MODEL_db51334d90a34a6fa253d12c7e50e54c"],"layout":"IPY_MODEL_e43487b0dcbb4ec0ae8af912f5535074"}},"01ee8d04a6194f8f83fb81212c3d1a10":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_71db91520a20457f96a0e8ce83ba2f61","placeholder":"​","style":"IPY_MODEL_53e0eb9505e14e52a9aaa3ad3d92eb7f","value":"Generating train split: 100%"}},"318be1fd72314dc28b62f265935318bf":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_98a4e533d68f4aaa8bd5e556d967ca2c","max":79927,"min":0,"orientation":"horizontal","style":"IPY_MODEL_25aa3f2b14d34a95a10d65099f044490","value":79927}},"db51334d90a34a6fa253d12c7e50e54c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2c1449b36e254ad3ae9ebf521ba9c06c","placeholder":"​","style":"IPY_MODEL_048b2b884d594f44a627508114bb2653","value":" 79927/79927 [00:00&lt;00:00, 918229.20 examples/s]"}},"e43487b0dcbb4ec0ae8af912f5535074":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"71db91520a20457f96a0e8ce83ba2f61":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"53e0eb9505e14e52a9aaa3ad3d92eb7f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"98a4e533d68f4aaa8bd5e556d967ca2c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"25aa3f2b14d34a95a10d65099f044490":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"2c1449b36e254ad3ae9ebf521ba9c06c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"048b2b884d594f44a627508114bb2653":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"edbe3f5f53cc490aac30f8c2a6e7cfe2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_61e376e936464743be2c0882861bfeae","IPY_MODEL_3133af8ca0b9442aa0a863a82a0efc90","IPY_MODEL_c2f229cd45bf4dbf86efe061f43010e0"],"layout":"IPY_MODEL_9fb9e54ec0544cb5bf30fe6286ece8c4"}},"61e376e936464743be2c0882861bfeae":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_074a6c4bedd24a379b59da7c284de342","placeholder":"​","style":"IPY_MODEL_29618afe42234e1fa1e23d6ca3c9a9bf","value":"Generating validation split: 100%"}},"3133af8ca0b9442aa0a863a82a0efc90":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c6986d4b832c45fe97b5a663309067de","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_8485fbb850314eee9e0df683a18d1497","value":2000}},"c2f229cd45bf4dbf86efe061f43010e0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a52f65446e6241e18b5011ef9c55865c","placeholder":"​","style":"IPY_MODEL_15ba674f577b4559a3ceb2129022bc9c","value":" 2000/2000 [00:00&lt;00:00, 130939.02 examples/s]"}},"9fb9e54ec0544cb5bf30fe6286ece8c4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"074a6c4bedd24a379b59da7c284de342":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"29618afe42234e1fa1e23d6ca3c9a9bf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c6986d4b832c45fe97b5a663309067de":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8485fbb850314eee9e0df683a18d1497":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a52f65446e6241e18b5011ef9c55865c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"15ba674f577b4559a3ceb2129022bc9c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
Nepali_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"LbVGaOODddTz"},"source":["# Nepali-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Nepali (ne) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Nepali-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"LbVGaOODddTz"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZ7JcWyrddT1","executionInfo":{"status":"ok","timestamp":1770440337344,"user_tz":-420,"elapsed":5458,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"335f12ce-bd5b-4992-c358-b8185ac2a1b9"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"yZ7JcWyrddT1"},{"cell_type":"markdown","metadata":{"id":"u7XmZQkZddT2"},"source":["## 2. Data Loading (Opus-100)\n","Loading Nepali-English pairs from Opus-100."],"id":"u7XmZQkZddT2"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":659,"referenced_widgets":["a41c32e2052d441c841f4617448813c2","7ac0d88a4d9f4e0bb1d87cd80430b1e9","ee706053eaae4069a05db7e2f56ba69d","78ece472f8ea40459ea595513f5240e2","ff66d13a55e84797b188d1382bf50091","d6e1511fb8844085beb80f8bd2930ae6","cdaa2216ab1040fda24528b555b4ab0a","06740d7f569f4113a012c128edb51057","627d1d072e794b2ca7b36bf6b77eaec8","94b1dccd601f48eb939d3b70f7c3cd01","8026c53a8c2b406d9e231f528535b918","ea79c3477e64407ab92b59495a9fe7f7","0e102f99679e415baf15276596dd8f48","7cfbc4d2a5ba482bb82bf34513014bb7","6b5b9da87832435aa93fc0b35cafe144","8b40b8060e6d4beaa89aa03b5da350d8","2957929a0a594134a39e9f50972c95f5","b0b0dbef7bbf49348b38ae4135c0325f","2ac916d608ba461ea5cc41e742fc9050","8d4bd45404ea4e1d92a93500be7ecc83","a29fbfb99d584fb1a0d11f43c87d71aa","c1a20886f8824428bb20e5d6d90402dc","a55c8ff37bf64d12bdb89e635dc48a51","2d738657d18e45d5b25298e3f1cf77cb","c09344c7b9804c398159e386c0ea4793","4f48a69a3ffc4a34b5cabf6049e0cffa","c70fc8be949b4cfda2a967429531bed1","8987b208e7f146b8b53aa10e7a74f25d","99da700e76704aa2884f7734a130079a","bc6b755aeec94d1aa14524898e0e8a76","741e2559968047a5bd064f310f661aaf","cac8c251e15944f193520251d24a6e73","018f6c77133a4d1d9ce7d5a75b670e53","aa70893f207845ec9c45f5ef016f2012","43e618cd2c08491a918c3addd066e174","3d17473809be4bf186d87b77cfcd4251","80c2b1e78b444afe958d06840d0b3c22","f3d46b4dc6834a119ee3991d2fadf7a3","1d6dab275642477180f6af35d716c9c0","b7b4268a75354f6192788eb36578ad35","f5669a8b55b8462fbbe15284078ead59","a0ceda60536a4dea9417b67afe7fefdc","0388568f16a84b60b89dd9e9d5dcc094","49f0330a97ad46249d9eeef7abc23026","09ad9dd4f7ea4002a7a2c5ec941768fb","3e1aef5c69d24a83ae07961e029b97fa","a2f7309ff7104c0480d6ebdf57de9668","c8ae4f36f344437e87336203b5117ebc","8af56068d3ef48198078732f2a2daa11","0d106be8be624440bf53600a515aa21e","742154eadf264acabf813f2115942644","f1d955df3d2a4631866f30dee3a9ad4e","f18bc0bf195949a3a6580fba2ae4d4a4","ee7f47b44b4040619a4393afc6220774","fe9b3042222d4db2b0f381fa5634fa94","e29b2e2d761347eb96cdbaec4534ca2c","eb60ac90d2a04fcc843a0019b397cb02","59aeec517e0e4f65a86c848eed8f798e","242fb8afaa8043c98d32a10d7bd096e4","83983989c1d54dc7933bf8b32c1873d9","06fef064adf34c49825101bb82bfa4b2","730871a915414d6daa4f4392fabea1f9","9c8acd7cd5d5478b91b2c1ab080b2178","a01b7337abd14d9d9ac44215d2c421e4","52ed7fde32214e159f3f331a44d732bc","97b08a16cac746ba8b9a4c6608fca1de","1ab6807b738244b0bc19dcd81068b3e5","b4578f21e55f4a828ba23a6fc88f3f61","26abf52639c743caae95f00dcfb6f61f","86cc3b8137b84116bfe742cb70d17d63","e2cdd9081357477ab36eaa99585d85a1","26ff241b8d8d4e5f8f6e7d88de581967","c322fea973734ca0aa51d5a89f21688e","61f98c2866af4781860c3a3421b26c24","2fdd33ca5b974bf7931b74411824f063","fdc37d1c376b4dce92f471ff453e98fb","b29682f2ebb04ef4954e7c3f7d382648"]},"id":"aEbelD3SddT2","executionInfo":{"status":"ok","timestamp":1770440353235,"user_tz":-420,"elapsed":15889,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"dff0c74d-1bdd-4e92-c834-a50ce026e880"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Nepali-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a41c32e2052d441c841f4617448813c2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ne/test-00000-of-00001.parquet: 0%| | 0.00/93.5k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"ea79c3477e64407ab92b59495a9fe7f7"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["en-ne/train-00000-of-00001.parquet: 0%| | 0.00/23.9M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a55c8ff37bf64d12bdb89e635dc48a51"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ne/validation-00000-of-00001.parquet: 0%| | 0.00/101k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"aa70893f207845ec9c45f5ef016f2012"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"09ad9dd4f7ea4002a7a2c5ec941768fb"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/406381 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e29b2e2d761347eb96cdbaec4534ca2c"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1ab6807b738244b0bc19dcd81068b3e5"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 410381 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Nepali-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Nepali-English)...\")\n","try:\n"," # Opus-100 has 'en-ne'\n"," dataset = load_dataset(\"opus100\", \"en-ne\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," # Extract data\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'ne' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'ne': item['translation']['ne'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Nepali-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"aEbelD3SddT2"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wkuNsm4cddT2","executionInfo":{"status":"ok","timestamp":1770440353336,"user_tz":-420,"elapsed":97,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"c8a53b82-bca6-4802-f961-49bf95c43705"},"outputs":[{"output_type":"stream","name":"stdout","text":[" ne \\\n","0 प्रयोग नगरिएको हटाउनुहोस् \n","1 साधारण थ्रेडिङ \n","2 नम्बर स्तम्भ प्रयोग गर्न पर्दाको कार्यस्थान सज... \n","3 टाइमआउट सर्भरमा जडान गर्दै । \n","4 बोधार्थ प्रतिलिपि फाँट \n","\n"," en \n","0 Remove Unused \n","1 Si_mple threading \n","2 Change the workspace layout of the screen to u... \n","3 Timeout connecting to server. \n","4 _Cc Field \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['ne', 'en'])\n","df['ne'] = df['ne'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['ne'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"wkuNsm4cddT2"},{"cell_type":"markdown","metadata":{"id":"0gy7106RddT2"},"source":["## 3. Tokenization"],"id":"0gy7106RddT2"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZaZ7dr2ddT3","executionInfo":{"status":"ok","timestamp":1770440355997,"user_tz":-420,"elapsed":2659,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"f44c1dbb-bf92-4ccc-ec69-5feff5f69533"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Nepali Tokenizer...\n","Training English Tokenizer (for Nepali pair)...\n"]}],"source":["# Save texts to files\n","with open('train_ne.txt', 'w', encoding='utf-8') as f:\n"," for line in df['ne']: f.write(line + '\\n')\n","\n","with open('train_en_ne.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Nepali Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_ne.txt',\n"," model_prefix='spm_ne',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Nepali pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_ne.txt',\n"," model_prefix='spm_en_ne',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_ne.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_ne.model')"],"id":"yZaZ7dr2ddT3"},{"cell_type":"markdown","metadata":{"id":"sciub41gddT3"},"source":["## 4. Dataset & Model"],"id":"sciub41gddT3"},{"cell_type":"code","execution_count":5,"metadata":{"id":"KYoV5LvZddT3","executionInfo":{"status":"ok","timestamp":1770440356001,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['ne']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"KYoV5LvZddT3"},{"cell_type":"code","execution_count":6,"metadata":{"id":"V9eoQcEsddT3","executionInfo":{"status":"ok","timestamp":1770440356004,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"V9eoQcEsddT3"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BnI01iQ7ddT3","executionInfo":{"status":"ok","timestamp":1770440617090,"user_tz":-420,"elapsed":261083,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"0754250e-37de-4e98-f94b-1d45083c67cf"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.176\n","Step 100, Loss: 6.588\n","Step 200, Loss: 6.126\n","Step 300, Loss: 6.147\n","Step 400, Loss: 5.600\n","Step 500, Loss: 5.748\n","Step 600, Loss: 5.715\n","Step 700, Loss: 5.169\n","Epoch 1 Loss: 5.846\n","Step 0, Loss: 4.846\n","Step 100, Loss: 4.999\n","Step 200, Loss: 4.959\n","Step 300, Loss: 4.773\n","Step 400, Loss: 4.586\n","Step 500, Loss: 4.851\n","Step 600, Loss: 4.514\n","Step 700, Loss: 4.902\n","Epoch 2 Loss: 4.711\n","Step 0, Loss: 4.046\n","Step 100, Loss: 3.816\n","Step 200, Loss: 3.815\n","Step 300, Loss: 3.796\n","Step 400, Loss: 3.608\n","Step 500, Loss: 4.376\n","Step 600, Loss: 3.983\n","Step 700, Loss: 3.369\n","Epoch 3 Loss: 3.951\n","Step 0, Loss: 3.170\n","Step 100, Loss: 3.633\n","Step 200, Loss: 3.516\n","Step 300, Loss: 3.477\n","Step 400, Loss: 3.575\n","Step 500, Loss: 3.580\n","Step 600, Loss: 3.283\n","Step 700, Loss: 3.402\n","Epoch 4 Loss: 3.369\n","Step 0, Loss: 2.605\n","Step 100, Loss: 3.518\n","Step 200, Loss: 3.049\n","Step 300, Loss: 3.254\n","Step 400, Loss: 3.399\n","Step 500, Loss: 2.932\n","Step 600, Loss: 3.141\n","Step 700, Loss: 2.269\n","Epoch 5 Loss: 2.937\n","Step 0, Loss: 2.661\n","Step 100, Loss: 2.003\n","Step 200, Loss: 2.601\n","Step 300, Loss: 3.214\n","Step 400, Loss: 3.140\n","Step 500, Loss: 2.526\n","Step 600, Loss: 2.761\n","Step 700, Loss: 2.669\n","Epoch 6 Loss: 2.618\n","Step 0, Loss: 2.305\n","Step 100, Loss: 2.072\n","Step 200, Loss: 2.323\n","Step 300, Loss: 2.058\n","Step 400, Loss: 2.571\n","Step 500, Loss: 2.387\n","Step 600, Loss: 2.445\n","Step 700, Loss: 2.105\n","Epoch 7 Loss: 2.375\n","Step 0, Loss: 1.901\n","Step 100, Loss: 2.535\n","Step 200, Loss: 2.590\n","Step 300, Loss: 2.259\n","Step 400, Loss: 2.060\n","Step 500, Loss: 2.945\n","Step 600, Loss: 2.238\n","Step 700, Loss: 2.150\n","Epoch 8 Loss: 2.181\n","Step 0, Loss: 2.019\n","Step 100, Loss: 2.238\n","Step 200, Loss: 1.810\n","Step 300, Loss: 1.984\n","Step 400, Loss: 2.144\n","Step 500, Loss: 1.977\n","Step 600, Loss: 2.094\n","Step 700, Loss: 1.831\n","Epoch 9 Loss: 2.025\n","Step 0, Loss: 1.937\n","Step 100, Loss: 1.998\n","Step 200, Loss: 2.205\n","Step 300, Loss: 1.905\n","Step 400, Loss: 1.929\n","Step 500, Loss: 1.959\n","Step 600, Loss: 2.074\n","Step 700, Loss: 1.576\n","Epoch 10 Loss: 1.891\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_ne.pt')"],"id":"BnI01iQ7ddT3"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kglUseModdT3","executionInfo":{"status":"ok","timestamp":1770440617119,"user_tz":-420,"elapsed":26,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"aafa9beb-3d49-45cd-8026-ff5d5f584dff"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_ne.pt', 'app/models/transformer_model_ne.pt')\n","shutil.copy('spm_ne.model', 'app/models/spm_ne.model')\n","shutil.copy('spm_en_ne.model', 'app/models/spm_en_ne.model')\n","print(\"Models copied to app/models/\")"],"id":"kglUseModdT3"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"a41c32e2052d441c841f4617448813c2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_7ac0d88a4d9f4e0bb1d87cd80430b1e9","IPY_MODEL_ee706053eaae4069a05db7e2f56ba69d","IPY_MODEL_78ece472f8ea40459ea595513f5240e2"],"layout":"IPY_MODEL_ff66d13a55e84797b188d1382bf50091"}},"7ac0d88a4d9f4e0bb1d87cd80430b1e9":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d6e1511fb8844085beb80f8bd2930ae6","placeholder":"​","style":"IPY_MODEL_cdaa2216ab1040fda24528b555b4ab0a","value":"README.md: "}},"ee706053eaae4069a05db7e2f56ba69d":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_06740d7f569f4113a012c128edb51057","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_627d1d072e794b2ca7b36bf6b77eaec8","value":1}},"78ece472f8ea40459ea595513f5240e2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_94b1dccd601f48eb939d3b70f7c3cd01","placeholder":"​","style":"IPY_MODEL_8026c53a8c2b406d9e231f528535b918","value":" 65.4k/? [00:00&lt;00:00, 7.17MB/s]"}},"ff66d13a55e84797b188d1382bf50091":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6e1511fb8844085beb80f8bd2930ae6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cdaa2216ab1040fda24528b555b4ab0a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"06740d7f569f4113a012c128edb51057":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"627d1d072e794b2ca7b36bf6b77eaec8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"94b1dccd601f48eb939d3b70f7c3cd01":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8026c53a8c2b406d9e231f528535b918":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ea79c3477e64407ab92b59495a9fe7f7":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0e102f99679e415baf15276596dd8f48","IPY_MODEL_7cfbc4d2a5ba482bb82bf34513014bb7","IPY_MODEL_6b5b9da87832435aa93fc0b35cafe144"],"layout":"IPY_MODEL_8b40b8060e6d4beaa89aa03b5da350d8"}},"0e102f99679e415baf15276596dd8f48":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2957929a0a594134a39e9f50972c95f5","placeholder":"​","style":"IPY_MODEL_b0b0dbef7bbf49348b38ae4135c0325f","value":"en-ne/test-00000-of-00001.parquet: 100%"}},"7cfbc4d2a5ba482bb82bf34513014bb7":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_2ac916d608ba461ea5cc41e742fc9050","max":93474,"min":0,"orientation":"horizontal","style":"IPY_MODEL_8d4bd45404ea4e1d92a93500be7ecc83","value":93474}},"6b5b9da87832435aa93fc0b35cafe144":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a29fbfb99d584fb1a0d11f43c87d71aa","placeholder":"​","style":"IPY_MODEL_c1a20886f8824428bb20e5d6d90402dc","value":" 93.5k/93.5k [00:00&lt;00:00, 179kB/s]"}},"8b40b8060e6d4beaa89aa03b5da350d8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2957929a0a594134a39e9f50972c95f5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b0b0dbef7bbf49348b38ae4135c0325f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2ac916d608ba461ea5cc41e742fc9050":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8d4bd45404ea4e1d92a93500be7ecc83":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a29fbfb99d584fb1a0d11f43c87d71aa":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c1a20886f8824428bb20e5d6d90402dc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a55c8ff37bf64d12bdb89e635dc48a51":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_2d738657d18e45d5b25298e3f1cf77cb","IPY_MODEL_c09344c7b9804c398159e386c0ea4793","IPY_MODEL_4f48a69a3ffc4a34b5cabf6049e0cffa"],"layout":"IPY_MODEL_c70fc8be949b4cfda2a967429531bed1"}},"2d738657d18e45d5b25298e3f1cf77cb":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8987b208e7f146b8b53aa10e7a74f25d","placeholder":"​","style":"IPY_MODEL_99da700e76704aa2884f7734a130079a","value":"en-ne/train-00000-of-00001.parquet: 100%"}},"c09344c7b9804c398159e386c0ea4793":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_bc6b755aeec94d1aa14524898e0e8a76","max":23913307,"min":0,"orientation":"horizontal","style":"IPY_MODEL_741e2559968047a5bd064f310f661aaf","value":23913307}},"4f48a69a3ffc4a34b5cabf6049e0cffa":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cac8c251e15944f193520251d24a6e73","placeholder":"​","style":"IPY_MODEL_018f6c77133a4d1d9ce7d5a75b670e53","value":" 23.9M/23.9M [00:00&lt;00:00, 12.0MB/s]"}},"c70fc8be949b4cfda2a967429531bed1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8987b208e7f146b8b53aa10e7a74f25d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"99da700e76704aa2884f7734a130079a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"bc6b755aeec94d1aa14524898e0e8a76":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"741e2559968047a5bd064f310f661aaf":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"cac8c251e15944f193520251d24a6e73":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"018f6c77133a4d1d9ce7d5a75b670e53":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"aa70893f207845ec9c45f5ef016f2012":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_43e618cd2c08491a918c3addd066e174","IPY_MODEL_3d17473809be4bf186d87b77cfcd4251","IPY_MODEL_80c2b1e78b444afe958d06840d0b3c22"],"layout":"IPY_MODEL_f3d46b4dc6834a119ee3991d2fadf7a3"}},"43e618cd2c08491a918c3addd066e174":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1d6dab275642477180f6af35d716c9c0","placeholder":"​","style":"IPY_MODEL_b7b4268a75354f6192788eb36578ad35","value":"en-ne/validation-00000-of-00001.parquet: 100%"}},"3d17473809be4bf186d87b77cfcd4251":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f5669a8b55b8462fbbe15284078ead59","max":100742,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a0ceda60536a4dea9417b67afe7fefdc","value":100742}},"80c2b1e78b444afe958d06840d0b3c22":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0388568f16a84b60b89dd9e9d5dcc094","placeholder":"​","style":"IPY_MODEL_49f0330a97ad46249d9eeef7abc23026","value":" 101k/101k [00:00&lt;00:00, 128kB/s]"}},"f3d46b4dc6834a119ee3991d2fadf7a3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1d6dab275642477180f6af35d716c9c0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b7b4268a75354f6192788eb36578ad35":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f5669a8b55b8462fbbe15284078ead59":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a0ceda60536a4dea9417b67afe7fefdc":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"0388568f16a84b60b89dd9e9d5dcc094":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"49f0330a97ad46249d9eeef7abc23026":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"09ad9dd4f7ea4002a7a2c5ec941768fb":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_3e1aef5c69d24a83ae07961e029b97fa","IPY_MODEL_a2f7309ff7104c0480d6ebdf57de9668","IPY_MODEL_c8ae4f36f344437e87336203b5117ebc"],"layout":"IPY_MODEL_8af56068d3ef48198078732f2a2daa11"}},"3e1aef5c69d24a83ae07961e029b97fa":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0d106be8be624440bf53600a515aa21e","placeholder":"​","style":"IPY_MODEL_742154eadf264acabf813f2115942644","value":"Generating test split: 100%"}},"a2f7309ff7104c0480d6ebdf57de9668":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f1d955df3d2a4631866f30dee3a9ad4e","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_f18bc0bf195949a3a6580fba2ae4d4a4","value":2000}},"c8ae4f36f344437e87336203b5117ebc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ee7f47b44b4040619a4393afc6220774","placeholder":"​","style":"IPY_MODEL_fe9b3042222d4db2b0f381fa5634fa94","value":" 2000/2000 [00:00&lt;00:00, 55864.09 examples/s]"}},"8af56068d3ef48198078732f2a2daa11":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0d106be8be624440bf53600a515aa21e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"742154eadf264acabf813f2115942644":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f1d955df3d2a4631866f30dee3a9ad4e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f18bc0bf195949a3a6580fba2ae4d4a4":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"ee7f47b44b4040619a4393afc6220774":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fe9b3042222d4db2b0f381fa5634fa94":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e29b2e2d761347eb96cdbaec4534ca2c":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_eb60ac90d2a04fcc843a0019b397cb02","IPY_MODEL_59aeec517e0e4f65a86c848eed8f798e","IPY_MODEL_242fb8afaa8043c98d32a10d7bd096e4"],"layout":"IPY_MODEL_83983989c1d54dc7933bf8b32c1873d9"}},"eb60ac90d2a04fcc843a0019b397cb02":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_06fef064adf34c49825101bb82bfa4b2","placeholder":"​","style":"IPY_MODEL_730871a915414d6daa4f4392fabea1f9","value":"Generating train split: 100%"}},"59aeec517e0e4f65a86c848eed8f798e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_9c8acd7cd5d5478b91b2c1ab080b2178","max":406381,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a01b7337abd14d9d9ac44215d2c421e4","value":406381}},"242fb8afaa8043c98d32a10d7bd096e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_52ed7fde32214e159f3f331a44d732bc","placeholder":"​","style":"IPY_MODEL_97b08a16cac746ba8b9a4c6608fca1de","value":" 406381/406381 [00:00&lt;00:00, 1711431.02 examples/s]"}},"83983989c1d54dc7933bf8b32c1873d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"06fef064adf34c49825101bb82bfa4b2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"730871a915414d6daa4f4392fabea1f9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9c8acd7cd5d5478b91b2c1ab080b2178":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a01b7337abd14d9d9ac44215d2c421e4":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"52ed7fde32214e159f3f331a44d732bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"97b08a16cac746ba8b9a4c6608fca1de":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1ab6807b738244b0bc19dcd81068b3e5":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_b4578f21e55f4a828ba23a6fc88f3f61","IPY_MODEL_26abf52639c743caae95f00dcfb6f61f","IPY_MODEL_86cc3b8137b84116bfe742cb70d17d63"],"layout":"IPY_MODEL_e2cdd9081357477ab36eaa99585d85a1"}},"b4578f21e55f4a828ba23a6fc88f3f61":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_26ff241b8d8d4e5f8f6e7d88de581967","placeholder":"​","style":"IPY_MODEL_c322fea973734ca0aa51d5a89f21688e","value":"Generating validation split: 100%"}},"26abf52639c743caae95f00dcfb6f61f":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_61f98c2866af4781860c3a3421b26c24","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_2fdd33ca5b974bf7931b74411824f063","value":2000}},"86cc3b8137b84116bfe742cb70d17d63":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_fdc37d1c376b4dce92f471ff453e98fb","placeholder":"​","style":"IPY_MODEL_b29682f2ebb04ef4954e7c3f7d382648","value":" 2000/2000 [00:00&lt;00:00, 163374.13 examples/s]"}},"e2cdd9081357477ab36eaa99585d85a1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"26ff241b8d8d4e5f8f6e7d88de581967":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c322fea973734ca0aa51d5a89f21688e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"61f98c2866af4781860c3a3421b26c24":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2fdd33ca5b974bf7931b74411824f063":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"fdc37d1c376b4dce92f471ff453e98fb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b29682f2ebb04ef4954e7c3f7d382648":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
README.md CHANGED
@@ -1,10 +1,74 @@
1
  ---
2
- title: MY Translator
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
8
  ---
 
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: My Translator by Ko Ko
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  ---
9
+ # Multilingual Neural Machine Translation (Project A3)
10
 
11
+ **Developed by:** Htut Ko Ko (st126010)
12
+
13
+ * 👉 **Live App** : [huggingface.co/spaces/shadowsilence/burmese-english-translator](https://huggingface.co/spaces/shadowsilence/burmese-english-translator)
14
+
15
+ This project implements high-quality machine translation systems for multiple languages (Burmese, Thai, Chinese, Vietnamese, Hindi, Nepali, Urdu, Tagalog, Kazakh, Bengali, German) to English using two approaches:
16
+
17
+ 1. **Fine-Tuned NLLB-200**: State-of-the-art multilingual model tailored for high-quality translation across all supported languages.
18
+ 2. **Transformer from Scratch**: Educational implementation to demonstrate understanding of NMT architecture.
19
+
20
+ ## Experiments
21
+
22
+ ![WebUI Demo](attention/attention_loss_comparison.png)
23
+
24
+ ### Attention Mechanisms (Burmese-English)
25
+
26
+ I compared **General (Dot Product)** and **Additive (Bahdanau)** attention mechanisms using a Seq2Seq GRU model.
27
+
28
+ | Attention Mechanism | Training Loss | Training PPL | Validation Loss | Validation PPL |
29
+ | ----------------------------- | --------------- | ---------------- | --------------- | ----------------- |
30
+ | General (Dot) | 4.819 | 123.868 | 6.662 | 782.166 |
31
+ | **Additive (Bahdanau)** | **4.447** | **85.368** | **6.440** | **626.673** |
32
+
33
+ **Observation:** Additive Attention achieved lower validation perplexity, indicating better performance.
34
+
35
+ ## Demo
36
+
37
+ ![WebUI Demo](demo.gif)
38
+
39
+ ## Folder Structure
40
+
41
+ - `Burmese_English_NLLB.ipynb`: **(Recommended)** Fine-Tuning NLLB for high-quality translation.
42
+ - `Burmese_English_Transformer.ipynb`: Transformer from Scratch implementation for Burmese-English.
43
+ - `*_English_Transformer.ipynb`: Transformer implementation for Foreign_language_for_AIT_students-English.
44
+ - `Attention_Experiments.ipynb`: Comparison of General vs. Additive Attention (Burmese-English).
45
+ - `app/`: Web Application folder.
46
+ - `app.py`: Flask application supporting multiple languages.
47
+ - `nllb_model/`: Fine-tuned NLLB model.
48
+
49
+ ## How to Run Locally
50
+
51
+ ### 1. Requirements
52
+
53
+ Install dependencies:
54
+
55
+ ```bash
56
+ cd app
57
+ pip install -r requirements.txt
58
+ ```
59
+
60
+ ### 2. Run the App
61
+
62
+ ```bash
63
+ python app.py
64
+ ```
65
+
66
+ Open `http://localhost:5001`.
67
+
68
+ ## Credits & Acknowledgements
69
+
70
+ This project respects the academic integrity and usage policies of the following resources:
71
+
72
+ - **Dataset**: [Asian Language Treebank (ALT)](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/), [Opus-100](https://opus.nlpl.eu/)
73
+ - **Base Model**: [NLLB-200](https://ai.meta.com/research/no-language-left-behind/) by Meta AI.
74
+ - **Tokenization**: [SentencePiece](https://github.com/google/sentencepiece) by Google.
Tagalog_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"JLkDXbmyi7DI"},"source":["# Tagalog-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Tagalog (tl) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **ALT (Asian Language Treebank)** dataset for Tagalog-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"JLkDXbmyi7DI"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"h_1v3NHui7DJ","executionInfo":{"status":"ok","timestamp":1770441680901,"user_tz":-420,"elapsed":11371,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"1afa5161-47f7-4df9-95dd-f7f8aa105a3d"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"h_1v3NHui7DJ"},{"cell_type":"markdown","metadata":{"id":"IPLDi898i7DK"},"source":["## 2. Data Loading (ALT Dataset)\n","Loading Tagalog-English pairs from ALT."],"id":"IPLDi898i7DK"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":713,"referenced_widgets":["3bea3f3ba71840d08d15d89cd9b1942f","7260b34547704aa1a25ce5764326d84a","99a8c2e3e5b348ffa3398f3f5fa7feeb","ad978d099dff43899c4c2b27f5965d96","152028b582394ed1aa179fe7e49a71e5","66fd2df97a514b4abf8c495215a0c40d","1de3bb240c4e48299b80c32479f7236f","e0a682bc6a954368bab0b5cd24afe0bc","5bc80f4a727a4a2f89048592923819f2","ac63742075ee4a8aacde692dd878949d","602c57471f0a4ee9b2c1a62b50c81b30","9cb7f8df246f47718c3088bd75b4f615","9b9b1352c1eb45cb87795854e0ffdfc1","bcf36300957947c2ac51b1bf1050156e","46a5329ffe0f41288446476b3c1e02f3","7c4627887bbb42388cf1902e76eb483b","9f23df2a6c234262ad7870e5cbbe68a0","f7f91e01fc0d4c5fa76734c051a68815","db1a0ccf2a7a4171957399db5adc5ba4","628845fcf7da4e4f8030a019e44f5a88","e1b84510fd384d2eb17c23a8dc216e68","7ee522e7a7ec411aa178c1f2f4d487cf","2dd50fb2684d49c684d0196ad5d6e065","8288e34b12704fa880150517a686a9e4","dc201daf7f9549cf91806bc409d68c6c","8db637dc8ee24def8ee31ffb1d63d0b2","6cebb045b0124eb3bd151a6ae4bb5230","3bc431e04c444df081ff754a0cb6c6b5","568b2f1130994f278776e45bcd939a28","f88dfe8d492f46518bdf554b8a94131d","bff35403467b42a88d0bca8f2919065a","881ab236fe8e429c883603d3de9528ab","25368c07e6724131a2f8e5395707d668","9d7773e115654ad9b75a7ad44f502f4a","040d4b5b41c94a2bb1f36e3296b3bdd4","c21d3525924c47979813f514fcbb949e","9f707859d1b447b9bf2b400bf713bb51","e78ea5453d714b1fbfde8466abf9d683","3167bcf1029e4ab2b0e1ac9c777ced6c","e01b0c927e4b4456a83f1360ee6debcc","3b2aad8a0cdb4bb4866b89e49484e75f","871f6c7ca002447cb4616ae764bcf996","e15f751bbcbd4f53b4d756da399d6d9c","2e2167c120244bfba2a600cd11bea24c","0fc80220974d42e383a059fb7fb4cac4","93a42a372e0e470ab0172a5c86047b78","48f749ac01c7414786abb9e76bfde0b4","0aec42c7404b42a4bda73ba4ab759576","d0d1f6343b014c769ba95945fd399365","d6f0b3924aa048f7955cd9a1ad391106","ba639735e8594aed937e838652efc534","b96937a130a745ec99861af1ca8b7a97","35179739868040d1ba7d542f450f04da","78ce17d6759143faa0fecb35ce77dbaf","c388f8c8f7454befa4ef800ca881ab69","6dbf026b6dd541e8a616611f88d0fdd2","190d05dede1848f8ac868a8fd8c4f00b","740e1171bb4842ebac7f6dbdd7ee9f01","217fd7ebc0b5431f918656dd8c681243","a27cf14d24b94adfbbffaa816c1ce491","cdb53cc2c0e14e04aceb8faef270e136","d32ae0c469a242faaafeee4f7d14a17b","266ef0735e414b8a88e1333f91cfb615","eb1caf07bdb54ead8026a1dc689105fb","55e32e2868c6429687ec74c79f226386","2220cdff006d48e4927d4a80f3912cf9","ceec10aebf0645cdbbb06950685ca7d9","893ec2094c7846c0b36815fe2ed1fcc8","5366d27cd0654132b39b175e6e7a50d9","7b4877c7b2a1410fb7aa243edc77dd15","38d4f3d3233b46608e21a2889b5be2f8","c6d608c9fce94163a951e4651c7f30b3","21c72058ee9a472283a517619379dd77","88329550b2ab488c9a28527399de5e77","f14af4590e09400ea39e5406668afe53","327c428abb4443849591e08a144972ae","66591ba93f4943528bebe7d0bb7b8fd6"]},"id":"MJCOAcXPi7DK","executionInfo":{"status":"ok","timestamp":1770441690479,"user_tz":-420,"elapsed":9582,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"c0772e60-279d-43ac-ea7b-e3ab59d4cf59"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading ALT Dataset...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3bea3f3ba71840d08d15d89cd9b1942f"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["alt-parallel/train-00000-of-00001.parque(…): 0%| | 0.00/31.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9cb7f8df246f47718c3088bd75b4f615"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/validation-00000-of-00001.p(…): 0%| | 0.00/1.71M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2dd50fb2684d49c684d0196ad5d6e065"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/test-00000-of-00001.parquet: 0%| | 0.00/1.79M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9d7773e115654ad9b75a7ad44f502f4a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/18088 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0fc80220974d42e383a059fb7fb4cac4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/1000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6dbf026b6dd541e8a616611f88d0fdd2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/1019 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"ceec10aebf0645cdbbb06950685ca7d9"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 20107 sentences from ALT.\n","Extracted 20107 Tagalog-English pairs.\n"]}],"source":["print(\"Loading ALT Dataset...\")\n","try:\n"," # ALT has 'fil' for Filipino (Tagalog)\n"," dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from ALT.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'fil' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'tl': item['translation']['fil'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," print(f\"Extracted {len(data)} Tagalog-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"MJCOAcXPi7DK"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"C8jIYNA3i7DL","executionInfo":{"status":"ok","timestamp":1770441690484,"user_tz":-420,"elapsed":4,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"bc395897-349b-4b9a-c491-f185332e22ae"},"outputs":[{"output_type":"stream","name":"stdout","text":[" tl \\\n","0 Natalo ng Italya ang Portugal sa puntos na 31-... \n","1 Si Andrea Masi ang nagsimula na makapuntos sa ... \n","2 Sa kabila ng pagmamanipula sa unang kalahati n... \n","3 Hindi sumuko ang Portugal at si David Penalva ... \n","4 Nanguna ang Italya sa puntos na 16-5 sa kalagi... \n","\n"," en \n","0 Italy have defeated Portugal 31-5 in Pool C of... \n","1 Andrea Masi opened the scoring in the fourth m... \n","2 Despite controlling the game for much of the f... \n","3 Portugal never gave up and David Penalva score... \n","4 Italy led 16-5 at half time but were matched b... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['tl', 'en'])\n","df['tl'] = df['tl'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['tl'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"C8jIYNA3i7DL"},{"cell_type":"markdown","metadata":{"id":"cQGT4HOti7DL"},"source":["## 3. Tokenization"],"id":"cQGT4HOti7DL"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tbPUUwtOi7DL","executionInfo":{"status":"ok","timestamp":1770441695072,"user_tz":-420,"elapsed":4587,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"53ad1aaa-20df-44e3-c917-9b41496b81cc"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Tagalog Tokenizer...\n","Training English Tokenizer (for Tagalog pair)...\n"]}],"source":["# Save texts to files\n","with open('train_tl.txt', 'w', encoding='utf-8') as f:\n"," for line in df['tl']: f.write(line + '\\n')\n","\n","with open('train_en_tl.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Tagalog Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_tl.txt',\n"," model_prefix='spm_tl',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Tagalog pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_tl.txt',\n"," model_prefix='spm_en_tl',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_tl.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_tl.model')"],"id":"tbPUUwtOi7DL"},{"cell_type":"markdown","metadata":{"id":"TB-giK3ei7DL"},"source":["## 4. Dataset & Model"],"id":"TB-giK3ei7DL"},{"cell_type":"code","execution_count":5,"metadata":{"id":"y4IZvtrGi7DL","executionInfo":{"status":"ok","timestamp":1770441695079,"user_tz":-420,"elapsed":3,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['tl']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"y4IZvtrGi7DL"},{"cell_type":"code","execution_count":6,"metadata":{"id":"aiJSJu5Ui7DM","executionInfo":{"status":"ok","timestamp":1770441695091,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"aiJSJu5Ui7DM"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EL4FOyYli7DM","executionInfo":{"status":"ok","timestamp":1770443127273,"user_tz":-420,"elapsed":1432179,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"f969203b-beb2-4c51-89c9-791964ccc429"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.210\n","Step 100, Loss: 6.797\n","Step 200, Loss: 6.489\n","Step 300, Loss: 6.381\n","Epoch 1 Loss: 6.725\n","Step 0, Loss: 6.260\n","Step 100, Loss: 5.953\n","Step 200, Loss: 5.918\n","Step 300, Loss: 5.736\n","Epoch 2 Loss: 6.015\n","Step 0, Loss: 5.615\n","Step 100, Loss: 5.594\n","Step 200, Loss: 5.530\n","Step 300, Loss: 5.515\n","Epoch 3 Loss: 5.589\n","Step 0, Loss: 5.190\n","Step 100, Loss: 5.415\n","Step 200, Loss: 5.166\n","Step 300, Loss: 5.139\n","Epoch 4 Loss: 5.258\n","Step 0, Loss: 5.087\n","Step 100, Loss: 5.019\n","Step 200, Loss: 5.036\n","Step 300, Loss: 4.975\n","Epoch 5 Loss: 5.001\n","Step 0, Loss: 4.884\n","Step 100, Loss: 4.778\n","Step 200, Loss: 4.828\n","Step 300, Loss: 4.676\n","Epoch 6 Loss: 4.792\n","Step 0, Loss: 4.523\n","Step 100, Loss: 4.596\n","Step 200, Loss: 4.639\n","Step 300, Loss: 4.588\n","Epoch 7 Loss: 4.617\n","Step 0, Loss: 4.536\n","Step 100, Loss: 4.275\n","Step 200, Loss: 4.409\n","Step 300, Loss: 4.398\n","Epoch 8 Loss: 4.461\n","Step 0, Loss: 4.208\n","Step 100, Loss: 4.340\n","Step 200, Loss: 4.500\n","Step 300, Loss: 4.374\n","Epoch 9 Loss: 4.323\n","Step 0, Loss: 4.190\n","Step 100, Loss: 4.173\n","Step 200, Loss: 4.264\n","Step 300, Loss: 4.224\n","Epoch 10 Loss: 4.204\n","Step 0, Loss: 3.999\n","Step 100, Loss: 4.082\n","Step 200, Loss: 4.328\n","Step 300, Loss: 4.229\n","Epoch 11 Loss: 4.094\n","Step 0, Loss: 3.885\n","Step 100, Loss: 3.996\n","Step 200, Loss: 4.035\n","Step 300, Loss: 4.102\n","Epoch 12 Loss: 3.989\n","Step 0, Loss: 3.838\n","Step 100, Loss: 3.867\n","Step 200, Loss: 3.930\n","Step 300, Loss: 3.849\n","Epoch 13 Loss: 3.897\n","Step 0, Loss: 3.722\n","Step 100, Loss: 3.670\n","Step 200, Loss: 3.909\n","Step 300, Loss: 3.918\n","Epoch 14 Loss: 3.810\n","Step 0, Loss: 3.634\n","Step 100, Loss: 3.624\n","Step 200, Loss: 3.850\n","Step 300, Loss: 3.949\n","Epoch 15 Loss: 3.730\n","Step 0, Loss: 3.555\n","Step 100, Loss: 3.679\n","Step 200, Loss: 3.749\n","Step 300, Loss: 3.787\n","Epoch 16 Loss: 3.656\n","Step 0, Loss: 3.512\n","Step 100, Loss: 3.532\n","Step 200, Loss: 3.613\n","Step 300, Loss: 3.729\n","Epoch 17 Loss: 3.582\n","Step 0, Loss: 3.320\n","Step 100, Loss: 3.593\n","Step 200, Loss: 3.603\n","Step 300, Loss: 3.622\n","Epoch 18 Loss: 3.518\n","Step 0, Loss: 3.346\n","Step 100, Loss: 3.409\n","Step 200, Loss: 3.498\n","Step 300, Loss: 3.355\n","Epoch 19 Loss: 3.456\n","Step 0, Loss: 3.229\n","Step 100, Loss: 3.335\n","Step 200, Loss: 3.442\n","Step 300, Loss: 3.505\n","Epoch 20 Loss: 3.395\n","Step 0, Loss: 3.102\n","Step 100, Loss: 3.359\n","Step 200, Loss: 3.311\n","Step 300, Loss: 3.466\n","Epoch 21 Loss: 3.342\n","Step 0, Loss: 3.045\n","Step 100, Loss: 3.320\n","Step 200, Loss: 3.197\n","Step 300, Loss: 3.455\n","Epoch 22 Loss: 3.295\n","Step 0, Loss: 3.050\n","Step 100, Loss: 3.280\n","Step 200, Loss: 3.276\n","Step 300, Loss: 3.456\n","Epoch 23 Loss: 3.243\n","Step 0, Loss: 3.122\n","Step 100, Loss: 3.209\n","Step 200, Loss: 3.183\n","Step 300, Loss: 3.135\n","Epoch 24 Loss: 3.197\n","Step 0, Loss: 2.916\n","Step 100, Loss: 3.175\n","Step 200, Loss: 3.129\n","Step 300, Loss: 3.387\n","Epoch 25 Loss: 3.149\n","Step 0, Loss: 2.970\n","Step 100, Loss: 3.263\n","Step 200, Loss: 3.150\n","Step 300, Loss: 3.108\n","Epoch 26 Loss: 3.112\n","Step 0, Loss: 2.946\n","Step 100, Loss: 3.011\n","Step 200, Loss: 3.132\n","Step 300, Loss: 3.279\n","Epoch 27 Loss: 3.073\n","Step 0, Loss: 2.808\n","Step 100, Loss: 3.053\n","Step 200, Loss: 3.114\n","Step 300, Loss: 3.123\n","Epoch 28 Loss: 3.041\n","Step 0, Loss: 2.847\n","Step 100, Loss: 2.930\n","Step 200, Loss: 3.195\n","Step 300, Loss: 3.101\n","Epoch 29 Loss: 3.000\n","Step 0, Loss: 2.866\n","Step 100, Loss: 2.908\n","Step 200, Loss: 2.950\n","Step 300, Loss: 3.234\n","Epoch 30 Loss: 2.967\n","Step 0, Loss: 2.873\n","Step 100, Loss: 2.911\n","Step 200, Loss: 2.841\n","Step 300, Loss: 2.970\n","Epoch 31 Loss: 2.930\n","Step 0, Loss: 2.749\n","Step 100, Loss: 3.014\n","Step 200, Loss: 2.898\n","Step 300, Loss: 2.942\n","Epoch 32 Loss: 2.902\n","Step 0, Loss: 2.662\n","Step 100, Loss: 2.879\n","Step 200, Loss: 2.882\n","Step 300, Loss: 2.923\n","Epoch 33 Loss: 2.875\n","Step 0, Loss: 2.714\n","Step 100, Loss: 2.701\n","Step 200, Loss: 2.897\n","Step 300, Loss: 3.026\n","Epoch 34 Loss: 2.840\n","Step 0, Loss: 2.709\n","Step 100, Loss: 2.807\n","Step 200, Loss: 2.826\n","Step 300, Loss: 2.929\n","Epoch 35 Loss: 2.814\n","Step 0, Loss: 2.691\n","Step 100, Loss: 2.725\n","Step 200, Loss: 2.831\n","Step 300, Loss: 2.991\n","Epoch 36 Loss: 2.789\n","Step 0, Loss: 2.623\n","Step 100, Loss: 2.744\n","Step 200, Loss: 2.841\n","Step 300, Loss: 2.849\n","Epoch 37 Loss: 2.766\n","Step 0, Loss: 2.595\n","Step 100, Loss: 2.715\n","Step 200, Loss: 2.849\n","Step 300, Loss: 2.844\n","Epoch 38 Loss: 2.736\n","Step 0, Loss: 2.516\n","Step 100, Loss: 2.631\n","Step 200, Loss: 2.853\n","Step 300, Loss: 2.662\n","Epoch 39 Loss: 2.717\n","Step 0, Loss: 2.569\n","Step 100, Loss: 2.618\n","Step 200, Loss: 2.787\n","Step 300, Loss: 2.816\n","Epoch 40 Loss: 2.693\n","Step 0, Loss: 2.608\n","Step 100, Loss: 2.583\n","Step 200, Loss: 2.668\n","Step 300, Loss: 2.813\n","Epoch 41 Loss: 2.670\n","Step 0, Loss: 2.519\n","Step 100, Loss: 2.595\n","Step 200, Loss: 2.600\n","Step 300, Loss: 2.710\n","Epoch 42 Loss: 2.650\n","Step 0, Loss: 2.546\n","Step 100, Loss: 2.591\n","Step 200, Loss: 2.790\n","Step 300, Loss: 2.816\n","Epoch 43 Loss: 2.631\n","Step 0, Loss: 2.500\n","Step 100, Loss: 2.651\n","Step 200, Loss: 2.752\n","Step 300, Loss: 2.643\n","Epoch 44 Loss: 2.608\n","Step 0, Loss: 2.488\n","Step 100, Loss: 2.614\n","Step 200, Loss: 2.687\n","Step 300, Loss: 2.730\n","Epoch 45 Loss: 2.592\n","Step 0, Loss: 2.365\n","Step 100, Loss: 2.544\n","Step 200, Loss: 2.575\n","Step 300, Loss: 2.590\n","Epoch 46 Loss: 2.569\n","Step 0, Loss: 2.300\n","Step 100, Loss: 2.454\n","Step 200, Loss: 2.510\n","Step 300, Loss: 2.621\n","Epoch 47 Loss: 2.555\n","Step 0, Loss: 2.508\n","Step 100, Loss: 2.466\n","Step 200, Loss: 2.577\n","Step 300, Loss: 2.596\n","Epoch 48 Loss: 2.534\n","Step 0, Loss: 2.390\n","Step 100, Loss: 2.496\n","Step 200, Loss: 2.567\n","Step 300, Loss: 2.660\n","Epoch 49 Loss: 2.519\n","Step 0, Loss: 2.340\n","Step 100, Loss: 2.477\n","Step 200, Loss: 2.551\n","Step 300, Loss: 2.687\n","Epoch 50 Loss: 2.496\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(50): # 50 Epochs for ALT\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_tl.pt')"],"id":"EL4FOyYli7DM"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"G_0EQE5Vi7DM","executionInfo":{"status":"ok","timestamp":1770443127285,"user_tz":-420,"elapsed":9,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"fa2fa05e-e591-49a5-e0c0-267ce50c411b"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_tl.pt', 'app/models/transformer_model_tl.pt')\n","shutil.copy('spm_tl.model', 'app/models/spm_tl.model')\n","shutil.copy('spm_en_tl.model', 'app/models/spm_en_tl.model')\n","print(\"Models copied to app/models/\")"],"id":"G_0EQE5Vi7DM"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"3bea3f3ba71840d08d15d89cd9b1942f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_7260b34547704aa1a25ce5764326d84a","IPY_MODEL_99a8c2e3e5b348ffa3398f3f5fa7feeb","IPY_MODEL_ad978d099dff43899c4c2b27f5965d96"],"layout":"IPY_MODEL_152028b582394ed1aa179fe7e49a71e5"}},"7260b34547704aa1a25ce5764326d84a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_66fd2df97a514b4abf8c495215a0c40d","placeholder":"​","style":"IPY_MODEL_1de3bb240c4e48299b80c32479f7236f","value":"README.md: "}},"99a8c2e3e5b348ffa3398f3f5fa7feeb":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e0a682bc6a954368bab0b5cd24afe0bc","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5bc80f4a727a4a2f89048592923819f2","value":1}},"ad978d099dff43899c4c2b27f5965d96":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ac63742075ee4a8aacde692dd878949d","placeholder":"​","style":"IPY_MODEL_602c57471f0a4ee9b2c1a62b50c81b30","value":" 13.2k/? [00:00&lt;00:00, 1.06MB/s]"}},"152028b582394ed1aa179fe7e49a71e5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"66fd2df97a514b4abf8c495215a0c40d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1de3bb240c4e48299b80c32479f7236f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e0a682bc6a954368bab0b5cd24afe0bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"5bc80f4a727a4a2f89048592923819f2":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"ac63742075ee4a8aacde692dd878949d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"602c57471f0a4ee9b2c1a62b50c81b30":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9cb7f8df246f47718c3088bd75b4f615":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_9b9b1352c1eb45cb87795854e0ffdfc1","IPY_MODEL_bcf36300957947c2ac51b1bf1050156e","IPY_MODEL_46a5329ffe0f41288446476b3c1e02f3"],"layout":"IPY_MODEL_7c4627887bbb42388cf1902e76eb483b"}},"9b9b1352c1eb45cb87795854e0ffdfc1":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9f23df2a6c234262ad7870e5cbbe68a0","placeholder":"​","style":"IPY_MODEL_f7f91e01fc0d4c5fa76734c051a68815","value":"alt-parallel/train-00000-of-00001.parque(…): 100%"}},"bcf36300957947c2ac51b1bf1050156e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_db1a0ccf2a7a4171957399db5adc5ba4","max":31211167,"min":0,"orientation":"horizontal","style":"IPY_MODEL_628845fcf7da4e4f8030a019e44f5a88","value":31211167}},"46a5329ffe0f41288446476b3c1e02f3":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e1b84510fd384d2eb17c23a8dc216e68","placeholder":"​","style":"IPY_MODEL_7ee522e7a7ec411aa178c1f2f4d487cf","value":" 31.2M/31.2M [00:01&lt;00:00, 17.0MB/s]"}},"7c4627887bbb42388cf1902e76eb483b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9f23df2a6c234262ad7870e5cbbe68a0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f7f91e01fc0d4c5fa76734c051a68815":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"db1a0ccf2a7a4171957399db5adc5ba4":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"628845fcf7da4e4f8030a019e44f5a88":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e1b84510fd384d2eb17c23a8dc216e68":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7ee522e7a7ec411aa178c1f2f4d487cf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2dd50fb2684d49c684d0196ad5d6e065":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_8288e34b12704fa880150517a686a9e4","IPY_MODEL_dc201daf7f9549cf91806bc409d68c6c","IPY_MODEL_8db637dc8ee24def8ee31ffb1d63d0b2"],"layout":"IPY_MODEL_6cebb045b0124eb3bd151a6ae4bb5230"}},"8288e34b12704fa880150517a686a9e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3bc431e04c444df081ff754a0cb6c6b5","placeholder":"​","style":"IPY_MODEL_568b2f1130994f278776e45bcd939a28","value":"alt-parallel/validation-00000-of-00001.p(…): 100%"}},"dc201daf7f9549cf91806bc409d68c6c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f88dfe8d492f46518bdf554b8a94131d","max":1710203,"min":0,"orientation":"horizontal","style":"IPY_MODEL_bff35403467b42a88d0bca8f2919065a","value":1710203}},"8db637dc8ee24def8ee31ffb1d63d0b2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_881ab236fe8e429c883603d3de9528ab","placeholder":"​","style":"IPY_MODEL_25368c07e6724131a2f8e5395707d668","value":" 1.71M/1.71M [00:00&lt;00:00, 3.30MB/s]"}},"6cebb045b0124eb3bd151a6ae4bb5230":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3bc431e04c444df081ff754a0cb6c6b5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"568b2f1130994f278776e45bcd939a28":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f88dfe8d492f46518bdf554b8a94131d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bff35403467b42a88d0bca8f2919065a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"881ab236fe8e429c883603d3de9528ab":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"25368c07e6724131a2f8e5395707d668":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9d7773e115654ad9b75a7ad44f502f4a":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_040d4b5b41c94a2bb1f36e3296b3bdd4","IPY_MODEL_c21d3525924c47979813f514fcbb949e","IPY_MODEL_9f707859d1b447b9bf2b400bf713bb51"],"layout":"IPY_MODEL_e78ea5453d714b1fbfde8466abf9d683"}},"040d4b5b41c94a2bb1f36e3296b3bdd4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3167bcf1029e4ab2b0e1ac9c777ced6c","placeholder":"​","style":"IPY_MODEL_e01b0c927e4b4456a83f1360ee6debcc","value":"alt-parallel/test-00000-of-00001.parquet: 100%"}},"c21d3525924c47979813f514fcbb949e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3b2aad8a0cdb4bb4866b89e49484e75f","max":1786537,"min":0,"orientation":"horizontal","style":"IPY_MODEL_871f6c7ca002447cb4616ae764bcf996","value":1786537}},"9f707859d1b447b9bf2b400bf713bb51":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e15f751bbcbd4f53b4d756da399d6d9c","placeholder":"​","style":"IPY_MODEL_2e2167c120244bfba2a600cd11bea24c","value":" 1.79M/1.79M [00:00&lt;00:00, 3.94MB/s]"}},"e78ea5453d714b1fbfde8466abf9d683":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3167bcf1029e4ab2b0e1ac9c777ced6c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e01b0c927e4b4456a83f1360ee6debcc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3b2aad8a0cdb4bb4866b89e49484e75f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"871f6c7ca002447cb4616ae764bcf996":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e15f751bbcbd4f53b4d756da399d6d9c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2e2167c120244bfba2a600cd11bea24c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"0fc80220974d42e383a059fb7fb4cac4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_93a42a372e0e470ab0172a5c86047b78","IPY_MODEL_48f749ac01c7414786abb9e76bfde0b4","IPY_MODEL_0aec42c7404b42a4bda73ba4ab759576"],"layout":"IPY_MODEL_d0d1f6343b014c769ba95945fd399365"}},"93a42a372e0e470ab0172a5c86047b78":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d6f0b3924aa048f7955cd9a1ad391106","placeholder":"​","style":"IPY_MODEL_ba639735e8594aed937e838652efc534","value":"Generating train split: 100%"}},"48f749ac01c7414786abb9e76bfde0b4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_b96937a130a745ec99861af1ca8b7a97","max":18088,"min":0,"orientation":"horizontal","style":"IPY_MODEL_35179739868040d1ba7d542f450f04da","value":18088}},"0aec42c7404b42a4bda73ba4ab759576":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_78ce17d6759143faa0fecb35ce77dbaf","placeholder":"​","style":"IPY_MODEL_c388f8c8f7454befa4ef800ca881ab69","value":" 18088/18088 [00:00&lt;00:00, 58062.28 examples/s]"}},"d0d1f6343b014c769ba95945fd399365":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6f0b3924aa048f7955cd9a1ad391106":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ba639735e8594aed937e838652efc534":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"b96937a130a745ec99861af1ca8b7a97":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"35179739868040d1ba7d542f450f04da":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"78ce17d6759143faa0fecb35ce77dbaf":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c388f8c8f7454befa4ef800ca881ab69":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6dbf026b6dd541e8a616611f88d0fdd2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_190d05dede1848f8ac868a8fd8c4f00b","IPY_MODEL_740e1171bb4842ebac7f6dbdd7ee9f01","IPY_MODEL_217fd7ebc0b5431f918656dd8c681243"],"layout":"IPY_MODEL_a27cf14d24b94adfbbffaa816c1ce491"}},"190d05dede1848f8ac868a8fd8c4f00b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cdb53cc2c0e14e04aceb8faef270e136","placeholder":"​","style":"IPY_MODEL_d32ae0c469a242faaafeee4f7d14a17b","value":"Generating validation split: 100%"}},"740e1171bb4842ebac7f6dbdd7ee9f01":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_266ef0735e414b8a88e1333f91cfb615","max":1000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_eb1caf07bdb54ead8026a1dc689105fb","value":1000}},"217fd7ebc0b5431f918656dd8c681243":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_55e32e2868c6429687ec74c79f226386","placeholder":"​","style":"IPY_MODEL_2220cdff006d48e4927d4a80f3912cf9","value":" 1000/1000 [00:00&lt;00:00, 30195.27 examples/s]"}},"a27cf14d24b94adfbbffaa816c1ce491":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cdb53cc2c0e14e04aceb8faef270e136":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d32ae0c469a242faaafeee4f7d14a17b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"266ef0735e414b8a88e1333f91cfb615":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"eb1caf07bdb54ead8026a1dc689105fb":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"55e32e2868c6429687ec74c79f226386":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2220cdff006d48e4927d4a80f3912cf9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ceec10aebf0645cdbbb06950685ca7d9":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_893ec2094c7846c0b36815fe2ed1fcc8","IPY_MODEL_5366d27cd0654132b39b175e6e7a50d9","IPY_MODEL_7b4877c7b2a1410fb7aa243edc77dd15"],"layout":"IPY_MODEL_38d4f3d3233b46608e21a2889b5be2f8"}},"893ec2094c7846c0b36815fe2ed1fcc8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c6d608c9fce94163a951e4651c7f30b3","placeholder":"​","style":"IPY_MODEL_21c72058ee9a472283a517619379dd77","value":"Generating test split: 100%"}},"5366d27cd0654132b39b175e6e7a50d9":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_88329550b2ab488c9a28527399de5e77","max":1019,"min":0,"orientation":"horizontal","style":"IPY_MODEL_f14af4590e09400ea39e5406668afe53","value":1019}},"7b4877c7b2a1410fb7aa243edc77dd15":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_327c428abb4443849591e08a144972ae","placeholder":"​","style":"IPY_MODEL_66591ba93f4943528bebe7d0bb7b8fd6","value":" 1019/1019 [00:00&lt;00:00, 27028.37 examples/s]"}},"38d4f3d3233b46608e21a2889b5be2f8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c6d608c9fce94163a951e4651c7f30b3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"21c72058ee9a472283a517619379dd77":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"88329550b2ab488c9a28527399de5e77":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f14af4590e09400ea39e5406668afe53":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"327c428abb4443849591e08a144972ae":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"66591ba93f4943528bebe7d0bb7b8fd6":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
Thai_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"ovL7yvuuM1J8"},"source":["# Thai-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Thai (th) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **ALT (Asian Language Treebank)** dataset for Thai-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n","\n","## Pipeline\n","1. **Setup**: Install/Import dependencies.\n","2. **Data Loading**: Load the ALT dataset (Thai-English).\n","3. **Tokenization**: Train SentencePiece model (`spm_th`, `spm_en_th`).\n","4. **Data Processing**: Create PyTorch Datasets and DataLoaders.\n","5. **Model**: Implement Transformer.\n","6. **Training**: Train the model.\n","7. **Evaluation**: Calculate BLEU score.\n","8. **Inference**: Demo function and save model for Web App."],"id":"ovL7yvuuM1J8"},{"cell_type":"markdown","metadata":{"id":"5lxOnCsnM1J-"},"source":["## 1. Setup and Imports"],"id":"5lxOnCsnM1J-"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"larr4GScM1J-","executionInfo":{"status":"ok","timestamp":1770435885900,"user_tz":-420,"elapsed":9552,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"3893aa16-273d-4de6-cd68-edd42f94d2c5"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","# Set seeds\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"larr4GScM1J-"},{"cell_type":"code","execution_count":2,"metadata":{"id":"tUHgxeXwM1J_","executionInfo":{"status":"ok","timestamp":1770435885906,"user_tz":-420,"elapsed":2,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["# Install dependencies if missing (uncomment if needed)\n","# !pip install sentencepiece datasets portalocker"],"id":"tUHgxeXwM1J_"},{"cell_type":"markdown","metadata":{"id":"CSodHP5IM1J_"},"source":["## 2. Data Loading (ALT Dataset)\n","Loading Thai-English pairs from ALT."],"id":"CSodHP5IM1J_"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":541,"referenced_widgets":["d4ec5329c1844388b4f6b896b76c586e","aa53e174527f43dcb3fa9fae564098d9","b68adfcf941447f38a1b49e9b709308b","41c73eb3cc5f4938a6091d907620233a","940dbf51962f4e98b3e36155dd166fd7","f9041e18aa7342199f63cc555eda0993","73b36006efa64e22b9c480f1f85828c0","ef841660bc9a4fb5b5f99f84507480fe","a8002a144c3447bfa3aee9212129fd1a","28bb8872ccee4dc4a3fb8b0a078e3576","c735929e87eb435d801c413ff9b0fca0","dfd4e0697b15478c839a697e7744852d","0d05c76faa734d6195ef21f2403a666f","0745b063a2df4c48b58f413b637bdaa2","baf15b3bf3774eed9738e244ebd7b16c","8036e1c5dd76462ba3bdc17c73f2f06e","7b43840efecb47b2bcd4611b99df11d7","7b91e2472b9b44e29bd431dd22b1c737","3064ea06da344e7dac706999c2785261","5572d2fe61ac43118afa9b304b16b510","a7d4a56724634785a2f1416c8f08eb10","aaed2b2a50ef4370bb8ae1951c386a91","20147e3fe9dc4e888f33069c3c3f523e","ff646d16028842a29963ee5cb4eae215","70d8e5233dfd49519a4a12e9fbc45351","25b982891ac04ccb87f2555549dbbd18","f5b81c8f4ca44375b763e77d8b7f6c15","1e2345f0d2fa4f5582c5635344a9ff4d","cada57a9f19247d69aa72a9b9c1d4000","e4d59e8900eb43909d06b969bd735289","21126f0d2ca6408ab3ffdc2d288a74e3","ea27c3e675ac4f1588f59f47d06e1263","edfc67720c9641cf8c8876f1bad6dcea","af5d3bec80a2485ba399db24ba326c91","0afa3b2829104136a268120d7e48fa1e","87ed005e275e4c05846fd9f4cb7e3e8c","2fd5509f4c8b44c1ae0d8f65a9808e19","cd37370826174938b41dddaa78e2ad2e","728ef89112d049d2adb5626823a31164","cb17f9c954dc4d8ab4ad1411fb9e01da","ab1a3305c8e64d698ccf96bf2e89f8f6","f7afee3e635249d0b7962eb73876add8","39f6e71fb05246798a7e1e5f86dc6c8a","013b681e3d32499095ffa169e0f8d27e","90b4ee1a9def4ef8be493eda8a40f873","0afd65653e4148b4b01d8b75699c5c49","99b699e79fb3466c80bf532f1f4978e8","b57a41822d80427b8bf08861f424806d","f95fa07a24a642ceb167385cd41237e6","61d30f08cf91410b8881944c71dbdf72","25f96a4aa8c840efb6c8d3c954c3440a","414890d7ea3848d982503a684f9b1438","a17e670578444551bbe6a33ce3eb1469","3e4e6bcd2b114edc86291e3fb6fc64ae","a15d1a9f2848416f812c1e2101054dca","8e2fa8bbd8e54193a1603669e5b73d99","48597d1c17ce4c4596e06db84fa57c0e","235a020b60fd4417b223de7522d98904","6b00786102084a9d8ac295a86b4a018e","50b37c8c62434616b00ca71334033ea3","e8c79aaefe8d4b58b63a6d630137c238","d2b8d4ebfc064bf6826d2e71e2d764e6","3202b4ce97ad44ad9877d3f5f9fc4c41","323a619824c442b8aa8609d255b0d470","3c6fef78cd854e2ab3fa23af01faa51f","29f1f1575fb443a6b1e01741a7d63f6a","c543038331c449de80fd72f37458950b","9bcf8d42b16645ddb29750021e5e52d6","1d09575ade5048ccb4501812157a2c21","4f75e0e560704eeab8aeb09be89add0b","cbefddfd5f5f4f8da3c1fa962a110737","48ce12da1224499f9238b21b0d3b2f4f","e3b3496a577e4d92b2b0e7edd4f888c4","bf35bf464ace496aa23fd8a184be1cd3","78aa0dae0714424b8019e3c6f25008d8","02a269db1c7c48d4b3c2ebb647abd98d","043730347106465e9995143ebbd7852c"]},"id":"VzLTzFz5M1KA","executionInfo":{"status":"ok","timestamp":1770435895935,"user_tz":-420,"elapsed":10028,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"333d7e1b-5821-45fa-ae65-ebefdcce6bab"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading ALT Dataset (Thai-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d4ec5329c1844388b4f6b896b76c586e"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["alt-parallel/train-00000-of-00001.parque(…): 0%| | 0.00/31.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"dfd4e0697b15478c839a697e7744852d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/validation-00000-of-00001.p(…): 0%| | 0.00/1.71M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"20147e3fe9dc4e888f33069c3c3f523e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/test-00000-of-00001.parquet: 0%| | 0.00/1.79M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"af5d3bec80a2485ba399db24ba326c91"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/18088 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"90b4ee1a9def4ef8be493eda8a40f873"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/1000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8e2fa8bbd8e54193a1603669e5b73d99"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/1019 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c543038331c449de80fd72f37458950b"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 20107 sentences from ALT dataset.\n","Extracted 20107 Thai-English pairs.\n"]}],"source":["from datasets import load_dataset\n","\n","print(\"Loading ALT Dataset (Thai-English)...\")\n","try:\n"," dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from ALT dataset.\")\n","\n"," # Filter/Extract only Thai and English\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'th' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'th': item['translation']['th'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," print(f\"Extracted {len(data)} Thai-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")\n"],"id":"VzLTzFz5M1KA"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RjL0ai03M1KA","executionInfo":{"status":"ok","timestamp":1770435895973,"user_tz":-420,"elapsed":35,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"38846639-c1e9-4864-afba-8f6312d502c3"},"outputs":[{"output_type":"stream","name":"stdout","text":[" th \\\n","0 อิตาลีได้เอาชนะโปรตุเกสด้วยคะแนน31ต่อ5 ในกลุ่ม... \n","1 Andrea Masi ได้เปิดฉากทำคะแนนในนาทีที่สี่ ด้วย... \n","2 ทั้งที่เป็นฝ่ายคุมเกมส์ในครึ่งแรกของการแข่งขัน... \n","3 โปรตุเกสไม่ละความพยยาม และDavid Penalvaได้ทำคะ... \n","4 ในครึ่งแรกอิตาลีขึ้นนำด้วยคะแนน16 ต่อ5 แต่ถูกป... \n","\n"," en \n","0 Italy have defeated Portugal 31-5 in Pool C of... \n","1 Andrea Masi opened the scoring in the fourth m... \n","2 Despite controlling the game for much of the f... \n","3 Portugal never gave up and David Penalva score... \n","4 Italy led 16-5 at half time but were matched b... \n","After cleaning: 20101 pairs\n","\n","--- Data Alignment Check ---\n","Source (th): ปัญหาเริ่มจากมีกลุ่ม \"นักปีนเขาที่สวมแต่รองเท้าบูทเท่านั้น\" ถูกตำรวจที่ Alpine จับเมื่อฤดูใบไม้ร่วงที่ผ่านมา\n","Target (en): The problem started with a group of \"boot-only hikers\" who were stopped by the police in the Alpine region last autumn.\n","--------------------\n","Source (th): ส่วนผู้สมัครคนอื่น ๆ ได้แก่ เจมี แม็คการ์วีย์, กรีน เกล็น ฮอดจ์สัน จากพรรคริเบอรัล และเดวิด โรว์แลนด์ ผู้สมัครอิสระ\n","Target (en): Other candidates in the riding are Liberal Jamie McGarvey, Green Glen Hodgson, and independent David Rowland.\n","--------------------\n","Source (th): \"ผู้หญิงที่ได้รับบาดเจ็บถูกหามขึ้นบนรถฉุกเฉิน Lebanon Ambulance One และส่งไปที่โรงพยาบาล Frisbie ใน Rochester เพื่อรักษาอาการให้คงที่ จากนั้นถูกส่งต่อไปที่ศูนย์อุบัติเหตุที่ Maine Medical Center\"\n","Target (en): \"The female patient was loaded into Lebanon Ambulance One and transported to Frisbie Hospital in Rochester to be stabilized and then was transferred to the trauma center at Maine Medical Center.\"\n","--------------------\n","Source (th): อดึตรัฐมนตรีว่าการกระทรวงพลังงานนิวเคลียร์ Yevgeny Adamov ได้ถูกจับกุมเมื่อวันจันทร์โดยเจ้าหน้าที่สวิสเซอร์แลนด์\n","Target (en): Former Russian nuclear energy minister Yevgeny Adamov was arrested on Monday by Swiss authorities.\n","--------------------\n","Source (th): พยานผู้เห็นเหตุการณ์รายหนึ่ง ให้สำนักข่าวว้อยซ์ ออฟ อเมริกาว่า ตอนเช้าวันศุกร์เขากำลังนั่งอยู่ในร้านของเขา เมื่อตอนที่ระเบิดใหญ่สะเทือนบริเวณนั้น เกิดฝุ่นคลุ้งไปทั่วบริเวณและทำให้สิ่งของหล่นจากผนัง\n","Target (en): One eyewitness told the Voice of America news agency he was sitting in his shop when a big explosion shook the area Friday morning, sending dust in the air and causing objects to fall from the walls.\n","--------------------\n"]}],"source":["# Convert to DataFrame\n","df = pd.DataFrame(data)\n","print(df.head())\n","\n","# Basic Cleaning\n","df = df.dropna(subset=['th', 'en'])\n","df['th'] = df['th'].astype(str)\n","df['en'] = df['en'].astype(str)\n","\n","df = df[df['th'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']\n","print(f\"After cleaning: {len(df)} pairs\")\n","\n","print(\"\\n--- Data Alignment Check ---\")\n","for i in range(5):\n"," sample = df.sample(1).iloc[0]\n"," print(f\"Source (th): {sample['th']}\")\n"," print(f\"Target (en): {sample['en']}\")\n"," print(\"-\" * 20)"],"id":"RjL0ai03M1KA"},{"cell_type":"markdown","metadata":{"id":"mL8WYE8YM1KA"},"source":["## 3. Tokenization (SentencePiece)\n","Training separate tokenizers for Thai (`spm_th`) and English (`spm_en_th`)."],"id":"mL8WYE8YM1KA"},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cvDYVzxQM1KA","executionInfo":{"status":"ok","timestamp":1770435910685,"user_tz":-420,"elapsed":14694,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"0616ff53-1703-43d0-e9bd-56e30cd9c417"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Thai Tokenizer...\n","Training English Tokenizer (for Thai pair)...\n","Tokenizer training complete!\n"]}],"source":["import sentencepiece as spm\n","\n","# 1. Save texts to files\n","with open('train_th.txt', 'w', encoding='utf-8') as f:\n"," for line in df['th']:\n"," f.write(line + '\\n')\n","\n","with open('train_en_th.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']:\n"," f.write(line + '\\n')\n","\n","# 2. Train SentencePiece models\n","vocab_size = 4000\n","model_type = 'bpe'\n","\n","print(\"Training Thai Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_th.txt',\n"," model_prefix='spm_th',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Thai pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_th.txt',\n"," model_prefix='spm_en_th',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Tokenizer training complete!\")"],"id":"cvDYVzxQM1KA"},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fssRkvGwM1KA","executionInfo":{"status":"ok","timestamp":1770435910711,"user_tz":-420,"elapsed":23,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"7322cd51-b246-43a7-e75c-2f22935d68ca"},"outputs":[{"output_type":"stream","name":"stdout","text":["Original th: อิตาลีได้เอาชนะโปรตุเกสด้วยคะแนน31ต่อ5 ในกลุ่มc ของการแข่งขันรักบี้เวิลด์คัพปี2007 ที่สนามปาร์กเดแพร็งส์ ที่กรุงปารีส ประเทศฝรั่งเศส\n","Tokens: ['▁', 'อิตาลี', 'ได้', 'เอาชนะ', 'โปร', 'ตุ', 'เก', 'ส', 'ด้วยคะแนน', '3', '1', 'ต่อ', '5', '▁ใน', 'กลุ่ม', 'c', '▁ของ', 'การแข่งขัน', 'รัก', 'บ', 'ี้', 'เ', 'วิ', 'ล', 'ด์', 'ค', 'ัพ', 'ปี', '200', '7', '▁ที่', 'สนาม', 'ป', 'าร์', 'ก', 'เด', 'แพร', '็ง', 'ส์', '▁ที่', 'กรุง', 'ป', 'าร', 'ี', 'ส', '▁ประเทศ', 'ฝรั่งเศส']\n","IDs: [3866, 2645, 25, 2150, 2037, 170, 70, 3882, 2998, 3950, 3931, 120, 3947, 109, 321, 3929, 420, 806, 358, 3886, 59, 3872, 102, 3879, 389, 3888, 1481, 108, 2812, 3970, 237, 955, 3890, 374, 3869, 111, 1251, 1222, 729, 237, 1028, 3890, 4, 3877, 3882, 990, 1232]\n"]}],"source":["# Load the processors\n","sp_th = spm.SentencePieceProcessor(model_file='spm_th.model')\n","sp_en = spm.SentencePieceProcessor(model_file='spm_en_th.model')\n","\n","# Test Tokenization\n","idx = 0\n","print(f\"Original th: {df.iloc[idx]['th']}\")\n","print(f\"Tokens: {sp_th.encode(df.iloc[idx]['th'], out_type=str)}\")\n","print(f\"IDs: {sp_th.encode(df.iloc[idx]['th'], out_type=int)}\")"],"id":"fssRkvGwM1KA"},{"cell_type":"markdown","metadata":{"id":"4uCE2oXHM1KA"},"source":["## 4. PyTorch Dataset and DataLoader"],"id":"4uCE2oXHM1KA"},{"cell_type":"code","execution_count":7,"metadata":{"id":"Elb_T_R1M1KB","executionInfo":{"status":"ok","timestamp":1770435910717,"user_tz":-420,"elapsed":7,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['th']\n"," trg_text = self.data.iloc[idx]['en']\n","\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n","\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n","\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n","\n"," return src_pad, trg_pad\n","\n","# Split Data\n","train_df = df.sample(frac=0.8, random_state=SEED)\n","val_test_df = df.drop(train_df.index)\n","val_df = val_test_df.sample(frac=0.5, random_state=SEED)\n","test_df = val_test_df.drop(val_df.index)\n","\n","train_dataset = TranslationDataset(train_df, sp_th, sp_en)\n","val_dataset = TranslationDataset(val_df, sp_th, sp_en)\n","test_dataset = TranslationDataset(test_df, sp_th, sp_en)\n","\n","BATCH_SIZE = 64\n","train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n","val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n","test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)"],"id":"Elb_T_R1M1KB"},{"cell_type":"markdown","metadata":{"id":"ghbp1QkpM1KB"},"source":["## 5. Transformer Model"],"id":"ghbp1QkpM1KB"},{"cell_type":"code","execution_count":8,"metadata":{"id":"Oe5LIn2lM1KB","executionInfo":{"status":"ok","timestamp":1770435910736,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=512, nhead=8, num_encoder_layers=3,\n"," num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n","\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n","\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n","\n"," self.transformer = nn.Transformer(\n"," d_model=d_model,\n"," nhead=nhead,\n"," num_encoder_layers=num_encoder_layers,\n"," num_decoder_layers=num_decoder_layers,\n"," dim_feedforward=dim_feedforward,\n"," dropout=dropout,\n"," batch_first=True\n"," )\n","\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n"," self.init_weights()\n","\n"," def init_weights(self):\n"," for p in self.parameters():\n"," if p.dim() > 1:\n"," nn.init.xavier_uniform_(p)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n","\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n","\n"," output = self.transformer(\n"," src=src_emb,\n"," tgt=trg_emb,\n"," tgt_mask=trg_mask,\n"," src_key_padding_mask=src_key_padding_mask\n"," )\n"," return self.fc_out(output)\n","\n","class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)"],"id":"Oe5LIn2lM1KB"},{"cell_type":"markdown","metadata":{"id":"ot44IfLkM1KB"},"source":["## 6. Training"],"id":"ot44IfLkM1KB"},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TPsbvXwKM1KB","executionInfo":{"status":"ok","timestamp":1770438089612,"user_tz":-420,"elapsed":2178869,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"7211b8cc-f8a7-4cb3-dcfd-49c1bccf342a"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting training...\n","Epoch: 01 | Train Loss: 6.842\n","Epoch: 02 | Train Loss: 6.028\n","Epoch: 03 | Train Loss: 5.618\n","Epoch: 04 | Train Loss: 5.384\n","Epoch: 05 | Train Loss: 5.247\n","Epoch: 06 | Train Loss: 5.146\n","Epoch: 07 | Train Loss: 5.064\n","Epoch: 08 | Train Loss: 4.994\n","Epoch: 09 | Train Loss: 4.929\n","Epoch: 10 | Train Loss: 4.868\n","Epoch: 11 | Train Loss: 4.809\n","Epoch: 12 | Train Loss: 4.755\n","Epoch: 13 | Train Loss: 4.703\n","Epoch: 14 | Train Loss: 4.653\n","Epoch: 15 | Train Loss: 4.607\n","Epoch: 16 | Train Loss: 4.563\n","Epoch: 17 | Train Loss: 4.523\n","Epoch: 18 | Train Loss: 4.487\n","Epoch: 19 | Train Loss: 4.450\n","Epoch: 20 | Train Loss: 4.416\n","Epoch: 21 | Train Loss: 4.383\n","Epoch: 22 | Train Loss: 4.353\n","Epoch: 23 | Train Loss: 4.327\n","Epoch: 24 | Train Loss: 4.300\n","Epoch: 25 | Train Loss: 4.277\n","Epoch: 26 | Train Loss: 4.252\n","Epoch: 27 | Train Loss: 4.229\n","Epoch: 28 | Train Loss: 4.208\n","Epoch: 29 | Train Loss: 4.189\n","Epoch: 30 | Train Loss: 4.169\n","Epoch: 31 | Train Loss: 4.150\n","Epoch: 32 | Train Loss: 4.132\n","Epoch: 33 | Train Loss: 4.115\n","Epoch: 34 | Train Loss: 4.099\n","Epoch: 35 | Train Loss: 4.084\n","Epoch: 36 | Train Loss: 4.070\n","Epoch: 37 | Train Loss: 4.055\n","Epoch: 38 | Train Loss: 4.043\n","Epoch: 39 | Train Loss: 4.028\n","Epoch: 40 | Train Loss: 4.014\n","Epoch: 41 | Train Loss: 4.001\n","Epoch: 42 | Train Loss: 3.988\n","Epoch: 43 | Train Loss: 3.979\n","Epoch: 44 | Train Loss: 3.967\n","Epoch: 45 | Train Loss: 3.960\n","Epoch: 46 | Train Loss: 3.946\n","Epoch: 47 | Train Loss: 3.936\n","Epoch: 48 | Train Loss: 3.929\n","Epoch: 49 | Train Loss: 3.915\n","Epoch: 50 | Train Loss: 3.909\n","Epoch: 51 | Train Loss: 3.897\n","Epoch: 52 | Train Loss: 3.889\n","Epoch: 53 | Train Loss: 3.881\n","Epoch: 54 | Train Loss: 3.872\n","Epoch: 55 | Train Loss: 3.865\n","Epoch: 56 | Train Loss: 3.856\n","Epoch: 57 | Train Loss: 3.851\n","Epoch: 58 | Train Loss: 3.844\n","Epoch: 59 | Train Loss: 3.836\n","Epoch: 60 | Train Loss: 3.830\n","Epoch: 61 | Train Loss: 3.821\n","Epoch: 62 | Train Loss: 3.817\n","Epoch: 63 | Train Loss: 3.807\n","Epoch: 64 | Train Loss: 3.802\n","Epoch: 65 | Train Loss: 3.798\n","Epoch: 66 | Train Loss: 3.792\n","Epoch: 67 | Train Loss: 3.785\n","Epoch: 68 | Train Loss: 3.779\n","Epoch: 69 | Train Loss: 3.774\n","Epoch: 70 | Train Loss: 3.770\n","Epoch: 71 | Train Loss: 3.763\n","Epoch: 72 | Train Loss: 3.757\n","Epoch: 73 | Train Loss: 3.751\n","Epoch: 74 | Train Loss: 3.748\n","Epoch: 75 | Train Loss: 3.742\n","Epoch: 76 | Train Loss: 3.737\n","Epoch: 77 | Train Loss: 3.734\n","Epoch: 78 | Train Loss: 3.727\n","Epoch: 79 | Train Loss: 3.726\n","Epoch: 80 | Train Loss: 3.719\n","Epoch: 81 | Train Loss: 3.714\n","Epoch: 82 | Train Loss: 3.713\n","Epoch: 83 | Train Loss: 3.711\n","Epoch: 84 | Train Loss: 3.704\n","Epoch: 85 | Train Loss: 3.698\n","Epoch: 86 | Train Loss: 3.696\n","Epoch: 87 | Train Loss: 3.693\n","Epoch: 88 | Train Loss: 3.687\n","Epoch: 89 | Train Loss: 3.682\n","Epoch: 90 | Train Loss: 3.681\n","Epoch: 91 | Train Loss: 3.677\n","Epoch: 92 | Train Loss: 3.674\n","Epoch: 93 | Train Loss: 3.670\n","Epoch: 94 | Train Loss: 3.666\n","Epoch: 95 | Train Loss: 3.663\n","Epoch: 96 | Train Loss: 3.660\n","Epoch: 97 | Train Loss: 3.655\n","Epoch: 98 | Train Loss: 3.653\n","Epoch: 99 | Train Loss: 3.650\n","Epoch: 100 | Train Loss: 3.648\n"]}],"source":["SRC_VOCAB_SIZE = vocab_size\n","TRG_VOCAB_SIZE = vocab_size\n","D_MODEL = 256\n","N_HEAD = 4\n","NUM_LAYERS = 2\n","FF_DIM = 512\n","DROPOUT = 0.4\n","LR = 0.0005\n","EPOCHS = 100\n","\n","model = TransformerModel(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, N_HEAD, NUM_LAYERS, NUM_LAYERS, FF_DIM, DROPOUT).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=LR)\n","criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)\n","\n","def train(model, iterator, optimizer, criterion, clip):\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(iterator):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output_dim = output.shape[-1]\n"," output = output.contiguous().view(-1, output_dim)\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," return epoch_loss / len(iterator)\n","\n","print(\"Starting training...\")\n","for epoch in range(EPOCHS):\n"," train_loss = train(model, train_loader, optimizer, criterion, 1.0)\n"," print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')\n"," # Save every epoch or best validation (skipped val loop for brevity here, but included in full code)\n"," torch.save(model.state_dict(), 'transformer_model_th.pt')"],"id":"TPsbvXwKM1KB"},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"OCQG6akdM1KB","executionInfo":{"status":"ok","timestamp":1770438089644,"user_tz":-420,"elapsed":27,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"99f934d0-7490-4a34-bdd7-c2f0e5548986"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Save artifacts for Web App\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_th.pt', 'app/models/transformer_model_th.pt')\n","shutil.copy('spm_th.model', 'app/models/spm_th.model')\n","shutil.copy('spm_en_th.model', 'app/models/spm_en_th.model')\n","print(\"Models copied to app/models/\")"],"id":"OCQG6akdM1KB"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"d4ec5329c1844388b4f6b896b76c586e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_aa53e174527f43dcb3fa9fae564098d9","IPY_MODEL_b68adfcf941447f38a1b49e9b709308b","IPY_MODEL_41c73eb3cc5f4938a6091d907620233a"],"layout":"IPY_MODEL_940dbf51962f4e98b3e36155dd166fd7"}},"aa53e174527f43dcb3fa9fae564098d9":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f9041e18aa7342199f63cc555eda0993","placeholder":"​","style":"IPY_MODEL_73b36006efa64e22b9c480f1f85828c0","value":"README.md: "}},"b68adfcf941447f38a1b49e9b709308b":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ef841660bc9a4fb5b5f99f84507480fe","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a8002a144c3447bfa3aee9212129fd1a","value":1}},"41c73eb3cc5f4938a6091d907620233a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_28bb8872ccee4dc4a3fb8b0a078e3576","placeholder":"​","style":"IPY_MODEL_c735929e87eb435d801c413ff9b0fca0","value":" 13.2k/? [00:00&lt;00:00, 931kB/s]"}},"940dbf51962f4e98b3e36155dd166fd7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f9041e18aa7342199f63cc555eda0993":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"73b36006efa64e22b9c480f1f85828c0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ef841660bc9a4fb5b5f99f84507480fe":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"a8002a144c3447bfa3aee9212129fd1a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"28bb8872ccee4dc4a3fb8b0a078e3576":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c735929e87eb435d801c413ff9b0fca0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"dfd4e0697b15478c839a697e7744852d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0d05c76faa734d6195ef21f2403a666f","IPY_MODEL_0745b063a2df4c48b58f413b637bdaa2","IPY_MODEL_baf15b3bf3774eed9738e244ebd7b16c"],"layout":"IPY_MODEL_8036e1c5dd76462ba3bdc17c73f2f06e"}},"0d05c76faa734d6195ef21f2403a666f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_7b43840efecb47b2bcd4611b99df11d7","placeholder":"​","style":"IPY_MODEL_7b91e2472b9b44e29bd431dd22b1c737","value":"alt-parallel/train-00000-of-00001.parque(…): 100%"}},"0745b063a2df4c48b58f413b637bdaa2":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3064ea06da344e7dac706999c2785261","max":31211167,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5572d2fe61ac43118afa9b304b16b510","value":31211167}},"baf15b3bf3774eed9738e244ebd7b16c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a7d4a56724634785a2f1416c8f08eb10","placeholder":"​","style":"IPY_MODEL_aaed2b2a50ef4370bb8ae1951c386a91","value":" 31.2M/31.2M [00:01&lt;00:00, 24.5MB/s]"}},"8036e1c5dd76462ba3bdc17c73f2f06e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7b43840efecb47b2bcd4611b99df11d7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7b91e2472b9b44e29bd431dd22b1c737":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3064ea06da344e7dac706999c2785261":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5572d2fe61ac43118afa9b304b16b510":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a7d4a56724634785a2f1416c8f08eb10":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aaed2b2a50ef4370bb8ae1951c386a91":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"20147e3fe9dc4e888f33069c3c3f523e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ff646d16028842a29963ee5cb4eae215","IPY_MODEL_70d8e5233dfd49519a4a12e9fbc45351","IPY_MODEL_25b982891ac04ccb87f2555549dbbd18"],"layout":"IPY_MODEL_f5b81c8f4ca44375b763e77d8b7f6c15"}},"ff646d16028842a29963ee5cb4eae215":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1e2345f0d2fa4f5582c5635344a9ff4d","placeholder":"​","style":"IPY_MODEL_cada57a9f19247d69aa72a9b9c1d4000","value":"alt-parallel/validation-00000-of-00001.p(…): 100%"}},"70d8e5233dfd49519a4a12e9fbc45351":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e4d59e8900eb43909d06b969bd735289","max":1710203,"min":0,"orientation":"horizontal","style":"IPY_MODEL_21126f0d2ca6408ab3ffdc2d288a74e3","value":1710203}},"25b982891ac04ccb87f2555549dbbd18":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ea27c3e675ac4f1588f59f47d06e1263","placeholder":"​","style":"IPY_MODEL_edfc67720c9641cf8c8876f1bad6dcea","value":" 1.71M/1.71M [00:00&lt;00:00, 3.69MB/s]"}},"f5b81c8f4ca44375b763e77d8b7f6c15":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1e2345f0d2fa4f5582c5635344a9ff4d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cada57a9f19247d69aa72a9b9c1d4000":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e4d59e8900eb43909d06b969bd735289":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"21126f0d2ca6408ab3ffdc2d288a74e3":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"ea27c3e675ac4f1588f59f47d06e1263":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"edfc67720c9641cf8c8876f1bad6dcea":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"af5d3bec80a2485ba399db24ba326c91":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0afa3b2829104136a268120d7e48fa1e","IPY_MODEL_87ed005e275e4c05846fd9f4cb7e3e8c","IPY_MODEL_2fd5509f4c8b44c1ae0d8f65a9808e19"],"layout":"IPY_MODEL_cd37370826174938b41dddaa78e2ad2e"}},"0afa3b2829104136a268120d7e48fa1e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_728ef89112d049d2adb5626823a31164","placeholder":"​","style":"IPY_MODEL_cb17f9c954dc4d8ab4ad1411fb9e01da","value":"alt-parallel/test-00000-of-00001.parquet: 100%"}},"87ed005e275e4c05846fd9f4cb7e3e8c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ab1a3305c8e64d698ccf96bf2e89f8f6","max":1786537,"min":0,"orientation":"horizontal","style":"IPY_MODEL_f7afee3e635249d0b7962eb73876add8","value":1786537}},"2fd5509f4c8b44c1ae0d8f65a9808e19":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_39f6e71fb05246798a7e1e5f86dc6c8a","placeholder":"​","style":"IPY_MODEL_013b681e3d32499095ffa169e0f8d27e","value":" 1.79M/1.79M [00:00&lt;00:00, 4.44MB/s]"}},"cd37370826174938b41dddaa78e2ad2e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"728ef89112d049d2adb5626823a31164":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cb17f9c954dc4d8ab4ad1411fb9e01da":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ab1a3305c8e64d698ccf96bf2e89f8f6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f7afee3e635249d0b7962eb73876add8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"39f6e71fb05246798a7e1e5f86dc6c8a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"013b681e3d32499095ffa169e0f8d27e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"90b4ee1a9def4ef8be493eda8a40f873":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_0afd65653e4148b4b01d8b75699c5c49","IPY_MODEL_99b699e79fb3466c80bf532f1f4978e8","IPY_MODEL_b57a41822d80427b8bf08861f424806d"],"layout":"IPY_MODEL_f95fa07a24a642ceb167385cd41237e6"}},"0afd65653e4148b4b01d8b75699c5c49":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_61d30f08cf91410b8881944c71dbdf72","placeholder":"​","style":"IPY_MODEL_25f96a4aa8c840efb6c8d3c954c3440a","value":"Generating train split: 100%"}},"99b699e79fb3466c80bf532f1f4978e8":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_414890d7ea3848d982503a684f9b1438","max":18088,"min":0,"orientation":"horizontal","style":"IPY_MODEL_a17e670578444551bbe6a33ce3eb1469","value":18088}},"b57a41822d80427b8bf08861f424806d":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3e4e6bcd2b114edc86291e3fb6fc64ae","placeholder":"​","style":"IPY_MODEL_a15d1a9f2848416f812c1e2101054dca","value":" 18088/18088 [00:00&lt;00:00, 51523.10 examples/s]"}},"f95fa07a24a642ceb167385cd41237e6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"61d30f08cf91410b8881944c71dbdf72":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"25f96a4aa8c840efb6c8d3c954c3440a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"414890d7ea3848d982503a684f9b1438":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a17e670578444551bbe6a33ce3eb1469":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3e4e6bcd2b114edc86291e3fb6fc64ae":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a15d1a9f2848416f812c1e2101054dca":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8e2fa8bbd8e54193a1603669e5b73d99":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_48597d1c17ce4c4596e06db84fa57c0e","IPY_MODEL_235a020b60fd4417b223de7522d98904","IPY_MODEL_6b00786102084a9d8ac295a86b4a018e"],"layout":"IPY_MODEL_50b37c8c62434616b00ca71334033ea3"}},"48597d1c17ce4c4596e06db84fa57c0e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e8c79aaefe8d4b58b63a6d630137c238","placeholder":"​","style":"IPY_MODEL_d2b8d4ebfc064bf6826d2e71e2d764e6","value":"Generating validation split: 100%"}},"235a020b60fd4417b223de7522d98904":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3202b4ce97ad44ad9877d3f5f9fc4c41","max":1000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_323a619824c442b8aa8609d255b0d470","value":1000}},"6b00786102084a9d8ac295a86b4a018e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3c6fef78cd854e2ab3fa23af01faa51f","placeholder":"​","style":"IPY_MODEL_29f1f1575fb443a6b1e01741a7d63f6a","value":" 1000/1000 [00:00&lt;00:00, 25709.06 examples/s]"}},"50b37c8c62434616b00ca71334033ea3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e8c79aaefe8d4b58b63a6d630137c238":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d2b8d4ebfc064bf6826d2e71e2d764e6":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3202b4ce97ad44ad9877d3f5f9fc4c41":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"323a619824c442b8aa8609d255b0d470":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3c6fef78cd854e2ab3fa23af01faa51f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"29f1f1575fb443a6b1e01741a7d63f6a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c543038331c449de80fd72f37458950b":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_9bcf8d42b16645ddb29750021e5e52d6","IPY_MODEL_1d09575ade5048ccb4501812157a2c21","IPY_MODEL_4f75e0e560704eeab8aeb09be89add0b"],"layout":"IPY_MODEL_cbefddfd5f5f4f8da3c1fa962a110737"}},"9bcf8d42b16645ddb29750021e5e52d6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_48ce12da1224499f9238b21b0d3b2f4f","placeholder":"​","style":"IPY_MODEL_e3b3496a577e4d92b2b0e7edd4f888c4","value":"Generating test split: 100%"}},"1d09575ade5048ccb4501812157a2c21":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_bf35bf464ace496aa23fd8a184be1cd3","max":1019,"min":0,"orientation":"horizontal","style":"IPY_MODEL_78aa0dae0714424b8019e3c6f25008d8","value":1019}},"4f75e0e560704eeab8aeb09be89add0b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_02a269db1c7c48d4b3c2ebb647abd98d","placeholder":"​","style":"IPY_MODEL_043730347106465e9995143ebbd7852c","value":" 1019/1019 [00:00&lt;00:00, 25704.84 examples/s]"}},"cbefddfd5f5f4f8da3c1fa962a110737":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"48ce12da1224499f9238b21b0d3b2f4f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e3b3496a577e4d92b2b0e7edd4f888c4":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"bf35bf464ace496aa23fd8a184be1cd3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"78aa0dae0714424b8019e3c6f25008d8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"02a269db1c7c48d4b3c2ebb647abd98d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"043730347106465e9995143ebbd7852c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
Urdu_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"1HmZNMCVfYGZ"},"source":["# Urdu-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Urdu (ur) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **Opus-100** dataset for Urdu-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"1HmZNMCVfYGZ"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"i4bBOIlgfYGb","executionInfo":{"status":"ok","timestamp":1770440810575,"user_tz":-420,"elapsed":5851,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"a2893c57-6f17-4fd5-c960-f284633cceb2"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"i4bBOIlgfYGb"},{"cell_type":"markdown","metadata":{"id":"rYPtZV_DfYGb"},"source":["## 2. Data Loading (Opus-100)\n","Loading Urdu-English pairs from Opus-100."],"id":"rYPtZV_DfYGb"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":635,"referenced_widgets":["4c07e9f9ef4a43998f8815cbb8a4f7d4","94779a76a45b4befba74054bbfd467d0","0e2c97e85f024a879cf7f58511052578","d94a089f1efa411fb8ccb82d2e26ade0","458c9e4ce6b443e98e5f8fb1c53e5e29","d8e59fb3dff34a4a8d422555e0aea6f7","4df631a1c84e46ce8b342d9ef832a740","d91a021523834b389c9ce9c8f9cec2bb","e8606c142c2e4bf39002074eacde19e8","cdab68a2574c4f8796a9555e5636389b","d502ee27f9d441e49b9e52d6c4344a22","7727c5cc79c747ddba6c3fba9adedd03","ccf60a63a5d740f7a4a937d00d12298c","522456874d164a9491bc74b16283c957","b95b1fe20f094222b6406f553bd013c8","fa14c7d360444c12b945e6acb210d1bc","1b05a64ef6544cb8af0d833659dc7508","e4ec51cc4ac54d05abfc1eadbfae1bb0","3a78306c9ed445f7a0b0204d7dbf98d2","6bb9ddb68e014790b2ce6f3b81814a82","a04f65b9c24c4a9c9ff57452d4142902","6008b4c3a29f4ab29a6aa97f0fd40745","98d2c90e3255476c868ce2aec11bd40f","5ff6b67810814fceabd7655048f3edf6","fc5319fd0372476cbe90c7c6e66f6bf6","33e3b83f9e8f4066b4f9dd38f86a5f34","7557fb3a682a4abba297ee872c243c83","3696d6c431184ac2a787e8be379885e3","39796bec65a74876b423a5beefc37e83","ce9ec3c5bbab4d9c8e56d556421b6645","316db49fe7cf43489e3ca58a9d313a2a","d1213b9c69284d16883f408f5b1f8048","68256b9bfd2f4d848aea4a6174ce2028","b3876e04d1d844738ee51613344e4739","7380a8c67ceb4857b5d9ffc66a554630","b00a302b235145a8855dc91b74b282bd","4dba981f0e6748168c4e86b2925d324b","63c6aa1173c54d49b3eb0d805d7c477b","b78945a747ce40eb95756149fbfb95cc","a662d27f646f4326b3469543f5d2a63c","b21ffa7749674a67a61b0a7fa1ba99f7","d9b9049a68b449df87dad7775d845ee7","f6a6e8f216e84720ba36b15d4d7ac81e","ac4b27ebe30d42e9ba07cb23647ebc27","417c9e6a1dd74232819a230cf8d99943","42267e92152e473897fd47343f02b283","f73dff27cf9c4eb1834c4b2edba6ac7e","03bd7d8f89774129b1252f0aa3ce6fa0","d61a75049aa04f7eb44b63c8134de3a6","ea8e3f0d26ee4b9aa91028882ae7c1ea","bdcfae8bc78542f982ddff5e76aca0f3","fbe8e753526a494f960e0e23a76d19ca","69df32f2825144f8b398f1fd9583a8be","dbcbeefa1f05437d834fda23b2e93d5a","f8a1cc4691eb412583c3aec6513ce447","f3cd3e9c0c0a420a87b80d32df51a888","651b748ffbcf4b66b19cc6ce63e34e7a","2fb00de56f9d40ffab06bde4e09095bd","7e81d321a5c84fbb8e6d60856e16b7a7","513556bb91324ecdad415564f9b6ab2d","63133d05ec304cfaac336b346308ce6d","061019974f254fd7afc9ca4fe05b3831","d8c92679f4e2420e89da403a69b75a5b","52dfaca0539f46218aec0ee50731b4bd","2b7c3d1a4fb2400baf9ca96a69649b57","0217f1e38530403685127fdd120b6ed0","c86c02301e7144cabab7b596c855c922","97a2cd7cb62f41e3b3c8a4a5a31697b2","a92af3d602c541488d20feeefa361d9c","9149268626854fdbae9a2c7c36f5ef15","cab3bc9f5746483cad0ed7fcd532fc73","f0db55c4f25249019e5d7a2691e50366","4bd624c5737e4adc87d41ef527720016","cc918e93af824c18adefb3fdf4b0f307","cf790630f3254fcea11210ca25667755","b222362d13e447d1873929f6f477084c","7af0010ef5f646c89a932b9299a948ee"]},"id":"QGa0ct6NfYGc","executionInfo":{"status":"ok","timestamp":1770440845487,"user_tz":-420,"elapsed":34909,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d3a17c97-6d81-47f3-8e15-9cbceac894f7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading Opus-100 Dataset (Urdu-English)...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"4c07e9f9ef4a43998f8815cbb8a4f7d4"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n","WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"]},{"output_type":"display_data","data":{"text/plain":["en-ur/test-00000-of-00001.parquet: 0%| | 0.00/301k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7727c5cc79c747ddba6c3fba9adedd03"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ur/train-00000-of-00001.parquet: 0%| | 0.00/148M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"98d2c90e3255476c868ce2aec11bd40f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["en-ur/validation-00000-of-00001.parquet: 0%| | 0.00/296k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b3876e04d1d844738ee51613344e4739"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"417c9e6a1dd74232819a230cf8d99943"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/753913 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f3cd3e9c0c0a420a87b80d32df51a888"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c86c02301e7144cabab7b596c855c922"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 757913 sentences from Opus-100 dataset.\n","Subsampled to 50,000 examples for efficiency.\n","Extracted 50000 Urdu-English pairs.\n"]}],"source":["print(\"Loading Opus-100 Dataset (Urdu-English)...\")\n","try:\n"," # Opus-100 has 'en-ur'\n"," dataset = load_dataset(\"opus100\", \"en-ur\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from Opus-100 dataset.\")\n","\n"," # Extract data\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'ur' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'ur': item['translation']['ur'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," # Limit to manageable size\n"," if len(data) > 50000:\n"," import random\n"," random.shuffle(data)\n"," data = data[:50000]\n"," print(\"Subsampled to 50,000 examples for efficiency.\")\n","\n"," print(f\"Extracted {len(data)} Urdu-English pairs.\")\n","\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"QGa0ct6NfYGc"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"KAxwboZPfYGc","executionInfo":{"status":"ok","timestamp":1770440845692,"user_tz":-420,"elapsed":193,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"800f788f-5319-436c-b300-bf9a902fa14d"},"outputs":[{"output_type":"stream","name":"stdout","text":[" ur \\\n","0 اور وہ کہ جب کہ انہیں ان کے رب کی آیتیں یاد د ... \n","1 یہ سارے کے سارے قیامت کے دن اکیلے اس کے پاس حا... \n","2 شیطان ان سے وعدے کرتا ہےاور انہیں امیدیں دلاتا... \n","3 کیا تم نے نہیں دیکھا کہ خدا نے سات آسمان کیسے ... \n","4 لیکن ہم نے بہت سی نسلیں پیدا کیں جن پر لمبی مد... \n","\n"," en \n","0 Who, when reminded of their Lord's revelations... \n","1 And everyone of them will come to Him alone on... \n","2 He [Shaitan (Satan)] makes promises to them, a... \n","3 \"Have you not seen that God has created the se... \n","4 But We raised up (new) generations, and long w... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['ur', 'en'])\n","df['ur'] = df['ur'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['ur'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"KAxwboZPfYGc"},{"cell_type":"markdown","metadata":{"id":"YWbTPuM-fYGc"},"source":["## 3. Tokenization"],"id":"YWbTPuM-fYGc"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pjA-NxXffYGc","executionInfo":{"status":"ok","timestamp":1770440849735,"user_tz":-420,"elapsed":4044,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"86c788ea-ff9a-4067-ace9-756dade232ab"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Urdu Tokenizer...\n","Training English Tokenizer (for Urdu pair)...\n"]}],"source":["# Save texts to files\n","with open('train_ur.txt', 'w', encoding='utf-8') as f:\n"," for line in df['ur']: f.write(line + '\\n')\n","\n","with open('train_en_ur.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Urdu Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_ur.txt',\n"," model_prefix='spm_ur',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Urdu pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_ur.txt',\n"," model_prefix='spm_en_ur',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_ur.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_ur.model')"],"id":"pjA-NxXffYGc"},{"cell_type":"markdown","metadata":{"id":"2z7j-DBSfYGc"},"source":["## 4. Dataset & Model"],"id":"2z7j-DBSfYGc"},{"cell_type":"code","execution_count":5,"metadata":{"id":"NOC6FziQfYGc","executionInfo":{"status":"ok","timestamp":1770440849738,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['ur']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"NOC6FziQfYGc"},{"cell_type":"code","execution_count":6,"metadata":{"id":"1jck_VZ8fYGd","executionInfo":{"status":"ok","timestamp":1770440849739,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"1jck_VZ8fYGd"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"A-OVPe8FfYGd","executionInfo":{"status":"ok","timestamp":1770441403046,"user_tz":-420,"elapsed":553307,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"628af614-851e-4e44-e3e0-5043e6cc354f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.226\n","Step 100, Loss: 5.868\n","Step 200, Loss: 5.375\n","Step 300, Loss: 5.134\n","Step 400, Loss: 4.910\n","Step 500, Loss: 4.746\n","Step 600, Loss: 4.627\n","Step 700, Loss: 4.644\n","Epoch 1 Loss: 5.131\n","Step 0, Loss: 4.443\n","Step 100, Loss: 4.345\n","Step 200, Loss: 4.251\n","Step 300, Loss: 4.343\n","Step 400, Loss: 4.134\n","Step 500, Loss: 4.181\n","Step 600, Loss: 4.057\n","Step 700, Loss: 4.120\n","Epoch 2 Loss: 4.244\n","Step 0, Loss: 4.032\n","Step 100, Loss: 4.073\n","Step 200, Loss: 3.957\n","Step 300, Loss: 3.818\n","Step 400, Loss: 3.853\n","Step 500, Loss: 3.864\n","Step 600, Loss: 3.875\n","Step 700, Loss: 3.605\n","Epoch 3 Loss: 3.886\n","Step 0, Loss: 3.811\n","Step 100, Loss: 3.600\n","Step 200, Loss: 3.583\n","Step 300, Loss: 3.638\n","Step 400, Loss: 3.742\n","Step 500, Loss: 3.589\n","Step 600, Loss: 3.483\n","Step 700, Loss: 3.716\n","Epoch 4 Loss: 3.640\n","Step 0, Loss: 3.661\n","Step 100, Loss: 3.563\n","Step 200, Loss: 3.329\n","Step 300, Loss: 3.446\n","Step 400, Loss: 3.519\n","Step 500, Loss: 3.519\n","Step 600, Loss: 3.268\n","Step 700, Loss: 3.564\n","Epoch 5 Loss: 3.458\n","Step 0, Loss: 3.315\n","Step 100, Loss: 3.228\n","Step 200, Loss: 3.513\n","Step 300, Loss: 3.184\n","Step 400, Loss: 3.431\n","Step 500, Loss: 3.297\n","Step 600, Loss: 3.256\n","Step 700, Loss: 3.244\n","Epoch 6 Loss: 3.318\n","Step 0, Loss: 3.247\n","Step 100, Loss: 3.172\n","Step 200, Loss: 3.218\n","Step 300, Loss: 3.424\n","Step 400, Loss: 3.134\n","Step 500, Loss: 3.326\n","Step 600, Loss: 3.261\n","Step 700, Loss: 3.400\n","Epoch 7 Loss: 3.203\n","Step 0, Loss: 3.005\n","Step 100, Loss: 3.210\n","Step 200, Loss: 3.063\n","Step 300, Loss: 3.111\n","Step 400, Loss: 3.125\n","Step 500, Loss: 3.147\n","Step 600, Loss: 3.146\n","Step 700, Loss: 2.993\n","Epoch 8 Loss: 3.109\n","Step 0, Loss: 2.872\n","Step 100, Loss: 2.797\n","Step 200, Loss: 3.205\n","Step 300, Loss: 2.936\n","Step 400, Loss: 2.938\n","Step 500, Loss: 3.081\n","Step 600, Loss: 3.183\n","Step 700, Loss: 2.846\n","Epoch 9 Loss: 3.032\n","Step 0, Loss: 2.861\n","Step 100, Loss: 2.839\n","Step 200, Loss: 2.970\n","Step 300, Loss: 2.929\n","Step 400, Loss: 2.815\n","Step 500, Loss: 3.130\n","Step 600, Loss: 3.194\n","Step 700, Loss: 2.977\n","Epoch 10 Loss: 2.962\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(10): # 10 Epochs for demo\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_ur.pt')"],"id":"A-OVPe8FfYGd"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"IzEtrjoJfYGd","executionInfo":{"status":"ok","timestamp":1770441403058,"user_tz":-420,"elapsed":15,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"0ac1a748-3943-4343-a786-1831f28817bc"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_ur.pt', 'app/models/transformer_model_ur.pt')\n","shutil.copy('spm_ur.model', 'app/models/spm_ur.model')\n","shutil.copy('spm_en_ur.model', 'app/models/spm_en_ur.model')\n","print(\"Models copied to app/models/\")"],"id":"IzEtrjoJfYGd"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"L4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"4c07e9f9ef4a43998f8815cbb8a4f7d4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_94779a76a45b4befba74054bbfd467d0","IPY_MODEL_0e2c97e85f024a879cf7f58511052578","IPY_MODEL_d94a089f1efa411fb8ccb82d2e26ade0"],"layout":"IPY_MODEL_458c9e4ce6b443e98e5f8fb1c53e5e29"}},"94779a76a45b4befba74054bbfd467d0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d8e59fb3dff34a4a8d422555e0aea6f7","placeholder":"​","style":"IPY_MODEL_4df631a1c84e46ce8b342d9ef832a740","value":"README.md: "}},"0e2c97e85f024a879cf7f58511052578":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_d91a021523834b389c9ce9c8f9cec2bb","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_e8606c142c2e4bf39002074eacde19e8","value":1}},"d94a089f1efa411fb8ccb82d2e26ade0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cdab68a2574c4f8796a9555e5636389b","placeholder":"​","style":"IPY_MODEL_d502ee27f9d441e49b9e52d6c4344a22","value":" 65.4k/? [00:00&lt;00:00, 6.80MB/s]"}},"458c9e4ce6b443e98e5f8fb1c53e5e29":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d8e59fb3dff34a4a8d422555e0aea6f7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4df631a1c84e46ce8b342d9ef832a740":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"d91a021523834b389c9ce9c8f9cec2bb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"e8606c142c2e4bf39002074eacde19e8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"cdab68a2574c4f8796a9555e5636389b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d502ee27f9d441e49b9e52d6c4344a22":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"7727c5cc79c747ddba6c3fba9adedd03":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ccf60a63a5d740f7a4a937d00d12298c","IPY_MODEL_522456874d164a9491bc74b16283c957","IPY_MODEL_b95b1fe20f094222b6406f553bd013c8"],"layout":"IPY_MODEL_fa14c7d360444c12b945e6acb210d1bc"}},"ccf60a63a5d740f7a4a937d00d12298c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1b05a64ef6544cb8af0d833659dc7508","placeholder":"​","style":"IPY_MODEL_e4ec51cc4ac54d05abfc1eadbfae1bb0","value":"en-ur/test-00000-of-00001.parquet: 100%"}},"522456874d164a9491bc74b16283c957":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3a78306c9ed445f7a0b0204d7dbf98d2","max":300728,"min":0,"orientation":"horizontal","style":"IPY_MODEL_6bb9ddb68e014790b2ce6f3b81814a82","value":300728}},"b95b1fe20f094222b6406f553bd013c8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a04f65b9c24c4a9c9ff57452d4142902","placeholder":"​","style":"IPY_MODEL_6008b4c3a29f4ab29a6aa97f0fd40745","value":" 301k/301k [00:01&lt;00:00, 192kB/s]"}},"fa14c7d360444c12b945e6acb210d1bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1b05a64ef6544cb8af0d833659dc7508":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e4ec51cc4ac54d05abfc1eadbfae1bb0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3a78306c9ed445f7a0b0204d7dbf98d2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6bb9ddb68e014790b2ce6f3b81814a82":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"a04f65b9c24c4a9c9ff57452d4142902":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6008b4c3a29f4ab29a6aa97f0fd40745":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"98d2c90e3255476c868ce2aec11bd40f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_5ff6b67810814fceabd7655048f3edf6","IPY_MODEL_fc5319fd0372476cbe90c7c6e66f6bf6","IPY_MODEL_33e3b83f9e8f4066b4f9dd38f86a5f34"],"layout":"IPY_MODEL_7557fb3a682a4abba297ee872c243c83"}},"5ff6b67810814fceabd7655048f3edf6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3696d6c431184ac2a787e8be379885e3","placeholder":"​","style":"IPY_MODEL_39796bec65a74876b423a5beefc37e83","value":"en-ur/train-00000-of-00001.parquet: 100%"}},"fc5319fd0372476cbe90c7c6e66f6bf6":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ce9ec3c5bbab4d9c8e56d556421b6645","max":147739018,"min":0,"orientation":"horizontal","style":"IPY_MODEL_316db49fe7cf43489e3ca58a9d313a2a","value":147739018}},"33e3b83f9e8f4066b4f9dd38f86a5f34":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d1213b9c69284d16883f408f5b1f8048","placeholder":"​","style":"IPY_MODEL_68256b9bfd2f4d848aea4a6174ce2028","value":" 148M/148M [00:03&lt;00:00, 103MB/s]"}},"7557fb3a682a4abba297ee872c243c83":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3696d6c431184ac2a787e8be379885e3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"39796bec65a74876b423a5beefc37e83":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ce9ec3c5bbab4d9c8e56d556421b6645":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"316db49fe7cf43489e3ca58a9d313a2a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d1213b9c69284d16883f408f5b1f8048":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"68256b9bfd2f4d848aea4a6174ce2028":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"b3876e04d1d844738ee51613344e4739":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_7380a8c67ceb4857b5d9ffc66a554630","IPY_MODEL_b00a302b235145a8855dc91b74b282bd","IPY_MODEL_4dba981f0e6748168c4e86b2925d324b"],"layout":"IPY_MODEL_63c6aa1173c54d49b3eb0d805d7c477b"}},"7380a8c67ceb4857b5d9ffc66a554630":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b78945a747ce40eb95756149fbfb95cc","placeholder":"​","style":"IPY_MODEL_a662d27f646f4326b3469543f5d2a63c","value":"en-ur/validation-00000-of-00001.parquet: 100%"}},"b00a302b235145a8855dc91b74b282bd":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_b21ffa7749674a67a61b0a7fa1ba99f7","max":296298,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d9b9049a68b449df87dad7775d845ee7","value":296298}},"4dba981f0e6748168c4e86b2925d324b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f6a6e8f216e84720ba36b15d4d7ac81e","placeholder":"​","style":"IPY_MODEL_ac4b27ebe30d42e9ba07cb23647ebc27","value":" 296k/296k [00:00&lt;00:00, 394kB/s]"}},"63c6aa1173c54d49b3eb0d805d7c477b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b78945a747ce40eb95756149fbfb95cc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a662d27f646f4326b3469543f5d2a63c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"b21ffa7749674a67a61b0a7fa1ba99f7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d9b9049a68b449df87dad7775d845ee7":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"f6a6e8f216e84720ba36b15d4d7ac81e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ac4b27ebe30d42e9ba07cb23647ebc27":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"417c9e6a1dd74232819a230cf8d99943":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_42267e92152e473897fd47343f02b283","IPY_MODEL_f73dff27cf9c4eb1834c4b2edba6ac7e","IPY_MODEL_03bd7d8f89774129b1252f0aa3ce6fa0"],"layout":"IPY_MODEL_d61a75049aa04f7eb44b63c8134de3a6"}},"42267e92152e473897fd47343f02b283":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_ea8e3f0d26ee4b9aa91028882ae7c1ea","placeholder":"​","style":"IPY_MODEL_bdcfae8bc78542f982ddff5e76aca0f3","value":"Generating test split: 100%"}},"f73dff27cf9c4eb1834c4b2edba6ac7e":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_fbe8e753526a494f960e0e23a76d19ca","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_69df32f2825144f8b398f1fd9583a8be","value":2000}},"03bd7d8f89774129b1252f0aa3ce6fa0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_dbcbeefa1f05437d834fda23b2e93d5a","placeholder":"​","style":"IPY_MODEL_f8a1cc4691eb412583c3aec6513ce447","value":" 2000/2000 [00:00&lt;00:00, 48086.86 examples/s]"}},"d61a75049aa04f7eb44b63c8134de3a6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ea8e3f0d26ee4b9aa91028882ae7c1ea":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bdcfae8bc78542f982ddff5e76aca0f3":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"fbe8e753526a494f960e0e23a76d19ca":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"69df32f2825144f8b398f1fd9583a8be":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"dbcbeefa1f05437d834fda23b2e93d5a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f8a1cc4691eb412583c3aec6513ce447":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f3cd3e9c0c0a420a87b80d32df51a888":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_651b748ffbcf4b66b19cc6ce63e34e7a","IPY_MODEL_2fb00de56f9d40ffab06bde4e09095bd","IPY_MODEL_7e81d321a5c84fbb8e6d60856e16b7a7"],"layout":"IPY_MODEL_513556bb91324ecdad415564f9b6ab2d"}},"651b748ffbcf4b66b19cc6ce63e34e7a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_63133d05ec304cfaac336b346308ce6d","placeholder":"​","style":"IPY_MODEL_061019974f254fd7afc9ca4fe05b3831","value":"Generating train split: 100%"}},"2fb00de56f9d40ffab06bde4e09095bd":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_d8c92679f4e2420e89da403a69b75a5b","max":753913,"min":0,"orientation":"horizontal","style":"IPY_MODEL_52dfaca0539f46218aec0ee50731b4bd","value":753913}},"7e81d321a5c84fbb8e6d60856e16b7a7":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2b7c3d1a4fb2400baf9ca96a69649b57","placeholder":"​","style":"IPY_MODEL_0217f1e38530403685127fdd120b6ed0","value":" 753913/753913 [00:01&lt;00:00, 727575.52 examples/s]"}},"513556bb91324ecdad415564f9b6ab2d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"63133d05ec304cfaac336b346308ce6d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"061019974f254fd7afc9ca4fe05b3831":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"d8c92679f4e2420e89da403a69b75a5b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"52dfaca0539f46218aec0ee50731b4bd":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"2b7c3d1a4fb2400baf9ca96a69649b57":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0217f1e38530403685127fdd120b6ed0":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c86c02301e7144cabab7b596c855c922":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_97a2cd7cb62f41e3b3c8a4a5a31697b2","IPY_MODEL_a92af3d602c541488d20feeefa361d9c","IPY_MODEL_9149268626854fdbae9a2c7c36f5ef15"],"layout":"IPY_MODEL_cab3bc9f5746483cad0ed7fcd532fc73"}},"97a2cd7cb62f41e3b3c8a4a5a31697b2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f0db55c4f25249019e5d7a2691e50366","placeholder":"​","style":"IPY_MODEL_4bd624c5737e4adc87d41ef527720016","value":"Generating validation split: 100%"}},"a92af3d602c541488d20feeefa361d9c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_cc918e93af824c18adefb3fdf4b0f307","max":2000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_cf790630f3254fcea11210ca25667755","value":2000}},"9149268626854fdbae9a2c7c36f5ef15":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b222362d13e447d1873929f6f477084c","placeholder":"​","style":"IPY_MODEL_7af0010ef5f646c89a932b9299a948ee","value":" 2000/2000 [00:00&lt;00:00, 139973.44 examples/s]"}},"cab3bc9f5746483cad0ed7fcd532fc73":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f0db55c4f25249019e5d7a2691e50366":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4bd624c5737e4adc87d41ef527720016":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"cc918e93af824c18adefb3fdf4b0f307":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cf790630f3254fcea11210ca25667755":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"b222362d13e447d1873929f6f477084c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7af0010ef5f646c89a932b9299a948ee":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
Vietnamese_English_Transformer.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"Y6Xiq5X1qAJ3"},"source":["# Vietnamese-English Machine Translation (A3 Project)\n","\n","**Student**: Htut Ko Ko \n","**Course**: Natural Language Understanding \n","**Task**: Vietnamese (vi) <-> English (en) Translation using Transformer\n","\n","## Project Overview\n","This notebook implements a Neural Machine Translation system using a **Transformer** architecture.\n","We use the **ALT (Asian Language Treebank)** dataset for Vietnamese-English parallel data.\n","We use **SentencePiece** for subword tokenization.\n"],"id":"Y6Xiq5X1qAJ3"},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EH9of9XXqAJ5","executionInfo":{"status":"ok","timestamp":1770443538034,"user_tz":-420,"elapsed":14177,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"cff84c10-22ef-4cb6-f4c9-fcdc7ce7a7a0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n"]}],"source":["import os\n","import math\n","import time\n","import random\n","import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pad_sequence\n","from datasets import load_dataset\n","import sentencepiece as spm\n","\n","# Check for GPU\n","device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n","print(f\"Using device: {device}\")\n","\n","SEED = 1234\n","random.seed(SEED)\n","np.random.seed(SEED)\n","torch.manual_seed(SEED)\n","torch.cuda.manual_seed(SEED)\n","torch.backends.cudnn.deterministic = True"],"id":"EH9of9XXqAJ5"},{"cell_type":"markdown","metadata":{"id":"Xlli4N5iqAJ6"},"source":["## 2. Data Loading (ALT Dataset)\n","Loading Vietnamese-English pairs from ALT."],"id":"Xlli4N5iqAJ6"},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":690,"referenced_widgets":["fb20b1e521b64aa1962f136fd63bbf9d","89d6648073bd422c9277abbe2fece8b8","929200ce40424a8a98aae0611dc3777a","cc1d34e078e544d1a5e5424aeca7f8b0","1f4431edf035470b82cfe98efcff7695","76b3fcfa58a64d24aefa15e4ecb2ce3b","d6e25207451a45fab08d448108835294","4a1909474d084d68a660bcf2422c58d8","bb784334e7ce4b9399f25920f4807366","e398947cea754173ab48c3dfdaf0d87f","caf41e44597047429d0be0ec9e2067c5","a15308a64214421bae1d2e0f0a862aa8","ab8b4fae3fcf4140912a978b3323adb5","0ce6f6b343d74a84ad68ad9488bd2a89","49cc8ebaaa1e437cbcbc8acfe813b4ef","103b3787e870426fbc6ff55ba1e40e3b","5f30b380e9df4de4a08c3b0a04625e79","18d5b221667e4d8eb534f99832765915","34b3dd2221554eb0b513d1d4853f04a3","7e940d9ee70449fb9556ff15bc7e2397","845d27d0d0f84dce84df211e959cc823","b6463d48ff6d4202b41be959f9c50268","401b5d9999f043a98772faf9b8a8f197","93f592745f444769bc86e9f9dd875e3f","dfa0355de7094119aa7465c6525dbe4a","fda7062fa4104029839de98df15a391d","7fa7685332fb41d79a6a61fbfea90ae0","83429e92713c453bbd36c47ed9150e44","4c2e26a4ddf742268c223fc17185919d","9499f61b2e044d8a9b196cc0f69e56c9","4988cd6403fe4808b685900ffd10a347","c11064e79ad84bb6ac585401b9decbb5","5e7852f70d304162b5cbb1424813ad92","5928fc881dc54140b47f4a98ae76a3b4","1770cefadd524409837537a1c2743153","552cb64ab51b4c038f7065d5ece72ad4","398ffcf356c14086a94fe4fc102ea618","c3b786bbff9c485f9c74ac3dae7857be","2efa929d583b4716a5bdf5496583d6b5","ddb822e7b8d947b4a59430c805653c3b","e7cac3c07a8e47179c2ca3a308dda345","7961adc6d65d42308947ff3d1c6d8c8e","dac510cdf0fc4e22866eadf9c0294f06","68148c6a30e34e96aa39aebb31be1c43","0c9399c9b77b414db4ef4b2d3a34f46e","456000f0d012409db8c3077e7278e499","08253a1253c3451b943560fc197352e2","fa202bbd9af449d6b65ec9f2b590c2b1","62d00b422ac44a07a2b72fbb22ecb1aa","1303c6f4973947dda8bf925b9a2c6aa9","151cfdb17570418ba523cd6e06e78685","c96e992a7e22432a897c7cd239b930a8","cfc12ece33a94fe4b34eae0c60d565a4","9203996d133f4d50a821f122aea5f75a","70e20e6d36a54c719b1cf1b4e5674acf","78b2cc67439d4cbf92e038eddd3f161b","ae68c0a697e14aaa86c9d0882a0ad3dc","f83e25cbf112462895c7bf73679bb347","0446501b448f475ab2dd1ac31e271df8","affae7b453604976aeee265fe28b337d","da49068b06c54390937853d411badb7a","329fbecbfe054fa6ab310f380e01779e","691731bd2d8d42e694f0d43aec6fc0e5","c2ffb0dd03664863973ee3362a99b63b","e18a194feab84ff885559e97433c0406","b309512414e2439fbdc4e92208646853","1f113a7f71b84b2d89de2b461cfa4daa","e5bcb09ac5294ef8afed6e13d3297c13","796c0bd2f45042f3aed902a50c7d8278","0ff64bbe47a5407fa961e979c93e2cdc","31e3616994124d17883f1361fd733633","f44ab67a089c464c95321c33cdaa2b88","aafe20126db3455495ac520639f536f1","74e314998e414ce9baba95bef1c09e56","e0d16a103b944bd4841f62c47557bc59","b43dba75938d4af58da0e016d5193054","92b02abc4b2b42ccb5c095013df90af9"]},"id":"wAS6astuqAJ6","executionInfo":{"status":"ok","timestamp":1770443551213,"user_tz":-420,"elapsed":13176,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"d9472227-5475-490a-bdfe-f07c701d947e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loading ALT Dataset...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["README.md: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"fb20b1e521b64aa1962f136fd63bbf9d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/train-00000-of-00001.parque(…): 0%| | 0.00/31.2M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a15308a64214421bae1d2e0f0a862aa8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/validation-00000-of-00001.p(…): 0%| | 0.00/1.71M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"401b5d9999f043a98772faf9b8a8f197"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["alt-parallel/test-00000-of-00001.parquet: 0%| | 0.00/1.79M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"5928fc881dc54140b47f4a98ae76a3b4"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/18088 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0c9399c9b77b414db4ef4b2d3a34f46e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating validation split: 0%| | 0/1000 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"78b2cc67439d4cbf92e038eddd3f161b"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating test split: 0%| | 0/1019 [00:00<?, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1f113a7f71b84b2d89de2b461cfa4daa"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Loaded 20107 sentences from ALT.\n","Extracted 20107 Vietnamese-English pairs.\n"]}],"source":["print(\"Loading ALT Dataset...\")\n","try:\n"," # ALT has 'vi' for Vietnamese\n"," dataset = load_dataset(\"alt\", split=\"train+validation+test\")\n"," print(f\"Loaded {len(dataset)} sentences from ALT.\")\n","\n"," data = []\n"," for item in dataset:\n"," if 'translation' in item:\n"," if 'vi' in item['translation'] and 'en' in item['translation']:\n"," data.append({\n"," 'vi': item['translation']['vi'],\n"," 'en': item['translation']['en']\n"," })\n","\n"," print(f\"Extracted {len(data)} Vietnamese-English pairs.\")\n","except Exception as e:\n"," print(f\"Error loading from HF: {e}\")"],"id":"wAS6astuqAJ6"},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VzmCBLvzqAJ6","executionInfo":{"status":"ok","timestamp":1770443551236,"user_tz":-420,"elapsed":19,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"c1c6bdda-a123-41d0-8ac7-d838c096f534"},"outputs":[{"output_type":"stream","name":"stdout","text":[" vi \\\n","0 Ý đã đánh bại Bồ Đào Nha với tỉ số 31-5 ở Bảng... \n","1 Andrea Maisi đã mở tỉ số cho Ý ở phút thứ tư v... \n","2 Chiếm thế áp đảo trong hầu hết hiệp đầu nhưng ... \n","3 Bồ Đào Nha chưa bao giờ từ bỏ và David Penalva... \n","4 Ý đã dẫn 16-5 ở hiệp đầu nhưng ngang sức với B... \n","\n"," en \n","0 Italy have defeated Portugal 31-5 in Pool C of... \n","1 Andrea Masi opened the scoring in the fourth m... \n","2 Despite controlling the game for much of the f... \n","3 Portugal never gave up and David Penalva score... \n","4 Italy led 16-5 at half time but were matched b... \n"]}],"source":["df = pd.DataFrame(data)\n","print(df.head())\n","\n","df = df.dropna(subset=['vi', 'en'])\n","df['vi'] = df['vi'].astype(str)\n","df['en'] = df['en'].astype(str)\n","df = df[df['vi'].str.strip() != '']\n","df = df[df['en'].str.strip() != '']"],"id":"VzmCBLvzqAJ6"},{"cell_type":"markdown","metadata":{"id":"A7u71TB0qAJ6"},"source":["## 3. Tokenization"],"id":"A7u71TB0qAJ6"},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"O1KJ-ZMaqAJ6","executionInfo":{"status":"ok","timestamp":1770443554794,"user_tz":-420,"elapsed":3557,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"8b6ed1e3-dc97-48a9-f05c-905866f5da07"},"outputs":[{"output_type":"stream","name":"stdout","text":["Training Vietnamese Tokenizer...\n","Training English Tokenizer (for Vietnamese pair)...\n"]}],"source":["# Save texts to files\n","with open('train_vi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['vi']: f.write(line + '\\n')\n","\n","with open('train_en_vi.txt', 'w', encoding='utf-8') as f:\n"," for line in df['en']: f.write(line + '\\n')\n","\n","# Train SentencePiece models\n","vocab_size = 8000\n","model_type = 'bpe'\n","\n","print(\"Training Vietnamese Tokenizer...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_vi.txt',\n"," model_prefix='spm_vi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","print(\"Training English Tokenizer (for Vietnamese pair)...\")\n","spm.SentencePieceTrainer.train(\n"," input='train_en_vi.txt',\n"," model_prefix='spm_en_vi',\n"," vocab_size=vocab_size,\n"," model_type=model_type,\n"," pad_id=0, bos_id=1, eos_id=2, unk_id=3\n",")\n","\n","sp_src = spm.SentencePieceProcessor(model_file='spm_vi.model')\n","sp_trg = spm.SentencePieceProcessor(model_file='spm_en_vi.model')"],"id":"O1KJ-ZMaqAJ6"},{"cell_type":"markdown","metadata":{"id":"MA5tAC0jqAJ6"},"source":["## 4. Dataset & Model"],"id":"MA5tAC0jqAJ6"},{"cell_type":"code","execution_count":5,"metadata":{"id":"w__12RzXqAJ6","executionInfo":{"status":"ok","timestamp":1770443554795,"user_tz":-420,"elapsed":5,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class TranslationDataset(Dataset):\n"," def __init__(self, df, sp_src, sp_trg):\n"," self.data = df\n"," self.sp_src = sp_src\n"," self.sp_trg = sp_trg\n","\n"," def __len__(self):\n"," return len(self.data)\n","\n"," def __getitem__(self, idx):\n"," src_text = self.data.iloc[idx]['vi']\n"," trg_text = self.data.iloc[idx]['en']\n"," src_ids = [self.sp_src.bos_id()] + self.sp_src.encode(src_text, out_type=int) + [self.sp_src.eos_id()]\n"," trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode(trg_text, out_type=int) + [self.sp_trg.eos_id()]\n"," return torch.tensor(src_ids), torch.tensor(trg_ids)\n","\n","def collate_fn(batch):\n"," src_batch, trg_batch = [], []\n"," for src, trg in batch:\n"," src_batch.append(src)\n"," trg_batch.append(trg)\n"," src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)\n"," trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)\n"," return src_pad, trg_pad\n","\n","train_dataset = TranslationDataset(df, sp_src, sp_trg)\n","train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)"],"id":"w__12RzXqAJ6"},{"cell_type":"code","execution_count":6,"metadata":{"id":"BILlnw20qAJ7","executionInfo":{"status":"ok","timestamp":1770443554797,"user_tz":-420,"elapsed":1,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}}},"outputs":[],"source":["class PositionalEncoding(nn.Module):\n"," def __init__(self, d_model, dropout=0.1, max_len=5000):\n"," super(PositionalEncoding, self).__init__()\n"," self.dropout = nn.Dropout(p=dropout)\n"," pe = torch.zeros(max_len, d_model)\n"," position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n"," div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n"," pe[:, 0::2] = torch.sin(position * div_term)\n"," pe[:, 1::2] = torch.cos(position * div_term)\n"," self.register_buffer('pe', pe)\n","\n"," def forward(self, x):\n"," x = x + self.pe[:x.size(1), :]\n"," return self.dropout(x)\n","\n","class TransformerModel(nn.Module):\n"," def __init__(self, src_vocab_size, trg_vocab_size,\n"," d_model=256, nhead=4, num_encoder_layers=2,\n"," num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0):\n"," super(TransformerModel, self).__init__()\n"," self.d_model = d_model\n"," self.pad_idx = pad_idx\n"," self.src_embedding = nn.Embedding(src_vocab_size, d_model)\n"," self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)\n"," self.pos_encoder = PositionalEncoding(d_model, dropout)\n"," self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)\n"," self.fc_out = nn.Linear(d_model, trg_vocab_size)\n","\n"," def forward(self, src, trg):\n"," src_key_padding_mask = (src == self.pad_idx)\n"," trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)\n"," src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))\n"," trg_emb = self.pos_encoder(self.trg_embedding(trg) * math.sqrt(self.d_model))\n"," output = self.transformer(src=src_emb, tgt=trg_emb, tgt_mask=trg_mask, src_key_padding_mask=src_key_padding_mask)\n"," return self.fc_out(output)"],"id":"BILlnw20qAJ7"},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5PBRBuZgqAJ7","executionInfo":{"status":"ok","timestamp":1770444978379,"user_tz":-420,"elapsed":1423582,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"1a41f621-2330-4305-915c-a903f787e271"},"outputs":[{"output_type":"stream","name":"stdout","text":["Starting Training...\n","Step 0, Loss: 9.199\n","Step 100, Loss: 6.805\n","Step 200, Loss: 6.359\n","Step 300, Loss: 6.330\n","Epoch 1 Loss: 6.723\n","Step 0, Loss: 6.117\n","Step 100, Loss: 6.091\n","Step 200, Loss: 5.959\n","Step 300, Loss: 5.850\n","Epoch 2 Loss: 6.015\n","Step 0, Loss: 5.786\n","Step 100, Loss: 5.669\n","Step 200, Loss: 5.571\n","Step 300, Loss: 5.395\n","Epoch 3 Loss: 5.567\n","Step 0, Loss: 5.350\n","Step 100, Loss: 5.146\n","Step 200, Loss: 5.098\n","Step 300, Loss: 5.085\n","Epoch 4 Loss: 5.205\n","Step 0, Loss: 5.094\n","Step 100, Loss: 4.863\n","Step 200, Loss: 4.994\n","Step 300, Loss: 4.943\n","Epoch 5 Loss: 4.930\n","Step 0, Loss: 4.727\n","Step 100, Loss: 4.725\n","Step 200, Loss: 4.634\n","Step 300, Loss: 4.705\n","Epoch 6 Loss: 4.712\n","Step 0, Loss: 4.487\n","Step 100, Loss: 4.530\n","Step 200, Loss: 4.572\n","Step 300, Loss: 4.625\n","Epoch 7 Loss: 4.532\n","Step 0, Loss: 4.232\n","Step 100, Loss: 4.386\n","Step 200, Loss: 4.404\n","Step 300, Loss: 4.404\n","Epoch 8 Loss: 4.374\n","Step 0, Loss: 4.188\n","Step 100, Loss: 4.158\n","Step 200, Loss: 4.067\n","Step 300, Loss: 4.373\n","Epoch 9 Loss: 4.237\n","Step 0, Loss: 4.073\n","Step 100, Loss: 4.177\n","Step 200, Loss: 4.128\n","Step 300, Loss: 4.088\n","Epoch 10 Loss: 4.114\n","Step 0, Loss: 3.954\n","Step 100, Loss: 4.134\n","Step 200, Loss: 4.026\n","Step 300, Loss: 4.075\n","Epoch 11 Loss: 4.004\n","Step 0, Loss: 3.807\n","Step 100, Loss: 3.826\n","Step 200, Loss: 3.945\n","Step 300, Loss: 3.993\n","Epoch 12 Loss: 3.904\n","Step 0, Loss: 3.667\n","Step 100, Loss: 3.773\n","Step 200, Loss: 3.857\n","Step 300, Loss: 3.869\n","Epoch 13 Loss: 3.809\n","Step 0, Loss: 3.582\n","Step 100, Loss: 3.760\n","Step 200, Loss: 3.771\n","Step 300, Loss: 3.693\n","Epoch 14 Loss: 3.722\n","Step 0, Loss: 3.460\n","Step 100, Loss: 3.688\n","Step 200, Loss: 3.798\n","Step 300, Loss: 3.677\n","Epoch 15 Loss: 3.643\n","Step 0, Loss: 3.350\n","Step 100, Loss: 3.528\n","Step 200, Loss: 3.602\n","Step 300, Loss: 3.594\n","Epoch 16 Loss: 3.570\n","Step 0, Loss: 3.277\n","Step 100, Loss: 3.365\n","Step 200, Loss: 3.505\n","Step 300, Loss: 3.669\n","Epoch 17 Loss: 3.502\n","Step 0, Loss: 3.241\n","Step 100, Loss: 3.465\n","Step 200, Loss: 3.505\n","Step 300, Loss: 3.534\n","Epoch 18 Loss: 3.439\n","Step 0, Loss: 3.272\n","Step 100, Loss: 3.327\n","Step 200, Loss: 3.366\n","Step 300, Loss: 3.471\n","Epoch 19 Loss: 3.375\n","Step 0, Loss: 3.175\n","Step 100, Loss: 3.368\n","Step 200, Loss: 3.371\n","Step 300, Loss: 3.524\n","Epoch 20 Loss: 3.320\n","Step 0, Loss: 3.176\n","Step 100, Loss: 3.242\n","Step 200, Loss: 3.334\n","Step 300, Loss: 3.289\n","Epoch 21 Loss: 3.268\n","Step 0, Loss: 3.054\n","Step 100, Loss: 3.189\n","Step 200, Loss: 3.286\n","Step 300, Loss: 3.345\n","Epoch 22 Loss: 3.220\n","Step 0, Loss: 2.885\n","Step 100, Loss: 3.224\n","Step 200, Loss: 3.267\n","Step 300, Loss: 3.370\n","Epoch 23 Loss: 3.172\n","Step 0, Loss: 2.916\n","Step 100, Loss: 3.072\n","Step 200, Loss: 3.271\n","Step 300, Loss: 3.285\n","Epoch 24 Loss: 3.126\n","Step 0, Loss: 2.876\n","Step 100, Loss: 2.970\n","Step 200, Loss: 3.283\n","Step 300, Loss: 3.254\n","Epoch 25 Loss: 3.086\n","Step 0, Loss: 2.885\n","Step 100, Loss: 2.799\n","Step 200, Loss: 3.193\n","Step 300, Loss: 3.138\n","Epoch 26 Loss: 3.047\n","Step 0, Loss: 2.969\n","Step 100, Loss: 2.899\n","Step 200, Loss: 3.050\n","Step 300, Loss: 3.159\n","Epoch 27 Loss: 3.010\n","Step 0, Loss: 2.920\n","Step 100, Loss: 2.789\n","Step 200, Loss: 3.038\n","Step 300, Loss: 2.991\n","Epoch 28 Loss: 2.972\n","Step 0, Loss: 2.781\n","Step 100, Loss: 2.905\n","Step 200, Loss: 2.962\n","Step 300, Loss: 2.985\n","Epoch 29 Loss: 2.941\n","Step 0, Loss: 2.769\n","Step 100, Loss: 2.839\n","Step 200, Loss: 2.968\n","Step 300, Loss: 3.039\n","Epoch 30 Loss: 2.908\n","Step 0, Loss: 2.646\n","Step 100, Loss: 2.901\n","Step 200, Loss: 2.838\n","Step 300, Loss: 3.146\n","Epoch 31 Loss: 2.877\n","Step 0, Loss: 2.704\n","Step 100, Loss: 2.683\n","Step 200, Loss: 2.751\n","Step 300, Loss: 2.991\n","Epoch 32 Loss: 2.848\n","Step 0, Loss: 2.643\n","Step 100, Loss: 2.790\n","Step 200, Loss: 2.930\n","Step 300, Loss: 2.879\n","Epoch 33 Loss: 2.816\n","Step 0, Loss: 2.700\n","Step 100, Loss: 2.728\n","Step 200, Loss: 2.832\n","Step 300, Loss: 2.819\n","Epoch 34 Loss: 2.791\n","Step 0, Loss: 2.608\n","Step 100, Loss: 2.701\n","Step 200, Loss: 2.897\n","Step 300, Loss: 2.904\n","Epoch 35 Loss: 2.766\n","Step 0, Loss: 2.662\n","Step 100, Loss: 2.692\n","Step 200, Loss: 2.754\n","Step 300, Loss: 2.699\n","Epoch 36 Loss: 2.741\n","Step 0, Loss: 2.577\n","Step 100, Loss: 2.596\n","Step 200, Loss: 2.766\n","Step 300, Loss: 2.778\n","Epoch 37 Loss: 2.718\n","Step 0, Loss: 2.515\n","Step 100, Loss: 2.777\n","Step 200, Loss: 2.766\n","Step 300, Loss: 2.710\n","Epoch 38 Loss: 2.696\n","Step 0, Loss: 2.627\n","Step 100, Loss: 2.528\n","Step 200, Loss: 2.744\n","Step 300, Loss: 2.689\n","Epoch 39 Loss: 2.670\n","Step 0, Loss: 2.468\n","Step 100, Loss: 2.600\n","Step 200, Loss: 2.690\n","Step 300, Loss: 2.738\n","Epoch 40 Loss: 2.652\n","Step 0, Loss: 2.496\n","Step 100, Loss: 2.726\n","Step 200, Loss: 2.581\n","Step 300, Loss: 2.683\n","Epoch 41 Loss: 2.633\n","Step 0, Loss: 2.477\n","Step 100, Loss: 2.534\n","Step 200, Loss: 2.756\n","Step 300, Loss: 2.716\n","Epoch 42 Loss: 2.607\n","Step 0, Loss: 2.439\n","Step 100, Loss: 2.644\n","Step 200, Loss: 2.660\n","Step 300, Loss: 2.713\n","Epoch 43 Loss: 2.593\n","Step 0, Loss: 2.510\n","Step 100, Loss: 2.524\n","Step 200, Loss: 2.621\n","Step 300, Loss: 2.761\n","Epoch 44 Loss: 2.569\n","Step 0, Loss: 2.446\n","Step 100, Loss: 2.339\n","Step 200, Loss: 2.598\n","Step 300, Loss: 2.721\n","Epoch 45 Loss: 2.553\n","Step 0, Loss: 2.442\n","Step 100, Loss: 2.435\n","Step 200, Loss: 2.676\n","Step 300, Loss: 2.520\n","Epoch 46 Loss: 2.537\n","Step 0, Loss: 2.414\n","Step 100, Loss: 2.367\n","Step 200, Loss: 2.617\n","Step 300, Loss: 2.624\n","Epoch 47 Loss: 2.521\n","Step 0, Loss: 2.365\n","Step 100, Loss: 2.521\n","Step 200, Loss: 2.483\n","Step 300, Loss: 2.530\n","Epoch 48 Loss: 2.498\n","Step 0, Loss: 2.389\n","Step 100, Loss: 2.312\n","Step 200, Loss: 2.437\n","Step 300, Loss: 2.613\n","Epoch 49 Loss: 2.485\n","Step 0, Loss: 2.360\n","Step 100, Loss: 2.506\n","Step 200, Loss: 2.559\n","Step 300, Loss: 2.589\n","Epoch 50 Loss: 2.472\n"]}],"source":["model = TransformerModel(vocab_size, vocab_size).to(device)\n","optimizer = optim.Adam(model.parameters(), lr=0.0005)\n","criterion = nn.CrossEntropyLoss(ignore_index=0)\n","\n","print(\"Starting Training...\")\n","for epoch in range(50): # 50 Epochs for ALT\n"," model.train()\n"," epoch_loss = 0\n"," for i, (src, trg) in enumerate(train_loader):\n"," src, trg = src.to(device), trg.to(device)\n"," optimizer.zero_grad()\n"," output = model(src, trg[:, :-1])\n"," output = output.contiguous().view(-1, output.shape[-1])\n"," trg = trg[:, 1:].contiguous().view(-1)\n"," loss = criterion(output, trg)\n"," loss.backward()\n"," optimizer.step()\n"," epoch_loss += loss.item()\n"," if i % 100 == 0: print(f\"Step {i}, Loss: {loss.item():.3f}\")\n"," print(f\"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.3f}\")\n","\n"," # Save\n"," torch.save(model.state_dict(), 'transformer_model_vi.pt')"],"id":"5PBRBuZgqAJ7"},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cXFPvAxTqAJ7","executionInfo":{"status":"ok","timestamp":1770444978451,"user_tz":-420,"elapsed":54,"user":{"displayName":"Htut Ko Ko","userId":"13068088192988605156"}},"outputId":"f48d07a6-5625-4afc-f4ed-ac999485ae4f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Models copied to app/models/\n"]}],"source":["# Copy to app\n","import shutil\n","os.makedirs('app/models', exist_ok=True)\n","shutil.copy('transformer_model_vi.pt', 'app/models/transformer_model_vi.pt')\n","shutil.copy('spm_vi.model', 'app/models/spm_vi.model')\n","shutil.copy('spm_en_vi.model', 'app/models/spm_en_vi.model')\n","print(\"Models copied to app/models/\")"],"id":"cXFPvAxTqAJ7"}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"fb20b1e521b64aa1962f136fd63bbf9d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_89d6648073bd422c9277abbe2fece8b8","IPY_MODEL_929200ce40424a8a98aae0611dc3777a","IPY_MODEL_cc1d34e078e544d1a5e5424aeca7f8b0"],"layout":"IPY_MODEL_1f4431edf035470b82cfe98efcff7695"}},"89d6648073bd422c9277abbe2fece8b8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_76b3fcfa58a64d24aefa15e4ecb2ce3b","placeholder":"​","style":"IPY_MODEL_d6e25207451a45fab08d448108835294","value":"README.md: "}},"929200ce40424a8a98aae0611dc3777a":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_4a1909474d084d68a660bcf2422c58d8","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_bb784334e7ce4b9399f25920f4807366","value":1}},"cc1d34e078e544d1a5e5424aeca7f8b0":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e398947cea754173ab48c3dfdaf0d87f","placeholder":"​","style":"IPY_MODEL_caf41e44597047429d0be0ec9e2067c5","value":" 13.2k/? [00:00&lt;00:00, 841kB/s]"}},"1f4431edf035470b82cfe98efcff7695":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"76b3fcfa58a64d24aefa15e4ecb2ce3b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d6e25207451a45fab08d448108835294":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"4a1909474d084d68a660bcf2422c58d8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"bb784334e7ce4b9399f25920f4807366":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e398947cea754173ab48c3dfdaf0d87f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"caf41e44597047429d0be0ec9e2067c5":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a15308a64214421bae1d2e0f0a862aa8":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ab8b4fae3fcf4140912a978b3323adb5","IPY_MODEL_0ce6f6b343d74a84ad68ad9488bd2a89","IPY_MODEL_49cc8ebaaa1e437cbcbc8acfe813b4ef"],"layout":"IPY_MODEL_103b3787e870426fbc6ff55ba1e40e3b"}},"ab8b4fae3fcf4140912a978b3323adb5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_5f30b380e9df4de4a08c3b0a04625e79","placeholder":"​","style":"IPY_MODEL_18d5b221667e4d8eb534f99832765915","value":"alt-parallel/train-00000-of-00001.parque(…): 100%"}},"0ce6f6b343d74a84ad68ad9488bd2a89":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_34b3dd2221554eb0b513d1d4853f04a3","max":31211167,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7e940d9ee70449fb9556ff15bc7e2397","value":31211167}},"49cc8ebaaa1e437cbcbc8acfe813b4ef":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_845d27d0d0f84dce84df211e959cc823","placeholder":"​","style":"IPY_MODEL_b6463d48ff6d4202b41be959f9c50268","value":" 31.2M/31.2M [00:01&lt;00:00, 11.6MB/s]"}},"103b3787e870426fbc6ff55ba1e40e3b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5f30b380e9df4de4a08c3b0a04625e79":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"18d5b221667e4d8eb534f99832765915":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"34b3dd2221554eb0b513d1d4853f04a3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7e940d9ee70449fb9556ff15bc7e2397":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"845d27d0d0f84dce84df211e959cc823":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b6463d48ff6d4202b41be959f9c50268":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"401b5d9999f043a98772faf9b8a8f197":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_93f592745f444769bc86e9f9dd875e3f","IPY_MODEL_dfa0355de7094119aa7465c6525dbe4a","IPY_MODEL_fda7062fa4104029839de98df15a391d"],"layout":"IPY_MODEL_7fa7685332fb41d79a6a61fbfea90ae0"}},"93f592745f444769bc86e9f9dd875e3f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_83429e92713c453bbd36c47ed9150e44","placeholder":"​","style":"IPY_MODEL_4c2e26a4ddf742268c223fc17185919d","value":"alt-parallel/validation-00000-of-00001.p(…): 100%"}},"dfa0355de7094119aa7465c6525dbe4a":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_9499f61b2e044d8a9b196cc0f69e56c9","max":1710203,"min":0,"orientation":"horizontal","style":"IPY_MODEL_4988cd6403fe4808b685900ffd10a347","value":1710203}},"fda7062fa4104029839de98df15a391d":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c11064e79ad84bb6ac585401b9decbb5","placeholder":"​","style":"IPY_MODEL_5e7852f70d304162b5cbb1424813ad92","value":" 1.71M/1.71M [00:00&lt;00:00, 2.43MB/s]"}},"7fa7685332fb41d79a6a61fbfea90ae0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"83429e92713c453bbd36c47ed9150e44":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4c2e26a4ddf742268c223fc17185919d":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9499f61b2e044d8a9b196cc0f69e56c9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"4988cd6403fe4808b685900ffd10a347":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"c11064e79ad84bb6ac585401b9decbb5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5e7852f70d304162b5cbb1424813ad92":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"5928fc881dc54140b47f4a98ae76a3b4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_1770cefadd524409837537a1c2743153","IPY_MODEL_552cb64ab51b4c038f7065d5ece72ad4","IPY_MODEL_398ffcf356c14086a94fe4fc102ea618"],"layout":"IPY_MODEL_c3b786bbff9c485f9c74ac3dae7857be"}},"1770cefadd524409837537a1c2743153":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2efa929d583b4716a5bdf5496583d6b5","placeholder":"​","style":"IPY_MODEL_ddb822e7b8d947b4a59430c805653c3b","value":"alt-parallel/test-00000-of-00001.parquet: 100%"}},"552cb64ab51b4c038f7065d5ece72ad4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_e7cac3c07a8e47179c2ca3a308dda345","max":1786537,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7961adc6d65d42308947ff3d1c6d8c8e","value":1786537}},"398ffcf356c14086a94fe4fc102ea618":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_dac510cdf0fc4e22866eadf9c0294f06","placeholder":"​","style":"IPY_MODEL_68148c6a30e34e96aa39aebb31be1c43","value":" 1.79M/1.79M [00:00&lt;00:00, 2.16MB/s]"}},"c3b786bbff9c485f9c74ac3dae7857be":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2efa929d583b4716a5bdf5496583d6b5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ddb822e7b8d947b4a59430c805653c3b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"e7cac3c07a8e47179c2ca3a308dda345":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7961adc6d65d42308947ff3d1c6d8c8e":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"dac510cdf0fc4e22866eadf9c0294f06":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"68148c6a30e34e96aa39aebb31be1c43":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"0c9399c9b77b414db4ef4b2d3a34f46e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_456000f0d012409db8c3077e7278e499","IPY_MODEL_08253a1253c3451b943560fc197352e2","IPY_MODEL_fa202bbd9af449d6b65ec9f2b590c2b1"],"layout":"IPY_MODEL_62d00b422ac44a07a2b72fbb22ecb1aa"}},"456000f0d012409db8c3077e7278e499":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1303c6f4973947dda8bf925b9a2c6aa9","placeholder":"​","style":"IPY_MODEL_151cfdb17570418ba523cd6e06e78685","value":"Generating train split: 100%"}},"08253a1253c3451b943560fc197352e2":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_c96e992a7e22432a897c7cd239b930a8","max":18088,"min":0,"orientation":"horizontal","style":"IPY_MODEL_cfc12ece33a94fe4b34eae0c60d565a4","value":18088}},"fa202bbd9af449d6b65ec9f2b590c2b1":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9203996d133f4d50a821f122aea5f75a","placeholder":"​","style":"IPY_MODEL_70e20e6d36a54c719b1cf1b4e5674acf","value":" 18088/18088 [00:00&lt;00:00, 62254.98 examples/s]"}},"62d00b422ac44a07a2b72fbb22ecb1aa":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1303c6f4973947dda8bf925b9a2c6aa9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"151cfdb17570418ba523cd6e06e78685":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c96e992a7e22432a897c7cd239b930a8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cfc12ece33a94fe4b34eae0c60d565a4":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9203996d133f4d50a821f122aea5f75a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"70e20e6d36a54c719b1cf1b4e5674acf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"78b2cc67439d4cbf92e038eddd3f161b":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ae68c0a697e14aaa86c9d0882a0ad3dc","IPY_MODEL_f83e25cbf112462895c7bf73679bb347","IPY_MODEL_0446501b448f475ab2dd1ac31e271df8"],"layout":"IPY_MODEL_affae7b453604976aeee265fe28b337d"}},"ae68c0a697e14aaa86c9d0882a0ad3dc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_da49068b06c54390937853d411badb7a","placeholder":"​","style":"IPY_MODEL_329fbecbfe054fa6ab310f380e01779e","value":"Generating validation split: 100%"}},"f83e25cbf112462895c7bf73679bb347":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_691731bd2d8d42e694f0d43aec6fc0e5","max":1000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_c2ffb0dd03664863973ee3362a99b63b","value":1000}},"0446501b448f475ab2dd1ac31e271df8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e18a194feab84ff885559e97433c0406","placeholder":"​","style":"IPY_MODEL_b309512414e2439fbdc4e92208646853","value":" 1000/1000 [00:00&lt;00:00, 25237.25 examples/s]"}},"affae7b453604976aeee265fe28b337d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"da49068b06c54390937853d411badb7a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"329fbecbfe054fa6ab310f380e01779e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"691731bd2d8d42e694f0d43aec6fc0e5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c2ffb0dd03664863973ee3362a99b63b":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e18a194feab84ff885559e97433c0406":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b309512414e2439fbdc4e92208646853":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1f113a7f71b84b2d89de2b461cfa4daa":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_e5bcb09ac5294ef8afed6e13d3297c13","IPY_MODEL_796c0bd2f45042f3aed902a50c7d8278","IPY_MODEL_0ff64bbe47a5407fa961e979c93e2cdc"],"layout":"IPY_MODEL_31e3616994124d17883f1361fd733633"}},"e5bcb09ac5294ef8afed6e13d3297c13":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f44ab67a089c464c95321c33cdaa2b88","placeholder":"​","style":"IPY_MODEL_aafe20126db3455495ac520639f536f1","value":"Generating test split: 100%"}},"796c0bd2f45042f3aed902a50c7d8278":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_74e314998e414ce9baba95bef1c09e56","max":1019,"min":0,"orientation":"horizontal","style":"IPY_MODEL_e0d16a103b944bd4841f62c47557bc59","value":1019}},"0ff64bbe47a5407fa961e979c93e2cdc":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b43dba75938d4af58da0e016d5193054","placeholder":"​","style":"IPY_MODEL_92b02abc4b2b42ccb5c095013df90af9","value":" 1019/1019 [00:00&lt;00:00, 27799.43 examples/s]"}},"31e3616994124d17883f1361fd733633":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f44ab67a089c464c95321c33cdaa2b88":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"aafe20126db3455495ac520639f536f1":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"74e314998e414ce9baba95bef1c09e56":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e0d16a103b944bd4841f62c47557bc59":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"b43dba75938d4af58da0e016d5193054":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"92b02abc4b2b42ccb5c095013df90af9":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":5}
app/.dockerignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ nllb_model/
2
+ __pycache__/
3
+ *.pyc
4
+ .ipynb_checkpoints/
5
+ .DS_Store
6
+ venv/
7
+ env/
8
+ .env
app/Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /code
6
+
7
+ # Copy the dependencies file
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ # Upgrade pip to avoid install issues
12
+ RUN pip install --no-cache-dir --upgrade pip
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy the rest of the application code
16
+ COPY . .
17
+
18
+ # Create a writable directory for cache/temp files if needed
19
+ # (Optional, but good practice for transformers cache if not pre-downloaded)
20
+ RUN mkdir -p /tmp/cache
21
+ ENV HF_HOME=/tmp/cache
22
+
23
+ # Expose the port that HuggingFace Spaces expects (7860)
24
+ EXPOSE 7860
25
+
26
+ # Run the application using Gunicorn
27
+ # Bind to 0.0.0.0:7860
28
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
app/app.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import sentencepiece as spm
5
+ import math
6
+ from flask import Flask, render_template, request, jsonify
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+
9
+ app = Flask(__name__)
10
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+
12
+ # --- 1. Transformer from Scratch Definition ---
13
+ # --- 1. Transformer from Scratch Definition ---
14
+ class TransformationModel(nn.Module):
15
+ # NOTE: Class name in notebook might have been TransformerModel, but let's check if user renamed it
16
+ # The user's notebook has 'TransformerModel'.
17
+ pass
18
+
19
+ class PositionalEncoding(nn.Module):
20
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
21
+ super(PositionalEncoding, self).__init__()
22
+ self.dropout = nn.Dropout(p=dropout)
23
+
24
+ pe = torch.zeros(max_len, d_model)
25
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
26
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
27
+ pe[:, 0::2] = torch.sin(position * div_term)
28
+ pe[:, 1::2] = torch.cos(position * div_term)
29
+ self.register_buffer('pe', pe)
30
+
31
+ def forward(self, x):
32
+ x = x + self.pe[:x.size(1), :]
33
+ return self.dropout(x)
34
+
35
+ class TransformerModel(nn.Module):
36
+ def __init__(self, src_vocab_size, trg_vocab_size,
37
+ d_model=512, nhead=8, num_encoder_layers=3,
38
+ num_decoder_layers=3, dim_feedforward=2048, dropout=0.1, pad_idx=0):
39
+ super(TransformerModel, self).__init__()
40
+
41
+ self.d_model = d_model
42
+ self.pad_idx = pad_idx
43
+
44
+ # Embeddings
45
+ self.src_embedding = nn.Embedding(src_vocab_size, d_model)
46
+ self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
47
+
48
+ # Positional Encoding
49
+ self.pos_encoder = PositionalEncoding(d_model, dropout)
50
+
51
+ # Transformer
52
+ self.transformer = nn.Transformer(
53
+ d_model=d_model,
54
+ nhead=nhead,
55
+ num_encoder_layers=num_encoder_layers,
56
+ num_decoder_layers=num_decoder_layers,
57
+ dim_feedforward=dim_feedforward,
58
+ dropout=dropout,
59
+ batch_first=True
60
+ )
61
+
62
+ # Output Layer
63
+ self.fc_out = nn.Linear(d_model, trg_vocab_size)
64
+
65
+ def forward(self, src, trg):
66
+ # src: [batch_size, src_len]
67
+ # trg: [batch_size, trg_len]
68
+
69
+ # Create masks
70
+ src_key_padding_mask = (src == self.pad_idx)
71
+ # trg_key_padding_mask = (trg == self.pad_idx) # Optional, usually handled by generating loop mask
72
+
73
+ # Target mask for autoregressive decoding
74
+ trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(src.device)
75
+
76
+ # Embed + Positional Encoding
77
+ src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
78
+ trg_emb = self.trg_embedding(trg) * math.sqrt(self.d_model)
79
+
80
+ src_emb = self.pos_encoder(src_emb)
81
+ trg_emb = self.pos_encoder(trg_emb)
82
+
83
+ # Transformer Forward
84
+ output = self.transformer(
85
+ src=src_emb,
86
+ tgt=trg_emb,
87
+ tgt_mask=trg_mask,
88
+ src_key_padding_mask=src_key_padding_mask,
89
+ # tgt_key_padding_mask=trg_key_padding_mask
90
+ )
91
+
92
+ return self.fc_out(output)
93
+
94
+ # --- 2. Load Models ---
95
+ # Paths
96
+ BASE_DIR = os.path.dirname(__file__)
97
+ NLLB_PATH = os.path.join(BASE_DIR, 'nllb_model')
98
+ NLLB_PATH_SYNC = os.path.join(BASE_DIR, '../../nllb_model')
99
+ TRANSFORMER_PATH = os.path.join(BASE_DIR, 'models/transformer_model.pt')
100
+ SPM_MY_PATH = os.path.join(BASE_DIR, 'models/spm_my.model')
101
+ SPM_EN_PATH = os.path.join(BASE_DIR, 'models/spm_en.model')
102
+
103
+ # Global Variables
104
+ nllb_model = None
105
+ nllb_tokenizer = None
106
+ # Global Variables for Scratch Models
107
+ scratch_models = {}
108
+ sp_src_models = {}
109
+ sp_trg_models = {}
110
+
111
+ # Language Mapping for NLLB
112
+ NLLB_LANG_MAP = {
113
+ 'my': 'mya_Mymr',
114
+ 'th': 'tha_Thai',
115
+ 'zh': 'zho_Hans',
116
+ 'hi': 'hin_Deva',
117
+ 'ne': 'npi_Deva',
118
+ 'ur': 'urd_Arab',
119
+ 'vi': 'vie_Latn',
120
+ 'tl': 'tgl_Latn',
121
+ 'kk': 'kaz_Cyrl',
122
+ 'bn': 'ben_Beng',
123
+ 'de': 'deu_Latn'
124
+ }
125
+
126
+ def load_nllb():
127
+ global nllb_model, nllb_tokenizer
128
+ try:
129
+ print("Loading NLLB Model...")
130
+ # Check if model exists locally
131
+ if os.path.exists(NLLB_PATH) or os.path.exists(NLLB_PATH_SYNC):
132
+ model_path = NLLB_PATH if os.path.exists(NLLB_PATH) else NLLB_PATH_SYNC
133
+ print(f"Loading from {model_path}...")
134
+ nllb_tokenizer = AutoTokenizer.from_pretrained(model_path)
135
+ nllb_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE)
136
+ else:
137
+ # Download if not found (fallback)
138
+ print("NLLB model not found locally. Downloading facebook/nllb-200-distilled-600M...")
139
+ nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
140
+ nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(DEVICE)
141
+
142
+ # Save for later
143
+ print(f"Saving NLLB model to {NLLB_PATH}...")
144
+ nllb_tokenizer.save_pretrained(NLLB_PATH)
145
+ nllb_model.save_pretrained(NLLB_PATH)
146
+
147
+ print("NLLB Model Loaded.")
148
+ except Exception as e:
149
+ print(f"Failed to load NLLB Model: {e}")
150
+
151
+ def translate_nllb(text, src_lang="mya_Mymr", tgt_lang="eng_Latn"):
152
+ if not nllb_model or not nllb_tokenizer: return "Error: NLLB Model not loaded. Please wait for the model to download or check logs."
153
+ try:
154
+ # Set source language
155
+ nllb_tokenizer.src_lang = src_lang
156
+
157
+ inputs = nllb_tokenizer(text, return_tensors="pt").to(DEVICE)
158
+ with torch.no_grad():
159
+ translated_tokens = nllb_model.generate(**inputs, forced_bos_token_id=nllb_tokenizer.convert_tokens_to_ids(tgt_lang), max_length=128)
160
+ return nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
161
+ except Exception as e:
162
+ print(f"Error during NLLB translation: {e}")
163
+ return f"Error translating: {str(e)}"
164
+
165
+ # Initial Load
166
+ load_nllb()
167
+
168
+ def load_scratch_transformer():
169
+ global scratch_models, sp_src_models, sp_trg_models
170
+
171
+ languages = ['my', 'th', 'zh', 'hi', 'ne', 'ur', 'vi', 'tl', 'kk', 'bn', 'de']
172
+
173
+ for lang in languages:
174
+ # Define paths for each language
175
+ t_name = f'transformer_model_{lang}.pt' if lang != 'my' else 'transformer_model.pt'
176
+ s_name = f'spm_{lang}.model'
177
+ # English tokenizer naming convention
178
+ if lang == 'my': e_name = 'spm_en.model'
179
+ elif lang in ['th', 'zh', 'hi', 'ne', 'ur', 'vi', 'tl', 'kk', 'bn', 'de']: e_name = f'spm_en_{lang}.model'
180
+ else: e_name = 'spm_en.model'
181
+
182
+ # Check local then sync
183
+ t_path = os.path.join(BASE_DIR, f'models/{t_name}')
184
+ if not os.path.exists(t_path): t_path = os.path.join(BASE_DIR, f'../../models/{t_name}') # Fallback logic if needed, but standard is models/
185
+
186
+ s_path = os.path.join(BASE_DIR, f'models/{s_name}')
187
+ e_path = os.path.join(BASE_DIR, f'models/{e_name}')
188
+
189
+ # Fix for standard deployment structure (app/models) vs dev
190
+ if not os.path.exists(t_path):
191
+ # Try sync path logic for dev
192
+ t_path = os.path.join(BASE_DIR, f'../../app/models/{t_name}')
193
+ s_path = os.path.join(BASE_DIR, f'../../app/models/{s_name}')
194
+ e_path = os.path.join(BASE_DIR, f'../../app/models/{e_name}')
195
+
196
+ if os.path.exists(t_path) and os.path.exists(s_path) and os.path.exists(e_path):
197
+ try:
198
+ print(f"Loading Scratch Model for {lang}...")
199
+ sp_src_models[lang] = spm.SentencePieceProcessor(model_file=s_path)
200
+ sp_trg_models[lang] = spm.SentencePieceProcessor(model_file=e_path)
201
+
202
+ # Model params must match notebooks
203
+ # New languages use vocab_size=8000
204
+ vocab_size = 8000 if lang in ['hi', 'ne', 'ur', 'vi', 'tl', 'kk', 'bn', 'de'] else 4000
205
+
206
+ model = TransformerModel(
207
+ src_vocab_size=vocab_size,
208
+ trg_vocab_size=vocab_size,
209
+ d_model=256, nhead=4, num_encoder_layers=2,
210
+ num_decoder_layers=2, dim_feedforward=512, dropout=0.1, pad_idx=0
211
+ ).to(DEVICE)
212
+
213
+ model.load_state_dict(torch.load(t_path, map_location=DEVICE))
214
+ model.eval()
215
+ scratch_models[lang] = model
216
+ print(f"Scratch Transformer ({lang}) Loaded.")
217
+ except Exception as e:
218
+ print(f"Failed to load Scratch Transformer ({lang}): {e}")
219
+ else:
220
+ print(f"Scratch Transformer files for {lang} not found. Skipping.")
221
+
222
+ def translate_scratch(text, lang='my'):
223
+ # Lazy loading if model not found
224
+ if lang not in scratch_models:
225
+ print(f"Model for {lang} not found. Attempting to load...")
226
+ load_scratch_transformer()
227
+
228
+ if lang not in scratch_models:
229
+ return f"Error: Model for {lang} not available. Please train it first."
230
+
231
+ model = scratch_models[lang]
232
+ sp_src = sp_src_models[lang]
233
+ sp_trg = sp_trg_models[lang]
234
+
235
+ encoded_list = sp_src.encode_as_ids(text)
236
+ src_ids = [sp_src.bos_id()] + encoded_list + [sp_src.eos_id()]
237
+ src_tensor = torch.LongTensor(src_ids).unsqueeze(0).to(DEVICE)
238
+
239
+ outputs = [sp_trg.bos_id()]
240
+ for i in range(50):
241
+ trg_tensor = torch.LongTensor(outputs).unsqueeze(0).to(DEVICE)
242
+ with torch.no_grad():
243
+ output = model(src_tensor, trg_tensor)
244
+ best_guess = output.argmax(2)[:, -1].item()
245
+ if best_guess == sp_trg.eos_id(): break
246
+ outputs.append(best_guess)
247
+
248
+ return sp_trg.decode(outputs[1:])
249
+
250
+ # --- 4. Routes ---
251
+ @app.route('/', methods=['GET', 'POST'])
252
+ def index():
253
+ translation = ""
254
+ original = ""
255
+ model_choice = "nllb" # This will now effectively allow NLLB vs Scratch
256
+ lang_choice = "my"
257
+
258
+ if request.method == 'POST':
259
+ original = request.form.get('source_text', '')
260
+ model_choice = request.form.get('model_choice', 'nllb')
261
+ lang_choice = request.form.get('lang_choice', 'my')
262
+
263
+ if original:
264
+ if model_choice == 'nllb':
265
+ # Use NLLB with language code
266
+ src_code = NLLB_LANG_MAP.get(lang_choice, 'mya_Mymr')
267
+ translation = translate_nllb(original, src_lang=src_code, tgt_lang='eng_Latn')
268
+ else:
269
+ translation = translate_scratch(original, lang=lang_choice)
270
+
271
+ return render_template('index.html', translation=translation, original=original, model_choice=model_choice, lang_choice=lang_choice)
272
+
273
+ @app.route('/api/translate', methods=['POST'])
274
+ def api_translate():
275
+ data = request.json
276
+ text = data.get('text', '')
277
+ model_type = data.get('model', 'nllb')
278
+ lang = data.get('lang', 'my')
279
+ direction = data.get('direction', 'f2e') # f2e (Foreign to English) or e2f (English to Foreign)
280
+
281
+ if not text: return jsonify({'error': 'No text provided'}), 400
282
+
283
+ # Language Mapping for NLLB
284
+ # Language Mapping for NLLB (Use Global)
285
+ target_code = NLLB_LANG_MAP.get(lang, 'mya_Mymr')
286
+ english_code = 'eng_Latn'
287
+
288
+ if model_type == 'nllb':
289
+ if direction == 'f2e':
290
+ # Foreign -> English
291
+ translation = translate_nllb(text, src_lang=target_code, tgt_lang=english_code)
292
+ else:
293
+ # English -> Foreign
294
+ translation = translate_nllb(text, src_lang=english_code, tgt_lang=target_code)
295
+ else:
296
+ # Scratch model
297
+ if direction == 'e2f':
298
+ translation = f"Error: The Scratch Transformer model only supports {lang.upper()} -> English translation. Please use NLLB for English -> {lang.upper()}."
299
+ else:
300
+ translation = translate_scratch(text, lang=lang)
301
+
302
+ return jsonify({'translation': translation, 'model': model_type, 'lang': lang, 'direction': direction})
303
+
304
+ # Load Scratch Models
305
+ load_scratch_transformer()
306
+
307
+ if __name__ == '__main__':
308
+ app.run(debug=True, host='0.0.0.0', port=5001)
app/models/spm_bn.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eca3987b35b4fc63c8195574d289ed4a38cb928d9ba98ee3ed907c61e6ea8f17
3
+ size 432288
app/models/spm_de.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bcfb1adf5c4d1ee77c4adf990b221b070eb9bae324eec5f304f1e1e5197c00c
3
+ size 363507
app/models/spm_en.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0682aa69df3a4b8759b1b84de76d420ac7bb0c1d3eb13654e893aa44e0fcf3fc
3
+ size 301789
app/models/spm_en_bn.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215de7a00f9237d8d84f886f503fc4835c28e5e31bfea8132a1c4c267897db8e
3
+ size 367329
app/models/spm_en_de.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a0566dc48b91bd1a88a740adedb28d5aa6e98bfd55fab635fb392d0e5682f04
3
+ size 364310
app/models/spm_en_hi.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63103db31e76ca6114c7ca07e096519870b2453504cc7e4c261e9abf6656282a
3
+ size 367126
app/models/spm_en_kk.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:328fe08e93d2300a4e8275f0fae48603d4bea168fb4ecb0b802ea4d3bf920194
3
+ size 365178
app/models/spm_en_ne.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9d25c7975ef6934f32cade8b0a18ee152e68adc54177ea6933d5b59cc8d8b77
3
+ size 365442
app/models/spm_en_th.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26072cd673684251201725c9562f0399cb36147edcefa94e3d9f86f7fcefc48c
3
+ size 301805
app/models/spm_en_tl.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8d2ad373e42fd13404bbd3a93ff786c416032cff6cc589eafe15460706f719b
3
+ size 368786
app/models/spm_en_ur.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c2fd8602e6cfa0926d971e22706bb3139d204e61f9f272f289bf68eb3cdd250
3
+ size 371045
app/models/spm_en_vi.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d56b5384f055ac5775104317cdc1fe075c2b815247da69d16ae750457d7c37
3
+ size 368792
app/models/spm_en_zh.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fb3964e62398422318800e5cfa3c8e99129ff6dd18026061e0dd82dedca65ef
3
+ size 301801
app/models/spm_hi.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9ef78a411f6cdc495619b08d51a14cdb31e479f8c8d697ad6719d80ae0b0bfc
3
+ size 420423
app/models/spm_kk.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32d5dae6db9558ec7d2644b468c37cd56a6e7c57c1f4f3787f66eeb3dbf1fec5
3
+ size 394807
app/models/spm_my.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f84cebd908633ca6dfa8b2f3a68564e192820b3b85660ea95223c4a8a18f50ab
3
+ size 345249
app/models/spm_ne.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f8a4fe5b75ae9d389708a925309863d1dfba4ca7f2a116621a4bdc026a0930
3
+ size 421149
app/models/spm_th.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42b112d07c7803b68e31f0fcec6a4e1efb980690130b09397053368a75c90d12
3
+ size 327768
app/models/spm_tl.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7be22c5f9aab20883ddeb0d47317f4b265b2ff3429dcf17fb31205a2d76124
3
+ size 369093
app/models/spm_ur.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d68c3c1184864ab9df74b153dc5d5eed0c12223485c04684279a1cc5cf6552b
3
+ size 392567
app/models/spm_vi.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6f4a9a8673ddbb5274ba70aa17fa6c5bfb382b04fb6144485c5fd3bc1cc8eb8
3
+ size 362817
app/models/spm_zh.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c3920cdc6dda7d859164562bf9bfb1092bf3ebe6a25272fc58c48c8968e8266
3
+ size 291274
app/models/transformer_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:493c1cf6d24814251f4d690e5a247b955521fd98eebcbf0bc22730ad08c595ea
3
+ size 27998914
app/models/transformer_model_bn.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:858734606c7d355b405619fb011d240bc935bfc2cf5e6eee3f94ea201e5a102f
3
+ size 40303139
app/models/transformer_model_de.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2547249aa6c5a7a3a778b6c04e9892dd3c1d7025a51bb3dd9cbbda92c77846b7
3
+ size 40303139
app/models/transformer_model_hi.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1dad73c3e1108571bb623f5e8a74a85a6ee1abdeb518de7475aa9dee2d0f608
3
+ size 40303139
app/models/transformer_model_kk.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a82208803dc4726864fab0a3b87cea99c0a61a116bc1641f5a7fa8def8e9ce
3
+ size 40303139
app/models/transformer_model_ne.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee2bf41a678beb9e895f66609e686861338fa214c3e9fdbbd1d4ecad3909be9
3
+ size 40303139
app/models/transformer_model_th.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3f5834dc9188f5af71da97f3e332b31e4664c3fff5dbc95bf8cf65479d8a3ad
3
+ size 27999139
app/models/transformer_model_tl.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ab6e54c69ac135b92861b7450bf5040fa0f0b8be8023d9bf38aabe04e73cf9
3
+ size 40303139