diff --git "a/DynamicPadding.ipynb" "b/DynamicPadding.ipynb" new file mode 100644--- /dev/null +++ "b/DynamicPadding.ipynb" @@ -0,0 +1,128394 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fff7f8c170ef477eabc03bb51111e63f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/3668 [00:003\u001b[0m train_dataloader \u001b[39m=\u001b[39m DataLoader(tokenized_datasets[\u001b[39m'\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m'\u001b[39m],\n\u001b[0;32m 4\u001b[0m batch_size\u001b[39m=\u001b[39m\u001b[39m16\u001b[39m,\n\u001b[0;32m 5\u001b[0m shuffle\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m 7\u001b[0m \u001b[39mfor\u001b[39;00m step, batch \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(train_dataloader):\n\u001b[1;32m----> 8\u001b[0m \u001b[39mprint\u001b[39m(batch[\u001b[39m'\u001b[39;49m\u001b[39minput_ids\u001b[39;49m\u001b[39m'\u001b[39;49m]\u001b[39m.\u001b[39;49mshape)\n\u001b[0;32m 9\u001b[0m \u001b[39mif\u001b[39;00m step\u001b[39m>\u001b[39m\u001b[39m5\u001b[39m:\n\u001b[0;32m 10\u001b[0m \u001b[39mbreak\u001b[39;00m\n", + "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'shape'" + ] + } + ], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "train_dataloader = DataLoader(tokenized_datasets['train'],\n", + " batch_size=16,\n", + " shuffle=True)\n", + "\n", + "for step, batch in enumerate(train_dataloader):\n", + " print(batch['input_ids'].shape)\n", + " if step>5:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'As you can see, the input_ids are a list at the moment and so we\\nare facing this issue. we need to convert this into a tensor!'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''As you can see, the input_ids are a list at the moment and so we\n", + "are facing this issue. we need to convert this into a tensor!'''" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", + "tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])\n", + "tokenized_datasets = tokenized_datasets.rename_column('label','labels')\n", + "tokenized_datasets = tokenized_datasets.with_format('pytorch')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Now that you changed the format to pytorch, you will be able to input \\nthis tokenized dataset into the dataloader!'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''Now that you changed the format to pytorch, you will be able to input \n", + "this tokenized dataset into the dataloader!'''" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([16, 128])\n", + "torch.Size([16, 128])\n", + "torch.Size([16, 128])\n", + "torch.Size([16, 128])\n", + "torch.Size([16, 128])\n", + "torch.Size([16, 128])\n", + "torch.Size([16, 128])\n" + ] + } + ], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "train_dataloader = DataLoader(tokenized_datasets['train'], \n", + " batch_size=16,\n", + " shuffle=True)\n", + "\n", + "for step, batch in enumerate(train_dataloader):\n", + " print(batch['input_ids'].shape)\n", + " if step>5:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dynamic Padding" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'To apply dynamic padding:\\nwe must refer the part to the batch preperation!\\nSo we remove that part of padding from our tokenize function\\nbut leave the truncation part to True'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''To apply dynamic padding:\n", + "we must refer the part to the batch preperation!\n", + "So we remove that part of padding from our tokenize function\n", + "but leave the truncation part to True'''" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7591fbc9685244a48d641fb7501254d8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/408 [00:005:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}