pankaj10034 commited on
Commit
9682e0d
Β·
1 Parent(s): e18e38d

Upload 16 files

Browse files
Files changed (16) hide show
  1. Datathon_indoml_2023/1-step training/IndoML_Phase2_1st_stage_kfold_cross_validation.ipynb +0 -0
  2. Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_Ibm_pretrained_roberta_large_model.ipynb +0 -0
  3. Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_habana_xlm_r_large_pretrained_xlm_roberta_model.ipynb +0 -0
  4. Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_roberta_large_directly.ipynb +0 -0
  5. Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_vira_chatbot_pretrained_roberta_model.ipynb +0 -0
  6. Datathon_indoml_2023/1-step training/indoml_phase2_roberta_large_data_augmentation.ipynb +0 -0
  7. Datathon_indoml_2023/2-step training/indoml_roberta_large_2nd_stage_fine_tuning.ipynb +0 -0
  8. Datathon_indoml_2023/2-step training/intent_classificiation_indoml_pretraining_roberta_large_1st_stage.ipynb +1 -0
  9. Datathon_indoml_2023/extra layer training/Indoml_phase2_extra_layer_roberta_training.ipynb +0 -0
  10. Datathon_indoml_2023/requirements.txt +13 -0
  11. Datathon_indoml_2023/testing files/testing files for all training codes/test.ipynb +1 -0
  12. Datathon_indoml_2023/testing files/testing files for extra layer training/indoml_testing_extra_layer.ipynb +0 -0
  13. Datathon_indoml_2023/trainer setfit/trainer_Setfit_ibm_roberta_large.ipynb +0 -0
  14. Datathon_indoml_2023/trainer setfit/trainer_setfit_paraphrase_MiniLM_L6_v2.ipynb +0 -0
  15. Datathon_indoml_2023/voting Ensemble/Voting_ensemble_n_number_of_files.ipynb +0 -0
  16. Datathon_indoml_2023/voting Ensemble/final_11_best_models_voting_ensemble.ipynb +0 -0
Datathon_indoml_2023/1-step training/IndoML_Phase2_1st_stage_kfold_cross_validation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_Ibm_pretrained_roberta_large_model.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_habana_xlm_r_large_pretrained_xlm_roberta_model.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_roberta_large_directly.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/1-step training/indoml_phase2_1st_stage_vira_chatbot_pretrained_roberta_model.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/1-step training/indoml_phase2_roberta_large_data_augmentation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/2-step training/indoml_roberta_large_2nd_stage_fine_tuning.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/2-step training/intent_classificiation_indoml_pretraining_roberta_large_1st_stage.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3UzIdidWXGB6","outputId":"4abe3b81-773e-48f0-b16a-7d96ce47c4c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting transformers\n"," Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n","Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)\n"," Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m34.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n","Collecting tokenizers<0.15,>=0.14 (from transformers)\n"," Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m53.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n"," Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m53.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.27.6)\n","Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (17.0.2)\n","Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n","Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)\n"," Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m36.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.6)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n","Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers\n","Successfully installed huggingface-hub-0.17.3 safetensors-0.4.0 tokenizers-0.14.1 transformers-4.34.0\n"]}],"source":["!pip install transformers torch"]},{"cell_type":"code","source":["!pip install -U SentencePiece"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mD9GsQSKXIwd","outputId":"9fc95e07-5d31-4767-8820-1fad580e41d2"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting SentencePiece\n"," Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n","\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.3 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━\u001b[0m\u001b[90mβ•Ί\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.2/1.3 MB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m20.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: SentencePiece\n","Successfully installed SentencePiece-0.1.99\n"]}]},{"cell_type":"code","source":["!pip install accelerate -U"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ffXjIdZJXUKE","outputId":"97e551c4-fa63-4e83-8d98-8705fcaea24f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting accelerate\n"," Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)\n","\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/258.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91mβ•Έ\u001b[0m \u001b[32m256.0/258.1 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m258.1/258.1 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.23.5)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (23.2)\n","Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n","Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0.1)\n","Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.0.1+cu118)\n","Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.17.3)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.12.4)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.2)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.0.0)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (3.27.6)\n","Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (17.0.2)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->accelerate) (2023.6.0)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->accelerate) (2.31.0)\n","Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->accelerate) (4.66.1)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (3.3.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (2.0.6)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (2023.7.22)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n","Installing collected packages: accelerate\n","Successfully installed accelerate-0.23.0\n"]}]},{"cell_type":"code","source":["!pip install datasets"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5XjttF3RXpG2","outputId":"46566274-2e31-4cb7-86f2-897db85e97b0"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting datasets\n"," Downloading datasets-2.14.5-py3-none-any.whl (519 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.23.5)\n","Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n","Collecting dill<0.3.8,>=0.3.0 (from datasets)\n"," Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n","Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n","Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.1)\n","Collecting xxhash (from datasets)\n"," Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting multiprocess (from datasets)\n"," Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: fsspec[http]<2023.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n","Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.6)\n","Requirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.17.3)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n","Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (3.3.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n","Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n","Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (3.12.4)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (4.5.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.6)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2023.7.22)\n","Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n","Installing collected packages: xxhash, dill, multiprocess, datasets\n","Successfully installed datasets-2.14.5 dill-0.3.7 multiprocess-0.70.15 xxhash-3.4.1\n"]}]},{"cell_type":"code","source":["CUDA_LAUNCH_BLOCKING=1"],"metadata":{"id":"Hob3_SZ2XV5q"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"lFmbjZxDXYpf","outputId":"61d4a7bd-71d2-4e48-adca-1a79af5c5469"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","source":["import json\n","import random\n","import numpy as np\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import classification_report\n","from datasets import load_dataset\n","import torch\n","from transformers import BertTokenizer, BertForSequenceClassification\n","from transformers import AutoTokenizer, AutoModelForSequenceClassification\n","from transformers import RobertaTokenizer, RobertaForSequenceClassification, T5Tokenizer, T5ForSequenceClassification,TrainingArguments, Trainer"],"metadata":{"id":"py0In3CxXd8P"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# **Loading the ntu adl intent dataset**"],"metadata":{"id":"fAnC0Jfawt-E"}},{"cell_type":"code","source":["ntu_adl_intent = load_dataset(\"xjlulu/ntu_adl_intent\")\n","print(ntu_adl_intent[\"train\"][0])\n","print(ntu_adl_intent)\n","\n","surprise_intent = [\"accept reservations\", \"account blocked\", \"alarm\", \"application status\", \"apr\", \"are you a bot\", \"balance\", \"bill balance\", \"bill due\", \"book flight\", \"book hotel\", \"calculator\", \"calendar\", \"calendar update\", \"calories\", \"cancel\", \"cancel reservation\", \"car rental\", \"card declined\", \"carry on\", \"change accent\", \"change ai name\", \"change language\", \"change speed\", \"change user name\", \"change volume\", \"confirm reservation\", \"cook time\", \"credit limit\", \"credit limit change\", \"credit score\", \"current location\", \"damaged card\", \"date\", \"definition\", \"direct deposit\", \"directions\", \"distance\", \"do you have pets\", \"exchange rate\", \"expiration date\", \"find phone\", \"flight status\", \"flip coin\", \"food last\", \"freeze account\", \"fun fact\", \"gas\", \"gas type\", \"goodbye\", \"greeting\", \"how busy\", \"how old are you\", \"improve credit score\", \"income\", \"ingredient substitution\", \"ingredients list\", \"insurance\", \"insurance change\", \"interest rate\", \"international fees\", \"international visa\", \"jump start\", \"last maintenance\", \"lost luggage\", \"make call\", \"maybe\", \"meal suggestion\", \"meaning of life\", \"measurement conversion\", \"meeting schedule\", \"min payment\", \"mpg\", \"new card\", \"next holiday\", \"next song\", \"no\", \"nutrition info\", \"oil change how\", \"oil change when\", \"order\", \"order checks\", \"order status\", \"pay bill\", \"payday\", \"pin change\", \"play music\", \"plug type\", \"pto balance\", \"pto request\", \"pto request status\", \"pto used\", \"recipe\", \"redeem rewards\", \"reminder\", \"reminder update\", \"repeat\", \"replacement card duration\", \"report fraud\", \"report lost card\", \"reset settings\", \"restaurant reservation\", \"restaurant reviews\", \"restaurant suggestion\", \"rewards balance\", \"roll dice\", \"rollover 401k\", \"routing\", \"schedule maintenance\", \"schedule meeting\", \"share location\", \"shopping list\", \"shopping list update\", \"smart home\", \"spelling\", \"spending history\", \"sync device\", \"taxes\", \"tell joke\", \"text\", \"thank you\", \"time\", \"timer\", \"timezone\", \"tire change\", \"tire pressure\", \"todo list\", \"todo list update\", \"traffic\", \"transactions\", \"transfer\", \"translate\", \"travel alert\", \"travel notification\", \"travel suggestion\", \"uber\", \"update playlist\", \"user name\", \"vaccines\", \"w2\", \"weather\", \"what are your hobbies\", \"what can i ask you\", \"what is your name\", \"what song\", \"where are you from\", \"whisper mode\", \"who do you work for\", \"who made you\", \"yes\"]\n","print(len(surprise_intent))\n","\n","new_intent = ['accept_reservations', 'account_blocked', 'alarm', 'application_status', 'apr', 'are_you_a_bot', 'balance', 'bill_balance', 'bill_due', 'book_flight', 'book_hotel', 'calculator', 'calendar', 'calendar_update', 'calories', 'cancel', 'cancel_reservation', 'car_rental', 'card_declined', 'carry_on', 'change_accent', 'change_ai_name', 'change_language', 'change_speed', 'change_user_name', 'change_volume', 'confirm_reservation', 'cook_time', 'credit_limit', 'credit_limit_change', 'credit_score', 'current_location', 'damaged_card', 'date', 'definition', 'direct_deposit', 'directions', 'distance', 'do_you_have_pets', 'exchange_rate', 'expiration_date', 'find_phone', 'flight_status', 'flip_coin', 'food_last', 'freeze_account', 'fun_fact', 'gas', 'gas_type', 'goodbye', 'greeting', 'how_busy', 'how_old_are_you', 'improve_credit_score', 'income', 'ingredient_substitution', 'ingredients_list', 'insurance', 'insurance_change', 'interest_rate', 'international_fees', 'international_visa', 'jump_start', 'last_maintenance', 'lost_luggage', 'make_call', 'maybe', 'meal_suggestion', 'meaning_of_life', 'measurement_conversion', 'meeting_schedule', 'min_payment', 'mpg', 'new_card', 'next_holiday', 'next_song', 'no', 'nutrition_info', 'oil_change_how', 'oil_change_when', 'order', 'order_checks', 'order_status', 'pay_bill', 'payday', 'pin_change', 'play_music', 'plug_type', 'pto_balance', 'pto_request', 'pto_request_status', 'pto_used', 'recipe', 'redeem_rewards', 'reminder', 'reminder_update', 'repeat', 'replacement_card_duration', 'report_fraud', 'report_lost_card', 'reset_settings', 'restaurant_reservation', 'restaurant_reviews', 'restaurant_suggestion', 'rewards_balance', 'roll_dice', 'rollover_401k', 'routing', 'schedule_maintenance', 'schedule_meeting', 'share_location', 'shopping_list', 'shopping_list_update', 'smart_home', 'spelling', 'spending_history', 'sync_device', 'taxes', 'tell_joke', 'text', 'thank_you', 'time', 'timer', 'timezone', 'tire_change', 'tire_pressure', 'todo_list', 'todo_list_update', 'traffic', 'transactions', 'transfer', 'translate', 'travel_alert', 'travel_notification', 'travel_suggestion', 'uber', 'update_playlist', 'user_name', 'vaccines', 'w2', 'weather', 'what_are_your_hobbies', 'what_can_i_ask_you', 'what_is_your_name', 'what_song', 'where_are_you_from', 'whisper_mode', 'who_do_you_work_for', 'who_made_you', 'yes']\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Sluvva2rXis8","outputId":"3294a8ac-d843-47bf-f2f5-7ac159ee5689"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["{'id': 'train-0', 'intent': 'book_flight', 'text': 'i need you to book me a flight from ft lauderdale to houston on southwest'}\n","DatasetDict({\n"," train: Dataset({\n"," features: ['id', 'intent', 'text'],\n"," num_rows: 15000\n"," })\n"," validation: Dataset({\n"," features: ['id', 'intent', 'text'],\n"," num_rows: 3000\n"," })\n"," test: Dataset({\n"," features: ['id', 'intent', 'text'],\n"," num_rows: 7500\n"," })\n","})\n","150\n"]}]},{"cell_type":"code","source":["train_data=ntu_adl_intent['train']['text']\n","valid_data=ntu_adl_intent['validation']['text']\n","test_data=ntu_adl_intent['test']['text']"],"metadata":{"id":"MeS0Wy4MXwPS"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_data[0:5], test_data[0:5], valid_data[0:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"xq-WTTaFYHCY","outputId":"8e0f0083-d8c2-4797-e989-aafe99ec0859"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(['i need you to book me a flight from ft lauderdale to houston on southwest',\n"," 'my check engine light is on and i need to take a look at it',\n"," 'is the company party on my list of reminders',\n"," 'are you a human',\n"," 'i need to do cleaning so add it to my to do list'],\n"," ['how much are my rent and cable',\n"," 'i need some suggestions for dinner places tonight',\n"," 'can you report a lost card for me',\n"," 'timezone currently in mobile',\n"," 'tell me what to call you'],\n"," ['how long should i cook steak for',\n"," 'please tell me how much money i have in my bank accounts',\n"," 'what is the gas level in my gas tank',\n"," 'how late is it now in ourense',\n"," 'i really need to get a volkswagen car rental for march 5th to march 8th in phoenix'])"]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["train_labels=ntu_adl_intent['train']['intent']\n","valid_labels=ntu_adl_intent['validation']['intent']\n","test_labels=ntu_adl_intent['test']['intent']"],"metadata":{"id":"vfkQmDPXYEOE"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# **Since some of intents in the dataset have _ in between them so replacing that underscore with space**"],"metadata":{"id":"xiYL1Nclw0GP"}},{"cell_type":"code","source":["import re\n","def convert_label(label):\n"," # Use re.sub() to replace underscores with spaces and capitalize words\n"," # converted_label = re.sub(r'_', ' ', label).title()\n"," converted_label = re.sub(r'_', ' ', label) # .title() is for captializing the words\n"," return converted_label\n","\"\"\"We use the map function to apply the convert_label function to each element of the train labels.\n"," The result is a map object, so we convert it to a list to see the converted labels.\"\"\"\n","new_train_labels=list(map(convert_label, train_labels))\n","new_valid_labels=list(map(convert_label, valid_labels))\n","new_test_labels=list(map(convert_label, test_labels))\n","\n"],"metadata":{"id":"PpUR27KnYYXG"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# # Assuming you have train_data, train_labels, test_data, and test_labels as lists\n","# combined_train_data = train_data + test_data\n","# combined_train_labels = new_train_labels + new_test_labels"],"metadata":{"id":"eMaO3kFUaIk0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# combined_train_data[0:5], len(combined_train_data)"],"metadata":{"id":"NPxBclQXbh80"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# combined_train_labels[0:5], len(combined_train_labels)"],"metadata":{"id":"c6MiiasIbnnl"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["intents_set=set(surprise_intent)\n","print(intents_set)\n","len(intents_set)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ciJa8UTBbrfg","outputId":"27f62cd4-b760-4a77-e389-ca01abd2895d"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["{'min payment', 'who do you work for', 'redeem rewards', 'carry on', 'translate', 'change language', 'payday', 'bill due', 'food last', 'interest rate', 'directions', 'shopping list', 'gas type', 'maybe', 'calculator', 'restaurant reviews', 'tire pressure', 'application status', 'text', 'taxes', 'travel suggestion', 'timezone', 'order status', 'calendar update', 'last maintenance', 'travel alert', 'freeze account', 'restaurant reservation', 'do you have pets', 'plug type', 'car rental', 'book hotel', 'change user name', 'transfer', 'user name', 'goodbye', 'todo list', 'fun fact', 'sync device', 'cancel reservation', 'pto request', 'improve credit score', 'thank you', 'what are your hobbies', 'credit score', 'vaccines', 'mpg', 'next holiday', 'jump start', 'order checks', 'tell joke', 'time', 'replacement card duration', 'whisper mode', 'ingredients list', 'smart home', 'book flight', 'recipe', 'tire change', 'next song', 'reminder', 'pto balance', 'repeat', 'credit limit', 'w2', 'new card', 'apr', 'account blocked', 'international fees', 'current location', 'exchange rate', 'find phone', 'play music', 'meal suggestion', 'income', 'share location', 'international visa', 'measurement conversion', 'cook time', 'todo list update', 'yes', 'damaged card', 'lost luggage', 'expiration date', 'ingredient substitution', 'direct deposit', 'are you a bot', 'reminder update', 'timer', 'calendar', 'routing', 'who made you', 'spending history', 'weather', 'roll dice', 'distance', 'travel notification', 'insurance', 'bill balance', 'change accent', 'make call', 'update playlist', 'flip coin', 'gas', 'definition', 'credit limit change', 'schedule meeting', 'insurance change', 'report fraud', 'rewards balance', 'oil change how', 'date', 'oil change when', 'order', 'flight status', 'rollover 401k', 'confirm reservation', 'card declined', 'change ai name', 'what can i ask you', 'what is your name', 'balance', 'how old are you', 'greeting', 'how busy', 'transactions', 'meaning of life', 'shopping list update', 'report lost card', 'spelling', 'schedule maintenance', 'alarm', 'uber', 'where are you from', 'accept reservations', 'restaurant suggestion', 'change speed', 'pto used', 'what song', 'meeting schedule', 'nutrition info', 'change volume', 'pto request status', 'pin change', 'calories', 'pay bill', 'traffic', 'no', 'cancel', 'reset settings'}\n"]},{"output_type":"execute_result","data":{"text/plain":["150"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["id2label={}\n","label2id={}\n","for i, intent in enumerate(intents_set):\n"," id2label[i] = intent\n"," label2id[intent]=i\n","\n"],"metadata":{"id":"7DHWBt0_cP7V"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# **Loading the tokenizer and pretrained model**"],"metadata":{"id":"Ga2_qk9Ow9an"}},{"cell_type":"code","source":["from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification\n","\n","model_name = 'roberta-large'\n","\n","# Initialize the tokenizer\n","tokenizer = RobertaTokenizer.from_pretrained(model_name)\n"],"metadata":{"id":"2fCpwpqZcSYQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Initialize the model\n","num_classes=150\n","model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)\n","model.label2id=label2id\n","model.id2label=id2label\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QvRKoOUjccO6","outputId":"93e5e909-0544-4773-a8b6-90c572279d8b"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']\n","You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"]}]},{"cell_type":"code","source":["# Tokenize the input data\n","train_encodings = tokenizer(\n"," train_data,\n"," truncation=True,\n"," padding=True,\n"," max_length=64,\n"," return_tensors='pt' #return type is pytorch tensor\n",")\n","\n","test_encodings = tokenizer(\n"," valid_data,\n"," truncation=True,\n"," padding=True,\n"," max_length=64,\n"," return_tensors='pt'\n",")"],"metadata":{"id":"9BoWDGT1cfa4"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["train_int_labels = [label2id[label] for label in new_train_labels]\n","test_int_labels = [label2id[label] for label in new_valid_labels]"],"metadata":{"id":"hLEBGtk7cqRs"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["len(train_data), len(valid_data)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"u_oBUeb6IzYM","outputId":"7ee9cc2e-dbd9-48bc-9cce-ec018df561b2"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(15000, 3000)"]},"metadata":{},"execution_count":18}]},{"cell_type":"code","source":["# Create PyTorch datasets\n","class IntentDataset(torch.utils.data.Dataset):\n"," def __init__(self, encodings, labels):\n"," self.encodings = encodings\n"," self.labels = labels\n","\n"," def __getitem__(self, idx):\n"," item = {key: val[idx] for key, val in self.encodings.items()}\n"," item['labels'] = torch.tensor(self.labels[idx])\n"," return item\n","\n"," def __len__(self):\n"," return len(self.labels)\n","\n","train_dataset = IntentDataset(train_encodings, train_int_labels)\n","test_dataset = IntentDataset(test_encodings, test_int_labels)"],"metadata":{"id":"otxKr5cjc6-z"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from torch.utils.data import DataLoader\n","batch_size = 64 # Adjust the batch size as needed\n","train_dataloader = DataLoader(\n"," train_dataset,\n"," batch_size=batch_size,\n"," shuffle=True # You can shuffle your data for randomness during training\n",")\n","eval_dataloader = DataLoader(\n"," test_dataset,\n"," batch_size=batch_size,\n"," shuffle=False # You can shuffle your data for randomness during training\n",")\n"],"metadata":{"id":"658xBTusdhP8"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# **Training the model**"],"metadata":{"id":"Tr09XS68xKrq"}},{"cell_type":"code","source":["# Training arguments\n","saved_dir='/content/drive/MyDrive/intent_classification_It_bombay/trained_model_roberta_large_pretraining'\n","training_args = TrainingArguments(\n"," output_dir=saved_dir,\n"," per_device_train_batch_size=16,\n"," per_device_eval_batch_size=16,\n"," # gradient_accumulation_steps=40,\n"," evaluation_strategy=\"steps\", # use 'epoch' for evaluating every epoch\n"," logging_steps=10,\n"," eval_steps=10,\n"," save_total_limit=5,\n"," learning_rate=1e-4,\n"," # warmup_steps=400,\n"," # weight_decay=0.10,\n"," # adam_epsilon=1e-7,\n"," # warmup_steps=400,\n"," num_train_epochs=25,\n"," logging_dir='./logs',\n",")"],"metadata":{"id":"_1kuAYLic_RN"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import os\n","import torch\n","from transformers import AdamW, get_linear_schedule_with_warmup\n","from transformers import TrainingArguments\n","from sklearn.model_selection import KFold\n","from tqdm import tqdm\n","\n","# Define your model, tokenizer, and other necessary components here\n","# Make sure your model is moved to the GPU if available\n","\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","model.to(device)\n","\n","# Define the learning rate as a float (e.g., 7e-6)\n","learning_rate = 2e-5\n","\n","# Define the number of training epochs as an integer (e.g., 40)\n","num_train_epochs = 40\n","\n","# Create the optimizer\n","optimizer = AdamW(model.parameters(), lr=learning_rate)\n","\n","# Define the number of training steps (you may need to adjust this based on your dataset)\n","total_train_steps = len(train_dataloader) * num_train_epochs\n","\n","# Define the number of warmup steps (e.g., 10% of the total training steps)\n","num_warmup_steps = int(0.1 * total_train_steps)\n","\n","# Create a learning rate scheduler\n","lr_scheduler = get_linear_schedule_with_warmup(\n"," optimizer,\n"," num_warmup_steps=num_warmup_steps,\n"," num_training_steps=total_train_steps,\n",")\n","\n","# Define the output directory for saving the model\n","# we will saved the model weights after every epoch\n","saved_dir='/content/drive/MyDrive/intent_classification_It_bombay/trained_final_step_roberta_large_pretraining'\n","\n","# Training loop with tqdm progress bar\n","for epoch in range(num_train_epochs):\n"," model.train()\n"," model.to(device)\n"," train_loss = 0.0\n","\n"," progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f\"Epoch {epoch}\")\n","\n"," for step, batch in progress_bar:\n"," # Move data to the GPU\n"," batch = {k: v.to(device) for k, v in batch.items()}\n","\n"," # Forward pass\n"," outputs = model(**batch)\n"," loss = outputs.loss\n","\n"," # Backward pass and optimization\n"," loss.backward()\n"," optimizer.step()\n"," optimizer.zero_grad()\n","\n"," # Update the learning rate\n"," lr_scheduler.step()\n","\n"," # Accumulate the total loss\n"," train_loss += loss.item()\n","\n"," # Log loss and other metrics if needed\n"," if step % training_args.logging_steps == 0:\n"," avg_loss = train_loss / (step + 1)\n"," progress_bar.set_postfix(loss=avg_loss)\n","\n"," # Save the model checkpoint at the end of each epoch\n"," checkpoint_dir = os.path.join(saved_dir, f\"epoch_{epoch}\")\n"," os.makedirs(checkpoint_dir, exist_ok=True)\n","\n"," # Save model weights\n"," model.save_pretrained(checkpoint_dir)\n","\n"," # Save optimizer state\n"," # torch.save(optimizer.state_dict(), os.path.join(checkpoint_dir, \"optimizer.pt\"))\n","\n"," # Save learning rate scheduler state\n"," # torch.save(lr_scheduler.state_dict(), os.path.join(checkpoint_dir, \"scheduler.pt\"))\n","\n"," # Save RNG state (if needed)\n"," # torch.save(torch.get_rng_state(), os.path.join(checkpoint_dir, \"rng_state.pth\"))\n","\n"," # Save config.json (if needed)\n"," model.config.save_pretrained(checkpoint_dir)\n","\n"," # Save tokenizer (if needed)\n"," tokenizer.save_pretrained(checkpoint_dir)\n","\n"," # Evaluation loop with tqdm progress bar\n"," model.eval()\n"," eval_loss = 0.0\n","\n"," progress_bar_eval = tqdm(enumerate(eval_dataloader), total=len(eval_dataloader), desc=f\"Evaluation\")\n","\n"," for step, batch in progress_bar_eval:\n"," batch = {k: v.to(device) for k, v in batch.items()}\n"," with torch.no_grad():\n"," outputs = model(**batch)\n"," loss = outputs.loss\n"," eval_loss += loss.item()\n","\n"," avg_eval_loss = eval_loss / len(eval_dataloader)\n"," print(f\"Epoch {epoch}: Evaluation Loss: {avg_eval_loss}\")\n"," print(f'learning rate is : {learning_rate}')\n","\n"," # if (epoch + 1) % 3 == 0:\n"," # learning_rate *= 0.75\n"," # for param_group in optimizer.param_groups:\n"," # param_group['lr'] = learning_rate\n"],"metadata":{"id":"1bzRBJh-dIoS"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# raw_predictions = model.predict(test_dataset)\n","# predicted_labels = np.argmax(raw_predictions.predictions)\n","# predicted_labels, raw_predictions\n","import numpy as np\n","import torch\n","# use best model weights(checkpoint) from above on the basis of best validation score to get the predictions on the ntu adl validation dataset\n","model_name=\"/content/drive/MyDrive/intent_classification_It_bombay/trained_final_step_roberta_large_pretraining/epoch_12\"\n","model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)\n","device=torch.device('cuda') if torch.cuda.is_available() else \"cpu\"\n","# Put your test data into a DataLoader or batch format if it's not already\n","test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n","\n","# Set the model to evaluation mode\n","model.to(device)\n","model.eval()\n","\n","predicted_labels = []\n","raw_predictions = []\n","\n","with torch.no_grad():\n"," for batch in test_dataloader:\n"," # Move batch to the GPU if available\n"," batch = {k: v.to(device) for k, v in batch.items()}\n","\n"," # Forward pass\n"," outputs = model(**batch)\n"," logits = outputs.logits\n","\n"," # Get predicted labels (argmax)\n"," batch_predictions = np.argmax(logits.cpu().numpy(), axis=1)\n","\n"," # Append batch predictions to the list\n"," predicted_labels.extend(batch_predictions)\n","\n"," # Append raw logits if needed\n"," raw_predictions.extend(logits.cpu().numpy())\n","\n","# Convert the results to numpy arrays\n","predicted_labels = np.array(predicted_labels)\n","raw_predictions = np.array(raw_predictions)\n","\n","# Now, you have predicted labels and raw logits.\n"],"metadata":{"id":"UMRvyBmQd1jk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Calculate accuracy, precision, recall, and F1-score\n","report = classification_report(test_int_labels, predicted_labels, target_names=list(label2id.keys()), output_dict=True)\n","print(report)\n","print('\\n')\n","accuracy = report['accuracy']\n","precision = report['macro avg']['precision']\n","recall = report['macro avg']['recall']\n","f1_score = report['macro avg']['f1-score']\n","\n","print(f\"Accuracy: {accuracy:.4f}\")\n","print(f\"Precision: {precision:.4f}\")\n","print(f\"Recall: {recall:.4f}\")\n","print(f\"F1 Score: {f1_score:.4f}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FiRjdSp3mSGY","outputId":"9e0cf6cf-7239-4644-cc0e-8316ece9ca2e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["{'min payment': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'who do you work for': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'redeem rewards': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'carry on': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'translate': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'change language': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'payday': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'bill due': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'food last': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'interest rate': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'directions': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'shopping list': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'gas type': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'maybe': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'calculator': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'restaurant reviews': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'tire pressure': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'application status': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'text': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'taxes': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'travel suggestion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'timezone': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'order status': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'calendar update': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'last maintenance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'travel alert': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'freeze account': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'restaurant reservation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'do you have pets': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'plug type': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'car rental': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'book hotel': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'change user name': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'transfer': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'user name': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'goodbye': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'todo list': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'fun fact': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'sync device': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'cancel reservation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'pto request': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'improve credit score': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'thank you': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'what are your hobbies': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'credit score': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'vaccines': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'mpg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'next holiday': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'jump start': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'order checks': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'tell joke': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'time': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'replacement card duration': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'whisper mode': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'ingredients list': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'smart home': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'book flight': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'recipe': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'tire change': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'next song': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'reminder': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'pto balance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'repeat': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'credit limit': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'w2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'new card': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'apr': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'account blocked': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'international fees': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'current location': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'exchange rate': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'find phone': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'play music': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'meal suggestion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'income': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'share location': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'international visa': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'measurement conversion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'cook time': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'todo list update': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'yes': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'damaged card': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'lost luggage': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'expiration date': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'ingredient substitution': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'direct deposit': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'are you a bot': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'reminder update': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'timer': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'calendar': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'routing': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'who made you': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'spending history': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'weather': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'roll dice': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'distance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'travel notification': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'insurance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'bill balance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'change accent': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'make call': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'update playlist': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'flip coin': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'gas': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'definition': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'credit limit change': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'schedule meeting': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'insurance change': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'report fraud': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'rewards balance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'oil change how': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'date': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'oil change when': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'order': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'flight status': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'rollover 401k': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'confirm reservation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'card declined': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'change ai name': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'what can i ask you': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'what is your name': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'balance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'how old are you': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'greeting': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'how busy': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'transactions': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'meaning of life': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'shopping list update': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'report lost card': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'spelling': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'schedule maintenance': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'alarm': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'uber': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'where are you from': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'accept reservations': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'restaurant suggestion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'change speed': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'pto used': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'what song': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'meeting schedule': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 20}, 'nutrition info': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'change volume': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'pto request status': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'pin change': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'calories': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'pay bill': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'traffic': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'no': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'cancel': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'reset settings': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'accuracy': 0.006666666666666667, 'macro avg': {'precision': 0.006666666666666667, 'recall': 0.006666666666666667, 'f1-score': 0.006666666666666667, 'support': 3000}, 'weighted avg': {'precision': 0.006666666666666667, 'recall': 0.006666666666666667, 'f1-score': 0.006666666666666667, 'support': 3000}}\n","\n","\n","Accuracy: 0.0067\n","Precision: 0.0067\n","Recall: 0.0067\n","F1 Score: 0.0067\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"s-4-JhJNmaFU"},"execution_count":null,"outputs":[]}]}
Datathon_indoml_2023/extra layer training/Indoml_phase2_extra_layer_roberta_training.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers=4.34.1
2
+ torch=2.1.0
3
+ accelerate=0.23.0
4
+ sklearn=0.24.2
5
+ setfit=0.7.0
6
+ nlpaug=1.1.11
7
+ sentence_transformers
8
+ numpy=1.26.1
9
+ pandas=2.1.1
10
+ sentencepiece=0.1.99
11
+ datasets=2.14.5
12
+ huggingface_hub=0.18.0
13
+
Datathon_indoml_2023/testing files/testing files for all training codes/test.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"markdown","metadata":{"id":"E-OaLrjOvAP3"},"source":["# **Downloading the dependencies**"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XOvfSP9gvAP8","outputId":"2bd3517c-2795-49f4-d605-a7320cf389d7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting transformers\n"," Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m20.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n","Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)\n"," Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m41.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n","Collecting tokenizers<0.15,>=0.14 (from transformers)\n"," Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n"," Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m50.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n","Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n","Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)\n"," Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n","Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers\n","Successfully installed huggingface-hub-0.17.3 safetensors-0.4.0 tokenizers-0.14.1 transformers-4.34.1\n"]}],"source":["!pip install transformers"]},{"cell_type":"markdown","source":["# **Connecting google colab to drive**"],"metadata":{"id":"COar6kq8vETt"}},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pz7WesrSvDr_","outputId":"9675dc56-71ca-485f-bf24-ef5a3795c0f6"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"markdown","metadata":{"id":"bYSVMm9JvAP9"},"source":["# **Importing the necessary modules**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-GwjyIugvAP-"},"outputs":[],"source":["import json\n","import pandas as pd\n","import os\n","import torch\n","from transformers import RobertaTokenizer, RobertaForSequenceClassification\n","from torch.utils.data import DataLoader, TensorDataset"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"iooZvgNqvAP-"},"outputs":[],"source":["device=torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n","num_classe=150"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"hBXbEu-wvAP-"},"outputs":[],"source":["def test(model_path, data_path):\n"," solution_file_path=os.path.join(data_path,'surprise.solution')\n"," test_data_path=os.path.join(data_path,'massive_test.data')\n"," # loading surprise.solution file for getting id2label and label2id mapping\n"," with open(solution_file_path,'r') as solutions_file:\n"," solutions=[json.loads(line) for line in solutions_file] # reading json data from data_path and parse it into a test_data list\n","\n"," labels_list=[]\n"," for label in solutions:\n"," labels_list.append(label['intent'])\n"," unique_labels_list=[]\n"," for x in labels_list:\n"," if x not in unique_labels_list:\n"," unique_labels_list.append(x)\n"," # unique_labels_list, len(unique_labels_list)\n","\n"," label2id={}\n"," id2label={}\n"," for i, label in enumerate(unique_labels_list):\n"," label2id[label]=i\n"," id2label[i]=label\n"," # print(list(id2label.items())[:5])\n"," # print('\\n')\n"," # print(list(label2id.items())[:5])\n"," # loading testing data file\n"," with open(test_data_path,'r') as test_file:\n"," test_data=[json.loads(line) for line in test_file] # reading json data from data_path and parse it into a test_data list\n","\n"," num_classes=150\n"," # loading pretrained tokenizer\n"," tokenizer=RobertaTokenizer.from_pretrained(model_path)\n"," test_utt=[item['utt'] for item in test_data]\n"," test_data_encodings=tokenizer(test_utt, padding=True, truncation=True, return_tensors=\"pt\") # getting the encodings of testing data\n","\n"," # Convert the encodings into tensors\n"," input_ids = test_data_encodings['input_ids']\n"," attention_mask = test_data_encodings['attention_mask']\n"," # token_type_ids = test_set_encodings['token_type_ids'] # may be useful if our pretrained model is of type then roberta like BERT\n","\n"," # Create a TensorDataset\n"," test_dataset = TensorDataset(input_ids, attention_mask,)\n","\n"," # Define batch size\n"," batch_size = 32\n","\n"," # Create a DataLoader\n"," test_dataloader = DataLoader(test_dataset, batch_size=batch_size)\n","\n"," # loading the pretrained model\n"," model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=num_classes, ignore_mismatched_sizes=True)\n"," model.to(device)\n","\n"," # Initialize an empty list to store predictions\n"," predictions = []\n","\n"," # Set the model in evaluation mode\n"," model.eval()\n","\n"," # Iterate through the batches in the DataLoader\n"," for batch in test_dataloader:\n"," # Unpack the batch\n"," input_ids, attention_mask = batch\n","\n"," # Move tensors to the device (e.g., GPU if available)\n"," input_ids = input_ids.to(device)\n"," attention_mask = attention_mask.to(device)\n","\n","\n"," # Forward pass to get logits\n"," with torch.no_grad():\n"," outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n","\n"," # Extract the logits tensor from the outputs\n"," logits = outputs.logits\n","\n"," # Apply softmax to get class probabilities\n"," probabilities = torch.softmax(logits, dim=1)\n","\n"," # Get the predicted class (index with the highest probability)\n"," predicted_class = torch.argmax(probabilities, dim=1)\n","\n"," # Append the predicted class to the list of predictions\n"," predictions.extend(predicted_class.tolist())\n","\n"," predictions=torch.tensor(predictions) # predicted id for all the utterance of the testing data\n","\n"," predicted_labels=[id2label[int(id)] for id in list(predictions)] # converting those id into labels using id2label mapping made above\n","\n"," # converting these label with their id into pandas Dataframe\n"," my_id=[]\n"," my_intent=[]\n"," my_dict1={}\n","\n"," for i, entry in enumerate(test_data):\n"," my_id.append({'indoml_id':i+1,'intent':predicted_labels[i]})\n"," my_intent.append(predicted_labels[i])\n"," my_dict1['id']=my_id\n"," my_dict1['intent']=my_intent\n","\n","\n"," my_dict1_pd=pd.DataFrame.from_dict(my_dict1)\n"," print(my_dict1_pd)\n","\n"," # Converting the predictions into the desired format taken from the 1st column of my_dict1_pd dataframe\n"," # This output.predict file will got saved in the same directory where this jupyter file is present, you can also change the path of this where you want to save it accordingly.\n"," with open('output.predict', 'w') as out_file:\n"," for entry in my_dict1_pd['id']:\n"," out_file.write(str(entry))\n"," out_file.write('\\n')\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ib2sgAGyvAP_","outputId":"4c74de42-7ac9-4cd7-c88a-ff6e6cb58aee"},"outputs":[{"output_type":"stream","name":"stdout","text":[" id \\\n","0 {'indoml_id': 1, 'intent': 'what song'} \n","1 {'indoml_id': 2, 'intent': 'change volume'} \n","2 {'indoml_id': 3, 'intent': 'time'} \n","3 {'indoml_id': 4, 'intent': 'smart home'} \n","4 {'indoml_id': 5, 'intent': 'carry on'} \n","... ... \n","5995 {'indoml_id': 5996, 'intent': 'cancel'} \n","5996 {'indoml_id': 5997, 'intent': 'timezone'} \n","5997 {'indoml_id': 5998, 'intent': 'roll dice'} \n","5998 {'indoml_id': 5999, 'intent': 'carry on'} \n","5999 {'indoml_id': 6000, 'intent': 'restaurant rese... \n","\n"," intent \n","0 what song \n","1 change volume \n","2 time \n","3 smart home \n","4 carry on \n","... ... \n","5995 cancel \n","5996 timezone \n","5997 roll dice \n","5998 carry on \n","5999 restaurant reservation \n","\n","[6000 rows x 2 columns]\n"]}],"source":["# model_path=\"C:/Users/panka/Downloads/epoch_16-20231021T144106Z-001/epoch_16/\"\n","# data_path=\"C:/Users/panka/Desktop/IndoML/input_data_latest/indoml_phase2_data/\"\n","model_path=\"/content/drive/MyDrive/massive_accuracy_files_in_descending_order/intent_classification_It_bombay/trained_model_11_0.25_data_split_lr_4e_5_checkpoints/epoch_16\"\n","data_path=\"/content/drive/MyDrive/massive_accuracy_files_in_descending_order/intent_classification_It_bombay (1)/indoml_iit_bombay/surprise_data/\" # Directory or folder containing paths of all the files related to surprise data and massive testing data.\n","out_file=test(model_path, data_path)\n","# Convert the list of predictions to a tensor\n","\n"]},{"cell_type":"code","source":[],"metadata":{"id":"-hy_wAQm0hd3"},"execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}
Datathon_indoml_2023/testing files/testing files for extra layer training/indoml_testing_extra_layer.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/trainer setfit/trainer_Setfit_ibm_roberta_large.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/trainer setfit/trainer_setfit_paraphrase_MiniLM_L6_v2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/voting Ensemble/Voting_ensemble_n_number_of_files.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Datathon_indoml_2023/voting Ensemble/final_11_best_models_voting_ensemble.ipynb ADDED
The diff for this file is too large to render. See raw diff