ADD pos

Files changed (5) hide show

Test.ipynb +338 -0
models/bert_out_model/en09/config.json +87 -0
models/bert_out_model/en09/eval_results.txt +37 -0
models/bert_out_model/en09/pytorch_model.bin +3 -0
models/bert_out_model/en09/vocab.txt +0 -0

Test.ipynb ADDED Viewed

	@@ -0,0 +1,338 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+    "max_len  = 45"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tag2idx = {'X': 0,\n",
+    " 'YM': 1,\n",
+    " '[CLS]': 2,\n",
+    " 'DUM': 3,\n",
+    " 'VBF': 4,\n",
+    " 'RP': 5,\n",
+    " 'VBKO': 6,\n",
+    " 'CS': 7,\n",
+    " 'VBX': 8,\n",
+    " 'VBNE': 9,\n",
+    " 'CC': 10,\n",
+    " 'Unknown': 11,\n",
+    " 'PKO': 12,\n",
+    " 'JJM': 13,\n",
+    " 'PLE': 14,\n",
+    " 'VBO': 15,\n",
+    " 'HRU': 16,\n",
+    " 'YF': 17,\n",
+    " 'NN': 18,\n",
+    " 'YQ': 19,\n",
+    " 'VBI': 20,\n",
+    " '[SEP]': 21,\n",
+    " 'JJ': 22,\n",
+    " 'POP': 23,\n",
+    " 'PLAI': 24,\n",
+    " 'RBO': 25,\n",
+    " 'PP': 26,\n",
+    " 'CD': 27,\n",
+    " 'NNP': 28}\n",
+    "\n",
+    "# Mapping index to name\n",
+    "tag2name={tag2idx[key] : key for key in tag2idx.keys()}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tag_2_nees =  {'NN': 'Noun',\n",
+    "'JJ': 'Normal/Unmarked Adjective', \n",
+    "'NNP': 'Noun Plural',\n",
+    "'POP': 'Other Postpositions',\n",
+    "'PKO': 'Ko-Postpositions', \n",
+    "'YF': 'Sentence-final Punctuation',\n",
+    "'CD': 'Cardinal Digits',\n",
+    "'PLE':'Postpositions(Le- postpositions)',\n",
+    "'VBF': 'Finite Verb', \n",
+    "'HRU': 'Plural Marker',\n",
+    "'YM': 'Sentence-medial punctuation',\n",
+    "'VBX': 'Auxiliary Verb',\n",
+    "'VBKO': 'Verb aspectual participle',\n",
+    "'CC': 'Coordinating conjunction',\n",
+    " 'DUM':'Pronoun unmarked demonstrative',\n",
+    " 'VBNE': 'Verb(Prospective participle)',\n",
+    " 'VBO':'Other participle verb',\n",
+    "'PLAI': 'Postpositions(Lai-Postpositions)',\n",
+    " 'RBO': 'Adverb(Other Adverb)',\n",
+    " 'VBI': 'Verb Infinitive',\n",
+    " 'YQ': 'Quotation Marks',\n",
+    " 'PP':'Possessive pronoun',\n",
+    " 'JJM': 'Marked adjective',\n",
+    " 'CS': 'Subordinating conjunction appearing before/after the clause it subordinates',\n",
+    " 'RP': 'Particle'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ! pip install transformers\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import BertForMaskedLM\n",
+    "from transformers import BertTokenizer\n",
+    "model = BertForMaskedLM.from_pretrained('./models/bert_out_model/en09',\n",
+    "                                            num_labels=len(tag2idx),\n",
+    "                                            output_attentions = False,\n",
+    "                                            output_hidden_states = False\n",
+    "                                       )\n",
+    "vocab_file_dir = './models/bert_out_model/en09' \n",
+    "tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,\n",
+    "                                        strip_accents=False,\n",
+    "                                         clean_text=False )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def Get_POS(test_query):\n",
+    "    tokenized_texts = []\n",
+    "    temp_token = []\n",
+    "    # Add [CLS] at the front \n",
+    "    temp_token.append('[CLS]')\n",
+    "    token_list = tokenizer.tokenize(test_query)\n",
+    "    for m,token in enumerate(token_list):\n",
+    "        temp_token.append(token)\n",
+    "    # Trim the token to fit the length requirement\n",
+    "    if len(temp_token) > max_len-1:\n",
+    "        temp_token= temp_token[:max_len-1]\n",
+    "    # Add [SEP] at the end\n",
+    "    temp_token.append('[SEP]')\n",
+    "    tokenized_texts.append(temp_token)\n",
+    "    # Make text token into id\n",
+    "    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],\n",
+    "                            maxlen=max_len, dtype=\"long\", truncating=\"post\", padding=\"post\")\n",
+    "    # print(input_ids[0])\n",
+    "    \n",
+    "    # For fine tune of predict, with token mask is 1,pad token is 0\n",
+    "    attention_masks = [[int(i>0) for i in ii] for ii in input_ids]\n",
+    "    attention_masks[0];\n",
+    "    segment_ids = [[0] * len(input_id) for input_id in input_ids]\n",
+    "    segment_ids[0];\n",
+    "    input_ids = torch.tensor(input_ids)\n",
+    "    attention_masks = torch.tensor(attention_masks)\n",
+    "    segment_ids = torch.tensor(segment_ids)\n",
+    "    # Set save model to Evalue loop\n",
+    "    model.eval();\n",
+    "    # Get model predict result\n",
+    "    with torch.no_grad():\n",
+    "            outputs = model(input_ids, token_type_ids=None,\n",
+    "            attention_mask=None,)\n",
+    "            # For eval mode, the first result of outputs is logits\n",
+    "            logits = outputs[0]\n",
+    "            \n",
+    "    # Make logits into numpy type predict result\n",
+    "    # The predict result contain each token's all tags predict result\n",
+    "    predict_results = logits.detach().cpu().numpy()\n",
+    "\n",
+    "    predict_results.shape\n",
+    "\n",
+    "    from scipy.special import softmax\n",
+    "\n",
+    "    result_arrays_soft = softmax(predict_results[0])\n",
+    "\n",
+    "    result_array = result_arrays_soft\n",
+    "\n",
+    "    # Get each token final predict tag index result\n",
+    "    result_list = np.argmax(result_array,axis=-1)\n",
+    "\n",
+    "        \n",
+    "    x = list()\n",
+    "    y = list()\n",
+    "    new_tokens, new_labels = [], []\n",
+    "    for i, mark in enumerate(attention_masks[0]):\n",
+    "        if mark>0:\n",
+    "            print(\"Token:%s\"%(temp_token[i]))\n",
+    "            x.append(temp_token[i])\n",
+    "    #         print(\"Tag:%s\"%(result_list[i]))\n",
+    "            print(\"Predict_Tag:%s\"%(tag2name[result_list[i]]))\n",
+    "            y.append(result_list[i])\n",
+    "            # print(\"Posibility:%f\"%(result_array[i][result_list[i]]))\n",
+    "    \n",
+    "    for token, label_idx in zip(x, y):\n",
+    "        if token.startswith(\"##\"):\n",
+    "            new_tokens[-1] = new_tokens[-1] + token[2:]\n",
+    "        else:\n",
+    "            new_labels.append(tag2name[label_idx])\n",
+    "            new_tokens.append(token)\n",
+    "        \n",
+    "    # for token, label in zip(new_tokens, new_labels):\n",
+    "    #     print(\"{} ---------------> {}\".format(token, label))\n",
+    "    \n",
+    "    \n",
+    "    tag_names = []\n",
+    "    for i in new_labels[1:-1]:\n",
+    "        tag_names.append(\n",
+    "            tag_2_nees[i]\n",
+    "        )\n",
+    "       \n",
+    "    return new_tokens[1:-1],tag_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token:[CLS]\n",
+      "Predict_Tag:[CLS]\n",
+      "Token:हाल\n",
+      "Predict_Tag:RBO\n",
+      "Token:नेपालका\n",
+      "Predict_Tag:JJ\n",
+      "Token:विभिन्न\n",
+      "Predict_Tag:JJ\n",
+      "Token:राजनैतिक\n",
+      "Predict_Tag:JJ\n",
+      "Token:दलहरूबीच\n",
+      "Predict_Tag:JJ\n",
+      "Token:एमसीसी\n",
+      "Predict_Tag:JJ\n",
+      "Token:कार्यक्रमबारे\n",
+      "Predict_Tag:NN\n",
+      "Token:मतैक्य\n",
+      "Predict_Tag:NN\n",
+      "Token:##ता\n",
+      "Predict_Tag:X\n",
+      "Token:हुन\n",
+      "Predict_Tag:VBI\n",
+      "Token:नसकेका\n",
+      "Predict_Tag:VBKO\n",
+      "Token:कारण\n",
+      "Predict_Tag:NN\n",
+      "Token:आन्दोलन\n",
+      "Predict_Tag:NN\n",
+      "Token:पनि\n",
+      "Predict_Tag:RP\n",
+      "Token:चर्क\n",
+      "Predict_Tag:VBO\n",
+      "Token:##िरहेको\n",
+      "Predict_Tag:X\n",
+      "Token:छ\n",
+      "Predict_Tag:VBX\n",
+      "Token:।\n",
+      "Predict_Tag:YF\n",
+      "Token:[SEP]\n",
+      "Predict_Tag:[SEP]\n"
+     ]
+    }
+   ],
+   "source": [
+    "x,y = Get_POS(\"हाल नेपालका विभिन्न राजनैतिक दलहरूबीच एमसीसी कार्यक्रमबारे मतैक्यता हुन नसकेका कारण आन्दोलन पनि चर्किरहेको छ।\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(['हाल',\n",
+       "  'नेपालका',\n",
+       "  'विभिन्न',\n",
+       "  'राजनैतिक',\n",
+       "  'दलहरूबीच',\n",
+       "  'एमसीसी',\n",
+       "  'कार्यक्रमबारे',\n",
+       "  'मतैक्यता',\n",
+       "  'हुन',\n",
+       "  'नसकेका',\n",
+       "  'कारण',\n",
+       "  'आन्दोलन',\n",
+       "  'पनि',\n",
+       "  'चर्किरहेको',\n",
+       "  'छ',\n",
+       "  '।'],\n",
+       " ['Adverb(Other Adverb)',\n",
+       "  'Normal/Unmarked Adjective',\n",
+       "  'Normal/Unmarked Adjective',\n",
+       "  'Normal/Unmarked Adjective',\n",
+       "  'Normal/Unmarked Adjective',\n",
+       "  'Normal/Unmarked Adjective',\n",
+       "  'Noun',\n",
+       "  'Noun',\n",
+       "  'Verb Infinitive',\n",
+       "  'Verb aspectual participle',\n",
+       "  'Noun',\n",
+       "  'Noun',\n",
+       "  'Particle',\n",
+       "  'Other participle verb',\n",
+       "  'Auxiliary Verb',\n",
+       "  'Sentence-final Punctuation'])"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x,y"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "ca894e04cc6fd3e8c60826e0ca22793858ad83aa785622f3d49ff6f88f1ccbf8"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.7.0 64-bit ('pt3.7': conda)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

models/bert_out_model/en09/config.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+  "_name_or_path": "../input/nepalibert",
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.15.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 50000
+}

models/bert_out_model/en09/eval_results.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+f1 socre:
+0.9330855682813086
+Accuracy score:
+0.9458905242268894
+              precision    recall  f1-score   support
+          BF     0.9538    0.9253    0.9393       937
+          BI     0.9129    0.9402    0.9263       468
+         BKO     0.9785    0.9287    0.9529       785
+         BNE     0.9429    0.9319    0.9374       514
+          BO     0.8293    0.8872    0.8573       931
+          BX     0.9570    0.9547    0.9558       816
+           C     0.9943    0.9914    0.9929       701
+           D     0.9007    0.8772    0.8888       920
+           F     0.9963    0.9945    0.9954      1083
+           J     0.8835    0.8817    0.8826      2520
+          JM     0.8914    0.8914    0.8914       221
+          KO     0.9942    0.9976    0.9959      2070
+         LAI     0.9980    0.9980    0.9980       496
+          LE     0.9972    0.9945    0.9959      1088
+           M     0.9265    0.8164    0.8680       757
+           N     0.9304    0.9202    0.9253      6655
+          NP     0.8689    0.9005    0.8844      1648
+          OP     0.9880    0.9816    0.9848      2015
+           P     0.9833    0.9883    0.9858       597
+           Q     0.9513    0.8729    0.9104       425
+          RU     0.9977    0.9953    0.9965       859
+           S     0.9482    0.9337    0.9409       196
+          UM     0.9709    0.9799    0.9754       647
+           _     0.0000    0.0000    0.0000         0
+      nknown     0.8970    0.8172    0.8552       629
+   micro avg     0.9329    0.9333    0.9331     27978
+   macro avg     0.9077    0.8960    0.9015     27978
+weighted avg     0.9413    0.9333    0.9370     27978

models/bert_out_model/en09/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:779b44ae9309548a82a0f7631bde4e740cdeaf1c7117d200db157da46222c6ef
+size 327908843

models/bert_out_model/en09/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff