kanav0183
/

toxic_model

Model card Files Files and versions

xet

Community

kanav0183 commited on Oct 27, 2022

Commit

e1a5a68

1 Parent(s): 06f079f

Upload 3 files

Browse files

Files changed (3) hide show

app.py +132 -0
infer.ipynb +171 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pandas as pd
+import torch
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+from transformers import DistilBertTokenizer, DistilBertModel
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+MAX_LEN = 512
+TRAIN_BATCH_SIZE = 16
+VALID_BATCH_SIZE = 16
+EPOCHS = 3
+LEARNING_RATE = 1e-05
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
+class MultiLabelDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_len, new_data=False):
+        self.tokenizer = tokenizer
+        self.data = dataframe
+        self.text = dataframe.comment_text
+        self.new_data = new_data
+        if not new_data:
+            self.targets = self.data.labels
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.text)
+    def __getitem__(self, index):
+        text = str(self.text[index])
+        text = " ".join(text.split())
+        inputs = self.tokenizer.encode_plus(
+            text,
+            None,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            pad_to_max_length=True,
+            return_token_type_ids=True
+        )
+        ids = inputs['input_ids']
+        mask = inputs['attention_mask']
+        token_type_ids = inputs["token_type_ids"]
+        out = {
+            'ids': torch.tensor(ids, dtype=torch.long),
+            'mask': torch.tensor(mask, dtype=torch.long),
+            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+        }
+        if not self.new_data:
+            out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)
+        return out
+class DistilBERTClass(torch.nn.Module):
+    def __init__(self):
+        super(DistilBERTClass, self).__init__()
+        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Linear(768, 768),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.1),
+            torch.nn.Linear(768, 6)
+        )
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        hidden_state = output_1[0]
+        out = hidden_state[:, 0]
+        out = self.classifier(out)
+        return out
+model = DistilBERTClass()
+model.to(DEVICE);
+model_loaded = torch.load('model/inference_models_output_4fold_distilbert_fold_best_model.pth')
+model.load_state_dict(model_loaded['model'])
+val_params = {'batch_size': VALID_BATCH_SIZE,
+               'shuffle': False,
+                }
+def give_toxic(text):
+    # text = "You fucker "
+    test_data = pd.DataFrame([text],columns=['comment_text'])
+    test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)
+    test_loader = DataLoader(test_set, **val_params)
+    all_test_pred = []
+    def test(epoch):
+        model.eval()
+        with torch.inference_mode():
+            for _, data in tqdm(enumerate(test_loader, 0)):
+                ids = data['ids'].to(DEVICE, dtype=torch.long)
+                mask = data['mask'].to(DEVICE, dtype=torch.long)
+                token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
+                outputs = model(ids, mask, token_type_ids)
+                probas = torch.sigmoid(outputs)
+                all_test_pred.append(probas)
+    probas = test(model)
+    all_test_pred = torch.cat(all_test_pred)
+    label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
+    preds = all_test_pred.detach().cpu().numpy()[0]
+    final_dict  = dict(zip(label_columns , preds))
+    return final_dict
+def device():
+    return DEVICE
+print(give_toxic("fuck"))

infer.ipynb ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "d136f503-bb1b-404e-8657-ce3168eae54b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
+    "from transformers import DistilBertTokenizer, DistilBertModel\n",
+    "import streamlit as st\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "MAX_LEN = 512\n",
+    "TRAIN_BATCH_SIZE = 16\n",
+    "VALID_BATCH_SIZE = 16\n",
+    "EPOCHS = 3\n",
+    "LEARNING_RATE = 1e-05\n",
+    "DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n",
+    "print(DEVICE)\n",
+    "\n",
+    "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)\n",
+    "\n",
+    "class MultiLabelDataset(Dataset):\n",
+    "\n",
+    "    def __init__(self, dataframe, tokenizer, max_len, new_data=False):\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.data = dataframe\n",
+    "        self.text = dataframe.comment_text\n",
+    "        self.new_data = new_data\n",
+    "        \n",
+    "        if not new_data:\n",
+    "            self.targets = self.data.labels\n",
+    "        self.max_len = max_len\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.text)\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        text = str(self.text[index])\n",
+    "        text = \" \".join(text.split())\n",
+    "\n",
+    "        inputs = self.tokenizer.encode_plus(\n",
+    "            text,\n",
+    "            None,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            pad_to_max_length=True,\n",
+    "            return_token_type_ids=True\n",
+    "        )\n",
+    "        ids = inputs['input_ids']\n",
+    "        mask = inputs['attention_mask']\n",
+    "        token_type_ids = inputs[\"token_type_ids\"]\n",
+    "\n",
+    "        out = {\n",
+    "            'ids': torch.tensor(ids, dtype=torch.long),\n",
+    "            'mask': torch.tensor(mask, dtype=torch.long),\n",
+    "            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n",
+    "        }\n",
+    "        \n",
+    "        if not self.new_data:\n",
+    "            out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)\n",
+    "\n",
+    "        return out\n",
+    "\n",
+    "class DistilBERTClass(torch.nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(DistilBERTClass, self).__init__()\n",
+    "        \n",
+    "        self.bert = DistilBertModel.from_pretrained(\"distilbert-base-uncased\")\n",
+    "        self.classifier = torch.nn.Sequential(\n",
+    "            torch.nn.Linear(768, 768),\n",
+    "            torch.nn.ReLU(),\n",
+    "            torch.nn.Dropout(0.1),\n",
+    "            torch.nn.Linear(768, 6)\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, input_ids, attention_mask, token_type_ids):\n",
+    "        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
+    "        hidden_state = output_1[0]\n",
+    "        out = hidden_state[:, 0]\n",
+    "        out = self.classifier(out)\n",
+    "        return out\n",
+    "\n",
+    "model = DistilBERTClass()\n",
+    "model.to(DEVICE);\n",
+    "\n",
+    "model_loaded = torch.load('model/inference_models_output_4fold_distilbert_fold_best_model.pth',map_location=torch.device('cpu'))\n",
+    "\n",
+    "model.load_state_dict(model_loadede['model'])\n",
+    "\n",
+    "\n",
+    "val_params = {'batch_size': VALID_BATCH_SIZE,\n",
+    "               'shuffle': False,\n",
+    "               'num_workers': 8\n",
+    "                }\n",
+    "def give_toxic(text):\n",
+    "    text = \"You fucker \"\n",
+    "    test_data = pd.DataFrame([text],columns=['comment_text'])\n",
+    "    test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)\n",
+    "    test_loader = DataLoader(test_set, **val_params)\n",
+    "\n",
+    "    all_test_pred = []\n",
+    "\n",
+    "    def test(epoch):\n",
+    "        model.eval()\n",
+    "\n",
+    "        with torch.inference_mode():\n",
+    "\n",
+    "            for _, data in tqdm(enumerate(test_loader, 0)):\n",
+    "\n",
+    "\n",
+    "                ids = data['ids'].to(DEVICE, dtype=torch.long)\n",
+    "                mask = data['mask'].to(DEVICE, dtype=torch.long)\n",
+    "                token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)\n",
+    "                outputs = model(ids, mask, token_type_ids)\n",
+    "                probas = torch.sigmoid(outputs)\n",
+    "\n",
+    "                all_test_pred.append(probas)\n",
+    "\n",
+    "\n",
+    "    probas = test(model)\n",
+    "\n",
+    "    all_test_pred = torch.cat(all_test_pred)\n",
+    "\n",
+    "    label_columns = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
+    "\n",
+    "    preds = all_test_pred.detach().cpu().numpy()[0]\n",
+    "\n",
+    "    final_dict  = dict(zip(label_columns , preds))\n",
+    "    return final_dict\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db651873-60cd-4cd7-8ba0-da6c62e22ca8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+pandas
+transformers