Spaces:

jbraha
/

aiproject

Runtime error

App Files Files Community

jbraha commited on Apr 24, 2023

Commit

f2a478c

1 Parent(s): 2c55221

st changes

Browse files

Files changed (10) hide show

.github/workflows/main.yml +2 -2
.ipynb_checkpoints/Copy of training-checkpoint.ipynb +334 -0
Copy of training.ipynb +334 -0
README.md +1 -4
app.py +13 -4
data/.~lock.test.csv# +0 -1
data/.~lock.test_labels.csv# +0 -1
data/.~lock.train.csv# +0 -1
train.py +143 -0
traintokens.txt +0 -0

.github/workflows/main.yml CHANGED Viewed

@@ -1,7 +1,7 @@
 name: Sync to Hugging Face hub
 on:
   push:
-    branches: [milestone-2]
   # to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -21,6 +21,6 @@ jobs:
           git config user.name "$GITHUB_ACTOR" &&
           git config user.email "<>"
           && git switch main
-          && git merge origin/milestone-2
           && git push
           && git push https://jbraha:$HF_TOKEN@huggingface.co/spaces/jbraha/aiproject

 name: Sync to Hugging Face hub
 on:
   push:
+    branches: [milestone-3]
   # to run this workflow manually from the Actions tab
   workflow_dispatch:
           git config user.name "$GITHUB_ACTOR" &&
           git config user.email "<>"
           && git switch main
+          && git merge origin/milestone-3
           && git push
           && git push https://jbraha:$HF_TOKEN@huggingface.co/spaces/jbraha/aiproject

.ipynb_checkpoints/Copy of training-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,334 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "215a1aae",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 128,
+     "status": "ok",
+     "timestamp": 1682285319377,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "215a1aae"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-04-23 18:07:24.557548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-04-23 18:07:25.431969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from transformers import BertTokenizerFast, BertForSequenceClassification\n",
+    "from transformers import Trainer, TrainingArguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "J5Tlgp4tNd0U",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "executionInfo": {
+     "elapsed": 1897,
+     "status": "ok",
+     "timestamp": 1682285321454,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "J5Tlgp4tNd0U",
+    "outputId": "3c9f0c5b-7bc3-4c15-c5ff-0a77d3b3b607"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
+      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"bert-base-uncased\"\n",
+    "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
+    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
+    "max_len = 200\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"results\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=64,\n",
+    "    warmup_steps=500,\n",
+    "    learning_rate=5e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=10\n",
+    "    )\n",
+    "\n",
+    "# dataset class that inherits from torch.utils.data.Dataset\n",
+    "class TweetDataset(Dataset):\n",
+    "    def __init__(self, encodings, labels):\n",
+    "        self.encodings = encodings\n",
+    "        self.labels = labels\n",
+    "        self.tok = tokenizer\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        # encoding = self.tok(self.encodings[idx], truncation=True, padding=\"max_length\", max_length=max_len)\n",
+    "        item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }\n",
+    "        item['labels'] = torch.tensor(self.labels[idx])\n",
+    "        return item\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.labels)\n",
+    "    \n",
+    "class TokenizerDataset(Dataset):\n",
+    "    def __init__(self, strings):\n",
+    "        self.strings = strings\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.strings[idx]\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.strings)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9969c58c",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 5145,
+     "status": "ok",
+     "timestamp": 1682285326593,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "9969c58c",
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "train_data = pd.read_csv(\"data/train.csv\")\n",
+    "train_text = train_data[\"comment_text\"]\n",
+    "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
+    "                           \"obscene\", \"threat\", \n",
+    "                           \"insult\", \"identity_hate\"]]\n",
+    "\n",
+    "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
+    "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
+    "                           \"toxic\", \"severe_toxic\", \n",
+    "                           \"obscene\", \"threat\", \n",
+    "                           \"insult\", \"identity_hate\"]]\n",
+    "\n",
+    "# data preprocessing\n",
+    "\n",
+    "\n",
+    "\n",
+    "train_text = train_text.values.tolist()\n",
+    "train_labels = train_labels.values.tolist()\n",
+    "test_text = test_text.values.tolist()\n",
+    "test_labels = test_labels.values.tolist()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1n56TME9Njde",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 12,
+     "status": "ok",
+     "timestamp": 1682285326594,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "1n56TME9Njde"
+   },
+   "outputs": [],
+   "source": [
+    "# prepare tokenizer and dataset\n",
+    "\n",
+    "train_strings = TokenizerDataset(train_text)\n",
+    "test_strings = TokenizerDataset(test_text)\n",
+    "\n",
+    "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
+    "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
+    "#                             max_length=200, pad_to_max_length=True, \\\n",
+    "#                             truncation=True, return_token_type_ids=False \\\n",
+    "#                             )\n",
+    "# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
+    "#                             max_length=200, pad_to_max_length=True, \\\n",
+    "#                             truncation=True, return_token_type_ids=False \\\n",
+    "#                             )\n",
+    "\n",
+    "\n",
+    "train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
+    "test_encodings = tokenizer(test_text, truncation=True, padding=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5c7a657",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = open(\"traintokens.txt\", 'a')\n",
+    "f.write(train_encodings)\n",
+    "f.write('\\n\\n\\n\\n\\n')\n",
+    "f.close()\n",
+    "\n",
+    "g = open(\"testtokens.txt\", 'a')\n",
+    "g.write(test_encodings)\n",
+    "g.write('\\n\\n\\n\\n\\n')\n",
+    "\n",
+    "g.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4kwydz67qjW9",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 10,
+     "status": "ok",
+     "timestamp": 1682285326595,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "4kwydz67qjW9"
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset = TweetDataset(train_ecnodings, train_labels)\n",
+    "test_dataset = TweetDataset(test_encodings, test_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "krZKjDVwNnWI",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 10,
+     "status": "ok",
+     "timestamp": 1682285326596,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "krZKjDVwNnWI"
+   },
+   "outputs": [],
+   "source": [
+    "# training\n",
+    "trainer = Trainer(\n",
+    "    model=model, \n",
+    "    args=training_args, \n",
+    "    train_dataset=train_dataset, \n",
+    "    eval_dataset=test_dataset\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "VwsyMZg_tgTg",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 416
+    },
+    "executionInfo": {
+     "elapsed": 27193,
+     "status": "error",
+     "timestamp": 1682285353779,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "VwsyMZg_tgTg",
+    "outputId": "49c3f5c8-0342-45c5-8d0f-5cd5d2d1f9e9"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": [
+    {
+     "file_id": "https://github.com/joebraha/aiproject/blob/milestone-3/training.ipynb",
+     "timestamp": 1682285843150
+    }
+   ]
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Copy of training.ipynb ADDED Viewed

	@@ -0,0 +1,334 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "215a1aae",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 128,
+     "status": "ok",
+     "timestamp": 1682285319377,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "215a1aae"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-04-23 18:07:24.557548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-04-23 18:07:25.431969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from transformers import BertTokenizerFast, BertForSequenceClassification\n",
+    "from transformers import Trainer, TrainingArguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "J5Tlgp4tNd0U",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "executionInfo": {
+     "elapsed": 1897,
+     "status": "ok",
+     "timestamp": 1682285321454,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "J5Tlgp4tNd0U",
+    "outputId": "3c9f0c5b-7bc3-4c15-c5ff-0a77d3b3b607"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
+      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"bert-base-uncased\"\n",
+    "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
+    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
+    "max_len = 200\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"results\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=64,\n",
+    "    warmup_steps=500,\n",
+    "    learning_rate=5e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=10\n",
+    "    )\n",
+    "\n",
+    "# dataset class that inherits from torch.utils.data.Dataset\n",
+    "class TweetDataset(Dataset):\n",
+    "    def __init__(self, encodings, labels):\n",
+    "        self.encodings = encodings\n",
+    "        self.labels = labels\n",
+    "        self.tok = tokenizer\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        # encoding = self.tok(self.encodings[idx], truncation=True, padding=\"max_length\", max_length=max_len)\n",
+    "        item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }\n",
+    "        item['labels'] = torch.tensor(self.labels[idx])\n",
+    "        return item\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.labels)\n",
+    "    \n",
+    "class TokenizerDataset(Dataset):\n",
+    "    def __init__(self, strings):\n",
+    "        self.strings = strings\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.strings[idx]\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.strings)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9969c58c",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 5145,
+     "status": "ok",
+     "timestamp": 1682285326593,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "9969c58c",
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "train_data = pd.read_csv(\"data/train.csv\")\n",
+    "train_text = train_data[\"comment_text\"]\n",
+    "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
+    "                           \"obscene\", \"threat\", \n",
+    "                           \"insult\", \"identity_hate\"]]\n",
+    "\n",
+    "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
+    "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
+    "                           \"toxic\", \"severe_toxic\", \n",
+    "                           \"obscene\", \"threat\", \n",
+    "                           \"insult\", \"identity_hate\"]]\n",
+    "\n",
+    "# data preprocessing\n",
+    "\n",
+    "\n",
+    "\n",
+    "train_text = train_text.values.tolist()\n",
+    "train_labels = train_labels.values.tolist()\n",
+    "test_text = test_text.values.tolist()\n",
+    "test_labels = test_labels.values.tolist()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1n56TME9Njde",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 12,
+     "status": "ok",
+     "timestamp": 1682285326594,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "1n56TME9Njde"
+   },
+   "outputs": [],
+   "source": [
+    "# prepare tokenizer and dataset\n",
+    "\n",
+    "train_strings = TokenizerDataset(train_text)\n",
+    "test_strings = TokenizerDataset(test_text)\n",
+    "\n",
+    "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
+    "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
+    "#                             max_length=200, pad_to_max_length=True, \\\n",
+    "#                             truncation=True, return_token_type_ids=False \\\n",
+    "#                             )\n",
+    "# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
+    "#                             max_length=200, pad_to_max_length=True, \\\n",
+    "#                             truncation=True, return_token_type_ids=False \\\n",
+    "#                             )\n",
+    "\n",
+    "\n",
+    "train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
+    "test_encodings = tokenizer(test_text, truncation=True, padding=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5c7a657",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = open(\"traintokens.txt\", 'a')\n",
+    "f.write(train_encodings)\n",
+    "f.write('\\n\\n\\n\\n\\n')\n",
+    "f.close()\n",
+    "\n",
+    "g = open(\"testtokens.txt\", 'a')\n",
+    "g.write(test_encodings)\n",
+    "g.write('\\n\\n\\n\\n\\n')\n",
+    "\n",
+    "g.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4kwydz67qjW9",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 10,
+     "status": "ok",
+     "timestamp": 1682285326595,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "4kwydz67qjW9"
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset = TweetDataset(train_ecnodings, train_labels)\n",
+    "test_dataset = TweetDataset(test_encodings, test_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "krZKjDVwNnWI",
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 10,
+     "status": "ok",
+     "timestamp": 1682285326596,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "krZKjDVwNnWI"
+   },
+   "outputs": [],
+   "source": [
+    "# training\n",
+    "trainer = Trainer(\n",
+    "    model=model, \n",
+    "    args=training_args, \n",
+    "    train_dataset=train_dataset, \n",
+    "    eval_dataset=test_dataset\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "VwsyMZg_tgTg",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 416
+    },
+    "executionInfo": {
+     "elapsed": 27193,
+     "status": "error",
+     "timestamp": 1682285353779,
+     "user": {
+      "displayName": "",
+      "userId": ""
+     },
+     "user_tz": 240
+    },
+    "id": "VwsyMZg_tgTg",
+    "outputId": "49c3f5c8-0342-45c5-8d0f-5cd5d2d1f9e9"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": [
+    {
+     "file_id": "https://github.com/joebraha/aiproject/blob/milestone-3/training.ipynb",
+     "timestamp": 1682285843150
+    }
+   ]
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -10,11 +10,8 @@ pinned: false
 ---
-# Milestone 2
 Here is the link to the HF space:
 https://huggingface.co/spaces/jbraha/aiproject
-Other notes:
-- the docker image was changed to python 3.8.9 to align withe HF deployment, so tensorflow was imported manually
-- Git actions got weird: to use a milestone branch while also deploying to HF successfully, I have a git action automatically merging milestone-2 to the main branch and then pushing to the HF space

 ---
+# Milestone 3
 Here is the link to the HF space:
 https://huggingface.co/spaces/jbraha/aiproject

app.py CHANGED Viewed

@@ -10,12 +10,21 @@ st.title("Sentiment Analysis")
 def analyze(input, model):
     return "This is a sample output"
 #text insert
 input = st.text_area("insert text to be analyzed", value="Nice to see you today.", height=None, max_chars=None, key=None, help=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
-model_name = st.text_input("choose a transformer model (nothing for default)", value="")
-if model_name:
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
     classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
 else:
     classifier = pipeline('sentiment-analysis')

 def analyze(input, model):
     return "This is a sample output"
+# load my fine-tuned model
+fine_tuned = None
 #text insert
 input = st.text_area("insert text to be analyzed", value="Nice to see you today.", height=None, max_chars=None, key=None, help=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
+option = st.selectbox(
+    'Choose a transformer model:',
+    ('Default', 'Fine-Tuned' , 'Custom'))
+if option == 'Fine-Tuned':
+    model = TFAutoModelForSequenceClassification.from_pretrained(fine_tuned)
+    tokenizer = AutoTokenizer.from_pretrained(fine_tuned)
     classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
 else:
     classifier = pipeline('sentiment-analysis')

data/.~lock.test.csv# DELETED Viewed

	@@ -1 +0,0 @@
1	- ,joe,mint,23.04.2023 12:27,file:///home/joe/.config/libreoffice/4;

data/.~lock.test_labels.csv# DELETED Viewed

	@@ -1 +0,0 @@
1	- ,joe,mint,23.04.2023 11:48,file:///home/joe/.config/libreoffice/4;

data/.~lock.train.csv# DELETED Viewed

	@@ -1 +0,0 @@
1	- ,joe,mint,23.04.2023 11:51,file:///home/joe/.config/libreoffice/4;

train.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+from transformers import BertTokenizerFast, BertForSequenceClassification
+from transformers import Trainer, TrainingArguments
+model_name = "bert-base-uncased"
+tokenizer = BertTokenizerFast.from_pretrained(model_name)
+model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
+max_len = 200
+training_args = TrainingArguments(
+    output_dir="results",
+    num_train_epochs=1,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=64,
+    warmup_steps=500,
+    learning_rate=5e-5,
+    weight_decay=0.01,
+    logging_dir="./logs",
+    logging_steps=10
+    )
+# dataset class that inherits from torch.utils.data.Dataset
+class TweetDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+        self.tok = tokenizer
+    def __getitem__(self, idx):
+        # encoding = self.tok(self.encodings[idx], truncation=True, padding="max_length", max_length=max_len)
+        item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }
+        item['labels'] = torch.tensor(self.labels[idx])
+        return item
+    def __len__(self):
+        return len(self.labels)
+class TokenizerDataset(Dataset):
+    def __init__(self, strings):
+        self.strings = strings
+    def __getitem__(self, idx):
+        return self.strings[idx]
+    def __len__(self):
+        return len(self.strings)
+train_data = pd.read_csv("data/train.csv")
+train_text = train_data["comment_text"]
+train_labels = train_data[["toxic", "severe_toxic",
+                           "obscene", "threat",
+                           "insult", "identity_hate"]]
+test_text = pd.read_csv("data/test.csv")["comment_text"]
+test_labels = pd.read_csv("data/test_labels.csv")[[
+                           "toxic", "severe_toxic",
+                           "obscene", "threat",
+                           "insult", "identity_hate"]]
+# data preprocessing
+train_text = train_text.values.tolist()
+train_labels = train_labels.values.tolist()
+test_text = test_text.values.tolist()
+test_labels = test_labels.values.tolist()
+# prepare tokenizer and dataset
+train_strings = TokenizerDataset(train_text)
+test_strings = TokenizerDataset(test_text)
+train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)
+test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)
+# train_encodings = tokenizer.batch_encode_plus(train_text, \
+#                             max_length=200, pad_to_max_length=True, \
+#                             truncation=True, return_token_type_ids=False \
+#                             )
+# test_encodings = tokenizer.batch_encode_plus(test_text, \
+#                             max_length=200, pad_to_max_length=True, \
+#                             truncation=True, return_token_type_ids=False \
+#                             )
+train_encodings = tokenizer.encode(train_text, truncation=True, padding=True)
+test_encodings = tokenizer.encode(test_text, truncation=True, padding=True)
+f = open("traintokens.txt", 'a')
+f.write(train_encodings)
+f.write('\n\n\n\n\n')
+f.close()
+g = open("testtokens.txt", 'a')
+g.write(test_encodings)
+g.write('\n\n\n\n\n')
+g.close()
+# train_dataset = TweetDataset(train_encodings, train_labels)
+# test_dataset = TweetDataset(test_encodings, test_labels)
+# # training
+# trainer = Trainer(
+#     model=model,
+#     args=training_args,
+#     train_dataset=train_dataset,
+#     eval_dataset=test_dataset
+#     )
+# trainer.train()

traintokens.txt ADDED Viewed

File without changes