{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "smalltalkintent = pd.read_csv(\"intentdata/Small_talk_Intent.csv\").rename(columns={\"Utterances\":\"text\",\"Intent\":\"intent\"})\n",
    "training = pd.read_csv(\"intentdata/train_command.csv\")\n",
    "df3 = pd.concat([training,smalltalkintent], ignore_index=True)\n",
    "df3.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)\n",
    "# TODO Do NLPAugmentation\n",
    "#df3 = pd.concat([training,smalltalkintent], ignore_index=True)\n",
    "#with open(\"intentdata/Intent.json\",\"r\") as f:\n",
    "#    chatbotXresponseintent = json.load(f)\n",
    "#chatbotXresponseintent[\"intents\"][0] \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Produce responses\n",
    "import json \n",
    "with open(\"intentdata/Intent.json\",\"r\") as f:\n",
    "    data = json.load(f)\n",
    "intents = []\n",
    "for intent in data[\"intents\"]:\n",
    "    intents.append({\"intent\":intent[\"intent\"],\"responses\":intent[\"responses\"]})\n",
    "with open(\"intentdata/responses.json\",\"w+\") as f:\n",
    "    json.dump({\"response\":intents},f)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"intentdata/responses.json\",\"r\") as f:\n",
    "  responses = json.load(f)[\"responses\"]\n",
    "#print(intents[0] in greetings)\n",
    "#if intents[0] in greetings:\n",
    "print(responses[\"Greeting\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd \n",
    "from IPython.display import display\n",
    "with open(\"intentdata/aug/intent_aug_text.json\",\"r\") as f:\n",
    "    data = json.load(f)[\"intents\"]\n",
    "columns_to_concat = []\n",
    "for i in range(len(data)):\n",
    "    column1 = pd.DataFrame.from_dict({\"text\":data[i][\"text\"]})\n",
    "    #print(column1)\n",
    "    column2 = pd.DataFrame.from_dict({\"intent\":[data[i][\"intent\"] for j in range(len(data[i][\"text\"]))]})\n",
    "    #print(column2)\n",
    "    tent_concat= pd.concat([column1,column2],axis=1)\n",
    "    #print(tent_concat)\n",
    "    columns_to_concat.append(tent_concat)\n",
    "intentdf = pd.concat(columns_to_concat,axis=0)\n",
    "intentdf\n",
    "df = pd.read_csv(\"intentdata/train.csv\")\n",
    "newdf = pd.concat([df,intentdf],axis=0)\n",
    "newdf.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(\"intentdata/train.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(\"intentdata/train_no_half_response.csv\") # Original data is a (2000, 7) DataFrame\n",
    "data = data.replace(\"smalltalk_agent_acquaintance\",\"CourtesyGreeting\")\n",
    "data = data.replace(\"smalltalk_agent_age\",\"CurrentHumanQuery\")\n",
    "data = data.replace(\"smalltalk_agent_annoying\",\"NotTalking2U\")\n",
    "data = data.replace(\"smalltalk_agent_bad\",\"Swearing\")\n",
    "data = data.replace(\"smalltalk_agent_boss\",\"CurrentHumanQuery\")\n",
    "data = data.replace(\"smalltalk_agent_clever\",\"Clever\")\n",
    "\n",
    "data = data.replace(\"smalltalk_agent_beautiful\",\"Clever\")\n",
    "data = data.replace(\"smalltalk_agent_fired\",\"Shutup\")\n",
    "data = data.replace(\"smalltalk_agent_good\",\"Thanks\")\n",
    "data = data.replace(\"smalltalk_agent_chatbot\",\"SelfAware\")\n",
    "data = data.replace(\"smalltalk_agent_real\",\"SelfAware\")\n",
    "\n",
    "\n",
    "data.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO Checks label data balance\n",
    "import pandas as pd\n",
    "data = pd.read_csv(\"intentdata/train.csv\") # Original data is a (2000, 7) DataFrame\n",
    "# data contains 6 feature columns and 1 target column.\n",
    "\n",
    "# Separate the design matrix from the target labels.\n",
    "X = data.iloc[:, :-1]\n",
    "y = data['intent']\n",
    "\n",
    "y.value_counts().sort_index().plot.bar(x='Target Value', y='Number of Occurrences',figsize=(20,20))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "torch.cuda.is_available()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "import nlpaug.augmenter.word as naw\n",
    "import nlpaug.flow as naf\n",
    "print(\"Loading Models...\")\n",
    "import nltk \n",
    "nltk.download('wordnet')\n",
    "nltk.download('omw-1.4')\n",
    "TOPK=20 #default=100\n",
    "ACT = 'insert' #\"substitute\"\n",
    "def tst():\n",
    "    aug_w2v= naw.WordEmbsAug(\n",
    "        model_type='glove', model_path='glove/glove.6B.300d.txt',\n",
    "        action=\"substitute\")\n",
    "    aug_bert = naw.ContextualWordEmbsAug(\n",
    "        model_path='distilbert-base-uncased', \n",
    "        \n",
    "        action=ACT, top_k=TOPK)\n",
    "    aug = naf.Sequential([\n",
    "            aug_bert,aug_w2v\n",
    "        ])\n",
    "print(\"Models Loaded.\")\n",
    "with open(\"intentdata/intent.json\") as f:\n",
    "    intentwhole = json.load(f)[\"intents\"]\n",
    "#text = intent[0][\"text\"][3]\n",
    "for intent in intentwhole:\n",
    "    for text in intent[\"text\"]:\n",
    "        augmented_texts = set()\n",
    "        for i in range(20):\n",
    "            aug = naw.SynonymAug(aug_src='wordnet',aug_min=1, aug_max=10, aug_p=i/10)\n",
    "            augmented_text = str(aug.augment(text))\n",
    "            print(augmented_text)\n",
    "            #print(augmented_text)\n",
    "            augmented_texts.add(augmented_text)\n",
    "        augmented_texts = augmented_texts.union(set(intent[\"text\"]))\n",
    "        intent[\"text\"] = list(augmented_texts)\n",
    "    def test():\n",
    "        if intent[\"intent\"] != \"Jokes\": \n",
    "            for response in intent[\"responses\"]:\n",
    "                augmented_responses = set()\n",
    "                for i in range(20):\n",
    "                    #aug = naw.SynonymAug(aug_src='wordnet',aug_min=1, aug_max=10, aug_p=i/50,stopwords=[\"<HUMAN>\",\"<HUMAN>,\",\"<HUMAN>!\"])\n",
    "                    augmented_response = str(aug.augment(response)[0])\n",
    "                    print(augmented_response)\n",
    "                    try:\n",
    "                        augmented_response = augmented_response[:augmented_response.index(\"<\")] + \"<HUMAN\" + augmented_response[augmented_response.index(\">\"):]\n",
    "                    except ValueError as vex:\n",
    "                        pass\n",
    "                    #print(augmented_text)\n",
    "                    augmented_responses.add(augmented_response)\n",
    "                augmented_responses = augmented_responses.union(set(intent[\"responses\"]))\n",
    "                intent[\"responses\"] = list(augmented_responses) \n",
    "with open(\"intentdata/intent_aug_text_test.json\",\"w+\") as f:\n",
    "    json.dump({\"intents\":intentwhole},f)\n",
    "\n",
    "\n",
    "print(intentwhole[1][\"responses\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"intentdata/train.csv\")\n",
    "len(list(pd.unique(df[\"intent\"])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json \n",
    "import pandas as pd\n",
    "os.listdir(\"new_intent_data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "nlp(\"start the fan\").similarity(nlp(\"turn the fan on\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "traindf = pd.read_csv(\"intentdata/train.csv\")\n",
    "traindf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "intentdf = pd.read_csv(\"new_intent_data/intent_classification.csv\")\n",
    "traindf = pd.read_csv(\"intentdata/train.csv\")\n",
    "smart_home_intent = intentdf[[\"text-en\",\"intent-en\"]].rename({\"text-en\":\"text\",\"intent-en\":\"intent\"},axis=1)\n",
    "newdf = pd.concat([traindf,smart_home_intent],axis=0)\n",
    "\n",
    "newdf.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"new_intent_data/Dataset-train_old.txt\") as f:\n",
    "    textdata = f.readlines()\n",
    "sent = \"\"\n",
    "for text in textdata:\n",
    "    if text == \"\":\n",
    "        pass\n",
    "    elif text != \"\":\n",
    "        if text.count(\",\") == 2:\n",
    "            #print(text.replace(\",\",\";\",1))\n",
    "            sent += text.replace(\",\",\";\",1)\n",
    "        else:\n",
    "            \n",
    "            sent += text\n",
    "print(sent)\n",
    "with open(\"new_intent_data/Dataset-train.txt\",\"w+\") as f:\n",
    "    f.write(sent)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(\"new_intent_data/Dataset-train.csv\",delimiter=\",\") \n",
    "data = data.drop(\"Unnamed: 0\",axis=1)\n",
    "data\n",
    "data.to_csv(\"new_intent_data/Dataset-train.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(\"new_intent_data/Dataset-train.csv\",delimiter=\",\") # Original data is a (2000, 7) DataFrame\n",
    "data = data.apply(lambda x: x.replace('\"',''),axis=1)\n",
    "data.to_csv(\"new_intent_data/Dataset-train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(\"new_intent_data/Dataset-train.csv\",delimiter=\"|\") \n",
    "traindf = pd.read_csv(\"intentdata/train.csv\")\n",
    "new_data = pd.concat([traindf,data],axis=0)\n",
    "new_data.to_csv(\"intentdata/train.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>intent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>listen to westbam alumb allergic on google music</td>\n",
       "      <td>PlayMusic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>add step to me to the 50 clásicos playlist</td>\n",
       "      <td>AddToPlaylist</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>i give this current textbook a rating value of...</td>\n",
       "      <td>RateBook</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>play the song little robin redbreast</td>\n",
       "      <td>PlayMusic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>please add iris dement to my playlist this is ...</td>\n",
       "      <td>AddToPlaylist</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37306</th>\n",
       "      <td>what is your date of birth</td>\n",
       "      <td>how_old_are_you</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37307</th>\n",
       "      <td>what year were you born in</td>\n",
       "      <td>how_old_are_you</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37308</th>\n",
       "      <td>what is the year that were you born</td>\n",
       "      <td>how_old_are_you</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37309</th>\n",
       "      <td>how old are you ai</td>\n",
       "      <td>how_old_are_you</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37310</th>\n",
       "      <td>are you 16 years old</td>\n",
       "      <td>how_old_are_you</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>37311 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text           intent\n",
       "0       listen to westbam alumb allergic on google music        PlayMusic\n",
       "1             add step to me to the 50 clásicos playlist    AddToPlaylist\n",
       "2      i give this current textbook a rating value of...         RateBook\n",
       "3                   play the song little robin redbreast        PlayMusic\n",
       "4      please add iris dement to my playlist this is ...    AddToPlaylist\n",
       "...                                                  ...              ...\n",
       "37306                         what is your date of birth  how_old_are_you\n",
       "37307                         what year were you born in  how_old_are_you\n",
       "37308                what is the year that were you born  how_old_are_you\n",
       "37309                                 how old are you ai  how_old_are_you\n",
       "37310                               are you 16 years old  how_old_are_you\n",
       "\n",
       "[37311 rows x 2 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "traindf = pd.read_csv(\"intentdata/train.csv\")\n",
    "traindf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6.13 ('caesarnlradeon')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "bfcbd6a2138b43c88138636b277dc6540d170334c8840148725250eab505a128"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}