{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "smalltalkintent = pd.read_csv(\"intentdata/Small_talk_Intent.csv\").rename(columns={\"Utterances\":\"text\",\"Intent\":\"intent\"})\n", "training = pd.read_csv(\"intentdata/train_command.csv\")\n", "df3 = pd.concat([training,smalltalkintent], ignore_index=True)\n", "df3.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)\n", "# TODO Do NLPAugmentation\n", "#df3 = pd.concat([training,smalltalkintent], ignore_index=True)\n", "#with open(\"intentdata/Intent.json\",\"r\") as f:\n", "# chatbotXresponseintent = json.load(f)\n", "#chatbotXresponseintent[\"intents\"][0] \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Produce responses\n", "import json \n", "with open(\"intentdata/Intent.json\",\"r\") as f:\n", " data = json.load(f)\n", "intents = []\n", "for intent in data[\"intents\"]:\n", " intents.append({\"intent\":intent[\"intent\"],\"responses\":intent[\"responses\"]})\n", "with open(\"intentdata/responses.json\",\"w+\") as f:\n", " json.dump({\"response\":intents},f)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"intentdata/responses.json\",\"r\") as f:\n", " responses = json.load(f)[\"responses\"]\n", "#print(intents[0] in greetings)\n", "#if intents[0] in greetings:\n", "print(responses[\"Greeting\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd \n", "from IPython.display import display\n", "with open(\"intentdata/aug/intent_aug_text.json\",\"r\") as f:\n", " data = json.load(f)[\"intents\"]\n", "columns_to_concat = []\n", "for i in range(len(data)):\n", " column1 = pd.DataFrame.from_dict({\"text\":data[i][\"text\"]})\n", " #print(column1)\n", " column2 = pd.DataFrame.from_dict({\"intent\":[data[i][\"intent\"] for j in range(len(data[i][\"text\"]))]})\n", " #print(column2)\n", " tent_concat= pd.concat([column1,column2],axis=1)\n", " #print(tent_concat)\n", " columns_to_concat.append(tent_concat)\n", "intentdf = pd.concat(columns_to_concat,axis=0)\n", "intentdf\n", "df = pd.read_csv(\"intentdata/train.csv\")\n", "newdf = pd.concat([df,intentdf],axis=0)\n", "newdf.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"intentdata/train.csv\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"intentdata/train_no_half_response.csv\") # Original data is a (2000, 7) DataFrame\n", "data = data.replace(\"smalltalk_agent_acquaintance\",\"CourtesyGreeting\")\n", "data = data.replace(\"smalltalk_agent_age\",\"CurrentHumanQuery\")\n", "data = data.replace(\"smalltalk_agent_annoying\",\"NotTalking2U\")\n", "data = data.replace(\"smalltalk_agent_bad\",\"Swearing\")\n", "data = data.replace(\"smalltalk_agent_boss\",\"CurrentHumanQuery\")\n", "data = data.replace(\"smalltalk_agent_clever\",\"Clever\")\n", "\n", "data = data.replace(\"smalltalk_agent_beautiful\",\"Clever\")\n", "data = data.replace(\"smalltalk_agent_fired\",\"Shutup\")\n", "data = data.replace(\"smalltalk_agent_good\",\"Thanks\")\n", "data = data.replace(\"smalltalk_agent_chatbot\",\"SelfAware\")\n", "data = data.replace(\"smalltalk_agent_real\",\"SelfAware\")\n", "\n", "\n", "data.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TODO Checks label data balance\n", "import pandas as pd\n", "data = pd.read_csv(\"intentdata/train.csv\") # Original data is a (2000, 7) DataFrame\n", "# data contains 6 feature columns and 1 target column.\n", "\n", "# Separate the design matrix from the target labels.\n", "X = data.iloc[:, :-1]\n", "y = data['intent']\n", "\n", "y.value_counts().sort_index().plot.bar(x='Target Value', y='Number of Occurrences',figsize=(20,20))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "torch.cuda.is_available()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json \n", "import nlpaug.augmenter.word as naw\n", "import nlpaug.flow as naf\n", "print(\"Loading Models...\")\n", "TOPK=20 #default=100\n", "ACT = 'insert' #\"substitute\"\n", "aug_w2v= naw.WordEmbsAug(\n", " model_type='glove', model_path='glove/glove.6B.300d.txt',\n", " action=\"substitute\")\n", "aug_bert = naw.ContextualWordEmbsAug(\n", " model_path='distilbert-base-uncased', \n", " \n", " action=ACT, top_k=TOPK)\n", "aug = naf.Sequential([\n", " aug_bert,aug_w2v\n", " ])\n", "print(\"Models Loaded.\")\n", "with open(\"intentdata/intent.json\") as f:\n", " intentwhole = json.load(f)[\"intents\"]\n", "#text = intent[0][\"text\"][3]\n", "for intent in intentwhole:\n", " for text in intent[\"text\"]:\n", " augmented_texts = set()\n", " for i in range(20):\n", " #aug = naw.SynonymAug(aug_src='wordnet',aug_min=1, aug_max=10, aug_p=i/10)\n", " augmented_text = str(aug.augment(text)[0])\n", " print(augmented_text)\n", " #print(augmented_text)\n", " augmented_texts.add(augmented_text)\n", " augmented_texts = augmented_texts.union(set(intent[\"text\"]))\n", " intent[\"text\"] = list(augmented_texts)\n", " def test():\n", " if intent[\"intent\"] != \"Jokes\": \n", " for response in intent[\"responses\"]:\n", " augmented_responses = set()\n", " for i in range(20):\n", " #aug = naw.SynonymAug(aug_src='wordnet',aug_min=1, aug_max=10, aug_p=i/50,stopwords=[\"\",\",\",\"!\"])\n", " augmented_response = str(aug.augment(response)[0])\n", " print(augmented_response)\n", " try:\n", " augmented_response = augmented_response[:augmented_response.index(\"<\")] + \"\"):]\n", " except ValueError as vex:\n", " pass\n", " #print(augmented_text)\n", " augmented_responses.add(augmented_response)\n", " augmented_responses = augmented_responses.union(set(intent[\"responses\"]))\n", " intent[\"responses\"] = list(augmented_responses) \n", "with open(\"intentdata/intent_aug_text_test.json\",\"w+\") as f:\n", " json.dump({\"intents\":intentwhole},f)\n", "\n", "\n", "print(intentwhole[1][\"responses\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"intentdata/train.csv\")\n", "len(list(pd.unique(df[\"intent\"])))" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample',\n", " 'AskUbuntu Corpus.json',\n", " 'Bitext_Sample_Customer_Service_Training_Dataset',\n", " 'Chatbot Corpus.json',\n", " 'Dataset-train.csv',\n", " 'intent-corpus-basic.json',\n", " 'intent-corpus-enrich-limit-20.json',\n", " 'intents.json',\n", " 'intent_classification.csv',\n", " 'music_intent_entities.json',\n", " 'restaurant_intent_entities.json',\n", " 'Web App Corpus.json']" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import json \n", "import pandas as pd\n", "os.listdir(\"new_intent_data\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" } } }, "nbformat": 4, "nbformat_minor": 2 }