{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "smalltalkintent = pd.read_csv(\"intentdata/Small_talk_Intent.csv\").rename(columns={\"Utterances\":\"text\",\"Intent\":\"intent\"})\n", "training = pd.read_csv(\"intentdata/train_command.csv\")\n", "df3 = pd.concat([training,smalltalkintent], ignore_index=True)\n", "df3.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)\n", "# TODO Do NLPAugmentation\n", "#df3 = pd.concat([training,smalltalkintent], ignore_index=True)\n", "#with open(\"intentdata/Intent.json\",\"r\") as f:\n", "# chatbotXresponseintent = json.load(f)\n", "#chatbotXresponseintent[\"intents\"][0] \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Produce responses\n", "import json \n", "with open(\"intentdata/Intent.json\",\"r\") as f:\n", " data = json.load(f)\n", "intents = []\n", "for intent in data[\"intents\"]:\n", " intents.append({\"intent\":intent[\"intent\"],\"responses\":intent[\"responses\"]})\n", "with open(\"intentdata/responses.json\",\"w+\") as f:\n", " json.dump({\"response\":intents},f)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"intentdata/responses.json\",\"r\") as f:\n", " responses = json.load(f)[\"responses\"]\n", "#print(intents[0] in greetings)\n", "#if intents[0] in greetings:\n", "print(responses[\"Greeting\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd \n", "from IPython.display import display\n", "with open(\"intentdata/aug/intent_aug_text.json\",\"r\") as f:\n", " data = json.load(f)[\"intents\"]\n", "columns_to_concat = []\n", "for i in range(len(data)):\n", " column1 = pd.DataFrame.from_dict({\"text\":data[i][\"text\"]})\n", " #print(column1)\n", " column2 = pd.DataFrame.from_dict({\"intent\":[data[i][\"intent\"] for j in range(len(data[i][\"text\"]))]})\n", " #print(column2)\n", " tent_concat= pd.concat([column1,column2],axis=1)\n", " #print(tent_concat)\n", " columns_to_concat.append(tent_concat)\n", "intentdf = pd.concat(columns_to_concat,axis=0)\n", "intentdf\n", "df = pd.read_csv(\"intentdata/train.csv\")\n", "newdf = pd.concat([df,intentdf],axis=0)\n", "newdf.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"intentdata/train.csv\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"intentdata/train_no_half_response.csv\") # Original data is a (2000, 7) DataFrame\n", "data = data.replace(\"smalltalk_agent_acquaintance\",\"CourtesyGreeting\")\n", "data = data.replace(\"smalltalk_agent_age\",\"CurrentHumanQuery\")\n", "data = data.replace(\"smalltalk_agent_annoying\",\"NotTalking2U\")\n", "data = data.replace(\"smalltalk_agent_bad\",\"Swearing\")\n", "data = data.replace(\"smalltalk_agent_boss\",\"CurrentHumanQuery\")\n", "data = data.replace(\"smalltalk_agent_clever\",\"Clever\")\n", "\n", "data = data.replace(\"smalltalk_agent_beautiful\",\"Clever\")\n", "data = data.replace(\"smalltalk_agent_fired\",\"Shutup\")\n", "data = data.replace(\"smalltalk_agent_good\",\"Thanks\")\n", "data = data.replace(\"smalltalk_agent_chatbot\",\"SelfAware\")\n", "data = data.replace(\"smalltalk_agent_real\",\"SelfAware\")\n", "\n", "\n", "data.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TODO Checks label data balance\n", "import pandas as pd\n", "data = pd.read_csv(\"intentdata/train.csv\") # Original data is a (2000, 7) DataFrame\n", "# data contains 6 feature columns and 1 target column.\n", "\n", "# Separate the design matrix from the target labels.\n", "X = data.iloc[:, :-1]\n", "y = data['intent']\n", "\n", "y.value_counts().sort_index().plot.bar(x='Target Value', y='Number of Occurrences',figsize=(20,20))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "torch.cuda.is_available()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json \n", "import nlpaug.augmenter.word as naw\n", "import nlpaug.flow as naf\n", "print(\"Loading Models...\")\n", "import nltk \n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "TOPK=20 #default=100\n", "ACT = 'insert' #\"substitute\"\n", "def tst():\n", " aug_w2v= naw.WordEmbsAug(\n", " model_type='glove', model_path='glove/glove.6B.300d.txt',\n", " action=\"substitute\")\n", " aug_bert = naw.ContextualWordEmbsAug(\n", " model_path='distilbert-base-uncased', \n", " \n", " action=ACT, top_k=TOPK)\n", " aug = naf.Sequential([\n", " aug_bert,aug_w2v\n", " ])\n", "print(\"Models Loaded.\")\n", "with open(\"intentdata/intent.json\") as f:\n", " intentwhole = json.load(f)[\"intents\"]\n", "#text = intent[0][\"text\"][3]\n", "for intent in intentwhole:\n", " for text in intent[\"text\"]:\n", " augmented_texts = set()\n", " for i in range(20):\n", " aug = naw.SynonymAug(aug_src='wordnet',aug_min=1, aug_max=10, aug_p=i/10)\n", " augmented_text = str(aug.augment(text))\n", " print(augmented_text)\n", " #print(augmented_text)\n", " augmented_texts.add(augmented_text)\n", " augmented_texts = augmented_texts.union(set(intent[\"text\"]))\n", " intent[\"text\"] = list(augmented_texts)\n", " def test():\n", " if intent[\"intent\"] != \"Jokes\": \n", " for response in intent[\"responses\"]:\n", " augmented_responses = set()\n", " for i in range(20):\n", " #aug = naw.SynonymAug(aug_src='wordnet',aug_min=1, aug_max=10, aug_p=i/50,stopwords=[\"\",\",\",\"!\"])\n", " augmented_response = str(aug.augment(response)[0])\n", " print(augmented_response)\n", " try:\n", " augmented_response = augmented_response[:augmented_response.index(\"<\")] + \"\"):]\n", " except ValueError as vex:\n", " pass\n", " #print(augmented_text)\n", " augmented_responses.add(augmented_response)\n", " augmented_responses = augmented_responses.union(set(intent[\"responses\"]))\n", " intent[\"responses\"] = list(augmented_responses) \n", "with open(\"intentdata/intent_aug_text_test.json\",\"w+\") as f:\n", " json.dump({\"intents\":intentwhole},f)\n", "\n", "\n", "print(intentwhole[1][\"responses\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"intentdata/train.csv\")\n", "len(list(pd.unique(df[\"intent\"])))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json \n", "import pandas as pd\n", "os.listdir(\"new_intent_data\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "nlp(\"start the fan\").similarity(nlp(\"turn the fan on\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "traindf = pd.read_csv(\"intentdata/train.csv\")\n", "traindf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "intentdf = pd.read_csv(\"new_intent_data/intent_classification.csv\")\n", "traindf = pd.read_csv(\"intentdata/train.csv\")\n", "smart_home_intent = intentdf[[\"text-en\",\"intent-en\"]].rename({\"text-en\":\"text\",\"intent-en\":\"intent\"},axis=1)\n", "newdf = pd.concat([traindf,smart_home_intent],axis=0)\n", "\n", "newdf.to_csv(\"intentdata/train.csv\",mode=\"w\",index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"new_intent_data/Dataset-train_old.txt\") as f:\n", " textdata = f.readlines()\n", "sent = \"\"\n", "for text in textdata:\n", " if text == \"\":\n", " pass\n", " elif text != \"\":\n", " if text.count(\",\") == 2:\n", " #print(text.replace(\",\",\";\",1))\n", " sent += text.replace(\",\",\";\",1)\n", " else:\n", " \n", " sent += text\n", "print(sent)\n", "with open(\"new_intent_data/Dataset-train.txt\",\"w+\") as f:\n", " f.write(sent)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"new_intent_data/Dataset-train.csv\",delimiter=\",\") \n", "data = data.drop(\"Unnamed: 0\",axis=1)\n", "data\n", "data.to_csv(\"new_intent_data/Dataset-train.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"new_intent_data/Dataset-train.csv\",delimiter=\",\") # Original data is a (2000, 7) DataFrame\n", "data = data.apply(lambda x: x.replace('\"',''),axis=1)\n", "data.to_csv(\"new_intent_data/Dataset-train.csv\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\"new_intent_data/Dataset-train.csv\",delimiter=\"|\") \n", "traindf = pd.read_csv(\"intentdata/train.csv\")\n", "new_data = pd.concat([traindf,data],axis=0)\n", "new_data.to_csv(\"intentdata/train.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textintent
0listen to westbam alumb allergic on google musicPlayMusic
1add step to me to the 50 clásicos playlistAddToPlaylist
2i give this current textbook a rating value of...RateBook
3play the song little robin redbreastPlayMusic
4please add iris dement to my playlist this is ...AddToPlaylist
.........
37306what is your date of birthhow_old_are_you
37307what year were you born inhow_old_are_you
37308what is the year that were you bornhow_old_are_you
37309how old are you aihow_old_are_you
37310are you 16 years oldhow_old_are_you
\n", "

37311 rows × 2 columns

\n", "
" ], "text/plain": [ " text intent\n", "0 listen to westbam alumb allergic on google music PlayMusic\n", "1 add step to me to the 50 clásicos playlist AddToPlaylist\n", "2 i give this current textbook a rating value of... RateBook\n", "3 play the song little robin redbreast PlayMusic\n", "4 please add iris dement to my playlist this is ... AddToPlaylist\n", "... ... ...\n", "37306 what is your date of birth how_old_are_you\n", "37307 what year were you born in how_old_are_you\n", "37308 what is the year that were you born how_old_are_you\n", "37309 how old are you ai how_old_are_you\n", "37310 are you 16 years old how_old_are_you\n", "\n", "[37311 rows x 2 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "traindf = pd.read_csv(\"intentdata/train.csv\")\n", "traindf" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.6.13 ('caesarnlradeon')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "bfcbd6a2138b43c88138636b277dc6540d170334c8840148725250eab505a128" } } }, "nbformat": 4, "nbformat_minor": 2 }