{ "cells": [ { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split\n", "from transformers import AutoTokenizer\n", "from datasets import Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load Dataset" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jutsu_namejutsu_typejutsu_description
0Adamantine Sealing Chains: Spiral FormationHiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...Kushina uses her chains to form a barrier whil...
1Adamantine Power: AcalaKekkei Genkai, Ninjutsu, TaijutsuHashirama kicks the opponent away and raises s...
2Adamantine Prison WallNinjutsu, Clone Techniques, BukijutsuAfter using Transformation: Adamantine Staff, ...
3Adamantine Seal: Monkey Yang SuppressionNinjutsu, Fūinjutsu, Cooperation NinjutsuAfter placing fūinjutsu tags in an area, the u...
4AcrobatTaijutsu, KenjutsuThe Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...
\n", "
" ], "text/plain": [ " jutsu_name \\\n", "0 Adamantine Sealing Chains: Spiral Formation \n", "1 Adamantine Power: Acala \n", "2 Adamantine Prison Wall \n", "3 Adamantine Seal: Monkey Yang Suppression \n", "4 Acrobat \n", "\n", " jutsu_type \\\n", "0 Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... \n", "1 Kekkei Genkai, Ninjutsu, Taijutsu \n", "2 Ninjutsu, Clone Techniques, Bukijutsu \n", "3 Ninjutsu, Fūinjutsu, Cooperation Ninjutsu \n", "4 Taijutsu, Kenjutsu \n", "\n", " jutsu_description \n", "0 Kushina uses her chains to form a barrier whil... \n", "1 Hashirama kicks the opponent away and raises s... \n", "2 After using Transformation: Adamantine Staff, ... \n", "3 After placing fūinjutsu tags in an area, the u... \n", "4 The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_path = \"../data/jutsus.jsonl\"\n", "df = pd.read_json(data_path, lines=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def simplify_jutsu(jutsu):\n", " if \"Genjutsu\" in jutsu:\n", " return \"Genjutsu\"\n", " if \"Ninjutsu\" in jutsu:\n", " return \"Ninjutsu\"\n", " if \"Taijutsu\" in jutsu:\n", " return \"Taijutsu\"" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jutsu_namejutsu_typejutsu_descriptionjutsu_type_simplified
0Adamantine Sealing Chains: Spiral FormationHiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...Kushina uses her chains to form a barrier whil...Ninjutsu
1Adamantine Power: AcalaKekkei Genkai, Ninjutsu, TaijutsuHashirama kicks the opponent away and raises s...Ninjutsu
2Adamantine Prison WallNinjutsu, Clone Techniques, BukijutsuAfter using Transformation: Adamantine Staff, ...Ninjutsu
3Adamantine Seal: Monkey Yang SuppressionNinjutsu, Fūinjutsu, Cooperation NinjutsuAfter placing fūinjutsu tags in an area, the u...Ninjutsu
4AcrobatTaijutsu, KenjutsuThe Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...Taijutsu
\n", "
" ], "text/plain": [ " jutsu_name \\\n", "0 Adamantine Sealing Chains: Spiral Formation \n", "1 Adamantine Power: Acala \n", "2 Adamantine Prison Wall \n", "3 Adamantine Seal: Monkey Yang Suppression \n", "4 Acrobat \n", "\n", " jutsu_type \\\n", "0 Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... \n", "1 Kekkei Genkai, Ninjutsu, Taijutsu \n", "2 Ninjutsu, Clone Techniques, Bukijutsu \n", "3 Ninjutsu, Fūinjutsu, Cooperation Ninjutsu \n", "4 Taijutsu, Kenjutsu \n", "\n", " jutsu_description jutsu_type_simplified \n", "0 Kushina uses her chains to form a barrier whil... Ninjutsu \n", "1 Hashirama kicks the opponent away and raises s... Ninjutsu \n", "2 After using Transformation: Adamantine Staff, ... Ninjutsu \n", "3 After placing fūinjutsu tags in an area, the u... Ninjutsu \n", "4 The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... Taijutsu " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "jutsu_type_simplified\n", "Ninjutsu 2255\n", "Taijutsu 397\n", "Genjutsu 101\n", "Name: count, dtype: int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['jutsu_type_simplified'].value_counts()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "df['text'] = df['jutsu_name'] + \". \" + df['jutsu_description']\n", "df['jutsus'] = df['jutsu_type_simplified']\n", "df = df[['text', 'jutsus']]\n", "df = df.dropna()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jutsu_namejutsu_typejutsu_descriptionjutsu_type_simplifiedtextjutus
0Adamantine Sealing Chains: Spiral FormationHiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...Kushina uses her chains to form a barrier whil...NinjutsuAdamantine Sealing Chains: Spiral Formation. K...Ninjutsu
1Adamantine Power: AcalaKekkei Genkai, Ninjutsu, TaijutsuHashirama kicks the opponent away and raises s...NinjutsuAdamantine Power: Acala. Hashirama kicks the o...Ninjutsu
2Adamantine Prison WallNinjutsu, Clone Techniques, BukijutsuAfter using Transformation: Adamantine Staff, ...NinjutsuAdamantine Prison Wall. After using Transforma...Ninjutsu
3Adamantine Seal: Monkey Yang SuppressionNinjutsu, Fūinjutsu, Cooperation NinjutsuAfter placing fūinjutsu tags in an area, the u...NinjutsuAdamantine Seal: Monkey Yang Suppression. Afte...Ninjutsu
4AcrobatTaijutsu, KenjutsuThe Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...TaijutsuAcrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...Taijutsu
\n", "
" ], "text/plain": [ " jutsu_name \\\n", "0 Adamantine Sealing Chains: Spiral Formation \n", "1 Adamantine Power: Acala \n", "2 Adamantine Prison Wall \n", "3 Adamantine Seal: Monkey Yang Suppression \n", "4 Acrobat \n", "\n", " jutsu_type \\\n", "0 Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... \n", "1 Kekkei Genkai, Ninjutsu, Taijutsu \n", "2 Ninjutsu, Clone Techniques, Bukijutsu \n", "3 Ninjutsu, Fūinjutsu, Cooperation Ninjutsu \n", "4 Taijutsu, Kenjutsu \n", "\n", " jutsu_description jutsu_type_simplified \\\n", "0 Kushina uses her chains to form a barrier whil... Ninjutsu \n", "1 Hashirama kicks the opponent away and raises s... Ninjutsu \n", "2 After using Transformation: Adamantine Staff, ... Ninjutsu \n", "3 After placing fūinjutsu tags in an area, the u... Ninjutsu \n", "4 The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... Taijutsu \n", "\n", " text jutus \n", "0 Adamantine Sealing Chains: Spiral Formation. K... Ninjutsu \n", "1 Adamantine Power: Acala. Hashirama kicks the o... Ninjutsu \n", "2 Adamantine Prison Wall. After using Transforma... Ninjutsu \n", "3 Adamantine Seal: Monkey Yang Suppression. Afte... Ninjutsu \n", "4 Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... Taijutsu " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "class Cleaner():\n", " def __init__(self):\n", " pass \n", " \n", " def put_line_breaks(self, text):\n", " return text.replace(\"<\\p>\", \"<\\p>\\n\")\n", " \n", " def remove_html_tags(self, text):\n", " clean_text = BeautifulSoup(text, \"lxml\").text\n", " return clean_text\n", "\n", " def clean(self, text):\n", " text = self.put_line_breaks(text)\n", " text = self.remove_html_tags(text)\n", " text = text.strip()\n", " return text" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "text_column_name = 'text'\n", "label_column_name = \"jutsus\"" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_25661/3655626712.py:10: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n", " clean_text = BeautifulSoup(text, \"lxml\").text\n" ] } ], "source": [ "# Clean Text\n", "cleaner = Cleaner()\n", "df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textjutsustext_cleaned
0Adamantine Sealing Chains: Spiral Formation. K...NinjutsuAdamantine Sealing Chains: Spiral Formation. K...
1Adamantine Power: Acala. Hashirama kicks the o...NinjutsuAdamantine Power: Acala. Hashirama kicks the o...
\n", "
" ], "text/plain": [ " text jutsus \\\n", "0 Adamantine Sealing Chains: Spiral Formation. K... Ninjutsu \n", "1 Adamantine Power: Acala. Hashirama kicks the o... Ninjutsu \n", "\n", " text_cleaned \n", "0 Adamantine Sealing Chains: Spiral Formation. K... \n", "1 Adamantine Power: Acala. Hashirama kicks the o... " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(2)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LabelEncoder()" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Encode Labels \n", "le = preprocessing.LabelEncoder()\n", "le.fit(df[label_column_name].tolist())" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}\n", "label_dict" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df['label'] = le.transform(df[label_column_name].tolist())" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textjutsustext_cleanedlabel
0Adamantine Sealing Chains: Spiral Formation. K...NinjutsuAdamantine Sealing Chains: Spiral Formation. K...1
1Adamantine Power: Acala. Hashirama kicks the o...NinjutsuAdamantine Power: Acala. Hashirama kicks the o...1
2Adamantine Prison Wall. After using Transforma...NinjutsuAdamantine Prison Wall. After using Transforma...1
3Adamantine Seal: Monkey Yang Suppression. Afte...NinjutsuAdamantine Seal: Monkey Yang Suppression. Afte...1
4Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...TaijutsuAcrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...2
\n", "
" ], "text/plain": [ " text jutsus \\\n", "0 Adamantine Sealing Chains: Spiral Formation. K... Ninjutsu \n", "1 Adamantine Power: Acala. Hashirama kicks the o... Ninjutsu \n", "2 Adamantine Prison Wall. After using Transforma... Ninjutsu \n", "3 Adamantine Seal: Monkey Yang Suppression. Afte... Ninjutsu \n", "4 Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... Taijutsu \n", "\n", " text_cleaned label \n", "0 Adamantine Sealing Chains: Spiral Formation. K... 1 \n", "1 Adamantine Power: Acala. Hashirama kicks the o... 1 \n", "2 Adamantine Prison Wall. After using Transforma... 1 \n", "3 Adamantine Seal: Monkey Yang Suppression. Afte... 1 \n", "4 Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... 2 " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "test_size = 0.2\n", "df_train, df_test = train_test_split(df, \n", " test_size=test_size, \n", " stratify=df['label'],)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "jutsus\n", "Ninjutsu 1804\n", "Taijutsu 317\n", "Genjutsu 81\n", "Name: count, dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['jutsus'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "model_name = \"distilbert/distilbert-base-uncased\"" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/abdullah/python_virtual_env/gradio_env/lib/python3.8/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "def preprocess_function(tokenizer,examples):\n", " return tokenizer(examples['text_cleaned'],truncation=True)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0b2c1afee2ac44c987020a4fed8888b6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/2202 [00:00