{
"cells": [
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn import preprocessing\n",
"from sklearn.model_selection import train_test_split\n",
"from transformers import AutoTokenizer\n",
"from datasets import Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Dataset"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" jutsu_name | \n",
" jutsu_type | \n",
" jutsu_description | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Adamantine Sealing Chains: Spiral Formation | \n",
" Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... | \n",
" Kushina uses her chains to form a barrier whil... | \n",
"
\n",
" \n",
" | 1 | \n",
" Adamantine Power: Acala | \n",
" Kekkei Genkai, Ninjutsu, Taijutsu | \n",
" Hashirama kicks the opponent away and raises s... | \n",
"
\n",
" \n",
" | 2 | \n",
" Adamantine Prison Wall | \n",
" Ninjutsu, Clone Techniques, Bukijutsu | \n",
" After using Transformation: Adamantine Staff, ... | \n",
"
\n",
" \n",
" | 3 | \n",
" Adamantine Seal: Monkey Yang Suppression | \n",
" Ninjutsu, Fūinjutsu, Cooperation Ninjutsu | \n",
" After placing fūinjutsu tags in an area, the u... | \n",
"
\n",
" \n",
" | 4 | \n",
" Acrobat | \n",
" Taijutsu, Kenjutsu | \n",
" The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" jutsu_name \\\n",
"0 Adamantine Sealing Chains: Spiral Formation \n",
"1 Adamantine Power: Acala \n",
"2 Adamantine Prison Wall \n",
"3 Adamantine Seal: Monkey Yang Suppression \n",
"4 Acrobat \n",
"\n",
" jutsu_type \\\n",
"0 Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... \n",
"1 Kekkei Genkai, Ninjutsu, Taijutsu \n",
"2 Ninjutsu, Clone Techniques, Bukijutsu \n",
"3 Ninjutsu, Fūinjutsu, Cooperation Ninjutsu \n",
"4 Taijutsu, Kenjutsu \n",
"\n",
" jutsu_description \n",
"0 Kushina uses her chains to form a barrier whil... \n",
"1 Hashirama kicks the opponent away and raises s... \n",
"2 After using Transformation: Adamantine Staff, ... \n",
"3 After placing fūinjutsu tags in an area, the u... \n",
"4 The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_path = \"../data/jutsus.jsonl\"\n",
"df = pd.read_json(data_path, lines=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def simplify_jutsu(jutsu):\n",
" if \"Genjutsu\" in jutsu:\n",
" return \"Genjutsu\"\n",
" if \"Ninjutsu\" in jutsu:\n",
" return \"Ninjutsu\"\n",
" if \"Taijutsu\" in jutsu:\n",
" return \"Taijutsu\""
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" jutsu_name | \n",
" jutsu_type | \n",
" jutsu_description | \n",
" jutsu_type_simplified | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Adamantine Sealing Chains: Spiral Formation | \n",
" Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... | \n",
" Kushina uses her chains to form a barrier whil... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 1 | \n",
" Adamantine Power: Acala | \n",
" Kekkei Genkai, Ninjutsu, Taijutsu | \n",
" Hashirama kicks the opponent away and raises s... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 2 | \n",
" Adamantine Prison Wall | \n",
" Ninjutsu, Clone Techniques, Bukijutsu | \n",
" After using Transformation: Adamantine Staff, ... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 3 | \n",
" Adamantine Seal: Monkey Yang Suppression | \n",
" Ninjutsu, Fūinjutsu, Cooperation Ninjutsu | \n",
" After placing fūinjutsu tags in an area, the u... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 4 | \n",
" Acrobat | \n",
" Taijutsu, Kenjutsu | \n",
" The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... | \n",
" Taijutsu | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" jutsu_name \\\n",
"0 Adamantine Sealing Chains: Spiral Formation \n",
"1 Adamantine Power: Acala \n",
"2 Adamantine Prison Wall \n",
"3 Adamantine Seal: Monkey Yang Suppression \n",
"4 Acrobat \n",
"\n",
" jutsu_type \\\n",
"0 Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... \n",
"1 Kekkei Genkai, Ninjutsu, Taijutsu \n",
"2 Ninjutsu, Clone Techniques, Bukijutsu \n",
"3 Ninjutsu, Fūinjutsu, Cooperation Ninjutsu \n",
"4 Taijutsu, Kenjutsu \n",
"\n",
" jutsu_description jutsu_type_simplified \n",
"0 Kushina uses her chains to form a barrier whil... Ninjutsu \n",
"1 Hashirama kicks the opponent away and raises s... Ninjutsu \n",
"2 After using Transformation: Adamantine Staff, ... Ninjutsu \n",
"3 After placing fūinjutsu tags in an area, the u... Ninjutsu \n",
"4 The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... Taijutsu "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"jutsu_type_simplified\n",
"Ninjutsu 2255\n",
"Taijutsu 397\n",
"Genjutsu 101\n",
"Name: count, dtype: int64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['jutsu_type_simplified'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df['text'] = df['jutsu_name'] + \". \" + df['jutsu_description']\n",
"df['jutsus'] = df['jutsu_type_simplified']\n",
"df = df[['text', 'jutsus']]\n",
"df = df.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" jutsu_name | \n",
" jutsu_type | \n",
" jutsu_description | \n",
" jutsu_type_simplified | \n",
" text | \n",
" jutus | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Adamantine Sealing Chains: Spiral Formation | \n",
" Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... | \n",
" Kushina uses her chains to form a barrier whil... | \n",
" Ninjutsu | \n",
" Adamantine Sealing Chains: Spiral Formation. K... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 1 | \n",
" Adamantine Power: Acala | \n",
" Kekkei Genkai, Ninjutsu, Taijutsu | \n",
" Hashirama kicks the opponent away and raises s... | \n",
" Ninjutsu | \n",
" Adamantine Power: Acala. Hashirama kicks the o... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 2 | \n",
" Adamantine Prison Wall | \n",
" Ninjutsu, Clone Techniques, Bukijutsu | \n",
" After using Transformation: Adamantine Staff, ... | \n",
" Ninjutsu | \n",
" Adamantine Prison Wall. After using Transforma... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 3 | \n",
" Adamantine Seal: Monkey Yang Suppression | \n",
" Ninjutsu, Fūinjutsu, Cooperation Ninjutsu | \n",
" After placing fūinjutsu tags in an area, the u... | \n",
" Ninjutsu | \n",
" Adamantine Seal: Monkey Yang Suppression. Afte... | \n",
" Ninjutsu | \n",
"
\n",
" \n",
" | 4 | \n",
" Acrobat | \n",
" Taijutsu, Kenjutsu | \n",
" The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... | \n",
" Taijutsu | \n",
" Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... | \n",
" Taijutsu | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" jutsu_name \\\n",
"0 Adamantine Sealing Chains: Spiral Formation \n",
"1 Adamantine Power: Acala \n",
"2 Adamantine Prison Wall \n",
"3 Adamantine Seal: Monkey Yang Suppression \n",
"4 Acrobat \n",
"\n",
" jutsu_type \\\n",
"0 Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ... \n",
"1 Kekkei Genkai, Ninjutsu, Taijutsu \n",
"2 Ninjutsu, Clone Techniques, Bukijutsu \n",
"3 Ninjutsu, Fūinjutsu, Cooperation Ninjutsu \n",
"4 Taijutsu, Kenjutsu \n",
"\n",
" jutsu_description jutsu_type_simplified \\\n",
"0 Kushina uses her chains to form a barrier whil... Ninjutsu \n",
"1 Hashirama kicks the opponent away and raises s... Ninjutsu \n",
"2 After using Transformation: Adamantine Staff, ... Ninjutsu \n",
"3 After placing fūinjutsu tags in an area, the u... Ninjutsu \n",
"4 The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ... Taijutsu \n",
"\n",
" text jutus \n",
"0 Adamantine Sealing Chains: Spiral Formation. K... Ninjutsu \n",
"1 Adamantine Power: Acala. Hashirama kicks the o... Ninjutsu \n",
"2 Adamantine Prison Wall. After using Transforma... Ninjutsu \n",
"3 Adamantine Seal: Monkey Yang Suppression. Afte... Ninjutsu \n",
"4 Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... Taijutsu "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"class Cleaner():\n",
" def __init__(self):\n",
" pass \n",
" \n",
" def put_line_breaks(self, text):\n",
" return text.replace(\"<\\p>\", \"<\\p>\\n\")\n",
" \n",
" def remove_html_tags(self, text):\n",
" clean_text = BeautifulSoup(text, \"lxml\").text\n",
" return clean_text\n",
"\n",
" def clean(self, text):\n",
" text = self.put_line_breaks(text)\n",
" text = self.remove_html_tags(text)\n",
" text = text.strip()\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"text_column_name = 'text'\n",
"label_column_name = \"jutsus\""
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_25661/3655626712.py:10: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
" clean_text = BeautifulSoup(text, \"lxml\").text\n"
]
}
],
"source": [
"# Clean Text\n",
"cleaner = Cleaner()\n",
"df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" jutsus | \n",
" text_cleaned | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Adamantine Sealing Chains: Spiral Formation. K... | \n",
" Ninjutsu | \n",
" Adamantine Sealing Chains: Spiral Formation. K... | \n",
"
\n",
" \n",
" | 1 | \n",
" Adamantine Power: Acala. Hashirama kicks the o... | \n",
" Ninjutsu | \n",
" Adamantine Power: Acala. Hashirama kicks the o... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text jutsus \\\n",
"0 Adamantine Sealing Chains: Spiral Formation. K... Ninjutsu \n",
"1 Adamantine Power: Acala. Hashirama kicks the o... Ninjutsu \n",
"\n",
" text_cleaned \n",
"0 Adamantine Sealing Chains: Spiral Formation. K... \n",
"1 Adamantine Power: Acala. Hashirama kicks the o... "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"LabelEncoder()"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Encode Labels \n",
"le = preprocessing.LabelEncoder()\n",
"le.fit(df[label_column_name].tolist())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}\n",
"label_dict"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"df['label'] = le.transform(df[label_column_name].tolist())"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" jutsus | \n",
" text_cleaned | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Adamantine Sealing Chains: Spiral Formation. K... | \n",
" Ninjutsu | \n",
" Adamantine Sealing Chains: Spiral Formation. K... | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Adamantine Power: Acala. Hashirama kicks the o... | \n",
" Ninjutsu | \n",
" Adamantine Power: Acala. Hashirama kicks the o... | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Adamantine Prison Wall. After using Transforma... | \n",
" Ninjutsu | \n",
" Adamantine Prison Wall. After using Transforma... | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" Adamantine Seal: Monkey Yang Suppression. Afte... | \n",
" Ninjutsu | \n",
" Adamantine Seal: Monkey Yang Suppression. Afte... | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... | \n",
" Taijutsu | \n",
" Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text jutsus \\\n",
"0 Adamantine Sealing Chains: Spiral Formation. K... Ninjutsu \n",
"1 Adamantine Power: Acala. Hashirama kicks the o... Ninjutsu \n",
"2 Adamantine Prison Wall. After using Transforma... Ninjutsu \n",
"3 Adamantine Seal: Monkey Yang Suppression. Afte... Ninjutsu \n",
"4 Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... Taijutsu \n",
"\n",
" text_cleaned label \n",
"0 Adamantine Sealing Chains: Spiral Formation. K... 1 \n",
"1 Adamantine Power: Acala. Hashirama kicks the o... 1 \n",
"2 Adamantine Prison Wall. After using Transforma... 1 \n",
"3 Adamantine Seal: Monkey Yang Suppression. Afte... 1 \n",
"4 Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ... 2 "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"test_size = 0.2\n",
"df_train, df_test = train_test_split(df, \n",
" test_size=test_size, \n",
" stratify=df['label'],)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"jutsus\n",
"Ninjutsu 1804\n",
"Taijutsu 317\n",
"Genjutsu 81\n",
"Name: count, dtype: int64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train['jutsus'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"model_name = \"distilbert/distilbert-base-uncased\""
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/abdullah/python_virtual_env/gradio_env/lib/python3.8/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
]
}
],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_function(tokenizer,examples):\n",
" return tokenizer(examples['text_cleaned'],truncation=True)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0b2c1afee2ac44c987020a4fed8888b6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/2202 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ab776400265b416daf7c2b40262856a7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/551 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Conver Pandas to a hugging face dataset\n",
"train_dataset = Dataset.from_pandas(df_train)\n",
"test_dataset = Dataset.from_pandas(df_test)\n",
"\n",
"# tokenize the dataset\n",
"tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),\n",
" batched=True)\n",
"tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),\n",
" batched=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "gradio_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}