{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"from datasets import Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Dataset"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data_path = \"../data/naruto.csv\"\n",
"naruto_transcript_df = pd.read_csv(data_path)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" line | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Naruto | \n",
" (Laughing) Give it up. (Shows the stone faces... | \n",
"
\n",
" \n",
" | 1 | \n",
" Hiruzen | \n",
" (Turns away from his writing) I hope you’re n... | \n",
"
\n",
" \n",
" | 2 | \n",
" Ninja | \n",
" Naseer Sabah | \n",
"
\n",
" \n",
" | 3 | \n",
" Ninja | \n",
" is the best person on earth | \n",
"
\n",
" \n",
" | 4 | \n",
" Naruto | \n",
" muah | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name line\n",
"0 Naruto (Laughing) Give it up. (Shows the stone faces...\n",
"1 Hiruzen (Turns away from his writing) I hope you’re n...\n",
"2 Ninja Naseer Sabah\n",
"3 Ninja is the best person on earth\n",
"4 Naruto muah"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"naruto_transcript_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Remove actions from transcript\n",
"def remove_paranthesis(text):\n",
" result = re.sub(r'\\(.*?\\)','',text)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"naruto_transcript_df['line'] = naruto_transcript_df['line'].apply(remove_paranthesis)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" line | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Naruto | \n",
" Give it up. You’re just bent, because you d... | \n",
"
\n",
" \n",
" | 1 | \n",
" Hiruzen | \n",
" I hope you’re not bothering me with some tri... | \n",
"
\n",
" \n",
" | 2 | \n",
" Ninja | \n",
" Naseer Sabah | \n",
"
\n",
" \n",
" | 3 | \n",
" Ninja | \n",
" is the best person on earth | \n",
"
\n",
" \n",
" | 4 | \n",
" Naruto | \n",
" muah | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name line\n",
"0 Naruto Give it up. You’re just bent, because you d...\n",
"1 Hiruzen I hope you’re not bothering me with some tri...\n",
"2 Ninja Naseer Sabah\n",
"3 Ninja is the best person on earth\n",
"4 Naruto muah"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"naruto_transcript_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"naruto_transcript_df['number_of_words'] = naruto_transcript_df['line'].str.strip().str.split(\" \")\n",
"naruto_transcript_df['number_of_words'] = naruto_transcript_df['number_of_words'].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" line | \n",
" number_of_words | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Naruto | \n",
" Give it up. You’re just bent, because you d... | \n",
" 26 | \n",
"
\n",
" \n",
" | 1 | \n",
" Hiruzen | \n",
" I hope you’re not bothering me with some tri... | \n",
" 16 | \n",
"
\n",
" \n",
" | 2 | \n",
" Ninja | \n",
" Naseer Sabah | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" Ninja | \n",
" is the best person on earth | \n",
" 6 | \n",
"
\n",
" \n",
" | 4 | \n",
" Naruto | \n",
" muah | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name line number_of_words\n",
"0 Naruto Give it up. You’re just bent, because you d... 26\n",
"1 Hiruzen I hope you’re not bothering me with some tri... 16\n",
"2 Ninja Naseer Sabah 2\n",
"3 Ninja is the best person on earth 6\n",
"4 Naruto muah 1"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"naruto_transcript_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"naruto_transcript_df['naruto_response_flag'] = 0\n",
"naruto_transcript_df.loc[(naruto_transcript_df['name']==\"Naruto\")&(naruto_transcript_df['number_of_words']>5),'naruto_response_flag']=1"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" line | \n",
" number_of_words | \n",
" naruto_response_flag | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Naruto | \n",
" Give it up. You’re just bent, because you d... | \n",
" 26 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Hiruzen | \n",
" I hope you’re not bothering me with some tri... | \n",
" 16 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Ninja | \n",
" Naseer Sabah | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" Ninja | \n",
" is the best person on earth | \n",
" 6 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" Naruto | \n",
" muah | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 158 | \n",
" Iruka | \n",
" Congratulations. You graduate. Naruto’s stand... | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" | 159 | \n",
" Iruka | \n",
" Huh? | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 160 | \n",
" Naruto | \n",
" Iruka Sensei! | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 161 | \n",
" Iruka | \n",
" Ah! That hurts! | \n",
" 3 | \n",
" 0 | \n",
"
\n",
" \n",
" | 162 | \n",
" Iruka | \n",
" Naruto. This is only the beginning. The road... | \n",
" 38 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
163 rows × 4 columns
\n",
"
"
],
"text/plain": [
" name line \\\n",
"0 Naruto Give it up. You’re just bent, because you d... \n",
"1 Hiruzen I hope you’re not bothering me with some tri... \n",
"2 Ninja Naseer Sabah \n",
"3 Ninja is the best person on earth \n",
"4 Naruto muah \n",
".. ... ... \n",
"158 Iruka Congratulations. You graduate. Naruto’s stand... \n",
"159 Iruka Huh? \n",
"160 Naruto Iruka Sensei! \n",
"161 Iruka Ah! That hurts! \n",
"162 Iruka Naruto. This is only the beginning. The road... \n",
"\n",
" number_of_words naruto_response_flag \n",
"0 26 1 \n",
"1 16 0 \n",
"2 2 0 \n",
"3 6 0 \n",
"4 1 0 \n",
".. ... ... \n",
"158 27 0 \n",
"159 1 0 \n",
"160 2 0 \n",
"161 3 0 \n",
"162 38 0 \n",
"\n",
"[163 rows x 4 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"naruto_transcript_df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"indexes_to_take = list(naruto_transcript_df[(naruto_transcript_df['naruto_response_flag']==1)&(naruto_transcript_df.index>0)].index)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[6, 28, 30]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indexes_to_take[:3]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"system_promt = \"\"\"\" Your are Naruto from the anime \"Naruto\". Your responses should reflect his personality and speech patterns \\n\"\"\"\n",
"\n",
"prompts = []\n",
"for ind in indexes_to_take:\n",
" prompt = system_promt\n",
"\n",
" prompt += naruto_transcript_df.iloc[ind -1]['line']\n",
" prompt += '\\n'\n",
" prompt += naruto_transcript_df.iloc[ind]['line']\n",
" prompts.append(prompt)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\" Your are Naruto from the anime \"Naruto\". Your responses should reflect his personality and speech patterns \n",
" Well, I was just thinking. Maybe after you clean this all up I can take you out for some ramen. The good stuff. What do you think? \n",
" Now that’s some serious motivation! I’ll have this clean in no time!\n"
]
}
],
"source": [
"print(prompts[3])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" prompt | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" \" Your are Naruto from the anime \"Naruto\". You... | \n",
"
\n",
" \n",
" | 1 | \n",
" \" Your are Naruto from the anime \"Naruto\". You... | \n",
"
\n",
" \n",
" | 2 | \n",
" \" Your are Naruto from the anime \"Naruto\". You... | \n",
"
\n",
" \n",
" | 3 | \n",
" \" Your are Naruto from the anime \"Naruto\". You... | \n",
"
\n",
" \n",
" | 4 | \n",
" \" Your are Naruto from the anime \"Naruto\". You... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" prompt\n",
"0 \" Your are Naruto from the anime \"Naruto\". You...\n",
"1 \" Your are Naruto from the anime \"Naruto\". You...\n",
"2 \" Your are Naruto from the anime \"Naruto\". You...\n",
"3 \" Your are Naruto from the anime \"Naruto\". You...\n",
"4 \" Your are Naruto from the anime \"Naruto\". You..."
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\"prompt\":prompts})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"dataset = Dataset.from_pandas(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "gradio_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}