{ "cells": [ { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "from datasets import Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load Dataset" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data_path = \"../data/naruto.csv\"\n", "naruto_transcript_df = pd.read_csv(data_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameline
0Naruto(Laughing) Give it up. (Shows the stone faces...
1Hiruzen(Turns away from his writing) I hope you’re n...
2NinjaNaseer Sabah
3Ninjais the best person on earth
4Narutomuah
\n", "
" ], "text/plain": [ " name line\n", "0 Naruto (Laughing) Give it up. (Shows the stone faces...\n", "1 Hiruzen (Turns away from his writing) I hope you’re n...\n", "2 Ninja Naseer Sabah\n", "3 Ninja is the best person on earth\n", "4 Naruto muah" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "naruto_transcript_df.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Remove actions from transcript\n", "def remove_paranthesis(text):\n", " result = re.sub(r'\\(.*?\\)','',text)\n", " return result" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "naruto_transcript_df['line'] = naruto_transcript_df['line'].apply(remove_paranthesis)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameline
0NarutoGive it up. You’re just bent, because you d...
1HiruzenI hope you’re not bothering me with some tri...
2NinjaNaseer Sabah
3Ninjais the best person on earth
4Narutomuah
\n", "
" ], "text/plain": [ " name line\n", "0 Naruto Give it up. You’re just bent, because you d...\n", "1 Hiruzen I hope you’re not bothering me with some tri...\n", "2 Ninja Naseer Sabah\n", "3 Ninja is the best person on earth\n", "4 Naruto muah" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "naruto_transcript_df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "naruto_transcript_df['number_of_words'] = naruto_transcript_df['line'].str.strip().str.split(\" \")\n", "naruto_transcript_df['number_of_words'] = naruto_transcript_df['number_of_words'].apply(lambda x: len(x))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelinenumber_of_words
0NarutoGive it up. You’re just bent, because you d...26
1HiruzenI hope you’re not bothering me with some tri...16
2NinjaNaseer Sabah2
3Ninjais the best person on earth6
4Narutomuah1
\n", "
" ], "text/plain": [ " name line number_of_words\n", "0 Naruto Give it up. You’re just bent, because you d... 26\n", "1 Hiruzen I hope you’re not bothering me with some tri... 16\n", "2 Ninja Naseer Sabah 2\n", "3 Ninja is the best person on earth 6\n", "4 Naruto muah 1" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "naruto_transcript_df.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "naruto_transcript_df['naruto_response_flag'] = 0\n", "naruto_transcript_df.loc[(naruto_transcript_df['name']==\"Naruto\")&(naruto_transcript_df['number_of_words']>5),'naruto_response_flag']=1" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelinenumber_of_wordsnaruto_response_flag
0NarutoGive it up. You’re just bent, because you d...261
1HiruzenI hope you’re not bothering me with some tri...160
2NinjaNaseer Sabah20
3Ninjais the best person on earth60
4Narutomuah10
...............
158IrukaCongratulations. You graduate. Naruto’s stand...270
159IrukaHuh?10
160NarutoIruka Sensei!20
161IrukaAh! That hurts!30
162IrukaNaruto. This is only the beginning. The road...380
\n", "

163 rows × 4 columns

\n", "
" ], "text/plain": [ " name line \\\n", "0 Naruto Give it up. You’re just bent, because you d... \n", "1 Hiruzen I hope you’re not bothering me with some tri... \n", "2 Ninja Naseer Sabah \n", "3 Ninja is the best person on earth \n", "4 Naruto muah \n", ".. ... ... \n", "158 Iruka Congratulations. You graduate. Naruto’s stand... \n", "159 Iruka Huh? \n", "160 Naruto Iruka Sensei! \n", "161 Iruka Ah! That hurts! \n", "162 Iruka Naruto. This is only the beginning. The road... \n", "\n", " number_of_words naruto_response_flag \n", "0 26 1 \n", "1 16 0 \n", "2 2 0 \n", "3 6 0 \n", "4 1 0 \n", ".. ... ... \n", "158 27 0 \n", "159 1 0 \n", "160 2 0 \n", "161 3 0 \n", "162 38 0 \n", "\n", "[163 rows x 4 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "naruto_transcript_df" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "indexes_to_take = list(naruto_transcript_df[(naruto_transcript_df['naruto_response_flag']==1)&(naruto_transcript_df.index>0)].index)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[6, 28, 30]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indexes_to_take[:3]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "system_promt = \"\"\"\" Your are Naruto from the anime \"Naruto\". Your responses should reflect his personality and speech patterns \\n\"\"\"\n", "\n", "prompts = []\n", "for ind in indexes_to_take:\n", " prompt = system_promt\n", "\n", " prompt += naruto_transcript_df.iloc[ind -1]['line']\n", " prompt += '\\n'\n", " prompt += naruto_transcript_df.iloc[ind]['line']\n", " prompts.append(prompt)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\" Your are Naruto from the anime \"Naruto\". Your responses should reflect his personality and speech patterns \n", " Well, I was just thinking. Maybe after you clean this all up I can take you out for some ramen. The good stuff. What do you think? \n", " Now that’s some serious motivation! I’ll have this clean in no time!\n" ] } ], "source": [ "print(prompts[3])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prompt
0\" Your are Naruto from the anime \"Naruto\". You...
1\" Your are Naruto from the anime \"Naruto\". You...
2\" Your are Naruto from the anime \"Naruto\". You...
3\" Your are Naruto from the anime \"Naruto\". You...
4\" Your are Naruto from the anime \"Naruto\". You...
\n", "
" ], "text/plain": [ " prompt\n", "0 \" Your are Naruto from the anime \"Naruto\". You...\n", "1 \" Your are Naruto from the anime \"Naruto\". You...\n", "2 \" Your are Naruto from the anime \"Naruto\". You...\n", "3 \" Your are Naruto from the anime \"Naruto\". You...\n", "4 \" Your are Naruto from the anime \"Naruto\". You..." ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\"prompt\":prompts})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "dataset = Dataset.from_pandas(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "gradio_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 2 }