Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import torch | |
| import re | |
| import huggingface_hub | |
| from datasets import Dataset | |
| import transformers | |
| from transformers import ( | |
| BitsAndBytesConfig, | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| ) | |
| from peft import LoraConfig, PeftModel | |
| from trl import SFTConfig, SFTTrainer | |
| import gc | |
| # Remove actions from transcript | |
| def remove_paranthesis(text): | |
| result = re.sub(r'\(.*?\)','',text) | |
| return result | |
| class CharacterChatBot(): | |
| def __init__(self, | |
| model_path, | |
| data_path="/content/data/naruto.csv", | |
| huggingface_token = None | |
| ): | |
| self.model_path = model_path | |
| self.data_path = data_path | |
| self.huggingface_token = huggingface_token | |
| self.base_model_path = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| if self.huggingface_token is not None: | |
| huggingface_hub.login(self.huggingface_token) | |
| if huggingface_hub.repo_exists(self.model_path): | |
| self.model = self.load_model(self.model_path) | |
| else: | |
| print("Model Not found in huggingface hub we will train out own model") | |
| train_dataset = self.load_data() | |
| self.train(self.base_model_path, train_dataset) | |
| self.model = self.load_model(self.model_path) | |
| def chat(self, message, history): | |
| messages = [] | |
| # Add the system ptomp | |
| messages.append({"role":"system","content":""""Your are Naruto from the anime "Naruto". Your responses should reflect his personality and speech patterns \n"""}) | |
| for message_and_respnse in history: | |
| messages.append({"role":"user","content":message_and_respnse[0]}) | |
| messages.append({"role":"assistant","content":message_and_respnse[1]}) | |
| messages.append({"role":"user","content":message}) | |
| terminator = [ | |
| self.model.tokenizer.eos_token_id, | |
| self.model.tokenizer.convert_tokens_to_ids("<|eot_id|>") | |
| ] | |
| output = self.model( | |
| messages, | |
| max_length=256, | |
| eos_token_id=terminator, | |
| do_sample=True, | |
| temperature=0.6, | |
| top_p=0.9 | |
| ) | |
| output_message = output[0]['generated_text'][-1] | |
| return output_message | |
| def load_model(self, model_path): | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| ) | |
| pipeline = transformers.pipeline("text-generation", | |
| model = model_path, | |
| model_kwargs={"torch_dtype":torch.float16, | |
| "quantization_config":bnb_config, | |
| } | |
| ) | |
| return pipeline | |
| def train(self, | |
| base_model_name_or_path, | |
| dataset, | |
| output_dir = "./results", | |
| per_device_train_batch_size = 1, | |
| gradient_accumulation_steps = 1, | |
| optim = "paged_adamw_32bit", | |
| save_steps = 200, | |
| logging_steps = 10, | |
| learning_rate = 2e-4, | |
| max_grad_norm = 0.3, | |
| max_steps = 300, | |
| warmup_ratio = 0.3, | |
| lr_scheduler_type = "constant", | |
| ): | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path, | |
| quantization_config= bnb_config, | |
| trust_remote_code=True) | |
| model.config.use_cache = False | |
| toknizer = AutoTokenizer.from_pretrained(base_model_name_or_path) | |
| toknizer.pad_token = toknizer.eos_token | |
| lora_alpha = 16 | |
| lora_dropout = 0.1 | |
| lora_r=64 | |
| peft_config = LoraConfig( | |
| lora_alpha=lora_alpha, | |
| lora_dropout=lora_dropout, | |
| r=lora_r, | |
| bias="none", | |
| task_type="CASUAL_LM" | |
| ) | |
| training_arguments = SFTConfig( | |
| output_dir=output_dir, | |
| per_device_train_batch_size = per_device_train_batch_size, | |
| gradient_accumulation_steps = gradient_accumulation_steps, | |
| optim = optim, | |
| save_steps = save_steps, | |
| logging_steps = logging_steps, | |
| learning_rate = learning_rate, | |
| fp16= True, | |
| max_grad_norm = max_grad_norm, | |
| max_steps = max_steps, | |
| warmup_ratio = warmup_ratio, | |
| group_by_length = True, | |
| lr_scheduler_type = lr_scheduler_type, | |
| report_to = "none" | |
| ) | |
| max_seq_len = 512 | |
| trainer = SFTTrainer( | |
| model = model, | |
| train_dataset=dataset, | |
| peft_config=peft_config, | |
| dataset_text_field="prompt", | |
| max_seq_length=max_seq_len, | |
| tokenizer=toknizer, | |
| args = training_arguments, | |
| ) | |
| trainer.train() | |
| # Save model | |
| trainer.model.save_pretrained("final_ckpt") | |
| toknizer.save_pretrained("final_ckpt") | |
| # Flush memory | |
| del trainer, model | |
| gc.collect() | |
| base_model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path, | |
| return_dict=True, | |
| quantization_config=bnb_config, | |
| torch_dtype = torch.float16, | |
| device_map = self.device | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path) | |
| model = PeftModel.from_pretrained(base_model,"final_ckpt") | |
| model.push_to_hub(self.model_path) | |
| tokenizer.push_to_hub(self.model_path) | |
| # Flush Memory | |
| del model, base_model | |
| gc.collect() | |
| def load_data(self): | |
| naruto_transcript_df = pd.read_csv(self.data_path) | |
| naruto_transcript_df = naruto_transcript_df.dropna() | |
| naruto_transcript_df['line'] = naruto_transcript_df['line'].apply(remove_paranthesis) | |
| naruto_transcript_df['number_of_words'] = naruto_transcript_df['line'].str.strip().str.split(" ") | |
| naruto_transcript_df['number_of_words'] = naruto_transcript_df['number_of_words'].apply(lambda x: len(x)) | |
| naruto_transcript_df['naruto_response_flag'] = 0 | |
| naruto_transcript_df.loc[(naruto_transcript_df['name']=="Naruto")&(naruto_transcript_df['number_of_words']>5),'naruto_response_flag']=1 | |
| indexes_to_take = list(naruto_transcript_df[(naruto_transcript_df['naruto_response_flag']==1)&(naruto_transcript_df.index>0)].index) | |
| system_promt = """" Your are Naruto from the anime "Naruto". Your responses should reflect his personality and speech patterns \n""" | |
| prompts = [] | |
| for ind in indexes_to_take: | |
| prompt = system_promt | |
| prompt += naruto_transcript_df.iloc[ind -1]['line'] | |
| prompt += '\n' | |
| prompt += naruto_transcript_df.iloc[ind]['line'] | |
| prompts.append(prompt) | |
| df = pd.DataFrame({"prompt":prompts}) | |
| dataset = Dataset.from_pandas(df) | |
| return dataset | |