| import os |
| import torch |
| import json |
| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback |
| from datasets import Dataset |
| import matplotlib.pyplot as plt |
|
|
| |
| os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" |
|
|
| |
| model_name = "Salesforce/codegen-350M-multi" |
| local_model_path = "./codegen_model" |
| tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path) |
| model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path) |
|
|
| |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| device = torch.device("cpu") |
| model.to(device) |
|
|
| |
| dataset_path = "./custom_dataset.jsonl" |
| data = [] |
| with open(dataset_path, 'r', encoding='utf-8') as f: |
| for line in f: |
| data.append(json.loads(line.strip())) |
| dataset = Dataset.from_list(data) |
|
|
| |
| def tokenize_function(examples): |
| inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])] |
| return tokenizer(inputs, truncation=True, padding="max_length", max_length=128) |
|
|
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"]) |
|
|
| |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="./finetuned_codegen", |
| overwrite_output_dir=True, |
| num_train_epochs=3, |
| per_device_train_batch_size=1, |
| gradient_accumulation_steps=4, |
| save_steps=500, |
| save_total_limit=2, |
| logging_steps=100, |
| learning_rate=5e-5, |
| fp16=False, |
| no_cuda=True, |
| dataloader_pin_memory=False, |
| ) |
|
|
| |
| class LossCallback(TrainerCallback): |
| def __init__(self): |
| self.losses = [] |
|
|
| def on_log(self, args, state, control, logs=None, **kwargs): |
| if logs and "loss" in logs: |
| self.losses.append(logs["loss"]) |
|
|
| loss_callback = LossCallback() |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset, |
| data_collator=data_collator, |
| callbacks=[loss_callback], |
| ) |
|
|
| |
| print("Starting fine-tuning...") |
| trainer.train() |
|
|
| |
| model.save_pretrained("./finetuned_codegen") |
| tokenizer.save_pretrained("./finetuned_codegen") |
|
|
| |
| plt.plot(loss_callback.losses, label="Training Loss") |
| plt.xlabel("Steps") |
| plt.ylabel("Loss") |
| plt.title("Fine-Tuning Loss Curve") |
| plt.legend() |
| plt.savefig("./finetuned_codegen/loss_plot.png") |
| plt.show() |
|
|
| print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png") |
|
|
| |
| print("\nTesting fine-tuned model...") |
| prompts = [ |
| "Write a Python program to print 'Hello, World!'" |
| ] |
|
|
| for prompt in prompts: |
| inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device) |
| outputs = model.generate( |
| **inputs, |
| max_length=200, |
| num_return_sequences=1, |
| pad_token_id=tokenizer.eos_token_id, |
| do_sample=True, |
| temperature=0.7, |
| top_p=0.9 |
| ) |
| generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}") |