Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer | |
| # Config | |
| model_name = "google/flan-t5-small" | |
| data_path = "data/final_coding_dataset.jsonl" | |
| # Load dataset | |
| dataset = load_dataset("json", data_files=data_path, split="train") | |
| # Format data for T5 | |
| def format_example(example): | |
| return { | |
| "input_text": f"Question: {example['prompt']}", | |
| "target_text": example["completion"] | |
| } | |
| dataset = dataset.map(format_example) | |
| # Tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def tokenize(batch): | |
| input_enc = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512) | |
| target_enc = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128) | |
| input_enc["labels"] = target_enc["input_ids"] | |
| return input_enc | |
| dataset = dataset.map(tokenize, batched=True) | |
| dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) | |
| # Load model | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| # Training args | |
| training_args = TrainingArguments( | |
| output_dir="model/codementor-flan", | |
| num_train_epochs=6, # use epochs here | |
| per_device_train_batch_size=2, | |
| gradient_accumulation_steps=2, | |
| save_steps=100, | |
| save_total_limit=2, | |
| logging_steps=100, | |
| report_to="none", | |
| fp16=False | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| tokenizer=tokenizer | |
| ) | |
| # Train | |
| trainer.train() | |
| # Save final model | |
| model.save_pretrained("model/codementor-flan") | |
| tokenizer.save_pretrained("model/codementor-flan") | |