Spaces:
No application file
No application file
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
| from peft import get_peft_model, LoraConfig, TaskType | |
| import torch | |
| model_id = "microsoft/phi-3-mini-4k-instruct" | |
| dataset_path = "../0_data_gen/instruct_dataset.jsonl" | |
| # Carga dataset personalizado | |
| data = load_dataset("json", data_files=dataset_path) | |
| # Tokenización | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| def tokenize(example): | |
| return tokenizer(f"<|user|>{example['instruction']}<|assistant|>{example['response']}", truncation=True, padding="max_length", max_length=512) | |
| tokenized = data["train"].map(tokenize) | |
| # Carga modelo + PEFT | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05) | |
| model = get_peft_model(model, peft_config) | |
| # Entrenamiento | |
| training_args = TrainingArguments( | |
| output_dir="./model", | |
| per_device_train_batch_size=2, | |
| num_train_epochs=3, | |
| save_total_limit=1, | |
| logging_steps=10, | |
| learning_rate=2e-4, | |
| fp16=torch.cuda.is_available() | |
| ) | |
| trainer = Trainer(model=model, args=training_args, train_dataset=tokenized) | |
| trainer.train() | |