Spaces:
Runtime error
Runtime error
| import torch, json | |
| from datasets import load_dataset, Dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
| from peft import get_peft_model, LoraConfig, TaskType | |
| # Load your dataset | |
| data = [json.loads(l) for l in open("data/sft_data.jsonl")] | |
| dataset = Dataset.from_list(data) | |
| # Load model & tokenizer | |
| base_model = "meta-llama/Llama-2-7b-hf" # Or use Mistral, Falcon, etc. | |
| tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True) | |
| model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16) | |
| # Add LoRA (optional) | |
| lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.05, | |
| target_modules=["q_proj", "v_proj"]) | |
| model = get_peft_model(model, lora_config) | |
| # Preprocessing | |
| def tokenize(example): | |
| prompt = f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['output']}" | |
| return tokenizer(prompt, truncation=True, max_length=512, padding="max_length") | |
| dataset = dataset.map(tokenize, remove_columns=dataset.column_names) | |
| # Training setup | |
| args = TrainingArguments( | |
| output_dir="./sft-model", | |
| per_device_train_batch_size=2, | |
| num_train_epochs=3, | |
| fp16=True, | |
| evaluation_strategy="no", | |
| save_strategy="epoch", | |
| logging_steps=20, | |
| learning_rate=2e-5, | |
| report_to="tensorboard", | |
| ) | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| trainer = Trainer(model=model, args=args, train_dataset=dataset, data_collator=data_collator) | |
| trainer.train() |