from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from datasets import load_dataset import torch # Check for GPU and set device device = "cuda" if torch.cuda.is_available() else "cpu" # Load dataset dataset = load_dataset("mrohith29/high-school-physics", split="train") # Load model model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name).to(device) # Move model to GPU/CPU # Add padding token if missing if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model.resize_token_embeddings(len(tokenizer)) # Formatting function def format_example(question, choices, answer, explanation): return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}""" # Tokenization with automatic device handling def tokenize(examples): formatted_texts = [ format_example(q, ch, a, exp) for q, ch, a, exp in zip( examples["question"], examples["choices"], examples["answer"], examples["explanation"] ) ] return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256) tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names) # Training arguments (optimized for current hardware) training_args = TrainingArguments( output_dir="./output", per_device_train_batch_size=4 if device == "cuda" else 2, # Larger batches on GPU num_train_epochs=1, save_strategy="epoch", logging_steps=10, fp16=torch.cuda.is_available(), # Enable only if GPU exists push_to_hub=False, dataloader_pin_memory=torch.cuda.is_available(), # Pin memory only for GPU ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) trainer.train() model.save_pretrained("./output") tokenizer.save_pretrained("./output") print(f"✅ Training complete on {device.upper()}! Model saved in ./output")