Spaces:

AryanRathod3097
/

high-school-physics

Runtime error

File size: 2,118 Bytes

9fc7f59
 
 
 
5c902c4
 
 
9fc7f59
 
 
5c902c4
20f3c99
9fc7f59
5c902c4
9fc7f59
20f3c99
9fc7f59
 
 
 
5c902c4
b47081f
 
 
5c902c4
b47081f
 
 
 
 
 
 
 
 
 
 
 
 
 
5c902c4
9fc7f59
 
5c902c4
b47081f
9fc7f59
 
5c902c4
 
 
9fc7f59
 
 
 
 
b47081f
9fc7f59
 
 
 
 
 
5c902c4

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch

# Check for GPU and set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load dataset
dataset = load_dataset("mrohith29/high-school-physics", split="train")

# Load model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)  # Move model to GPU/CPU

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Formatting function
def format_example(question, choices, answer, explanation):
    return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}"""

# Tokenization with automatic device handling
def tokenize(examples):
    formatted_texts = [
        format_example(q, ch, a, exp)
        for q, ch, a, exp in zip(
            examples["question"],
            examples["choices"],
            examples["answer"],
            examples["explanation"]
        )
    ]
    return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256)

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

# Training arguments (optimized for current hardware)
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=4 if device == "cuda" else 2,  # Larger batches on GPU
    num_train_epochs=1,
    save_strategy="epoch",
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Enable only if GPU exists
    push_to_hub=False,
    dataloader_pin_memory=torch.cuda.is_available(),  # Pin memory only for GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()
model.save_pretrained("./output")
tokenizer.save_pretrained("./output")

print(f"✅ Training complete on {device.upper()}! Model saved in ./output")