Spaces:

AryanRathod3097
/

high-school-physics

Runtime error

App Files Files Community

high-school-physics / app.py

AryanRathod3097

Update app.py

5c902c4 verified 6 months ago

raw

history blame contribute delete

2.12 kB

	from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
	from datasets import load_dataset
	import torch

	# Check for GPU and set device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load dataset
	dataset = load_dataset("mrohith29/high-school-physics", split="train")

	# Load model
	model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name).to(device) # Move model to GPU/CPU

	# Add padding token if missing
	if tokenizer.pad_token is None:
	tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	model.resize_token_embeddings(len(tokenizer))

	# Formatting function
	def format_example(question, choices, answer, explanation):
	return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}"""

	# Tokenization with automatic device handling
	def tokenize(examples):
	formatted_texts = [
	format_example(q, ch, a, exp)
	for q, ch, a, exp in zip(
	examples["question"],
	examples["choices"],
	examples["answer"],
	examples["explanation"]
	)
	]
	return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256)

	tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

	# Training arguments (optimized for current hardware)
	training_args = TrainingArguments(
	output_dir="./output",
	per_device_train_batch_size=4 if device == "cuda" else 2, # Larger batches on GPU
	num_train_epochs=1,
	save_strategy="epoch",
	logging_steps=10,
	fp16=torch.cuda.is_available(), # Enable only if GPU exists
	push_to_hub=False,
	dataloader_pin_memory=torch.cuda.is_available(), # Pin memory only for GPU
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
	)

	trainer.train()
	model.save_pretrained("./output")
	tokenizer.save_pretrained("./output")

	print(f"✅ Training complete on {device.upper()}! Model saved in ./output")