Spaces:

AryanRathod3097
/

high-school-physics

Runtime error

App Files Files Community

AryanRathod3097 commited on Jul 18, 2025

Commit

5c902c4

verified ·

1 Parent(s): b47081f

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -2,24 +2,27 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
 from datasets import load_dataset
 import torch
 # Load dataset
 dataset = load_dataset("mrohith29/high-school-physics", split="train")
-# Load model (TinyLlama for lightweight training)
 model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
 # Add padding token if missing
 if tokenizer.pad_token is None:
     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
     model.resize_token_embeddings(len(tokenizer))
-# Format a single example (modified to handle batch correctly)
 def format_example(question, choices, answer, explanation):
     return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}"""
-# Tokenize the entire dataset
 def tokenize(examples):
     formatted_texts = [
         format_example(q, ch, a, exp)
@@ -32,30 +35,28 @@ def tokenize(examples):
     ]
     return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256)
-# Apply tokenization (removes original columns)
 tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
-# Training arguments (optimized for Spaces GPU)
 training_args = TrainingArguments(
     output_dir="./output",
-    per_device_train_batch_size=2,  # Reduce if OOM errors occur
     num_train_epochs=1,
     save_strategy="epoch",
     logging_steps=10,
-    fp16=True,
-    push_to_hub=False,  # Set to True to upload to your HF Hub
 )
-# Trainer
 trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_dataset,
 )
-# Train and save
 trainer.train()
 model.save_pretrained("./output")
 tokenizer.save_pretrained("./output")
-print("✅ Training complete! Model saved in ./output")

 from datasets import load_dataset
 import torch
+# Check for GPU and set device
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load dataset
 dataset = load_dataset("mrohith29/high-school-physics", split="train")
+# Load model
 model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name).to(device)  # Move model to GPU/CPU
 # Add padding token if missing
 if tokenizer.pad_token is None:
     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
     model.resize_token_embeddings(len(tokenizer))
+# Formatting function
 def format_example(question, choices, answer, explanation):
     return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}"""
+# Tokenization with automatic device handling
 def tokenize(examples):
     formatted_texts = [
         format_example(q, ch, a, exp)
     ]
     return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256)
 tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
+# Training arguments (optimized for current hardware)
 training_args = TrainingArguments(
     output_dir="./output",
+    per_device_train_batch_size=4 if device == "cuda" else 2,  # Larger batches on GPU
     num_train_epochs=1,
     save_strategy="epoch",
     logging_steps=10,
+    fp16=torch.cuda.is_available(),  # Enable only if GPU exists
+    push_to_hub=False,
+    dataloader_pin_memory=torch.cuda.is_available(),  # Pin memory only for GPU
 )
 trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_dataset,
 )
 trainer.train()
 model.save_pretrained("./output")
 tokenizer.save_pretrained("./output")
+print(f"✅ Training complete on {device.upper()}! Model saved in ./output")