AryanRathod3097 commited on
Commit
5c902c4
Β·
verified Β·
1 Parent(s): b47081f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -2,24 +2,27 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
2
  from datasets import load_dataset
3
  import torch
4
 
 
 
 
5
  # Load dataset
6
  dataset = load_dataset("mrohith29/high-school-physics", split="train")
7
 
8
- # Load model (TinyLlama for lightweight training)
9
  model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- model = AutoModelForCausalLM.from_pretrained(model_name)
12
 
13
  # Add padding token if missing
14
  if tokenizer.pad_token is None:
15
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
16
  model.resize_token_embeddings(len(tokenizer))
17
 
18
- # Format a single example (modified to handle batch correctly)
19
  def format_example(question, choices, answer, explanation):
20
  return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}"""
21
 
22
- # Tokenize the entire dataset
23
  def tokenize(examples):
24
  formatted_texts = [
25
  format_example(q, ch, a, exp)
@@ -32,30 +35,28 @@ def tokenize(examples):
32
  ]
33
  return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256)
34
 
35
- # Apply tokenization (removes original columns)
36
  tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
37
 
38
- # Training arguments (optimized for Spaces GPU)
39
  training_args = TrainingArguments(
40
  output_dir="./output",
41
- per_device_train_batch_size=2, # Reduce if OOM errors occur
42
  num_train_epochs=1,
43
  save_strategy="epoch",
44
  logging_steps=10,
45
- fp16=True,
46
- push_to_hub=False, # Set to True to upload to your HF Hub
 
47
  )
48
 
49
- # Trainer
50
  trainer = Trainer(
51
  model=model,
52
  args=training_args,
53
  train_dataset=tokenized_dataset,
54
  )
55
 
56
- # Train and save
57
  trainer.train()
58
  model.save_pretrained("./output")
59
  tokenizer.save_pretrained("./output")
60
 
61
- print("βœ… Training complete! Model saved in ./output")
 
2
  from datasets import load_dataset
3
  import torch
4
 
5
+ # Check for GPU and set device
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
  # Load dataset
9
  dataset = load_dataset("mrohith29/high-school-physics", split="train")
10
 
11
+ # Load model
12
  model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
13
  tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ model = AutoModelForCausalLM.from_pretrained(model_name).to(device) # Move model to GPU/CPU
15
 
16
  # Add padding token if missing
17
  if tokenizer.pad_token is None:
18
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
19
  model.resize_token_embeddings(len(tokenizer))
20
 
21
+ # Formatting function
22
  def format_example(question, choices, answer, explanation):
23
  return f"""### Instruction: {question}\n### Choices: {choices}\n### Answer: {answer}\n### Explanation: {explanation}"""
24
 
25
+ # Tokenization with automatic device handling
26
  def tokenize(examples):
27
  formatted_texts = [
28
  format_example(q, ch, a, exp)
 
35
  ]
36
  return tokenizer(formatted_texts, truncation=True, padding="max_length", max_length=256)
37
 
 
38
  tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
39
 
40
+ # Training arguments (optimized for current hardware)
41
  training_args = TrainingArguments(
42
  output_dir="./output",
43
+ per_device_train_batch_size=4 if device == "cuda" else 2, # Larger batches on GPU
44
  num_train_epochs=1,
45
  save_strategy="epoch",
46
  logging_steps=10,
47
+ fp16=torch.cuda.is_available(), # Enable only if GPU exists
48
+ push_to_hub=False,
49
+ dataloader_pin_memory=torch.cuda.is_available(), # Pin memory only for GPU
50
  )
51
 
 
52
  trainer = Trainer(
53
  model=model,
54
  args=training_args,
55
  train_dataset=tokenized_dataset,
56
  )
57
 
 
58
  trainer.train()
59
  model.save_pretrained("./output")
60
  tokenizer.save_pretrained("./output")
61
 
62
+ print(f"βœ… Training complete on {device.upper()}! Model saved in ./output")