DetectiveShadow commited on
Commit
e669806
·
verified ·
1 Parent(s): bdf2090

Update train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +19 -14
train_model.py CHANGED
@@ -1,42 +1,47 @@
1
  from datasets import load_dataset
2
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
3
 
4
- # Load your dataset
5
- dataset = load_dataset("json", data_files="dataset.jsonl")["train"]
6
 
7
- # Load tokenizer and model
 
 
 
8
  model_name = "t5-small"
9
  tokenizer = T5Tokenizer.from_pretrained(model_name)
10
  model = T5ForConditionalGeneration.from_pretrained(model_name)
11
 
12
- # Preprocessing
13
  def tokenize(example):
14
  input_enc = tokenizer(example["input"], truncation=True, padding="max_length", max_length=64)
15
  target_enc = tokenizer(example["output"], truncation=True, padding="max_length", max_length=64)
16
  input_enc["labels"] = target_enc["input_ids"]
17
  return input_enc
18
 
19
- tokenized_data = dataset.map(tokenize)
20
 
21
- # Training arguments
22
  training_args = TrainingArguments(
23
- output_dir="./trivia-genie-t5",
24
  per_device_train_batch_size=8,
25
  num_train_epochs=3,
26
  logging_steps=10,
27
- save_total_limit=2,
28
- save_strategy="epoch"
 
 
29
  )
30
 
 
31
  trainer = Trainer(
32
  model=model,
33
  args=training_args,
34
- train_dataset=tokenized_data,
35
  )
36
 
37
- # Train
38
  trainer.train()
 
 
39
 
40
- # Optional: Push to Hugging Face Hub
41
- # model.push_to_hub("your-username/trivia-genie-t5")
42
- # tokenizer.push_to_hub("your-username/trivia-genie-t5")
 
1
  from datasets import load_dataset
2
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
3
 
4
+ # Step 1: Load dataset
5
+ dataset = load_dataset("DetectiveShadow/MVPQuestion")["train"]
6
 
7
+ # Optional: Rename columns if needed
8
+ # dataset = dataset.rename_columns({"your_input_column": "input", "your_output_column": "output"})
9
+
10
+ # Step 2: Load tokenizer and model
11
  model_name = "t5-small"
12
  tokenizer = T5Tokenizer.from_pretrained(model_name)
13
  model = T5ForConditionalGeneration.from_pretrained(model_name)
14
 
15
+ # Step 3: Tokenization function
16
  def tokenize(example):
17
  input_enc = tokenizer(example["input"], truncation=True, padding="max_length", max_length=64)
18
  target_enc = tokenizer(example["output"], truncation=True, padding="max_length", max_length=64)
19
  input_enc["labels"] = target_enc["input_ids"]
20
  return input_enc
21
 
22
+ tokenized = dataset.map(tokenize)
23
 
24
+ # Step 4: Training configuration
25
  training_args = TrainingArguments(
26
+ output_dir="./MVPTrivia",
27
  per_device_train_batch_size=8,
28
  num_train_epochs=3,
29
  logging_steps=10,
30
+ save_strategy="epoch",
31
+ push_to_hub=True,
32
+ hub_model_id="DetectiveShadow/MVPTrivia", # This is where your model will go
33
+ hub_strategy="every_save"
34
  )
35
 
36
+ # Step 5: Trainer
37
  trainer = Trainer(
38
  model=model,
39
  args=training_args,
40
+ train_dataset=tokenized
41
  )
42
 
43
+ # Step 6: Train and push
44
  trainer.train()
45
+ trainer.push_to_hub()
46
+ tokenizer.push_to_hub("DetectiveShadow/MVPTrivia")
47