Saad381 commited on
Commit
dade59d
·
verified ·
1 Parent(s): 1abae7c

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +40 -0
train.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Load the model and tokenizer
5
+ model = AutoModelForCausalLM.from_pretrained("Saad381/SpectraGen")
6
+ tokenizer = AutoTokenizer.from_pretrained("Saad381/SpectraGen")
7
+
8
+ # Load your dataset (CSV file assumed here)
9
+ dataset = load_dataset('csv', data_files='dataset.csv')
10
+
11
+ # Tokenize your dataset
12
+ def tokenize_function(examples):
13
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
14
+
15
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
16
+
17
+ # Define training arguments
18
+ training_args = TrainingArguments(
19
+ output_dir='./results', # output directory
20
+ evaluation_strategy="epoch", # evaluate at end of each epoch
21
+ per_device_train_batch_size=8, # batch size
22
+ num_train_epochs=3, # number of training epochs
23
+ save_steps=10_000, # steps to save checkpoint
24
+ save_total_limit=2, # limit the total amount of checkpoints
25
+ )
26
+
27
+ # Initialize the Trainer
28
+ trainer = Trainer(
29
+ model=model,
30
+ args=training_args,
31
+ train_dataset=tokenized_datasets["train"],
32
+ eval_dataset=tokenized_datasets["test"]
33
+ )
34
+
35
+ # Train the model
36
+ trainer.train()
37
+
38
+ # Save the model
39
+ model.save_pretrained('./trained_model')
40
+ tokenizer.save_pretrained('./trained_model')