drzeeIslam commited on
Commit
2a320fa
·
verified ·
1 Parent(s): e13bea5

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +40 -0
train.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForMaskedLM
3
+
4
+ # Load dataset from local CSV
5
+ dataset = load_dataset("text", data_files="chunks.csv")
6
+
7
+ # Load tokenizer and model
8
+ model_checkpoint = "distilbert-base-uncased"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
10
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
11
+
12
+ # Tokenize the texts
13
+ def tokenize_function(examples):
14
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
15
+
16
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
17
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
18
+
19
+ # Training arguments
20
+ training_args = TrainingArguments(
21
+ output_dir="./results",
22
+ per_device_train_batch_size=8,
23
+ num_train_epochs=3,
24
+ save_steps=500,
25
+ save_total_limit=2,
26
+ logging_steps=50,
27
+ push_to_hub=False
28
+ )
29
+
30
+ # Trainer
31
+ trainer = Trainer(
32
+ model=model,
33
+ args=training_args,
34
+ train_dataset=tokenized_datasets["train"],
35
+ tokenizer=tokenizer,
36
+ data_collator=data_collator
37
+ )
38
+
39
+ # Train the model
40
+ trainer.train()