prelington commited on
Commit
dc44a31
·
verified ·
1 Parent(s): 0332dbf

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +104 -0
train.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datasets import Dataset
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ AutoModelForSequenceClassification,
6
+ Trainer,
7
+ TrainingArguments
8
+ )
9
+ import evaluate
10
+ import numpy as np
11
+ import os
12
+
13
+ # =============== CONFIG =================
14
+ MODEL_NAME = "bert-base-uncased"
15
+ HUB_MODEL_ID = "your-username/acoli"
16
+ DATA_PATH = "../dataset/test.json"
17
+ OUTPUT_DIR = "./acoli_model"
18
+ NUM_EPOCHS = 3
19
+ BATCH_SIZE = 8
20
+ LEARNING_RATE = 2e-5
21
+ SEED = 42
22
+ # ========================================
23
+
24
+ # 1️⃣ Load dataset
25
+ print("🔹 Loading dataset...")
26
+ with open(DATA_PATH, "r", encoding="utf-8") as f:
27
+ raw_data = json.load(f)
28
+ dataset = Dataset.from_list(raw_data)
29
+
30
+ # Split for training and validation
31
+ dataset = dataset.train_test_split(test_size=0.2, seed=SEED)
32
+
33
+ # 2️⃣ Tokenizer and model
34
+ print("🔹 Loading tokenizer and model...")
35
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
36
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
37
+
38
+ # 3️⃣ Tokenization
39
+ def tokenize_function(batch):
40
+ return tokenizer(batch["text"], padding="max_length", truncation=True)
41
+
42
+ print("🔹 Tokenizing data...")
43
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
44
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
45
+ tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
46
+
47
+ # 4️⃣ Metrics
48
+ print("🔹 Preparing metrics...")
49
+ accuracy = evaluate.load("accuracy")
50
+ f1_metric = evaluate.load("f1")
51
+
52
+ def compute_metrics(eval_pred):
53
+ logits, labels = eval_pred
54
+ predictions = np.argmax(logits, axis=-1)
55
+ acc = accuracy.compute(predictions=predictions, references=labels)
56
+ f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
57
+ return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
58
+
59
+ # 5️⃣ Training arguments
60
+ print("🔹 Setting up training arguments...")
61
+ training_args = TrainingArguments(
62
+ output_dir=OUTPUT_DIR,
63
+ evaluation_strategy="epoch",
64
+ save_strategy="epoch",
65
+ logging_strategy="steps",
66
+ logging_steps=20,
67
+ save_total_limit=2,
68
+ num_train_epochs=NUM_EPOCHS,
69
+ per_device_train_batch_size=BATCH_SIZE,
70
+ per_device_eval_batch_size=BATCH_SIZE,
71
+ learning_rate=LEARNING_RATE,
72
+ weight_decay=0.01,
73
+ load_best_model_at_end=True,
74
+ metric_for_best_model="accuracy",
75
+ greater_is_better=True,
76
+ push_to_hub=True,
77
+ hub_model_id=HUB_MODEL_ID,
78
+ report_to="none",
79
+ seed=SEED
80
+ )
81
+
82
+ # 6️⃣ Trainer setup
83
+ print("🔹 Initializing Trainer...")
84
+ trainer = Trainer(
85
+ model=model,
86
+ args=training_args,
87
+ train_dataset=tokenized_datasets["train"],
88
+ eval_dataset=tokenized_datasets["test"],
89
+ tokenizer=tokenizer,
90
+ compute_metrics=compute_metrics
91
+ )
92
+
93
+ # 7️⃣ Start training
94
+ print("🚀 Training started...")
95
+ trainer.train()
96
+
97
+ # 8️⃣ Save and push to Hugging Face Hub
98
+ print("🔹 Saving model locally...")
99
+ trainer.save_model(OUTPUT_DIR)
100
+
101
+ print("🔹 Pushing model to Hugging Face Hub...")
102
+ trainer.push_to_hub()
103
+
104
+ print("✅ Training complete! Model pushed to Hugging Face successfully.")