prelington commited on
Commit
5089144
·
verified ·
1 Parent(s): 7308396

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +107 -111
train.py CHANGED
@@ -1,115 +1,111 @@
1
- import json
2
- from datasets import Dataset
3
  from transformers import (
4
- AutoTokenizer,
5
  AutoModelForSequenceClassification,
 
6
  Trainer,
7
- TrainingArguments,
8
- )
9
- import evaluate
10
- import numpy as np
11
- import os
12
- from datetime import datetime
13
-
14
- # === CONFIG ===
15
- MODEL_NAME = "bert-base-uncased"
16
- MODEL_ID = "prelington/acoli"
17
- DATASET_PATH = "../dataset/test.json"
18
- OUTPUT_DIR = "./acoli_model"
19
- LABELS = ["negative", "neutral", "positive"]
20
-
21
- # === LOAD DATASET ===
22
- print("[INFO] Loading dataset from:", DATASET_PATH)
23
- if not os.path.exists(DATASET_PATH):
24
- raise FileNotFoundError(f"Dataset not found at {DATASET_PATH}")
25
-
26
- with open(DATASET_PATH, "r", encoding="utf-8") as f:
27
- data = json.load(f)
28
-
29
- if not isinstance(data, list):
30
- raise ValueError("Dataset must be a list of samples!")
31
-
32
- dataset = Dataset.from_list(data)
33
- dataset = dataset.train_test_split(test_size=0.25, seed=42)
34
- print("[INFO] Dataset loaded successfully!")
35
- print(dataset)
36
-
37
- # === TOKENIZER AND MODEL ===
38
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
39
- model = AutoModelForSequenceClassification.from_pretrained(
40
- MODEL_NAME, num_labels=len(LABELS)
41
  )
42
-
43
- # === PREPROCESSING ===
44
- def preprocess(batch):
45
- enc = tokenizer(
46
- batch["text"],
47
- truncation=True,
48
- padding="max_length",
49
- max_length=128,
50
- )
51
- enc["labels"] = [LABELS.index(label) for label in batch["label"]]
52
- return enc
53
-
54
- print("[INFO] Tokenizing dataset...")
55
- tokenized = dataset.map(preprocess, batched=True)
56
- tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
57
-
58
- # === METRICS ===
59
- accuracy = evaluate.load("accuracy")
60
- f1 = evaluate.load("f1")
61
-
62
- def compute_metrics(eval_pred):
63
- logits, labels = eval_pred
64
- predictions = np.argmax(logits, axis=-1)
65
- acc = accuracy.compute(predictions=predictions, references=labels)
66
- f1_score = f1.compute(predictions=predictions, references=labels, average="macro")
67
- return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}
68
-
69
- # === TRAINING ARGUMENTS ===
70
- time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
71
-
72
- training_args = TrainingArguments(
73
- output_dir=f"{OUTPUT_DIR}-{time_stamp}",
74
- evaluation_strategy="epoch",
75
- save_strategy="epoch",
76
- logging_dir=f"./logs_{time_stamp}",
77
- logging_strategy="steps",
78
- logging_steps=10,
79
- num_train_epochs=3,
80
- per_device_train_batch_size=4,
81
- per_device_eval_batch_size=4,
82
- warmup_ratio=0.1,
83
- learning_rate=2e-5,
84
- weight_decay=0.01,
85
- load_best_model_at_end=True,
86
- push_to_hub=True,
87
- hub_model_id=MODEL_ID,
88
- report_to="none",
89
- )
90
-
91
- # === TRAINER ===
92
- trainer = Trainer(
93
- model=model,
94
- args=training_args,
95
- train_dataset=tokenized["train"],
96
- eval_dataset=tokenized["test"],
97
- tokenizer=tokenizer,
98
- compute_metrics=compute_metrics,
99
- )
100
-
101
- # === TRAIN ===
102
- print("[INFO] Starting training...")
103
- trainer.train()
104
- print("[INFO] Training complete.")
105
-
106
- # === SAVE MODEL LOCALLY ===
107
- os.makedirs(OUTPUT_DIR, exist_ok=True)
108
- trainer.save_model(OUTPUT_DIR)
109
- tokenizer.save_pretrained(OUTPUT_DIR)
110
- print(f"[INFO] Model saved locally to {OUTPUT_DIR}")
111
-
112
- # === PUSH TO HUGGING FACE HUB ===
113
- print("[INFO] Uploading to Hugging Face Hub...")
114
- trainer.push_to_hub(commit_message="Initial upload of Acoli sentiment model.")
115
- print(f"[SUCCESS] Model pushed to https://huggingface.co/{MODEL_ID}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import torch
3
  from transformers import (
4
+ AutoTokenizer,
5
  AutoModelForSequenceClassification,
6
+ TrainingArguments,
7
  Trainer,
8
+ DataCollatorWithPadding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  )
10
+ from datasets import Dataset
11
+ import json
12
+ import logging
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class AcoliTrainer:
19
+ def __init__(self, model_name="xlm-roberta-base", num_labels=3):
20
+ self.model_name = model_name
21
+ self.num_labels = num_labels
22
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ self.model = AutoModelForSequenceClassification.from_pretrained(
24
+ model_name,
25
+ num_labels=num_labels
26
+ )
27
+
28
+ def load_data(self, jsonl_path):
29
+ """Load data from JSONL file"""
30
+ texts = []
31
+ labels = []
32
+
33
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
34
+ for line in f:
35
+ data = json.loads(line)
36
+ texts.append(data['text'])
37
+ labels.append(data['label'])
38
+
39
+ return Dataset.from_dict({
40
+ 'text': texts,
41
+ 'label': labels
42
+ })
43
+
44
+ def preprocess_function(self, examples):
45
+ """Tokenize the texts"""
46
+ return self.tokenizer(
47
+ examples['text'],
48
+ truncation=True,
49
+ padding=True,
50
+ max_length=512
51
+ )
52
+
53
+ def train(self, train_path, output_dir="./acoli-model"):
54
+ """Train the model"""
55
+
56
+ # Load and preprocess data
57
+ logger.info("Loading training data...")
58
+ dataset = self.load_data(train_path)
59
+ tokenized_dataset = dataset.map(self.preprocess_function, batched=True)
60
+
61
+ # Split dataset (80% train, 20% validation)
62
+ train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
63
+ train_dataset = train_test_split['train']
64
+ eval_dataset = train_test_split['test']
65
+
66
+ # Training arguments
67
+ training_args = TrainingArguments(
68
+ output_dir=output_dir,
69
+ learning_rate=2e-5,
70
+ per_device_train_batch_size=8,
71
+ per_device_eval_batch_size=8,
72
+ num_train_epochs=3,
73
+ weight_decay=0.01,
74
+ evaluation_strategy="epoch",
75
+ save_strategy="epoch",
76
+ load_best_model_at_end=True,
77
+ push_to_hub=False, # Set to True if you want to push to HF Hub
78
+ )
79
+
80
+ # Data collator
81
+ data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
82
+
83
+ # Initialize Trainer
84
+ trainer = Trainer(
85
+ model=self.model,
86
+ args=training_args,
87
+ train_dataset=train_dataset,
88
+ eval_dataset=eval_dataset,
89
+ tokenizer=self.tokenizer,
90
+ data_collator=data_collator,
91
+ )
92
+
93
+ # Start training
94
+ logger.info("Starting training...")
95
+ trainer.train()
96
+
97
+ # Save the model
98
+ logger.info(f"Saving model to {output_dir}")
99
+ trainer.save_model(output_dir)
100
+ self.tokenizer.save_pretrained(output_dir)
101
+
102
+ return trainer
103
+
104
+ if __name__ == "__main__":
105
+ # Example usage
106
+ trainer = AcoliTrainer()
107
+
108
+ # Train the model
109
+ trained_trainer = trainer.train("path/to/your/data.jsonl")
110
+
111
+ print("Training completed successfully!")