nozomuteruyo14 commited on
Commit
5b16bbc
·
verified ·
1 Parent(s): 9b5d8a8

Create examples/run_glue_experiment.py

Browse files
Files changed (1) hide show
  1. examples/run_glue_experiment.py +359 -0
examples/run_glue_experiment.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import time
5
+ import sys
6
+ import io
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from huggingface_hub import login
11
+ login("Your_API_Key")
12
+
13
+ # Modified Logger class to write output both to terminal and file
14
+ class Logger(io.TextIOBase):
15
+ def __init__(self, filename="experiment_log_GLUE.txt", stream=sys.stdout):
16
+ self.terminal = stream
17
+ self.log = open(filename, "w", encoding="utf8")
18
+ def write(self, message):
19
+ # Write to both terminal and file
20
+ self.terminal.write(message)
21
+ self.log.write(message)
22
+ self.log.flush() # Flush after each write
23
+ def flush(self):
24
+ self.terminal.flush()
25
+ self.log.flush()
26
+ @property
27
+ def encoding(self):
28
+ return self.log.encoding
29
+
30
+ # Redirect standard output to Logger
31
+ sys.stdout = Logger("experiment_log_GLUE.txt")
32
+
33
+ from transformers import (
34
+ AutoTokenizer,
35
+ AutoModelForSequenceClassification,
36
+ TrainingArguments,
37
+ Trainer,
38
+ DataCollatorWithPadding,
39
+ )
40
+ from datasets import load_dataset, DownloadConfig
41
+ import evaluate
42
+ from sklearn.metrics import f1_score
43
+
44
+ # Import DiffLoRA module from the diff_lora package
45
+ from diff_lora.model import replace_linear_with_diff_lora
46
+
47
+ ###############################################
48
+ # Mappings for GLUE Tasks
49
+ ###############################################
50
+
51
+ # Mapping of text columns for each GLUE task.
52
+ text_column_mapping = {
53
+ "mnli": ("premise", "hypothesis"),
54
+ "sst2": "sentence",
55
+ "cola": "sentence",
56
+ "qqp": ("question1", "question2"),
57
+ "qnli": ("question", "sentence"),
58
+ "rte": ("sentence1", "sentence2"),
59
+ "mrpc": ("sentence1", "sentence2"),
60
+ "stsb": ("sentence1", "sentence2")
61
+ }
62
+
63
+ # Number of labels per task (stsb is a regression task)
64
+ num_labels_mapping = {
65
+ "mnli": 3,
66
+ "sst2": 2,
67
+ "cola": 2,
68
+ "qqp": 2,
69
+ "qnli": 2,
70
+ "rte": 2,
71
+ "mrpc": 2,
72
+ "stsb": 1,
73
+ }
74
+
75
+ ###############################################
76
+ # Experiment Function for a Single GLUE Task
77
+ ###############################################
78
+
79
+ def run_glue_experiment(method: str, model_name: str, task: str,
80
+ num_train_epochs: int = 3, batch_size: int = 32,
81
+ lr: float = 2e-5, seed: int = 42, diff_r_ratio: float = 1.0):
82
+ print("\n==============================")
83
+ print(f"Task: {task} | Model: {model_name} | Method: {method}")
84
+ print("==============================\n")
85
+ torch.manual_seed(seed)
86
+
87
+ # Load dataset. For MNLI, use the "validation_matched" split.
88
+ download_config = DownloadConfig(max_retries=10)
89
+ dataset = load_dataset("glue", task, download_config=download_config)
90
+ if task == "mnli":
91
+ eval_split = "validation_matched"
92
+ else:
93
+ eval_split = "validation"
94
+
95
+ # Load evaluation metric.
96
+ metric = evaluate.load("glue", task)
97
+
98
+ # Load tokenizer.
99
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
100
+ text_cols = text_column_mapping[task]
101
+
102
+ # Preprocessing: if there are multiple text columns, concatenate them with a space.
103
+ def preprocess_function(examples):
104
+ if isinstance(text_cols, tuple):
105
+ texts = [ex1 + " " + ex2 for ex1, ex2 in zip(examples[text_cols[0]], examples[text_cols[1]])]
106
+ else:
107
+ texts = examples[text_cols]
108
+ return tokenizer(texts, truncation=True)
109
+
110
+ encoded_dataset = dataset.map(preprocess_function, batched=True)
111
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
112
+
113
+ # Determine if this is a regression task (stsb) or a classification task.
114
+ is_regression = (task == "stsb")
115
+ num_labels = num_labels_mapping[task]
116
+
117
+ # Load model.
118
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
119
+
120
+ # For full fine-tuning, do not freeze parameters.
121
+ if method == "full_finetuning":
122
+ print("Performing full fine-tuning: All parameters are trainable.")
123
+ else:
124
+ # Freeze base model parameters.
125
+ for param in model.parameters():
126
+ param.requires_grad = False
127
+
128
+ baseline_r = 8
129
+ adapter_r = max(1, int(baseline_r * diff_r_ratio))
130
+
131
+ # Inject adapters based on the chosen method.
132
+ if method == "lora":
133
+ from peft import LoraConfig, get_peft_model
134
+ lora_config = LoraConfig(
135
+ r=baseline_r,
136
+ lora_alpha=16,
137
+ target_modules=["query", "value", "dense"],
138
+ lora_dropout=0.1,
139
+ bias="none",
140
+ task_type="SEQ_CLS",
141
+ )
142
+ model = get_peft_model(model, lora_config)
143
+ print("Injected standard LoRA adapters via PEFT.")
144
+ elif method == "diff_lora":
145
+ target_pattern = r"(query|value|dense)"
146
+ replace_linear_with_diff_lora(model, target_pattern, adapter_r)
147
+ print(f"Injected fused DiffLoRA adapters with rank {adapter_r} (ratio={diff_r_ratio}).")
148
+ elif method == "adalora":
149
+ from peft import AdaLoraConfig, get_peft_model
150
+ adalora_config = AdaLoraConfig(
151
+ peft_type="ADALORA",
152
+ r=baseline_r,
153
+ lora_alpha=16,
154
+ target_modules=["query", "value", "dense"],
155
+ lora_dropout=0.1,
156
+ bias="none",
157
+ task_type="SEQ_CLS",
158
+ )
159
+ model = get_peft_model(model, adalora_config)
160
+ print("Injected AdaLoRA adapters via PEFT.")
161
+ elif method == "vb_lora":
162
+ from peft import VBLoRAConfig, get_peft_model
163
+ vb_lora_config = VBLoRAConfig(
164
+ r=baseline_r,
165
+ task_type="SEQ_CLS",
166
+ target_modules=["query", "value", "dense"],
167
+ num_vectors=256,
168
+ vector_length=256,
169
+ topk=2,
170
+ vblora_dropout=0.1,
171
+ bias="none",
172
+ )
173
+ model = get_peft_model(model, vb_lora_config)
174
+ print("Injected VB-LoRA adapters via PEFT.")
175
+ elif method == "olora":
176
+ from peft import LoraConfig, get_peft_model
177
+ olora_config = LoraConfig(
178
+ r=baseline_r,
179
+ lora_alpha=16,
180
+ target_modules=["query", "value", "dense"],
181
+ lora_dropout=0.1,
182
+ bias="none",
183
+ task_type="SEQ_CLS",
184
+ init_lora_weights="olora",
185
+ )
186
+ model = get_peft_model(model, olora_config)
187
+ print("Injected OLoRA adapters via PEFT.")
188
+ elif method == "full_finetuning":
189
+ print("Proceeding with full fine-tuning (no adapter injection).")
190
+ else:
191
+ raise ValueError("Unknown method. Choose from 'lora', 'diff_lora', 'adalora', 'vb_lora', 'olora', or 'full_finetuning'.")
192
+
193
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
194
+ total_params = sum(p.numel() for p in model.parameters())
195
+ print(f"Trainable params: {trainable_params} / {total_params} ({100 * trainable_params / total_params:.2f}%)")
196
+
197
+ # Set training arguments.
198
+ training_args = TrainingArguments(
199
+ output_dir=f"./outputs/results_{model_name}_{task}_{method}",
200
+ evaluation_strategy="epoch",
201
+ save_strategy="epoch",
202
+ learning_rate=lr,
203
+ per_device_train_batch_size=batch_size,
204
+ per_device_eval_batch_size=batch_size,
205
+ num_train_epochs=num_train_epochs,
206
+ weight_decay=0.01,
207
+ logging_steps=10000,
208
+ load_best_model_at_end=True,
209
+ report_to="none",
210
+ disable_tqdm=True
211
+ )
212
+
213
+ # Define compute_metrics based on the task.
214
+ def compute_metrics(eval_pred):
215
+ logits, labels = eval_pred
216
+ if task == "stsb":
217
+ predictions = logits.squeeze()
218
+ result = metric.compute(predictions=predictions, references=labels)
219
+ result["combined_score"] = (result["pearson"] + result["spearmanr"]) / 2
220
+ return result
221
+ elif task == "cola":
222
+ predictions = logits.argmax(axis=-1)
223
+ return metric.compute(predictions=predictions, references=labels)
224
+ elif task == "qqp":
225
+ predictions = logits.argmax(axis=-1)
226
+ acc = (predictions == labels).mean()
227
+ f1 = f1_score(labels, predictions)
228
+ return {"eval_accuracy": acc, "eval_f1": f1}
229
+ else:
230
+ predictions = logits.argmax(axis=-1)
231
+ return metric.compute(predictions=predictions, references=labels)
232
+
233
+ trainer = Trainer(
234
+ model=model,
235
+ args=training_args,
236
+ train_dataset=encoded_dataset["train"],
237
+ eval_dataset=encoded_dataset[eval_split],
238
+ tokenizer=tokenizer,
239
+ data_collator=data_collator,
240
+ compute_metrics=compute_metrics,
241
+ )
242
+
243
+ print("Starting training...")
244
+ start_time = time.time()
245
+ trainer.train()
246
+ training_time = time.time() - start_time
247
+ print(f"Training completed in {training_time:.2f} seconds.")
248
+
249
+ # Evaluate and extract the final metric.
250
+ if task == "mnli":
251
+ eval_result_matched = trainer.evaluate(eval_dataset=encoded_dataset["validation_matched"])
252
+ eval_result_mismatched = trainer.evaluate(eval_dataset=encoded_dataset["validation_mismatched"])
253
+ acc_matched = eval_result_matched.get("eval_accuracy", 0.0)
254
+ acc_mismatched = eval_result_mismatched.get("eval_accuracy", 0.0)
255
+ final_metric_str = f"{acc_matched:.4f}/{acc_mismatched:.4f}"
256
+ final_metric_num = (acc_matched + acc_mismatched) / 2
257
+ elif task == "qqp":
258
+ eval_result = trainer.evaluate()
259
+ acc = eval_result.get("eval_accuracy", 0.0)
260
+ f1 = eval_result.get("eval_f1", 0.0)
261
+ final_metric_str = f"{acc:.4f}/{f1:.4f}"
262
+ final_metric_num = (acc + f1) / 2
263
+ elif task == "stsb":
264
+ val = trainer.evaluate().get("eval_combined_score", 0.0)
265
+ final_metric_str = f"{val:.4f}"
266
+ final_metric_num = val
267
+ elif task == "cola":
268
+ val = trainer.evaluate().get("eval_matthews_correlation", 0.0)
269
+ final_metric_str = f"{val:.4f}"
270
+ final_metric_num = val
271
+ else:
272
+ val = trainer.evaluate().get("eval_accuracy", 0.0)
273
+ final_metric_str = f"{val:.4f}"
274
+ final_metric_num = val
275
+
276
+ print(f"\n=== FINAL RESULTS for {task} | {model_name} | {method} ===")
277
+ print(f"Metric: {final_metric_str}")
278
+ print(f"Training Time: {training_time:.2f} seconds\n")
279
+
280
+ return {
281
+ "task": task,
282
+ "model_name": model_name,
283
+ "method": method,
284
+ "metric_str": final_metric_str,
285
+ "metric_num": final_metric_num,
286
+ "training_time": training_time,
287
+ "trainable_params": trainable_params,
288
+ }
289
+
290
+ ###############################################
291
+ # Main: Run Experiments over GLUE Tasks for Multiple Methods
292
+ ###############################################
293
+
294
+ if __name__ == "__main__":
295
+ # Desired order and corresponding indicators:
296
+ # [mnli (m/mm), sst2 (Acc), cola (Mcc), qqp (Acc/F1), qnli (Acc), rte (Acc), mrpc (Acc), stsb (Corr)]
297
+ tasks = ["mnli", "sst2", "cola", "qqp", "qnli", "rte", "mrpc", "stsb"]
298
+ methods = ["lora", "diff_lora", "adalora", "vb_lora", "olora", "full_finetuning"]
299
+ model_names = ["bert-base-uncased"]
300
+
301
+ all_results = []
302
+ for model_name in model_names:
303
+ for method in methods:
304
+ for task in tasks:
305
+ result = run_glue_experiment(
306
+ method=method,
307
+ model_name=model_name,
308
+ task=task,
309
+ num_train_epochs=3,
310
+ batch_size=32,
311
+ lr=2e-5,
312
+ seed=42,
313
+ diff_r_ratio=1.0
314
+ )
315
+ all_results.append(result)
316
+
317
+ # Organize results: create a summary table for each model-method combination.
318
+ from collections import defaultdict
319
+ summary = defaultdict(dict)
320
+ for res in all_results:
321
+ key = f"{res['model_name']} | {res['method']}"
322
+ summary[key][res["task"]] = res["metric_str"]
323
+
324
+ # Print summary table with column indicators.
325
+ indicator_names = {
326
+ "mnli": "m/mm",
327
+ "sst2": "Acc",
328
+ "cola": "Mcc",
329
+ "qqp": "Acc/F1",
330
+ "qnli": "Acc",
331
+ "rte": "Acc",
332
+ "mrpc": "Acc",
333
+ "stsb": "Corr"
334
+ }
335
+
336
+ print("\n===== Summary of GLUE Results =====")
337
+ header = "Model | Method || " + " | ".join([f"{task} ({indicator_names[task]})" for task in tasks]) + " || Average"
338
+ print(header)
339
+ print("-" * len(header))
340
+ for key, metrics in summary.items():
341
+ avg_list = []
342
+ display_values = []
343
+ for task in tasks:
344
+ val = metrics.get(task, "N/A")
345
+ display_values.append(val)
346
+ if "/" in val:
347
+ parts = val.split("/")
348
+ try:
349
+ num_val = (float(parts[0]) + float(parts[1])) / 2
350
+ avg_list.append(num_val)
351
+ except:
352
+ pass
353
+ else:
354
+ try:
355
+ avg_list.append(float(val))
356
+ except:
357
+ pass
358
+ overall_avg = sum(avg_list) / len(avg_list) if avg_list else 0.0
359
+ print(f"{key} || " + " | ".join(display_values) + f" || {overall_avg:.4f}")