natmin322 commited on
Commit
d1be546
·
1 Parent(s): b9eaa7b
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +6 -1
  2. experiment_diagnosis_report.md +301 -0
  3. improve_gainlora/T5_small/gen_script_long_order3_t5_small_specroute_v2.sh +893 -0
  4. improve_gainlora/generate_specroute_scripts_v2.py +2 -2
  5. improve_gainlora/src/cl_trainer_specroute.py +88 -3
  6. improve_gainlora/src/run_t5.py +13 -7
  7. results/comparison_results.md +37 -35
  8. results/experiment_versions.md +167 -0
  9. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/all_results.json +0 -9
  10. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/eval_eval_predictions.jsonl +0 -3
  11. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/predict_eval_predictions.jsonl +0 -3
  12. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_0.pt +0 -3
  13. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_1.pt +0 -3
  14. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_10.pt +0 -3
  15. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_11.pt +0 -3
  16. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_12.pt +0 -3
  17. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_13.pt +0 -3
  18. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_14.pt +0 -3
  19. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_15.pt +0 -3
  20. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_16.pt +0 -3
  21. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_17.pt +0 -3
  22. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_18.pt +0 -3
  23. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_19.pt +0 -3
  24. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_2.pt +0 -3
  25. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_20.pt +0 -3
  26. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_21.pt +0 -3
  27. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_22.pt +0 -3
  28. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_23.pt +0 -3
  29. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_3.pt +0 -3
  30. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_4.pt +0 -3
  31. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_5.pt +0 -3
  32. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_6.pt +0 -3
  33. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_7.pt +0 -3
  34. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_8.pt +0 -3
  35. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_9.pt +0 -3
  36. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/runs/Mar13_11-48-44_a802a1875a6b/events.out.tfevents.1773402573.a802a1875a6b.120.0 +0 -0
  37. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/attention_weights.pkl +0 -0
  38. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_A.pt +0 -3
  39. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_B.pt +0 -3
  40. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/prompts_keys_till_now.pt +0 -3
  41. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/special_tokens_map.json +0 -125
  42. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/spiece.model +0 -3
  43. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer.json +0 -0
  44. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer_config.json +0 -938
  45. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/trans_input.pt +0 -3
  46. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/train_results.json +0 -9
  47. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trainer_state.json +0 -105
  48. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_0.pt +0 -3
  49. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_1.pt +0 -3
  50. root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_2.pt +0 -3
.gitignore CHANGED
@@ -9,4 +9,9 @@ __pycache__/
9
  *.pyo
10
  *.pyd
11
  */logs/*
12
- */logs
 
 
 
 
 
 
9
  *.pyo
10
  *.pyd
11
  */logs/*
12
+ */logs
13
+ logs/*
14
+ logs
15
+ *.log
16
+
17
+
experiment_diagnosis_report.md ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Báo cáo chẩn đoán thí nghiệm: SpecRoute vs GainLoRA trên T5-Small
2
+
3
+ > **Benchmark**: Long Sequence Order 3 (15 classification tasks)
4
+ > **Model**: T5-Small (d_model=512, 6 encoder + 6 decoder layers, lora_r=8)
5
+ > **Thí nghiệm**: SpecRoute (improve) vs GainLoRA-InfLoRA (root)
6
+
7
+ ---
8
+
9
+ ## 1. Xác minh kết quả: Bảng so sánh có chính xác không?
10
+
11
+ ### ✅ ROOT GainLoRA: AP = 59.70 — CHÍNH XÁC
12
+
13
+ Nguồn dữ liệu: `logs/root_t5_small/.../15-wic/all_results.json`
14
+ - Task 15 (wic) có `--do_predict` → evaluation trên ALL 15 tasks (70,861 samples)
15
+ - Metrics `predict_exact_match_for_{task}` cho tất cả tasks → **đây là R_{15,j} (final row)**
16
+ - AP = mean(R_{15,j}) = 59.70 ✓ (tính đúng theo paper)
17
+
18
+ ### ⚠️ SpecRoute: "AP" = 39.74 — **KHÔNG PHẢI AP THẬT**
19
+
20
+ Nguồn dữ liệu: `predict_eval_predictions.jsonl` tại MỖI task directory
21
+ - SpecRoute THIẾU `--do_predict` cho tasks 2-15 (bug trong script generator)
22
+ - File `predict_eval_predictions.jsonl` ở mỗi task chỉ chứa **current task evaluation**
23
+ - Các con số (yelp=54.36, imdb=0.21, etc.) là **R_{j,j} (diagonal = peak performance)**, KHÔNG phải R_{15,j}
24
+ - 39.74 = mean(diagonal), **KHÔNG phải AP** theo công thức paper
25
+
26
+ **Hệ quả**: AP thật của SpecRoute sẽ THẤP HƠN 39.74 vì forgetting sẽ giảm performance của các tasks đầu. Khoảng cách thực tế với ROOT có thể lớn hơn 19.96 điểm.
27
+
28
+ ### Bảng so sánh đã hiệu chỉnh
29
+
30
+ | # | Task | ROOT R_{15,j} (Final) | SpecRoute R_{j,j} (Peak) | Ghi chú |
31
+ |---|------|-----------------------|--------------------------|---------|
32
+ | 1 | yelp | 56.01 | 54.36 | Tương đương |
33
+ | 2 | amazon | 52.05 | 50.01 | Tương đương |
34
+ | 3 | mnli | 34.07 | 35.50 | SpecRoute tốt hơn |
35
+ | 4 | cb | 3.57 | 0.00 | Cả hai đều thấp |
36
+ | 5 | copa | 42.00 | 44.00 | Tương đương |
37
+ | 6 | qqp | 76.96 | 76.72 | Tương đương |
38
+ | 7 | rte | 45.85 | 50.90 | SpecRoute tốt hơn |
39
+ | 8 | imdb | 89.51 | **0.21** ⚠️ | **Không thể học** |
40
+ | 9 | sst2 | 85.21 | **0.00** ⚠️ | **Không thể học** |
41
+ | 10 | dbpedia | 98.16 | 92.22 | Chấp nhận được |
42
+ | 11 | agnews | 88.37 | 68.76 | Giảm đáng kể |
43
+ | 12 | yahoo | 57.28 | **8.12** ⚠️ | **Không thể học** |
44
+ | 13 | multirc | 50.52 | 54.23 | Tương đương |
45
+ | 14 | boolq | 60.43 | 61.13 | Tương đương |
46
+ | 15 | wic | 55.49 | **0.00** ⚠️ | **Không thể học** |
47
+
48
+ **Nhận xét quan trọng**: SpecRoute scoring ở đây là PEAK (ngay sau khi train task đó), trong khi ROOT scoring là FINAL (sau khi train xong tất cả 15 tasks). Với ROOT, imdb PEAK có thể > 89.51 rồi chỉ giảm nhẹ về 89.51. Nhưng với SpecRoute, imdb PEAK đã là 0.21 — model KHÔNG THỂ HỌC task này ngay từ đầu, đây **không phải catastrophic forgetting**.
49
+
50
+ ---
51
+
52
+ ## 2. Tại sao FT (Forgetting) không tính được?
53
+
54
+ ### Nguyên nhân trực tiếp: `--do_predict` bị thiếu
55
+
56
+ Công thức FT cần:
57
+ - R_{j,j} = performance trên task j ngay sau khi train task j (diagonal)
58
+ - R_{T,j} = performance trên task j sau khi train xong tất cả T tasks (final row)
59
+
60
+ | Method | R_{j,j} (diagonal) | R_{T,j} (final row) | FT computable? |
61
+ |--------|--------------------|--------------------|----------------|
62
+ | ROOT | ❌ Không có (tasks 1-14 thiếu cross-task eval) | ✅ Task 15 có | ❌ Thiếu diagonal |
63
+ | SpecRoute | ⚠️ Có nhưng chỉ single-task eval | ❌ Task 15 không eval cross-task | ❌ Thiếu final row |
64
+
65
+ ### Nguyên nhân gốc: Bug trong script generator
66
+
67
+ File `improve_gainlora/generate_specroute_scripts_v2.py`:
68
+
69
+ ```python
70
+ "long_order3": {
71
+ ...
72
+ "do_predict": False, # ← BUG: nên là True
73
+ ...
74
+ },
75
+ ```
76
+
77
+ **Fix**: Đổi thành `True` cho cả `long_order3` và `long_order4`. Khi `do_predict=True`, script sẽ generate `--do_predict --predict_with_generate` cho mỗi task → `run_t5.py` sẽ evaluate trên ALL task cumulative test sets → `score.py` sẽ build được full matrix R → FT tính được.
78
+
79
+ ROOT cũng cần fix: hiện tại chỉ task 15 có `--do_predict`. Cần thêm cho tasks 1-14 để có full R matrix.
80
+
81
+ ---
82
+
83
+ ## 3. Phân tích nguyên nhân gốc: Tại sao SpecRoute kém?
84
+
85
+ ### 3.1 KHÔNG phải do SVD/routing bugs
86
+
87
+ Sau khi đọc toàn bộ source code:
88
+ - `compute_spectral_signatures()`: SVD đúng, lưu Vt[:r] và S[:r] đúng
89
+ - `compute_spectral_routing()`: Weighted Rayleigh quotient đúng, softmax đúng
90
+ - Không có hardcoded dimensions cho T5-large
91
+ - d_model=512, lora_r=8 → SVD rank=8 capture toàn bộ non-zero singular values
92
+ - Gradient checkpointing fix đã áp dụng đúng
93
+
94
+ ### 3.2 KHÔNG phải hoàn toàn do config SVD (giả thuyết ban đầu)
95
+
96
+ User hypothesis: "configs ban đầu được thiết kế cho T5_large, T5_small nên config SVD không phù hợp"
97
+
98
+ **Sự thật**: Không có config SVD-specific nào cần thay đổi cho T5-small. Các hyperparameters (lora_r=8, lora_alpha=32, threshold=0.995, temperature=1.0) là model-agnostic. Vấn đề nằm ở chỗ khác.
99
+
100
+ ### 3.3 NGUYÊN NHÂN CHÍNH: Thiếu cơ chế chống forgetting
101
+
102
+ Đây là bảng so sánh **cơ chế bảo vệ** giữa 2 phương pháp:
103
+
104
+ | Cơ chế | ROOT GainLoRA | SpecRoute | Tác động |
105
+ |--------|:---:|:---:|----------|
106
+ | GPM gradient projection (LoRA A) | ✅ | ✅ | Chặn gradient phá LoRA cũ |
107
+ | KL distillation (`kl_ratio=0.1`) | ✅ | ❌ | Duy trì routing distribution cũ |
108
+ | Data replay (`gen_data_dir`) | ✅ | ❌ | Reinforce kiến thức cũ |
109
+ | Per-step GPM on routing params | ✅ | ❌ | Bảo vệ trans_input + prompt_key |
110
+ | Trans_input (learned routing) | ✅ | ❌ | Routing có gradient, học continuous |
111
+
112
+ **ROOT có 5 lớp bảo vệ, SpecRoute chỉ có 1 lớp (GPM trên LoRA A)**
113
+
114
+ Khi loại bỏ learned routing (trans_input + prompt_key), SpecRoute đồng thời loại bỏ luôn:
115
+ 1. KL distillation (vì không có routing params để distill)
116
+ 2. Data replay (vì không có routing MLP cần reinforce)
117
+ 3. Per-step GPM trên routing params (vì không có routing params)
118
+
119
+ Đây **không phải là design intention** — SpecRoute muốn replace routing mechanism, nhưng vô tình loại bỏ luôn CÁC CƠ CHẾ BẢO VỆ đi kèm routing.
120
+
121
+ ### 3.4 NGUYÊN NHÂN PHỤ: GPM null-space bão hòa sớm ở T5-small
122
+
123
+ Training loss so sánh (bằng chứng GPM over-constraining):
124
+
125
+ | Task (thứ tự) | ROOT loss | SpecRoute loss | Tỉ lệ | SpecRoute score |
126
+ |---|---|---|---|---|
127
+ | 1 yelp | 0.586 | 0.581 | 1.0x | 54.36 |
128
+ | 2 amazon | 0.540 | 0.588 | 1.1x | 50.01 |
129
+ | 5 copa | 0.455 | 0.459 | 1.0x | 44.00 |
130
+ | 6 qqp | 0.288 | 0.304 | 1.1x | 76.72 |
131
+ | 8 imdb | 1.410 | **4.149** | **2.9x** | **0.21** |
132
+ | 9 sst2 | 1.762 | **4.449** | **2.5x** | **0.00** |
133
+ | 12 yahoo | 1.189 | **3.077** | **2.6x** | **8.12** |
134
+ | 15 wic | 0.961 | **3.654** | **3.8x** | **0.00** |
135
+
136
+ **Pattern rõ ràng**: Tasks ban đầu (1-6) loss tương đương → model học OK. Tasks sau (8, 9, 12, 15) loss cao gấp 2.5-3.8x → model KHÔNG THỂ HỌC.
137
+
138
+ Nhưng thú vị: tasks 10 (dbpedia), 13 (multirc), 14 (boolq) vẫn học được tốt (loss < 1.3). Điều này cho thấy vấn đề không chỉ đơn thuần "hết null-space":
139
+
140
+ **Các tasks THẤT BẠI (imdb, sst2, yahoo, wic)** có đặc điểm chung: overlap lớn với tasks TRƯỚC ĐÓ trong feature space:
141
+ - imdb/sst2 = sentiment binary → overlap với yelp (task 1), amazon (task 2)
142
+ - yahoo = topic QA → overlap với nhiều domain trước
143
+ - wic = word sense → cần representations already claimed bởi tasks trước
144
+
145
+ **Giải thích**: GPM từ tasks 1-2 (yelp/amazon sentiment) đã "claim" sentiment-relevant directions. Khi imdb (cũng sentiment) đến, GPM ép LoRA A vào null-space orthogonal với sentiment directions → model bị ép vào directions KHÔNG LIÊN QUAN đến sentiment → không thể phân loại sentiment → loss cao, accuracy 0.
146
+
147
+ Trong ROOT GainLoRA, vấn đề này được giải quyết bởi:
148
+ - Trans_input cho phép MAP input mới vào representation space REUSE kiến thức sentiment cũ
149
+ - KL distillation cho phép routing CHUYỂN imdb sang LoRA branch sentiment đã có
150
+ - Data replay DUY TRÌ sentiment knowledge
151
+
152
+ ### 3.5 Training loss cao = model không thể học, KHÔNG PHẢI catastrophic forgetting
153
+
154
+ Đây là phát hiện quan trọng nhất: comparison_results.md ghi "imdb/sst2/wic về 0 do Catastrophic Forgetting" — **NHẬN ĐỊNH NÀY SAI**.
155
+
156
+ Bằng chứng:
157
+ - imdb train_loss = 4.149 (rất cao) → model CHƯA BAO GIỜ học được imdb
158
+ - imdb prediction: "Rififi" (copy từ review text), "Negative" (sai format, label đúng là "Good"/"Bad")
159
+ - sst2 train_loss = 4.449 → tương tự
160
+
161
+ **Đây là "inability to learn" (GPM over-constraining), KHÔNG phải "learned then forgot" (catastrophic forgetting).**
162
+
163
+ ---
164
+
165
+ ## 4. Lỗi so sánh không công bằng
166
+
167
+ | Aspect | ROOT | SpecRoute | Vấn đề |
168
+ |--------|------|-----------|--------|
169
+ | Score type | R_{15,j} (FINAL) | R_{j,j} (PEAK/DIAGONAL) | So sánh khác loại |
170
+ | Evaluation | Cross-task (all 15) | Single-task (chỉ current) | Scope khác nhau |
171
+ | `--do_predict` | Task 15 only | Task 1 only | Cả hai đều thiếu |
172
+
173
+ **ROOT**: Đánh giá SAU KHI train xong 15 tasks → bao gồm cả forgetting
174
+ **SpecRoute**: Đánh giá NGAY SAU KHI train từng task → peak performance, chưa bao gồm forgetting
175
+
176
+ Để so sánh công bằng, cần chạy lại SpecRoute với `--do_predict` ở task 15 để có R_{15,j} cho tất cả tasks.
177
+
178
+ ---
179
+
180
+ ## 5. Định hướng cải tiến
181
+
182
+ ### 5.1 Fix NGAY (không đổi methodology)
183
+
184
+ **A. Thêm `--do_predict` cho tất cả tasks**
185
+ ```python
186
+ # generate_specroute_scripts_v2.py
187
+ "long_order3": { "do_predict": True }, # was False
188
+ "long_order4": { "do_predict": True }, # was False
189
+ ```
190
+ → Cho phép build full R matrix, tính AP/FT đúng, so sánh công bằng.
191
+
192
+ **B. Khôi phục KL distillation**
193
+
194
+ Đây là fix quan trọng nhất. SpecRoute loại bỏ learned routing nhưng KL distillation hoàn toàn có thể adapt cho spectral routing:
195
+
196
+ ```python
197
+ # Concept: KL trên routing output thay vì routing params
198
+ def spectral_kl_regularization(model, old_signatures, input_embeds):
199
+ """Duy trì routing distribution gần với snapshot sau task trước"""
200
+ current_routing = model.compute_spectral_routing(input_embeds)
201
+ old_routing = compute_old_routing(old_signatures, input_embeds)
202
+ return kl_div(current_routing.log(), old_routing)
203
+ ```
204
+
205
+ Tuy nhiên, vì spectral routing là deterministic (không có learnable params), KL trên routing output không tạo gradient hữu ích. Thay vào đó:
206
+
207
+ **Option tốt hơn: KL distillation trên model OUTPUT (logits)**
208
+ ```python
209
+ # Sau mỗi task, lưu model logits trên replay data
210
+ # Trong training step tiếp theo:
211
+ kl_loss = kl_div(current_logits, saved_old_logits)
212
+ ```
213
+
214
+ **C. Khôi phục Data Replay**
215
+
216
+ Replay không phụ thuộc vào routing mechanism. Có thể dùng generated data hoặc coreset luôn:
217
+ ```bash
218
+ --gen_data_dir generated_data/lora_gen_long_t5 # Tái sử dụng tập replay của ROOT
219
+ --data_replay_freq 5 # Replay mỗi 5 steps
220
+ --kl_ratio 0.1 # Weight cho KL loss trên replay
221
+ ```
222
+
223
+ ### 5.2 Giảm GPM threshold cho T5-small
224
+
225
+ Với threshold=0.995, sau 15 tasks, threshold tăng lên 0.99967 → GPM giữ 99.97% variance → null-space cực nhỏ.
226
+
227
+ | threshold | Task 1 | Task 7 | Task 14 | Nhận xét |
228
+ |-----------|--------|--------|---------|----------|
229
+ | 0.995 (hiện tại) | 0.9950 | 0.9973 | 0.9997 | Quá chặt cho T5-small |
230
+ | 0.990 | 0.9900 | 0.9947 | 0.9993 | Vẫn khá chặt |
231
+ | 0.980 | 0.9800 | 0.9893 | 0.9987 | Thử nghiệm đầu tiên |
232
+ | 0.970 | 0.9700 | 0.9840 | 0.9980 | Aggressive nhưng đáng thử |
233
+
234
+ **Đề xuất**: Thử threshold=0.980 trước, nếu forgetting tăng thì kết hợp KL distillation để bù.
235
+
236
+ ### 5.3 Cải tiến methodology (dài hạn)
237
+
238
+ **A. Cho phép subspace sharing**
239
+
240
+ Vấn đề gốc: GPM ép tasks tương tự (imdb/sst2 vs yelp/amazon) vào subspaces orthogonal. Cần mechanism cho phép knowledge reuse:
241
+
242
+ ```python
243
+ # Ý tưởng: Nếu spectral routing gợi ý task mới SIMILAR với task cũ,
244
+ # giảm GPM protection cho directions tương tự → cho phép reuse
245
+ similarity = compute_spectral_routing(avg_input) # routing weights
246
+ for old_task, weight in enumerate(similarity):
247
+ if weight > threshold_reuse:
248
+ # Giảm GPM projection cho old_task's directions
249
+ # → cho phép refinement thay vì full orthogonality
250
+ ```
251
+
252
+ **B. Hybrid routing: spectral + lightweight learned component**
253
+
254
+ Thay vì hoàn toàn parameter-free, thêm adapter nhẹ:
255
+ ```python
256
+ routing = alpha * spectral_fit + (1-alpha) * learned_gate
257
+ ```
258
+ - `spectral_fit`: parameter-free, ổn định, không cần GPM protection
259
+ - `learned_gate`: lightweight (MLP nhỏ), cho phép gradient flow
260
+ - `alpha`: có thể learnable hoặc fixed (e.g., 0.7)
261
+
262
+ **C. Tách biệt protection vs routing**
263
+
264
+ Thiết kế SpecRoute hiện tại **couple** routing mechanism với protection mechanisms. Cần tách:
265
+ - **Routing**: Spectral (parameter-free) — OK giữ nguyên
266
+ - **Protection**: Cần ÍT NHẤT 2 trong 3: GPM, KL distillation, data replay
267
+
268
+ ---
269
+
270
+ ## 6. Kế hoạch thí nghiệm tiếp theo
271
+
272
+ ### Phase 1: Fix bugs + fair comparison (ưu tiên CAO)
273
+ 1. Fix `generate_specroute_scripts_v2.py`: `do_predict=True` cho long benchmarks
274
+ 2. Regenerate scripts
275
+ 3. Chạy lại SpecRoute Long Order 3 trên T5-small
276
+ 4. So sánh AP/FT chính xác giữa 2 methods
277
+
278
+ ### Phase 2: Thêm protection mechanisms (ưu tiên CAO)
279
+ 1. Thêm KL distillation trên model output logits (replay + KL loss)
280
+ 2. Thêm data replay
281
+ 3. Grid search: threshold ∈ {0.995, 0.990, 0.980}, kl_ratio ∈ {0.05, 0.1, 0.2}
282
+
283
+ ### Phase 3: Validate methodology (sau phase 2)
284
+ 1. Nếu Phase 2 cho kết quả tốt → methodology đúng, chỉ thiếu protection
285
+ 2. Nếu Phase 2 vẫn kém → spectral routing có vấn đề ở T5-small, cần hybrid approach
286
+ 3. Scale lên T5-large để so sánh ở đúng scale ROOT paper dùng
287
+
288
+ ---
289
+
290
+ ## 7. Tổng kết
291
+
292
+ | Câu hỏi | Trả lời |
293
+ |----------|---------|
294
+ | Kết quả tổng hợp có chính xác? | ⚠️ ROOT đúng (AP=59.70), SpecRoute SAI loại metric (diagonal vs final) |
295
+ | Tại sao kết quả tệ? | SpecRoute loại bỏ routing → vô tình loại bỏ luôn KL + replay + per-step GPM |
296
+ | Do methodology hay config? | **Cả hai**: methodology thiếu protection layers + GPM threshold quá chặt cho T5-small |
297
+ | SVD có phải nguyên nhân? | **Không trực tiếp**. SVD routing code đúng, không có bugs |
298
+ | FT tại sao chưa tính? | Bug trong script generator: `do_predict=False` cho long benchmarks |
299
+ | Hướng cải tiến? | Khôi phục KL distillation + data replay, giảm GPM threshold, fix scripts |
300
+
301
+ **Kết luận cốt lõi**: Ý tưởng spectral routing thay thế learned routing KHÔNG SAI về mặt lý thuyết. Vấn đề là khi implement, các cơ chế protection (KL, replay) bị loại bỏ theo vì chúng gắn chặt với learned routing trong code ROOT. Cần decouple routing mechanism khỏi protection mechanisms.
improve_gainlora/T5_small/gen_script_long_order3_t5_small_specroute_v2.sh ADDED
@@ -0,0 +1,893 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH -J cl
3
+ #SBATCH -o cl-%j.out
4
+ #SBATCH -p compute
5
+ #SBATCH -N 1
6
+ #SBATCH -t 20:00:00
7
+ #SBATCH --mem 128G
8
+ #SBATCH --gres=gpu:2
9
+
10
+ export CUDA_DEVICE_ORDER="PCI_BUS_ID"
11
+
12
+ port=$(shuf -i25000-30000 -n1)
13
+
14
+ # ============================================================
15
+ # Auto-detect GPU count and type for optimal parallelism
16
+ # ============================================================
17
+ NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
18
+ GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
19
+
20
+ if [ -z "$GPU_MEM" ]; then
21
+ echo "ERROR: No GPU detected!"
22
+ exit 1
23
+ fi
24
+
25
+ # Determine GPU type
26
+ if [ "$GPU_MEM" -lt 20000 ]; then
27
+ IS_T4=1
28
+ echo "[GPU] Detected T4 GPUs (${GPU_MEM}MB VRAM each)"
29
+ else
30
+ IS_T4=0
31
+ echo "[GPU] Detected high-memory GPUs (${GPU_MEM}MB VRAM each)"
32
+ fi
33
+
34
+ # Determine parallelism strategy
35
+ if [ "$IS_T4" -eq 1 ] && [ "$NUM_GPUS" -ge 2 ]; then
36
+ GPU_MODE="t4_2gpu"
37
+ GPU_IDS="0,1"
38
+ FP16_FLAG=""
39
+ echo "[GPU] Strategy: 2x T4 DataParallel + fp32 + gradient_checkpointing"
40
+ elif [ "$IS_T4" -eq 1 ]; then
41
+ GPU_MODE="t4_1gpu"
42
+ GPU_IDS="${1:-0}"
43
+ FP16_FLAG=""
44
+ echo "[GPU] Strategy: 1x T4 + fp32 + gradient_checkpointing"
45
+ else
46
+ GPU_MODE="a100"
47
+ GPU_IDS="${1:-0}"
48
+ FP16_FLAG=""
49
+ echo "[GPU] Strategy: A100 (single GPU, fp32)"
50
+ fi
51
+
52
+ echo "[GPU] Using CUDA_VISIBLE_DEVICES=$GPU_IDS"
53
+ echo "============================================================"
54
+ echo ""
55
+
56
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
57
+ BSZ=16; GA=1; EVAL_BSZ=256
58
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
59
+ BSZ=32; GA=1; EVAL_BSZ=256
60
+ else
61
+ BSZ=64; GA=1; EVAL_BSZ=512
62
+ fi
63
+
64
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
65
+ --do_train \
66
+ --do_predict \
67
+ --predict_with_generate \
68
+ --model_name_or_path $2 \
69
+ --data_dir CL_Benchmark \
70
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
71
+ --task_config_dir configs/gen_script_long_order3_t5_configs/yelp \
72
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp \
73
+ --per_device_train_batch_size $BSZ \
74
+ --per_device_eval_batch_size $EVAL_BSZ \
75
+ --gradient_accumulation_steps $GA \
76
+ --learning_rate 0.0003 \
77
+ --num_train_epochs 10 \
78
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
79
+ --max_source_length 512 \
80
+ --max_target_length 50 \
81
+ --generation_max_length 50 \
82
+ --add_task_name False \
83
+ --add_dataset_name False \
84
+ --overwrite_output_dir \
85
+ --overwrite_cache \
86
+ --lr_scheduler_type constant \
87
+ --warmup_steps 0 \
88
+ --logging_strategy steps \
89
+ --logging_steps 10 \
90
+ --metric_for_best_model eval_exact_match \
91
+ --evaluation_strategy steps \
92
+ --save_strategy steps \
93
+ --save_total_limit 1 \
94
+ --load_best_model_at_end \
95
+ --lora_r 8 \
96
+ --lora_alpha 32 \
97
+ --lora_dropout 0.0 \
98
+ --data_replay_freq 5 \
99
+ --mlp_hidden_dim 100 \
100
+ --model_name specroute \
101
+ --kl_ratio 0.1 \
102
+ --gen_data_dir CL_Benchmark \
103
+ --threshold 0.980 \
104
+ --transthreshold 0.980 \
105
+ $FP16_FLAG
106
+
107
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/checkpoint*
108
+
109
+ sleep 5
110
+
111
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
112
+ BSZ=16; GA=1; EVAL_BSZ=256
113
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
114
+ BSZ=32; GA=1; EVAL_BSZ=256
115
+ else
116
+ BSZ=64; GA=1; EVAL_BSZ=512
117
+ fi
118
+
119
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
120
+ --do_train \
121
+ --do_predict \
122
+ --predict_with_generate \
123
+ --model_name_or_path $2 \
124
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights \
125
+ --data_dir CL_Benchmark \
126
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
127
+ --task_config_dir configs/gen_script_long_order3_t5_configs/amazon \
128
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon \
129
+ --per_device_train_batch_size $BSZ \
130
+ --per_device_eval_batch_size $EVAL_BSZ \
131
+ --gradient_accumulation_steps $GA \
132
+ --learning_rate 0.0003 \
133
+ --num_train_epochs 10 \
134
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
135
+ --max_source_length 512 \
136
+ --max_target_length 50 \
137
+ --generation_max_length 50 \
138
+ --add_task_name False \
139
+ --add_dataset_name False \
140
+ --overwrite_output_dir \
141
+ --overwrite_cache \
142
+ --lr_scheduler_type constant \
143
+ --warmup_steps 0 \
144
+ --logging_strategy steps \
145
+ --logging_steps 10 \
146
+ --metric_for_best_model eval_exact_match_for_amazon \
147
+ --evaluation_strategy steps \
148
+ --save_strategy steps \
149
+ --save_total_limit 1 \
150
+ --load_best_model_at_end \
151
+ --lora_r 8 \
152
+ --lora_alpha 32 \
153
+ --lora_dropout 0.0 \
154
+ --data_replay_freq 5 \
155
+ --mlp_hidden_dim 100 \
156
+ --model_name specroute \
157
+ --kl_ratio 0.1 \
158
+ --gen_data_dir CL_Benchmark \
159
+ --threshold 0.980 \
160
+ --transthreshold 0.980 \
161
+ $FP16_FLAG
162
+
163
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/checkpoint*
164
+
165
+ sleep 5
166
+
167
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
168
+ BSZ=16; GA=1; EVAL_BSZ=256
169
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
170
+ BSZ=32; GA=1; EVAL_BSZ=256
171
+ else
172
+ BSZ=64; GA=1; EVAL_BSZ=512
173
+ fi
174
+
175
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
176
+ --do_train \
177
+ --do_predict \
178
+ --predict_with_generate \
179
+ --model_name_or_path $2 \
180
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights \
181
+ --data_dir CL_Benchmark \
182
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
183
+ --task_config_dir configs/gen_script_long_order3_t5_configs/mnli \
184
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli \
185
+ --per_device_train_batch_size $BSZ \
186
+ --per_device_eval_batch_size $EVAL_BSZ \
187
+ --gradient_accumulation_steps $GA \
188
+ --learning_rate 0.0003 \
189
+ --num_train_epochs 10 \
190
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
191
+ --max_source_length 512 \
192
+ --max_target_length 50 \
193
+ --generation_max_length 50 \
194
+ --add_task_name False \
195
+ --add_dataset_name False \
196
+ --overwrite_output_dir \
197
+ --overwrite_cache \
198
+ --lr_scheduler_type constant \
199
+ --warmup_steps 0 \
200
+ --logging_strategy steps \
201
+ --logging_steps 10 \
202
+ --metric_for_best_model eval_exact_match_for_mnli \
203
+ --evaluation_strategy steps \
204
+ --save_strategy steps \
205
+ --save_total_limit 1 \
206
+ --load_best_model_at_end \
207
+ --lora_r 8 \
208
+ --lora_alpha 32 \
209
+ --lora_dropout 0.0 \
210
+ --data_replay_freq 5 \
211
+ --mlp_hidden_dim 100 \
212
+ --model_name specroute \
213
+ --kl_ratio 0.1 \
214
+ --gen_data_dir CL_Benchmark \
215
+ --threshold 0.980 \
216
+ --transthreshold 0.980 \
217
+ $FP16_FLAG
218
+
219
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/checkpoint*
220
+
221
+ sleep 5
222
+
223
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
224
+ BSZ=16; GA=1; EVAL_BSZ=256
225
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
226
+ BSZ=32; GA=1; EVAL_BSZ=256
227
+ else
228
+ BSZ=64; GA=1; EVAL_BSZ=512
229
+ fi
230
+
231
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
232
+ --do_train \
233
+ --do_predict \
234
+ --predict_with_generate \
235
+ --model_name_or_path $2 \
236
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights \
237
+ --data_dir CL_Benchmark \
238
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
239
+ --task_config_dir configs/gen_script_long_order3_t5_configs/cb \
240
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb \
241
+ --per_device_train_batch_size $BSZ \
242
+ --per_device_eval_batch_size $EVAL_BSZ \
243
+ --gradient_accumulation_steps $GA \
244
+ --learning_rate 0.0003 \
245
+ --num_train_epochs 10 \
246
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
247
+ --max_source_length 512 \
248
+ --max_target_length 50 \
249
+ --generation_max_length 50 \
250
+ --add_task_name False \
251
+ --add_dataset_name False \
252
+ --overwrite_output_dir \
253
+ --overwrite_cache \
254
+ --lr_scheduler_type constant \
255
+ --warmup_steps 0 \
256
+ --logging_strategy steps \
257
+ --logging_steps 10 \
258
+ --metric_for_best_model eval_exact_match_for_cb \
259
+ --evaluation_strategy steps \
260
+ --save_strategy steps \
261
+ --save_total_limit 1 \
262
+ --load_best_model_at_end \
263
+ --lora_r 8 \
264
+ --lora_alpha 32 \
265
+ --lora_dropout 0.0 \
266
+ --data_replay_freq 5 \
267
+ --mlp_hidden_dim 100 \
268
+ --model_name specroute \
269
+ --kl_ratio 0.1 \
270
+ --gen_data_dir CL_Benchmark \
271
+ --threshold 0.980 \
272
+ --transthreshold 0.980 \
273
+ $FP16_FLAG
274
+
275
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/checkpoint*
276
+
277
+ sleep 5
278
+
279
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
280
+ BSZ=16; GA=1; EVAL_BSZ=256
281
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
282
+ BSZ=32; GA=1; EVAL_BSZ=256
283
+ else
284
+ BSZ=64; GA=1; EVAL_BSZ=512
285
+ fi
286
+
287
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
288
+ --do_train \
289
+ --do_predict \
290
+ --predict_with_generate \
291
+ --model_name_or_path $2 \
292
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights \
293
+ --data_dir CL_Benchmark \
294
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
295
+ --task_config_dir configs/gen_script_long_order3_t5_configs/copa \
296
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa \
297
+ --per_device_train_batch_size $BSZ \
298
+ --per_device_eval_batch_size $EVAL_BSZ \
299
+ --gradient_accumulation_steps $GA \
300
+ --learning_rate 0.0003 \
301
+ --num_train_epochs 10 \
302
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
303
+ --max_source_length 512 \
304
+ --max_target_length 50 \
305
+ --generation_max_length 50 \
306
+ --add_task_name False \
307
+ --add_dataset_name False \
308
+ --overwrite_output_dir \
309
+ --overwrite_cache \
310
+ --lr_scheduler_type constant \
311
+ --warmup_steps 0 \
312
+ --logging_strategy steps \
313
+ --logging_steps 10 \
314
+ --metric_for_best_model eval_exact_match_for_copa \
315
+ --evaluation_strategy steps \
316
+ --save_strategy steps \
317
+ --save_total_limit 1 \
318
+ --load_best_model_at_end \
319
+ --lora_r 8 \
320
+ --lora_alpha 32 \
321
+ --lora_dropout 0.0 \
322
+ --data_replay_freq 5 \
323
+ --mlp_hidden_dim 100 \
324
+ --model_name specroute \
325
+ --kl_ratio 0.1 \
326
+ --gen_data_dir CL_Benchmark \
327
+ --threshold 0.980 \
328
+ --transthreshold 0.980 \
329
+ $FP16_FLAG
330
+
331
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/checkpoint*
332
+
333
+ sleep 5
334
+
335
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
336
+ BSZ=16; GA=1; EVAL_BSZ=256
337
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
338
+ BSZ=32; GA=1; EVAL_BSZ=256
339
+ else
340
+ BSZ=64; GA=1; EVAL_BSZ=512
341
+ fi
342
+
343
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
344
+ --do_train \
345
+ --do_predict \
346
+ --predict_with_generate \
347
+ --model_name_or_path $2 \
348
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights \
349
+ --data_dir CL_Benchmark \
350
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
351
+ --task_config_dir configs/gen_script_long_order3_t5_configs/qqp \
352
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp \
353
+ --per_device_train_batch_size $BSZ \
354
+ --per_device_eval_batch_size $EVAL_BSZ \
355
+ --gradient_accumulation_steps $GA \
356
+ --learning_rate 0.0003 \
357
+ --num_train_epochs 10 \
358
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
359
+ --max_source_length 512 \
360
+ --max_target_length 50 \
361
+ --generation_max_length 50 \
362
+ --add_task_name False \
363
+ --add_dataset_name False \
364
+ --overwrite_output_dir \
365
+ --overwrite_cache \
366
+ --lr_scheduler_type constant \
367
+ --warmup_steps 0 \
368
+ --logging_strategy steps \
369
+ --logging_steps 10 \
370
+ --metric_for_best_model eval_exact_match_for_qqp \
371
+ --evaluation_strategy steps \
372
+ --save_strategy steps \
373
+ --save_total_limit 1 \
374
+ --load_best_model_at_end \
375
+ --lora_r 8 \
376
+ --lora_alpha 32 \
377
+ --lora_dropout 0.0 \
378
+ --data_replay_freq 5 \
379
+ --mlp_hidden_dim 100 \
380
+ --model_name specroute \
381
+ --kl_ratio 0.1 \
382
+ --gen_data_dir CL_Benchmark \
383
+ --threshold 0.980 \
384
+ --transthreshold 0.980 \
385
+ $FP16_FLAG
386
+
387
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/checkpoint*
388
+
389
+ sleep 5
390
+
391
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
392
+ BSZ=16; GA=1; EVAL_BSZ=256
393
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
394
+ BSZ=32; GA=1; EVAL_BSZ=256
395
+ else
396
+ BSZ=64; GA=1; EVAL_BSZ=512
397
+ fi
398
+
399
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
400
+ --do_train \
401
+ --do_predict \
402
+ --predict_with_generate \
403
+ --model_name_or_path $2 \
404
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights \
405
+ --data_dir CL_Benchmark \
406
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
407
+ --task_config_dir configs/gen_script_long_order3_t5_configs/rte \
408
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte \
409
+ --per_device_train_batch_size $BSZ \
410
+ --per_device_eval_batch_size $EVAL_BSZ \
411
+ --gradient_accumulation_steps $GA \
412
+ --learning_rate 0.0003 \
413
+ --num_train_epochs 10 \
414
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
415
+ --max_source_length 512 \
416
+ --max_target_length 50 \
417
+ --generation_max_length 50 \
418
+ --add_task_name False \
419
+ --add_dataset_name False \
420
+ --overwrite_output_dir \
421
+ --overwrite_cache \
422
+ --lr_scheduler_type constant \
423
+ --warmup_steps 0 \
424
+ --logging_strategy steps \
425
+ --logging_steps 10 \
426
+ --metric_for_best_model eval_exact_match_for_rte \
427
+ --evaluation_strategy steps \
428
+ --save_strategy steps \
429
+ --save_total_limit 1 \
430
+ --load_best_model_at_end \
431
+ --lora_r 8 \
432
+ --lora_alpha 32 \
433
+ --lora_dropout 0.0 \
434
+ --data_replay_freq 5 \
435
+ --mlp_hidden_dim 100 \
436
+ --model_name specroute \
437
+ --kl_ratio 0.1 \
438
+ --gen_data_dir CL_Benchmark \
439
+ --threshold 0.980 \
440
+ --transthreshold 0.980 \
441
+ $FP16_FLAG
442
+
443
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/checkpoint*
444
+
445
+ sleep 5
446
+
447
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
448
+ BSZ=16; GA=1; EVAL_BSZ=256
449
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
450
+ BSZ=32; GA=1; EVAL_BSZ=256
451
+ else
452
+ BSZ=64; GA=1; EVAL_BSZ=512
453
+ fi
454
+
455
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
456
+ --do_train \
457
+ --do_predict \
458
+ --predict_with_generate \
459
+ --model_name_or_path $2 \
460
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights \
461
+ --data_dir CL_Benchmark \
462
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
463
+ --task_config_dir configs/gen_script_long_order3_t5_configs/imdb \
464
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb \
465
+ --per_device_train_batch_size $BSZ \
466
+ --per_device_eval_batch_size $EVAL_BSZ \
467
+ --gradient_accumulation_steps $GA \
468
+ --learning_rate 0.0003 \
469
+ --num_train_epochs 10 \
470
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
471
+ --max_source_length 512 \
472
+ --max_target_length 50 \
473
+ --generation_max_length 50 \
474
+ --add_task_name False \
475
+ --add_dataset_name False \
476
+ --overwrite_output_dir \
477
+ --overwrite_cache \
478
+ --lr_scheduler_type constant \
479
+ --warmup_steps 0 \
480
+ --logging_strategy steps \
481
+ --logging_steps 10 \
482
+ --metric_for_best_model eval_exact_match_for_imdb \
483
+ --evaluation_strategy steps \
484
+ --save_strategy steps \
485
+ --save_total_limit 1 \
486
+ --load_best_model_at_end \
487
+ --lora_r 8 \
488
+ --lora_alpha 32 \
489
+ --lora_dropout 0.0 \
490
+ --data_replay_freq 5 \
491
+ --mlp_hidden_dim 100 \
492
+ --model_name specroute \
493
+ --kl_ratio 0.1 \
494
+ --gen_data_dir CL_Benchmark \
495
+ --threshold 0.980 \
496
+ --transthreshold 0.980 \
497
+ $FP16_FLAG
498
+
499
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/checkpoint*
500
+
501
+ sleep 5
502
+
503
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
504
+ BSZ=16; GA=1; EVAL_BSZ=256
505
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
506
+ BSZ=32; GA=1; EVAL_BSZ=256
507
+ else
508
+ BSZ=64; GA=1; EVAL_BSZ=512
509
+ fi
510
+
511
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
512
+ --do_train \
513
+ --do_predict \
514
+ --predict_with_generate \
515
+ --model_name_or_path $2 \
516
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights \
517
+ --data_dir CL_Benchmark \
518
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
519
+ --task_config_dir configs/gen_script_long_order3_t5_configs/sst2 \
520
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2 \
521
+ --per_device_train_batch_size $BSZ \
522
+ --per_device_eval_batch_size $EVAL_BSZ \
523
+ --gradient_accumulation_steps $GA \
524
+ --learning_rate 0.0003 \
525
+ --num_train_epochs 10 \
526
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
527
+ --max_source_length 512 \
528
+ --max_target_length 50 \
529
+ --generation_max_length 50 \
530
+ --add_task_name False \
531
+ --add_dataset_name False \
532
+ --overwrite_output_dir \
533
+ --overwrite_cache \
534
+ --lr_scheduler_type constant \
535
+ --warmup_steps 0 \
536
+ --logging_strategy steps \
537
+ --logging_steps 10 \
538
+ --metric_for_best_model eval_exact_match_for_sst2 \
539
+ --evaluation_strategy steps \
540
+ --save_strategy steps \
541
+ --save_total_limit 1 \
542
+ --load_best_model_at_end \
543
+ --lora_r 8 \
544
+ --lora_alpha 32 \
545
+ --lora_dropout 0.0 \
546
+ --data_replay_freq 5 \
547
+ --mlp_hidden_dim 100 \
548
+ --model_name specroute \
549
+ --kl_ratio 0.1 \
550
+ --gen_data_dir CL_Benchmark \
551
+ --threshold 0.980 \
552
+ --transthreshold 0.980 \
553
+ $FP16_FLAG
554
+
555
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/checkpoint*
556
+
557
+ sleep 5
558
+
559
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
560
+ BSZ=16; GA=1; EVAL_BSZ=256
561
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
562
+ BSZ=32; GA=1; EVAL_BSZ=256
563
+ else
564
+ BSZ=64; GA=1; EVAL_BSZ=512
565
+ fi
566
+
567
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
568
+ --do_train \
569
+ --do_predict \
570
+ --predict_with_generate \
571
+ --model_name_or_path $2 \
572
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights \
573
+ --data_dir CL_Benchmark \
574
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
575
+ --task_config_dir configs/gen_script_long_order3_t5_configs/dbpedia \
576
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia \
577
+ --per_device_train_batch_size $BSZ \
578
+ --per_device_eval_batch_size $EVAL_BSZ \
579
+ --gradient_accumulation_steps $GA \
580
+ --learning_rate 0.0003 \
581
+ --num_train_epochs 10 \
582
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
583
+ --max_source_length 512 \
584
+ --max_target_length 50 \
585
+ --generation_max_length 50 \
586
+ --add_task_name False \
587
+ --add_dataset_name False \
588
+ --overwrite_output_dir \
589
+ --overwrite_cache \
590
+ --lr_scheduler_type constant \
591
+ --warmup_steps 0 \
592
+ --logging_strategy steps \
593
+ --logging_steps 10 \
594
+ --metric_for_best_model eval_exact_match_for_dbpedia \
595
+ --evaluation_strategy steps \
596
+ --save_strategy steps \
597
+ --save_total_limit 1 \
598
+ --load_best_model_at_end \
599
+ --lora_r 8 \
600
+ --lora_alpha 32 \
601
+ --lora_dropout 0.0 \
602
+ --data_replay_freq 5 \
603
+ --mlp_hidden_dim 100 \
604
+ --model_name specroute \
605
+ --kl_ratio 0.1 \
606
+ --gen_data_dir CL_Benchmark \
607
+ --threshold 0.980 \
608
+ --transthreshold 0.980 \
609
+ $FP16_FLAG
610
+
611
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/checkpoint*
612
+
613
+ sleep 5
614
+
615
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
616
+ BSZ=16; GA=1; EVAL_BSZ=256
617
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
618
+ BSZ=32; GA=1; EVAL_BSZ=256
619
+ else
620
+ BSZ=64; GA=1; EVAL_BSZ=512
621
+ fi
622
+
623
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
624
+ --do_train \
625
+ --do_predict \
626
+ --predict_with_generate \
627
+ --model_name_or_path $2 \
628
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights \
629
+ --data_dir CL_Benchmark \
630
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
631
+ --task_config_dir configs/gen_script_long_order3_t5_configs/agnews \
632
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews \
633
+ --per_device_train_batch_size $BSZ \
634
+ --per_device_eval_batch_size $EVAL_BSZ \
635
+ --gradient_accumulation_steps $GA \
636
+ --learning_rate 0.0003 \
637
+ --num_train_epochs 10 \
638
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
639
+ --max_source_length 512 \
640
+ --max_target_length 50 \
641
+ --generation_max_length 50 \
642
+ --add_task_name False \
643
+ --add_dataset_name False \
644
+ --overwrite_output_dir \
645
+ --overwrite_cache \
646
+ --lr_scheduler_type constant \
647
+ --warmup_steps 0 \
648
+ --logging_strategy steps \
649
+ --logging_steps 10 \
650
+ --metric_for_best_model eval_exact_match_for_agnews \
651
+ --evaluation_strategy steps \
652
+ --save_strategy steps \
653
+ --save_total_limit 1 \
654
+ --load_best_model_at_end \
655
+ --lora_r 8 \
656
+ --lora_alpha 32 \
657
+ --lora_dropout 0.0 \
658
+ --data_replay_freq 5 \
659
+ --mlp_hidden_dim 100 \
660
+ --model_name specroute \
661
+ --kl_ratio 0.1 \
662
+ --gen_data_dir CL_Benchmark \
663
+ --threshold 0.980 \
664
+ --transthreshold 0.980 \
665
+ $FP16_FLAG
666
+
667
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/checkpoint*
668
+
669
+ sleep 5
670
+
671
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
672
+ BSZ=16; GA=1; EVAL_BSZ=256
673
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
674
+ BSZ=32; GA=1; EVAL_BSZ=256
675
+ else
676
+ BSZ=64; GA=1; EVAL_BSZ=512
677
+ fi
678
+
679
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
680
+ --do_train \
681
+ --do_predict \
682
+ --predict_with_generate \
683
+ --model_name_or_path $2 \
684
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights \
685
+ --data_dir CL_Benchmark \
686
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
687
+ --task_config_dir configs/gen_script_long_order3_t5_configs/yahoo \
688
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo \
689
+ --per_device_train_batch_size $BSZ \
690
+ --per_device_eval_batch_size $EVAL_BSZ \
691
+ --gradient_accumulation_steps $GA \
692
+ --learning_rate 0.0003 \
693
+ --num_train_epochs 10 \
694
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
695
+ --max_source_length 512 \
696
+ --max_target_length 50 \
697
+ --generation_max_length 50 \
698
+ --add_task_name False \
699
+ --add_dataset_name False \
700
+ --overwrite_output_dir \
701
+ --overwrite_cache \
702
+ --lr_scheduler_type constant \
703
+ --warmup_steps 0 \
704
+ --logging_strategy steps \
705
+ --logging_steps 10 \
706
+ --metric_for_best_model eval_exact_match_for_yahoo \
707
+ --evaluation_strategy steps \
708
+ --save_strategy steps \
709
+ --save_total_limit 1 \
710
+ --load_best_model_at_end \
711
+ --lora_r 8 \
712
+ --lora_alpha 32 \
713
+ --lora_dropout 0.0 \
714
+ --data_replay_freq 5 \
715
+ --mlp_hidden_dim 100 \
716
+ --model_name specroute \
717
+ --kl_ratio 0.1 \
718
+ --gen_data_dir CL_Benchmark \
719
+ --threshold 0.980 \
720
+ --transthreshold 0.980 \
721
+ $FP16_FLAG
722
+
723
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/checkpoint*
724
+
725
+ sleep 5
726
+
727
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
728
+ BSZ=16; GA=1; EVAL_BSZ=256
729
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
730
+ BSZ=32; GA=1; EVAL_BSZ=256
731
+ else
732
+ BSZ=64; GA=1; EVAL_BSZ=512
733
+ fi
734
+
735
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
736
+ --do_train \
737
+ --do_predict \
738
+ --predict_with_generate \
739
+ --model_name_or_path $2 \
740
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/saved_weights \
741
+ --data_dir CL_Benchmark \
742
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
743
+ --task_config_dir configs/gen_script_long_order3_t5_configs/multirc \
744
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc \
745
+ --per_device_train_batch_size $BSZ \
746
+ --per_device_eval_batch_size $EVAL_BSZ \
747
+ --gradient_accumulation_steps $GA \
748
+ --learning_rate 0.0003 \
749
+ --num_train_epochs 10 \
750
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
751
+ --max_source_length 512 \
752
+ --max_target_length 50 \
753
+ --generation_max_length 50 \
754
+ --add_task_name False \
755
+ --add_dataset_name False \
756
+ --overwrite_output_dir \
757
+ --overwrite_cache \
758
+ --lr_scheduler_type constant \
759
+ --warmup_steps 0 \
760
+ --logging_strategy steps \
761
+ --logging_steps 10 \
762
+ --metric_for_best_model eval_exact_match_for_multirc \
763
+ --evaluation_strategy steps \
764
+ --save_strategy steps \
765
+ --save_total_limit 1 \
766
+ --load_best_model_at_end \
767
+ --lora_r 8 \
768
+ --lora_alpha 32 \
769
+ --lora_dropout 0.0 \
770
+ --data_replay_freq 5 \
771
+ --mlp_hidden_dim 100 \
772
+ --model_name specroute \
773
+ --kl_ratio 0.1 \
774
+ --gen_data_dir CL_Benchmark \
775
+ --threshold 0.980 \
776
+ --transthreshold 0.980 \
777
+ $FP16_FLAG
778
+
779
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc/checkpoint*
780
+
781
+ sleep 5
782
+
783
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
784
+ BSZ=16; GA=1; EVAL_BSZ=256
785
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
786
+ BSZ=32; GA=1; EVAL_BSZ=256
787
+ else
788
+ BSZ=64; GA=1; EVAL_BSZ=512
789
+ fi
790
+
791
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
792
+ --do_train \
793
+ --do_predict \
794
+ --predict_with_generate \
795
+ --model_name_or_path $2 \
796
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc/saved_weights \
797
+ --data_dir CL_Benchmark \
798
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
799
+ --task_config_dir configs/gen_script_long_order3_t5_configs/boolq \
800
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/14-boolq \
801
+ --per_device_train_batch_size $BSZ \
802
+ --per_device_eval_batch_size $EVAL_BSZ \
803
+ --gradient_accumulation_steps $GA \
804
+ --learning_rate 0.0003 \
805
+ --num_train_epochs 10 \
806
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
807
+ --max_source_length 512 \
808
+ --max_target_length 50 \
809
+ --generation_max_length 50 \
810
+ --add_task_name False \
811
+ --add_dataset_name False \
812
+ --overwrite_output_dir \
813
+ --overwrite_cache \
814
+ --lr_scheduler_type constant \
815
+ --warmup_steps 0 \
816
+ --logging_strategy steps \
817
+ --logging_steps 10 \
818
+ --metric_for_best_model eval_exact_match_for_boolq \
819
+ --evaluation_strategy steps \
820
+ --save_strategy steps \
821
+ --save_total_limit 1 \
822
+ --load_best_model_at_end \
823
+ --lora_r 8 \
824
+ --lora_alpha 32 \
825
+ --lora_dropout 0.0 \
826
+ --data_replay_freq 5 \
827
+ --mlp_hidden_dim 100 \
828
+ --model_name specroute \
829
+ --kl_ratio 0.1 \
830
+ --gen_data_dir CL_Benchmark \
831
+ --threshold 0.980 \
832
+ --transthreshold 0.980 \
833
+ $FP16_FLAG
834
+
835
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/14-boolq/checkpoint*
836
+
837
+ sleep 5
838
+
839
+ if [ "$GPU_MODE" = "t4_2gpu" ]; then
840
+ BSZ=16; GA=1; EVAL_BSZ=256
841
+ elif [ "$GPU_MODE" = "t4_1gpu" ]; then
842
+ BSZ=32; GA=1; EVAL_BSZ=256
843
+ else
844
+ BSZ=64; GA=1; EVAL_BSZ=512
845
+ fi
846
+
847
+ CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
848
+ --do_train \
849
+ --do_predict \
850
+ --predict_with_generate \
851
+ --model_name_or_path $2 \
852
+ --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/14-boolq/saved_weights \
853
+ --data_dir CL_Benchmark \
854
+ --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
855
+ --task_config_dir configs/gen_script_long_order3_t5_configs/wic \
856
+ --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/15-wic \
857
+ --per_device_train_batch_size $BSZ \
858
+ --per_device_eval_batch_size $EVAL_BSZ \
859
+ --gradient_accumulation_steps $GA \
860
+ --learning_rate 0.0003 \
861
+ --num_train_epochs 10 \
862
+ --run_name gen_script_long_order3_t5_small_specroute_v2 \
863
+ --max_source_length 512 \
864
+ --max_target_length 50 \
865
+ --generation_max_length 50 \
866
+ --add_task_name False \
867
+ --add_dataset_name False \
868
+ --overwrite_output_dir \
869
+ --overwrite_cache \
870
+ --lr_scheduler_type constant \
871
+ --warmup_steps 0 \
872
+ --logging_strategy steps \
873
+ --logging_steps 10 \
874
+ --metric_for_best_model eval_exact_match_for_wic \
875
+ --evaluation_strategy steps \
876
+ --save_strategy steps \
877
+ --save_total_limit 1 \
878
+ --load_best_model_at_end \
879
+ --lora_r 8 \
880
+ --lora_alpha 32 \
881
+ --lora_dropout 0.0 \
882
+ --data_replay_freq 5 \
883
+ --mlp_hidden_dim 100 \
884
+ --model_name specroute \
885
+ --kl_ratio 0.1 \
886
+ --gen_data_dir CL_Benchmark \
887
+ --threshold 0.980 \
888
+ --transthreshold 0.980 \
889
+ $FP16_FLAG
890
+
891
+ rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/15-wic/checkpoint*
892
+
893
+ sleep 5
improve_gainlora/generate_specroute_scripts_v2.py CHANGED
@@ -107,7 +107,7 @@ SCRIPT_CONFIGS = {
107
  "lora_r": 8,
108
  "epochs": 10,
109
  "metric_base": "eval_exact_match",
110
- "do_predict": False,
111
  "cleanup_checkpoints": True,
112
  "batch_a100_task1": (8, 4),
113
  "batch_a100_rest": (16, 2),
@@ -125,7 +125,7 @@ SCRIPT_CONFIGS = {
125
  "lora_r": 8,
126
  "epochs": 10,
127
  "metric_base": "eval_exact_match",
128
- "do_predict": False,
129
  "cleanup_checkpoints": True,
130
  "batch_a100_task1": (8, 4),
131
  "batch_a100_rest": (16, 2),
 
107
  "lora_r": 8,
108
  "epochs": 10,
109
  "metric_base": "eval_exact_match",
110
+ "do_predict": True,
111
  "cleanup_checkpoints": True,
112
  "batch_a100_task1": (8, 4),
113
  "batch_a100_rest": (16, 2),
 
125
  "lora_r": 8,
126
  "epochs": 10,
127
  "metric_base": "eval_exact_match",
128
+ "do_predict": True,
129
  "cleanup_checkpoints": True,
130
  "batch_a100_task1": (8, 4),
131
  "batch_a100_rest": (16, 2),
improve_gainlora/src/cl_trainer_specroute.py CHANGED
@@ -68,9 +68,21 @@ class DenserEvalCallback(TrainerCallback):
68
  return control
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
71
  class SpecRoute_Trainer(Seq2SeqTrainer):
72
 
73
  def __init__(self, model, args, train_dataset, cur_task_id, task_order,
 
74
  eval_dataset=None, tokenizer=None, data_collator=None,
75
  compute_metrics=None, callbacks=None):
76
  super().__init__(
@@ -82,7 +94,32 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
82
  self.task_order = task_order
83
  self.cur_task_id = cur_task_id
84
  self._grad_check_done = False
85
- # No replay data needed for SpecRoute
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def _save(self, output_dir=None, state_dict=None):
88
  # T5 shared embeddings are incompatible with safetensors; force pytorch format
@@ -94,8 +131,56 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
94
  self.args.save_safetensors = old
95
 
96
  def training_step(self, model, inputs, **kwargs):
97
- """Override to add one-time gradient diagnostic."""
98
- loss = super().training_step(model, inputs, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  # One-time gradient check after first backward
101
  if not self._grad_check_done:
 
68
  return control
69
 
70
 
71
+ def create_memory_replay_generators(task, task_list, replay_data_dict):
72
+ """Create cycling iterators for previous tasks' replay data."""
73
+ print('Creating generators for previous tasks (SpecRoute replay) ...')
74
+ tasks_to_generators = {}
75
+ curr_task_num = task_list.index(task)
76
+ for idx in np.arange(curr_task_num):
77
+ prev_task = task_list[idx]
78
+ tasks_to_generators[prev_task] = iter(replay_data_dict[prev_task])
79
+ return tasks_to_generators
80
+
81
+
82
  class SpecRoute_Trainer(Seq2SeqTrainer):
83
 
84
  def __init__(self, model, args, train_dataset, cur_task_id, task_order,
85
+ data_collator_replay=None, replay_dataset_dict=None,
86
  eval_dataset=None, tokenizer=None, data_collator=None,
87
  compute_metrics=None, callbacks=None):
88
  super().__init__(
 
94
  self.task_order = task_order
95
  self.cur_task_id = cur_task_id
96
  self._grad_check_done = False
97
+
98
+ # Experience replay setup
99
+ self.data_collator_replay = data_collator_replay
100
+ self.replay_dataset_dict = replay_dataset_dict
101
+ if self.args.data_replay_freq != -1 and replay_dataset_dict is not None:
102
+ from torch.utils.data import RandomSampler
103
+ from transformers.trainer_utils import seed_worker
104
+ seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
105
+ generator = torch.Generator()
106
+ generator.manual_seed(seed)
107
+ self.replay_dataloader_dict = {}
108
+ for dataset_name, dataset in self.replay_dataset_dict.items():
109
+ train_sampler = RandomSampler(dataset, generator=generator)
110
+ self.replay_dataloader_dict[dataset_name] = DataLoader(
111
+ dataset,
112
+ batch_size=self._train_batch_size,
113
+ sampler=train_sampler,
114
+ collate_fn=self.data_collator_replay,
115
+ drop_last=self.args.dataloader_drop_last,
116
+ num_workers=self.args.dataloader_num_workers,
117
+ pin_memory=False,
118
+ worker_init_fn=seed_worker)
119
+ self.replay_iterator_dict = create_memory_replay_generators(
120
+ task_order[cur_task_id], task_order, self.replay_dataloader_dict)
121
+ print(f"[SpecRoute Replay] Enabled: {len(self.replay_dataloader_dict)} tasks, "
122
+ f"freq={self.args.data_replay_freq}, ratio={self.args.kl_ratio}")
123
 
124
  def _save(self, output_dir=None, state_dict=None):
125
  # T5 shared embeddings are incompatible with safetensors; force pytorch format
 
131
  self.args.save_safetensors = old
132
 
133
  def training_step(self, model, inputs, **kwargs):
134
+ """Override to add experience replay and one-time gradient diagnostic."""
135
+ model.train()
136
+ inputs = self._prepare_inputs(inputs)
137
+
138
+ with self.compute_loss_context_manager():
139
+ loss = self.compute_loss(model, inputs)
140
+
141
+ if self.args.n_gpu > 1:
142
+ loss = loss.mean()
143
+
144
+ if self.args.gradient_accumulation_steps > 1 and not self.is_deepspeed_enabled:
145
+ loss = loss / self.args.gradient_accumulation_steps
146
+
147
+ if self.is_deepspeed_enabled:
148
+ self.accelerator.backward(loss)
149
+ else:
150
+ loss.backward()
151
+
152
+ # === Experience Replay: CE loss on old task data ===
153
+ replay_freq = getattr(self.args, 'data_replay_freq', -1)
154
+ if (replay_freq != -1
155
+ and hasattr(self, 'replay_iterator_dict')
156
+ and self.state.global_step > getattr(self.args, 'replay_after_n_epoch', 0) * getattr(self.args, 'step_per_epoch', 0)
157
+ and self.state.global_step % replay_freq == 0):
158
+ for item in list(self.replay_iterator_dict.keys()):
159
+ generator_mem = self.replay_iterator_dict[item]
160
+ try:
161
+ b = next(generator_mem)
162
+ except StopIteration:
163
+ generator_mem = iter(self.replay_dataloader_dict[item])
164
+ self.replay_iterator_dict[item] = generator_mem
165
+ b = next(generator_mem)
166
+
167
+ # Remove replay_labels if present (not needed for CE replay)
168
+ b.pop("replay_labels", None)
169
+ replay_inputs = self._prepare_inputs(b)
170
+
171
+ with self.compute_loss_context_manager():
172
+ replay_loss = self.compute_loss(model, replay_inputs)
173
+
174
+ kl_ratio = getattr(self.args, 'kl_ratio', 0.1)
175
+ replay_loss = kl_ratio * replay_loss
176
+
177
+ if self.args.n_gpu > 1:
178
+ replay_loss = replay_loss.mean()
179
+
180
+ if self.is_deepspeed_enabled:
181
+ self.accelerator.backward(replay_loss)
182
+ else:
183
+ replay_loss.backward()
184
 
185
  # One-time gradient check after first backward
186
  if not self._grad_check_done:
improve_gainlora/src/run_t5.py CHANGED
@@ -707,7 +707,9 @@ def main():
707
  input_record_file=data_args.input_record_file)
708
 
709
  replay_dataset_dict, replay_label_dict = None, None
710
- if model_args.load_checkpoint_from:
 
 
711
  replay_dataset_dict = {}
712
  abs_data_dir_replay = os.path.abspath(data_dir) if data_dir else None
713
  for idx in range(cur_task_id):
@@ -725,12 +727,14 @@ def main():
725
  replay_dataset_dict[task_order[idx]] = raw_datasets_gen["train"]
726
  print(raw_datasets_gen)
727
 
728
- replay_label_dict = {}
729
- for idx in range(0,cur_task_id):
730
- with open(os.path.join("../logs_and_outputs/" + training_args.run_name + "/outputs/", str(idx+1)+"-"+task_order[idx], "saved_weights", "attention_weights.pkl"), 'rb') as f:
731
- attn_weights = pickle.load(f)
732
- replay_label_dict[task_order[idx]] = torch.cat([torch.tensor([0.] * (cur_task_id - idx)), torch.tensor(attn_weights)], dim=0).to(dtype=torch.bfloat16, device='cuda')
733
- print(replay_label_dict)
 
 
734
  print('-'*50)
735
 
736
  # Metric
@@ -866,6 +870,8 @@ def main():
866
  train_dataset=train_dataset if training_args.do_train else None,
867
  cur_task_id=cur_task_id,
868
  task_order=task_order,
 
 
869
  eval_dataset=eval_dataset if training_args.do_eval else None,
870
  tokenizer=tokenizer,
871
  data_collator=data_collator,
 
707
  input_record_file=data_args.input_record_file)
708
 
709
  replay_dataset_dict, replay_label_dict = None, None
710
+ # Load replay datasets for methods that need it
711
+ _need_replay_data = model_args.load_checkpoint_from or (training_args.model_name == 'specroute' and cur_task_id > 0)
712
+ if _need_replay_data:
713
  replay_dataset_dict = {}
714
  abs_data_dir_replay = os.path.abspath(data_dir) if data_dir else None
715
  for idx in range(cur_task_id):
 
727
  replay_dataset_dict[task_order[idx]] = raw_datasets_gen["train"]
728
  print(raw_datasets_gen)
729
 
730
+ # Load attention weights for KL replay (GainLoRA only, not SpecRoute)
731
+ if model_args.load_checkpoint_from:
732
+ replay_label_dict = {}
733
+ for idx in range(0,cur_task_id):
734
+ with open(os.path.join("../logs_and_outputs/" + training_args.run_name + "/outputs/", str(idx+1)+"-"+task_order[idx], "saved_weights", "attention_weights.pkl"), 'rb') as f:
735
+ attn_weights = pickle.load(f)
736
+ replay_label_dict[task_order[idx]] = torch.cat([torch.tensor([0.] * (cur_task_id - idx)), torch.tensor(attn_weights)], dim=0).to(dtype=torch.bfloat16, device='cuda')
737
+ print(replay_label_dict)
738
  print('-'*50)
739
 
740
  # Metric
 
870
  train_dataset=train_dataset if training_args.do_train else None,
871
  cur_task_id=cur_task_id,
872
  task_order=task_order,
873
+ data_collator_replay=data_collator_replay,
874
+ replay_dataset_dict=replay_dataset_dict,
875
  eval_dataset=eval_dataset if training_args.do_eval else None,
876
  tokenizer=tokenizer,
877
  data_collator=data_collator,
results/comparison_results.md CHANGED
@@ -152,7 +152,18 @@ python src/compute_ap_ft.py \
152
  | 13 | agnews | | | | |
153
  | 14 | multirc | | | | |
154
  | 15 | yahoo | | | | |
155
- | | **AP / FT** | **77.54 / 1.20** | | | |
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  ---
158
 
@@ -160,43 +171,34 @@ python src/compute_ap_ft.py \
160
 
161
  | Method | Order 3 AP↑ | Order 3 FT↓ |
162
  |--------|-------------|-------------|
163
- | **GainLoRA (Root)** | 59.70 | N/A* |
164
- | **SpecRoute (Improve)** | *(chờ chạy)* | *(chờ chạy)* |
165
 
166
- > *\*FT = N/A: Log chỉ chứa Evaluation task cuối (15-wic). Các task trước không có bước `--do_predict` để xuất cross-task matrix. Lần chạy sau dùng script trong `T5_small/` đã được sửa để có FT.*
 
167
 
168
- ## Per-Task Breakdown Order 3 (T5-Small)
169
 
170
- | # | Task | GainLoRA (Root) Final | SpecRoute (Improve) Final |
171
- |---|------|-----------------------|---------------------------|
172
- | 1 | yelp | 56.01 | |
173
- | 2 | amazon | 52.05 | |
174
- | 3 | mnli | 34.07 | |
175
- | 4 | cb | 3.57 | |
176
- | 5 | copa | 42.00 | |
177
- | 6 | qqp | 76.96 | |
178
- | 7 | rte | 45.85 | |
179
- | 8 | imdb | 89.51 | |
180
- | 9 | sst2 | 85.21 | |
181
- | 10 | dbpedia | 98.16 | |
182
- | 11 | agnews | 88.37 | |
183
- | 12 | yahoo | 57.28 | |
184
- | 13 | multirc | 50.52 | |
185
- | 14 | boolq | 60.43 | |
186
- | 15 | wic | 55.49 | |
187
- | | **AP / FT** | **59.70 / N/A** | |
188
 
189
- ---
190
-
191
- ## Quick Harvest (chạy sau khi xong cả 4 orders)
192
-
193
- ```bash
194
- # Chạy 4 lệnh này để lấy đủ số cho cả 2 bảng:
195
- python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order1_t5large/outputs --task_order "task1572_samsum_summary,task363_sst2_polarity_classification,task1290_xsum_summarization,task181_outcome_extraction,task002_quoref_answer_generation,task1510_evalution_relation_extraction,task639_multi_woz_user_utterance_generation,task1729_personachat_generate_next,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task748_glucose_reverse_cause_event_detection,task511_reddit_tifu_long_text_summarization,task591_sciq_answer_generation,task1687_sentiment140_classification,task875_emotion_classification" --save
196
-
197
- python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order2_t5large/outputs --task_order "task748_glucose_reverse_cause_event_detection,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task639_multi_woz_user_utterance_generation,task1572_samsum_summary,task1687_sentiment140_classification,task591_sciq_answer_generation,task363_sst2_polarity_classification,task1510_evalution_relation_extraction,task1729_personachat_generate_next,task181_outcome_extraction,task511_reddit_tifu_long_text_summarization,task002_quoref_answer_generation,task1290_xsum_summarization,task875_emotion_classification" --save
198
 
199
- python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order3_t5large/outputs --task_order "yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic" --save
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order4_t5large/outputs --task_order "mnli,cb,wic,copa,qqp,boolq,rte,imdb,yelp,amazon,sst2,dbpedia,agnews,multirc,yahoo" --save
202
- ```
 
152
  | 13 | agnews | | | | |
153
  | 14 | multirc | | | | |
154
  | 15 | yahoo | | | | |
155
+
156
+
157
+ ```bash
158
+ # Chạy 4 lệnh này để lấy đủ số cho cả 2 bảng:
159
+ python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order1_t5large/outputs --task_order "task1572_samsum_summary,task363_sst2_polarity_classification,task1290_xsum_summarization,task181_outcome_extraction,task002_quoref_answer_generation,task1510_evalution_relation_extraction,task639_multi_woz_user_utterance_generation,task1729_personachat_generate_next,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task748_glucose_reverse_cause_event_detection,task511_reddit_tifu_long_text_summarization,task591_sciq_answer_generation,task1687_sentiment140_classification,task875_emotion_classification" --save
160
+
161
+ python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order2_t5large/outputs --task_order "task748_glucose_reverse_cause_event_detection,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task639_multi_woz_user_utterance_generation,task1572_samsum_summary,task1687_sentiment140_classification,task591_sciq_answer_generation,task363_sst2_polarity_classification,task1510_evalution_relation_extraction,task1729_personachat_generate_next,task181_outcome_extraction,task511_reddit_tifu_long_text_summarization,task002_quoref_answer_generation,task1290_xsum_summarization,task875_emotion_classification" --save
162
+
163
+ python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order3_t5large/outputs --task_order "yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic" --save
164
+
165
+ python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order4_t5large/outputs --task_order "mnli,cb,wic,copa,qqp,boolq,rte,imdb,yelp,amazon,sst2,dbpedia,agnews,multirc,yahoo" --save
166
+ ```
167
 
168
  ---
169
 
 
171
 
172
  | Method | Order 3 AP↑ | Order 3 FT↓ |
173
  |--------|-------------|-------------|
174
+ | **GainLoRA (Root)** | **59.70** | N/A* |
175
+ | **SpecRoute (Improve)** | 39.74† | N/A* |
176
 
177
+ > *\*FT = N/A: cả 2 log chạy thiếu `--do_predict`. Lần tiếp theo dùng script `T5_small/` đã sửa sẽđủ FT.*
178
+ > *†Điểm Improve tính từ `predict_eval_predictions.jsonl` của từng task (hàng chéo score matrix). imdb/sst2/wic về 0 do Catastrophic Forgetting.*
179
 
180
+ ### ⚠️ Root GainLoRA tốt hơn SpecRoute trên T5-Small (−19.96 AP)
181
 
182
+ SpecRoute bị Catastrophic Forgetting nghiêm trọng các task phân loại sentiment (imdb=0.21, sst2=0.00, yahoo=8.12, wic=0.00). Nguyên nhân có thể do SVD rank không đủ lớn ở T5-Small, làm routing mechanism không phân tách được subspace của các task.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ ## Per-Task Breakdown — Order 3 (T5-Small)
 
 
 
 
 
 
 
 
185
 
186
+ | # | Task | GainLoRA (Root) | SpecRoute (Improve) | Δ (Improve−Root) |
187
+ |---|------|-----------------|--------------------|-----------------|
188
+ | 1 | yelp | 56.01 | 54.36 | −1.65 |
189
+ | 2 | amazon | 52.05 | 50.01 | −2.04 |
190
+ | 3 | mnli | 34.07 | 35.50 | +1.43 |
191
+ | 4 | cb | 3.57 | 0.00 | −3.57 |
192
+ | 5 | copa | 42.00 | 44.00 | +2.00 |
193
+ | 6 | qqp | 76.96 | 76.72 | −0.24 |
194
+ | 7 | rte | 45.85 | 50.90 | +5.05 |
195
+ | 8 | imdb | 89.51 | 0.21 | **−89.30 ⚠️** |
196
+ | 9 | sst2 | 85.21 | 0.00 | **−85.21 ⚠️** |
197
+ | 10 | dbpedia | 98.16 | 92.22 | −5.94 |
198
+ | 11 | agnews | 88.37 | 68.76 | −19.61 |
199
+ | 12 | yahoo | 57.28 | 8.12 | **−49.16 ⚠️** |
200
+ | 13 | multirc | 50.52 | 54.23 | +3.71 |
201
+ | 14 | boolq | 60.43 | 61.13 | +0.70 |
202
+ | 15 | wic | 55.49 | 0.00 | **−55.49 ⚠️** |
203
+ | | **AP / FT** | **59.70 / N/A** | **39.74 / N/A** | **−19.96** |
204
 
 
 
results/experiment_versions.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SpecRoute — Báo cáo Thử nghiệm theo Version
2
+
3
+ > Tracking tất cả versions thử nghiệm, kết quả, phân tích, và cải tiến.
4
+ > Benchmark: Long Sequence Order 3, 15 classification tasks, model T5-Small.
5
+
6
+ ---
7
+
8
+ ## Version 1.0 — Baseline SpecRoute (Kết quả đầu tiên)
9
+
10
+ ### Kịch bản thử nghiệm
11
+ - **Model**: T5-Small (d_model=512, 6 encoder + 6 decoder layers)
12
+ - **Method**: SpecRoute — spectral routing (SVD of LoRA B@A) thay thế learned routing (trans_input + prompt_key) của GainLoRA
13
+ - **So sánh**: ROOT GainLoRA-InfLoRA (original codebase)
14
+ - **Hyperparameters**: lora_r=8, lora_alpha=32, lr=3e-4, 10 epochs, threshold=0.995
15
+ - **Platform**: Kaggle T4 GPU
16
+
17
+ ### Kết quả
18
+
19
+ | # | Task | ROOT (Final R_{15,j}) | SpecRoute (Peak R_{j,j}) | Δ |
20
+ |---|------|-----------------------|--------------------------|---|
21
+ | 1 | yelp | 56.01 | 54.36 | -1.65 |
22
+ | 2 | amazon | 52.05 | 50.01 | -2.04 |
23
+ | 3 | mnli | 34.07 | 35.50 | +1.43 |
24
+ | 4 | cb | 3.57 | 0.00 | -3.57 |
25
+ | 5 | copa | 42.00 | 44.00 | +2.00 |
26
+ | 6 | qqp | 76.96 | 76.72 | -0.24 |
27
+ | 7 | rte | 45.85 | 50.90 | +5.05 |
28
+ | 8 | imdb | 89.51 | **0.21** ⚠️ | -89.30 |
29
+ | 9 | sst2 | 85.21 | **0.00** ⚠️ | -85.21 |
30
+ | 10 | dbpedia | 98.16 | 92.22 | -5.94 |
31
+ | 11 | agnews | 88.37 | 68.76 | -19.61 |
32
+ | 12 | yahoo | 57.28 | **8.12** ⚠️ | -49.16 |
33
+ | 13 | multirc | 50.52 | 54.23 | +3.71 |
34
+ | 14 | boolq | 60.43 | 61.13 | +0.70 |
35
+ | 15 | wic | 55.49 | **0.00** ⚠️ | -55.49 |
36
+ | | **Mean** | **59.70** | **39.74** | **-19.96** |
37
+
38
+ > ⚠️ **LƯU Ý QUAN TRỌNG**: So sánh KHÔNG công bằng — ROOT dùng R_{15,j} (final, sau tất cả 15 tasks), SpecRoute dùng R_{j,j} (peak, ngay sau train từng task). AP thực của SpecRoute sẽ thấp hơn 39.74.
39
+
40
+ ### Phân tích
41
+
42
+ **1. Prediction metrics không được lưu**
43
+ - SpecRoute `all_results.json` chỉ chứa training metrics, KHÔNG có `predict_exact_match_for_{task}`
44
+ - `task_order.txt` không tồn tại → `score.py` không thể tính AP/FT
45
+ - Nguyên nhân: Có thể do experiment được chạy bằng script khác (không phải T5_small/ scripts đã fix `--do_predict`)
46
+ - T5-large script generator (`generate_specroute_scripts_v2.py`) vẫn có bug `do_predict=False` cho long benchmarks
47
+
48
+ **2. Các tasks THẤT BẠI KHÔNG PHẢI do catastrophic forgetting**
49
+
50
+ | Task | Train Loss (Root) | Train Loss (SpecRoute) | Ratio | Verdict |
51
+ |------|:-:|:-:|:-:|---|
52
+ | imdb | 1.41 | **4.15** | 2.9x | Không thể học |
53
+ | sst2 | 1.76 | **4.45** | 2.5x | Không thể học |
54
+ | yahoo | 1.19 | **3.08** | 2.6x | Không thể học |
55
+ | wic | 0.96 | **3.65** | 3.8x | Không thể học |
56
+
57
+ Training loss cao gấp 2.5-3.8x → model KHÔNG THỂ HỌC ngay từ đầu (inability to learn, NOT catastrophic forgetting).
58
+
59
+ **3. Nguyên nhân gốc: GPM null-space saturation + thiếu protection mechanisms**
60
+
61
+ SpecRoute loại bỏ learned routing → đồng thời mất 4/5 cơ chế protection của ROOT:
62
+
63
+ | Protection Mechanism | ROOT | SpecRoute V1 |
64
+ |---------------------|:---:|:---:|
65
+ | GPM on LoRA A | ✅ | ✅ |
66
+ | KL distillation on routing | ✅ | ❌ |
67
+ | Data replay | ✅ | ❌ |
68
+ | Per-step GPM on routing params | ✅ | ❌ (no routing params) |
69
+ | Learned routing adaptation | ✅ | ❌ (by design) |
70
+
71
+ Khi tasks tương tự (imdb/sst2 vs yelp/amazon — cùng sentiment domain) đến, GPM đã "claim" sentiment-relevant directions → model bị ép vào orthogonal null-space không liên quan → KHÔNG thể học sentiment tasks mới.
72
+
73
+ ROOT GainLoRA giải quyết vấn đề này nhờ trans_input MLP map input mới vào representation space REUSE kiến thức cũ, kết hợp KL distillation + data replay.
74
+
75
+ **4. FT (Forgetting) = N/A**
76
+ - Không tính được vì thiếu cross-task prediction metrics
77
+
78
+ ### Cải tiến cho V2
79
+
80
+ | # | Loại | Nội dung | Tác động |
81
+ |---|------|---------|----------|
82
+ | 1 | Bug fix | Fix `do_predict=False` → `True` trong generator | Cho phép tính AP/FT đúng |
83
+ | 2 | Config | Giảm GPM threshold: 0.995 → 0.980 | Mở rộng null-space cho tasks sau |
84
+ | 3 | **Idea change** | Thêm Experience Replay (CE loss trên old task data) | Chống forgetting + hỗ trợ knowledge reuse |
85
+
86
+ ---
87
+
88
+ ## Version 2.0 — SpecRoute + Experience Replay (Planned)
89
+
90
+ ### Thay đổi về Idea
91
+
92
+ > **⚠️ IDEA CHANGE**: Version 2 thêm **Experience Replay (CE loss)** vào SpecRoute.
93
+ >
94
+ > SpecRoute V1 claim rằng spectral routing parameter-free đủ để thay thế learned routing. V2 bổ sung rằng:
95
+ > - Spectral routing thay thế **routing mechanism** (đúng, giữ nguyên)
96
+ > - Nhưng **protection mechanisms** (data replay) là ORTHOGONAL với routing mechanism và cần được giữ lại
97
+ > - V2 sử dụng **CE replay trực tiếp** trên old task training data (không cần teacher model hay saved logits)
98
+ > - Khác ROOT (KL on routing scores): SpecRoute replay chỉ cần CE loss vì routing là parameter-free
99
+ >
100
+ > Đây là sự thay đổi từ "spectral routing is sufficient" sang "spectral routing + replay protection is the complete solution".
101
+ > Bản chất: **decouple routing mechanism khỏi protection mechanisms**.
102
+
103
+ ### Kịch bản thử nghiệm
104
+ - **Model**: T5-Small (d_model=512, 6 encoder + 6 decoder layers)
105
+ - **Method**: SpecRoute V2 — spectral routing + experience replay (CE loss trên original training data)
106
+ - **Hyperparameters**:
107
+ - lora_r=8, lora_alpha=32, lr=3e-4, 10 epochs
108
+ - **threshold=0.980** (giảm từ 0.995)
109
+ - **data_replay_freq=5** (replay mỗi 5 steps)
110
+ - **kl_ratio=0.1** (weight cho replay CE loss)
111
+ - **gen_data_dir=CL_Benchmark** (replay từ original training data)
112
+ - **Script**: `T5_small/gen_script_long_order3_t5_small_specroute_v2.sh`
113
+ - **Platform**: Kaggle T4 GPU
114
+
115
+ ### Code Changes (Actual)
116
+
117
+ **1. Bug Fix: `generate_specroute_scripts_v2.py`**
118
+ - `do_predict=False` → `True` cho `long_order3` và `long_order4`
119
+
120
+ **2. Trainer: `cl_trainer_specroute.py`**
121
+ - Thêm `create_memory_replay_generators()` — tạo DataLoader cycling iterators
122
+ - `__init__()`: nhận `data_collator_replay`, `replay_dataset_dict`, tạo `replay_dataloader_dict` và `replay_iterator_dict`
123
+ - `training_step()`: Sau main CE loss backward, replay CE loss trên old task data:
124
+ ```
125
+ Mỗi replay_freq steps:
126
+ For each old task:
127
+ sample batch from replay iterator
128
+ replay_loss = kl_ratio * CE_loss(model, replay_batch)
129
+ replay_loss.backward()
130
+ ```
131
+
132
+ **3. Run entry: `run_t5.py`**
133
+ - Mở rộng replay dataset loading condition: `load_checkpoint_from OR (specroute AND cur_task_id > 0)`
134
+ - Skip `attention_weights.pkl` loading cho SpecRoute (không cần KL on routing)
135
+ - Pass `data_collator_replay`, `replay_dataset_dict` vào SpecRoute_Trainer
136
+
137
+ **4. Shell Script: `T5_small/gen_script_long_order3_t5_small_specroute_v2.sh`** (NEW)
138
+ - threshold: 0.995 → 0.980
139
+ - data_replay_freq: -1 → 5
140
+ - Thêm: `--kl_ratio 0.1`, `--gen_data_dir CL_Benchmark`
141
+ - Output dir: `specroute_v2` (tách biệt V1)
142
+ - V1 script giữ nguyên để so sánh
143
+
144
+ ### Kết quả
145
+ > *Chưa chạy — cần thực nghiệm*
146
+
147
+ ### Phân tích
148
+ > *Pending*
149
+
150
+ ### Kỳ vọng
151
+ - Tasks 8 (imdb), 9 (sst2), 12 (yahoo), 15 (wic): kỳ vọng cải thiện đáng kể nhờ threshold thấp hơn (mở rộng null-space)
152
+ - Overall AP: kỳ vọng tăng từ ~39.74 lên >50 (threshold fix), replay CE giúp chống forgetting
153
+ - FT: kỳ vọng tính được (do_predict fix) và forgetting thấp hơn nhờ replay
154
+
155
+ ### Nếu kết quả không đạt → V3 Plan
156
+ - **V3a**: Thêm output-level KL distillation (so sánh logits hiện tại vs teacher model snapshot) — yêu cầu lưu teacher model
157
+ - **V3b**: Thêm adaptive threshold per-layer (thay vì cùng threshold cho tất cả layers)
158
+ - **V3c**: SpecRoute + InfLoRA-style direction expansion khi null-space quá nhỏ
159
+
160
+ ---
161
+
162
+ ## Changelog
163
+
164
+ | Date | Version | Change Type | Description |
165
+ |------|---------|-------------|-------------|
166
+ | 2025-XX-XX | V1.0 | Initial | First experiment — baseline SpecRoute vs ROOT GainLoRA |
167
+ | 2025-XX-XX | V2.0 | Idea + Code | Thêm experience replay (CE), giảm threshold 0.995→0.980, fix do_predict |
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/all_results.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "epoch": 10.0,
3
- "total_flos": 9122411270725632.0,
4
- "train_loss": 0.5864024265556579,
5
- "train_runtime": 1102.9262,
6
- "train_samples": 5000,
7
- "train_samples_per_second": 45.334,
8
- "train_steps_per_second": 1.423
9
- }
 
 
 
 
 
 
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/eval_eval_predictions.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a7a162013858ce8c643d5f66d3b834d8a573de497cec931a216f99e1f0178c
3
- size 8615403
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/predict_eval_predictions.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a7a162013858ce8c643d5f66d3b834d8a573de497cec931a216f99e1f0178c
3
- size 8615403
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_0.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6347c1939a53ee2509742a6aa5db72d302a9f33b13fbe90ef95a10f9e9221bb8
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_1.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:843fd6f47f02250c381b8489f82b7adb7cce5852b8dd463066395623cd42ca7c
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_10.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:222a5768a659ab676766654ca8e142f24f7c0761be3b55d63f9a6071d64207ac
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_11.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b98abe23b3517987bda49db3b1501a0de290ccfe1293476c85ef9fb06ae1af3
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_12.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4d217ec73e80746735689b52de8985bd9b8b37a3f804d87e8ea8399e5a7ce13
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_13.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94c96b1e217047809307086ec824bac8630ccbd16d8431e1d4d0bd8cd9d9513
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_14.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ddbf830599893c4a9f15ae1abf9f120a95063e6a53fde5581a64244edfccec8
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_15.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:063e96e4e954283653178868f826503b2f5d8003f4317b6396df23f47ba1ef6a
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_16.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e145f0f32e63de296a637276dfb148e216c770504333ea90f87f429486cd242
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_17.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c388d5777061b3d716fa2d9f33b58fd6b88a00880991654d7d74ae7c2f8393f
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_18.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:29cca1a96c16d78b45dec35a9fcd4f69fd78012d801f9ed7169d42b5d17b1818
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_19.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:78e8649cda3b950cd839967a2be7347d627f1ca1fbe32e3fa2f0ef09852c531b
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_2.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:741af2fdc11424d8a3b16fbee464fb95abe96251b5b96e2a6e9a48b3f91b0023
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_20.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd3cbcd2c082eef395e674b4f6aff25636050063f66d7df24136a283756a85cb
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_21.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0a27c57b783efd371db7bb7e49a6b9d088511a9a79b79a9cee12c6584fbf2ec
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_22.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbcb435a6636e909e222a288c45038662e44bb1546fa20e602c6dd6bacc7a0d9
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_23.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:86ed74a9a8c5fd74143b95b2d88ad0bfd2e5acb73a55a6de084b2007cd09ba2b
3
- size 1049687
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_3.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:102c178c81bf0eca52a54fd02f8748209231bce6bcf038daa330e8dedb6cb4e4
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_4.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:764cf3f60cf57542a85cd7a24226830a47d0feda84fc32e6a8a1d9d9c498ebeb
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_5.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0249fa240b48cf18d6bc3638b6977ccbb993a5e4e3d924a7788a662c5668ffed
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_6.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdd5d0c75a9037ff92d44e21c8ffbec5d32586e40cbdf5ba08b35f0d7ca3ec29
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_7.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:82169d8abd9475d638fc237287bc2f5273f0a0e8b1e8bcccc0e4ad53c9c74958
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_8.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dff7efc08c970c36173502656d3837456b6910d5e20efcf2a8a70cfcbcf744ef
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_9.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:80e3d101a726351d264a702ff4dd91d81898206ab423198df379119272c1b3be
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/runs/Mar13_11-48-44_a802a1875a6b/events.out.tfevents.1773402573.a802a1875a6b.120.0 DELETED
Binary file (9.73 kB)
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/attention_weights.pkl DELETED
Binary file (151 Bytes)
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_A.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4b19d3392d606e6a2f01514dd365f84474ee8008946d2d8bcb59f543159160b
3
- size 803442
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_B.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2771ba4fefea1b21526d4a4c06bf7b00075a15ddfcb60fb1ccff48d5551c3b6f
3
- size 606770
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/prompts_keys_till_now.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f52669525937aba46986ba60df07fb81bedf3745928a864ae4c08d88fae6a069
3
- size 3298
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/special_tokens_map.json DELETED
@@ -1,125 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<extra_id_0>",
4
- "<extra_id_1>",
5
- "<extra_id_2>",
6
- "<extra_id_3>",
7
- "<extra_id_4>",
8
- "<extra_id_5>",
9
- "<extra_id_6>",
10
- "<extra_id_7>",
11
- "<extra_id_8>",
12
- "<extra_id_9>",
13
- "<extra_id_10>",
14
- "<extra_id_11>",
15
- "<extra_id_12>",
16
- "<extra_id_13>",
17
- "<extra_id_14>",
18
- "<extra_id_15>",
19
- "<extra_id_16>",
20
- "<extra_id_17>",
21
- "<extra_id_18>",
22
- "<extra_id_19>",
23
- "<extra_id_20>",
24
- "<extra_id_21>",
25
- "<extra_id_22>",
26
- "<extra_id_23>",
27
- "<extra_id_24>",
28
- "<extra_id_25>",
29
- "<extra_id_26>",
30
- "<extra_id_27>",
31
- "<extra_id_28>",
32
- "<extra_id_29>",
33
- "<extra_id_30>",
34
- "<extra_id_31>",
35
- "<extra_id_32>",
36
- "<extra_id_33>",
37
- "<extra_id_34>",
38
- "<extra_id_35>",
39
- "<extra_id_36>",
40
- "<extra_id_37>",
41
- "<extra_id_38>",
42
- "<extra_id_39>",
43
- "<extra_id_40>",
44
- "<extra_id_41>",
45
- "<extra_id_42>",
46
- "<extra_id_43>",
47
- "<extra_id_44>",
48
- "<extra_id_45>",
49
- "<extra_id_46>",
50
- "<extra_id_47>",
51
- "<extra_id_48>",
52
- "<extra_id_49>",
53
- "<extra_id_50>",
54
- "<extra_id_51>",
55
- "<extra_id_52>",
56
- "<extra_id_53>",
57
- "<extra_id_54>",
58
- "<extra_id_55>",
59
- "<extra_id_56>",
60
- "<extra_id_57>",
61
- "<extra_id_58>",
62
- "<extra_id_59>",
63
- "<extra_id_60>",
64
- "<extra_id_61>",
65
- "<extra_id_62>",
66
- "<extra_id_63>",
67
- "<extra_id_64>",
68
- "<extra_id_65>",
69
- "<extra_id_66>",
70
- "<extra_id_67>",
71
- "<extra_id_68>",
72
- "<extra_id_69>",
73
- "<extra_id_70>",
74
- "<extra_id_71>",
75
- "<extra_id_72>",
76
- "<extra_id_73>",
77
- "<extra_id_74>",
78
- "<extra_id_75>",
79
- "<extra_id_76>",
80
- "<extra_id_77>",
81
- "<extra_id_78>",
82
- "<extra_id_79>",
83
- "<extra_id_80>",
84
- "<extra_id_81>",
85
- "<extra_id_82>",
86
- "<extra_id_83>",
87
- "<extra_id_84>",
88
- "<extra_id_85>",
89
- "<extra_id_86>",
90
- "<extra_id_87>",
91
- "<extra_id_88>",
92
- "<extra_id_89>",
93
- "<extra_id_90>",
94
- "<extra_id_91>",
95
- "<extra_id_92>",
96
- "<extra_id_93>",
97
- "<extra_id_94>",
98
- "<extra_id_95>",
99
- "<extra_id_96>",
100
- "<extra_id_97>",
101
- "<extra_id_98>",
102
- "<extra_id_99>"
103
- ],
104
- "eos_token": {
105
- "content": "</s>",
106
- "lstrip": false,
107
- "normalized": false,
108
- "rstrip": false,
109
- "single_word": false
110
- },
111
- "pad_token": {
112
- "content": "<pad>",
113
- "lstrip": false,
114
- "normalized": false,
115
- "rstrip": false,
116
- "single_word": false
117
- },
118
- "unk_token": {
119
- "content": "<unk>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false
124
- }
125
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/spiece.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
- size 791656
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer_config.json DELETED
@@ -1,938 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<pad>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "</s>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "<unk>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "32000": {
28
- "content": "<extra_id_99>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "32001": {
36
- "content": "<extra_id_98>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "32002": {
44
- "content": "<extra_id_97>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "32003": {
52
- "content": "<extra_id_96>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "32004": {
60
- "content": "<extra_id_95>",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- },
67
- "32005": {
68
- "content": "<extra_id_94>",
69
- "lstrip": false,
70
- "normalized": false,
71
- "rstrip": false,
72
- "single_word": false,
73
- "special": true
74
- },
75
- "32006": {
76
- "content": "<extra_id_93>",
77
- "lstrip": false,
78
- "normalized": false,
79
- "rstrip": false,
80
- "single_word": false,
81
- "special": true
82
- },
83
- "32007": {
84
- "content": "<extra_id_92>",
85
- "lstrip": false,
86
- "normalized": false,
87
- "rstrip": false,
88
- "single_word": false,
89
- "special": true
90
- },
91
- "32008": {
92
- "content": "<extra_id_91>",
93
- "lstrip": false,
94
- "normalized": false,
95
- "rstrip": false,
96
- "single_word": false,
97
- "special": true
98
- },
99
- "32009": {
100
- "content": "<extra_id_90>",
101
- "lstrip": false,
102
- "normalized": false,
103
- "rstrip": false,
104
- "single_word": false,
105
- "special": true
106
- },
107
- "32010": {
108
- "content": "<extra_id_89>",
109
- "lstrip": false,
110
- "normalized": false,
111
- "rstrip": false,
112
- "single_word": false,
113
- "special": true
114
- },
115
- "32011": {
116
- "content": "<extra_id_88>",
117
- "lstrip": false,
118
- "normalized": false,
119
- "rstrip": false,
120
- "single_word": false,
121
- "special": true
122
- },
123
- "32012": {
124
- "content": "<extra_id_87>",
125
- "lstrip": false,
126
- "normalized": false,
127
- "rstrip": false,
128
- "single_word": false,
129
- "special": true
130
- },
131
- "32013": {
132
- "content": "<extra_id_86>",
133
- "lstrip": false,
134
- "normalized": false,
135
- "rstrip": false,
136
- "single_word": false,
137
- "special": true
138
- },
139
- "32014": {
140
- "content": "<extra_id_85>",
141
- "lstrip": false,
142
- "normalized": false,
143
- "rstrip": false,
144
- "single_word": false,
145
- "special": true
146
- },
147
- "32015": {
148
- "content": "<extra_id_84>",
149
- "lstrip": false,
150
- "normalized": false,
151
- "rstrip": false,
152
- "single_word": false,
153
- "special": true
154
- },
155
- "32016": {
156
- "content": "<extra_id_83>",
157
- "lstrip": false,
158
- "normalized": false,
159
- "rstrip": false,
160
- "single_word": false,
161
- "special": true
162
- },
163
- "32017": {
164
- "content": "<extra_id_82>",
165
- "lstrip": false,
166
- "normalized": false,
167
- "rstrip": false,
168
- "single_word": false,
169
- "special": true
170
- },
171
- "32018": {
172
- "content": "<extra_id_81>",
173
- "lstrip": false,
174
- "normalized": false,
175
- "rstrip": false,
176
- "single_word": false,
177
- "special": true
178
- },
179
- "32019": {
180
- "content": "<extra_id_80>",
181
- "lstrip": false,
182
- "normalized": false,
183
- "rstrip": false,
184
- "single_word": false,
185
- "special": true
186
- },
187
- "32020": {
188
- "content": "<extra_id_79>",
189
- "lstrip": false,
190
- "normalized": false,
191
- "rstrip": false,
192
- "single_word": false,
193
- "special": true
194
- },
195
- "32021": {
196
- "content": "<extra_id_78>",
197
- "lstrip": false,
198
- "normalized": false,
199
- "rstrip": false,
200
- "single_word": false,
201
- "special": true
202
- },
203
- "32022": {
204
- "content": "<extra_id_77>",
205
- "lstrip": false,
206
- "normalized": false,
207
- "rstrip": false,
208
- "single_word": false,
209
- "special": true
210
- },
211
- "32023": {
212
- "content": "<extra_id_76>",
213
- "lstrip": false,
214
- "normalized": false,
215
- "rstrip": false,
216
- "single_word": false,
217
- "special": true
218
- },
219
- "32024": {
220
- "content": "<extra_id_75>",
221
- "lstrip": false,
222
- "normalized": false,
223
- "rstrip": false,
224
- "single_word": false,
225
- "special": true
226
- },
227
- "32025": {
228
- "content": "<extra_id_74>",
229
- "lstrip": false,
230
- "normalized": false,
231
- "rstrip": false,
232
- "single_word": false,
233
- "special": true
234
- },
235
- "32026": {
236
- "content": "<extra_id_73>",
237
- "lstrip": false,
238
- "normalized": false,
239
- "rstrip": false,
240
- "single_word": false,
241
- "special": true
242
- },
243
- "32027": {
244
- "content": "<extra_id_72>",
245
- "lstrip": false,
246
- "normalized": false,
247
- "rstrip": false,
248
- "single_word": false,
249
- "special": true
250
- },
251
- "32028": {
252
- "content": "<extra_id_71>",
253
- "lstrip": false,
254
- "normalized": false,
255
- "rstrip": false,
256
- "single_word": false,
257
- "special": true
258
- },
259
- "32029": {
260
- "content": "<extra_id_70>",
261
- "lstrip": false,
262
- "normalized": false,
263
- "rstrip": false,
264
- "single_word": false,
265
- "special": true
266
- },
267
- "32030": {
268
- "content": "<extra_id_69>",
269
- "lstrip": false,
270
- "normalized": false,
271
- "rstrip": false,
272
- "single_word": false,
273
- "special": true
274
- },
275
- "32031": {
276
- "content": "<extra_id_68>",
277
- "lstrip": false,
278
- "normalized": false,
279
- "rstrip": false,
280
- "single_word": false,
281
- "special": true
282
- },
283
- "32032": {
284
- "content": "<extra_id_67>",
285
- "lstrip": false,
286
- "normalized": false,
287
- "rstrip": false,
288
- "single_word": false,
289
- "special": true
290
- },
291
- "32033": {
292
- "content": "<extra_id_66>",
293
- "lstrip": false,
294
- "normalized": false,
295
- "rstrip": false,
296
- "single_word": false,
297
- "special": true
298
- },
299
- "32034": {
300
- "content": "<extra_id_65>",
301
- "lstrip": false,
302
- "normalized": false,
303
- "rstrip": false,
304
- "single_word": false,
305
- "special": true
306
- },
307
- "32035": {
308
- "content": "<extra_id_64>",
309
- "lstrip": false,
310
- "normalized": false,
311
- "rstrip": false,
312
- "single_word": false,
313
- "special": true
314
- },
315
- "32036": {
316
- "content": "<extra_id_63>",
317
- "lstrip": false,
318
- "normalized": false,
319
- "rstrip": false,
320
- "single_word": false,
321
- "special": true
322
- },
323
- "32037": {
324
- "content": "<extra_id_62>",
325
- "lstrip": false,
326
- "normalized": false,
327
- "rstrip": false,
328
- "single_word": false,
329
- "special": true
330
- },
331
- "32038": {
332
- "content": "<extra_id_61>",
333
- "lstrip": false,
334
- "normalized": false,
335
- "rstrip": false,
336
- "single_word": false,
337
- "special": true
338
- },
339
- "32039": {
340
- "content": "<extra_id_60>",
341
- "lstrip": false,
342
- "normalized": false,
343
- "rstrip": false,
344
- "single_word": false,
345
- "special": true
346
- },
347
- "32040": {
348
- "content": "<extra_id_59>",
349
- "lstrip": false,
350
- "normalized": false,
351
- "rstrip": false,
352
- "single_word": false,
353
- "special": true
354
- },
355
- "32041": {
356
- "content": "<extra_id_58>",
357
- "lstrip": false,
358
- "normalized": false,
359
- "rstrip": false,
360
- "single_word": false,
361
- "special": true
362
- },
363
- "32042": {
364
- "content": "<extra_id_57>",
365
- "lstrip": false,
366
- "normalized": false,
367
- "rstrip": false,
368
- "single_word": false,
369
- "special": true
370
- },
371
- "32043": {
372
- "content": "<extra_id_56>",
373
- "lstrip": false,
374
- "normalized": false,
375
- "rstrip": false,
376
- "single_word": false,
377
- "special": true
378
- },
379
- "32044": {
380
- "content": "<extra_id_55>",
381
- "lstrip": false,
382
- "normalized": false,
383
- "rstrip": false,
384
- "single_word": false,
385
- "special": true
386
- },
387
- "32045": {
388
- "content": "<extra_id_54>",
389
- "lstrip": false,
390
- "normalized": false,
391
- "rstrip": false,
392
- "single_word": false,
393
- "special": true
394
- },
395
- "32046": {
396
- "content": "<extra_id_53>",
397
- "lstrip": false,
398
- "normalized": false,
399
- "rstrip": false,
400
- "single_word": false,
401
- "special": true
402
- },
403
- "32047": {
404
- "content": "<extra_id_52>",
405
- "lstrip": false,
406
- "normalized": false,
407
- "rstrip": false,
408
- "single_word": false,
409
- "special": true
410
- },
411
- "32048": {
412
- "content": "<extra_id_51>",
413
- "lstrip": false,
414
- "normalized": false,
415
- "rstrip": false,
416
- "single_word": false,
417
- "special": true
418
- },
419
- "32049": {
420
- "content": "<extra_id_50>",
421
- "lstrip": false,
422
- "normalized": false,
423
- "rstrip": false,
424
- "single_word": false,
425
- "special": true
426
- },
427
- "32050": {
428
- "content": "<extra_id_49>",
429
- "lstrip": false,
430
- "normalized": false,
431
- "rstrip": false,
432
- "single_word": false,
433
- "special": true
434
- },
435
- "32051": {
436
- "content": "<extra_id_48>",
437
- "lstrip": false,
438
- "normalized": false,
439
- "rstrip": false,
440
- "single_word": false,
441
- "special": true
442
- },
443
- "32052": {
444
- "content": "<extra_id_47>",
445
- "lstrip": false,
446
- "normalized": false,
447
- "rstrip": false,
448
- "single_word": false,
449
- "special": true
450
- },
451
- "32053": {
452
- "content": "<extra_id_46>",
453
- "lstrip": false,
454
- "normalized": false,
455
- "rstrip": false,
456
- "single_word": false,
457
- "special": true
458
- },
459
- "32054": {
460
- "content": "<extra_id_45>",
461
- "lstrip": false,
462
- "normalized": false,
463
- "rstrip": false,
464
- "single_word": false,
465
- "special": true
466
- },
467
- "32055": {
468
- "content": "<extra_id_44>",
469
- "lstrip": false,
470
- "normalized": false,
471
- "rstrip": false,
472
- "single_word": false,
473
- "special": true
474
- },
475
- "32056": {
476
- "content": "<extra_id_43>",
477
- "lstrip": false,
478
- "normalized": false,
479
- "rstrip": false,
480
- "single_word": false,
481
- "special": true
482
- },
483
- "32057": {
484
- "content": "<extra_id_42>",
485
- "lstrip": false,
486
- "normalized": false,
487
- "rstrip": false,
488
- "single_word": false,
489
- "special": true
490
- },
491
- "32058": {
492
- "content": "<extra_id_41>",
493
- "lstrip": false,
494
- "normalized": false,
495
- "rstrip": false,
496
- "single_word": false,
497
- "special": true
498
- },
499
- "32059": {
500
- "content": "<extra_id_40>",
501
- "lstrip": false,
502
- "normalized": false,
503
- "rstrip": false,
504
- "single_word": false,
505
- "special": true
506
- },
507
- "32060": {
508
- "content": "<extra_id_39>",
509
- "lstrip": false,
510
- "normalized": false,
511
- "rstrip": false,
512
- "single_word": false,
513
- "special": true
514
- },
515
- "32061": {
516
- "content": "<extra_id_38>",
517
- "lstrip": false,
518
- "normalized": false,
519
- "rstrip": false,
520
- "single_word": false,
521
- "special": true
522
- },
523
- "32062": {
524
- "content": "<extra_id_37>",
525
- "lstrip": false,
526
- "normalized": false,
527
- "rstrip": false,
528
- "single_word": false,
529
- "special": true
530
- },
531
- "32063": {
532
- "content": "<extra_id_36>",
533
- "lstrip": false,
534
- "normalized": false,
535
- "rstrip": false,
536
- "single_word": false,
537
- "special": true
538
- },
539
- "32064": {
540
- "content": "<extra_id_35>",
541
- "lstrip": false,
542
- "normalized": false,
543
- "rstrip": false,
544
- "single_word": false,
545
- "special": true
546
- },
547
- "32065": {
548
- "content": "<extra_id_34>",
549
- "lstrip": false,
550
- "normalized": false,
551
- "rstrip": false,
552
- "single_word": false,
553
- "special": true
554
- },
555
- "32066": {
556
- "content": "<extra_id_33>",
557
- "lstrip": false,
558
- "normalized": false,
559
- "rstrip": false,
560
- "single_word": false,
561
- "special": true
562
- },
563
- "32067": {
564
- "content": "<extra_id_32>",
565
- "lstrip": false,
566
- "normalized": false,
567
- "rstrip": false,
568
- "single_word": false,
569
- "special": true
570
- },
571
- "32068": {
572
- "content": "<extra_id_31>",
573
- "lstrip": false,
574
- "normalized": false,
575
- "rstrip": false,
576
- "single_word": false,
577
- "special": true
578
- },
579
- "32069": {
580
- "content": "<extra_id_30>",
581
- "lstrip": false,
582
- "normalized": false,
583
- "rstrip": false,
584
- "single_word": false,
585
- "special": true
586
- },
587
- "32070": {
588
- "content": "<extra_id_29>",
589
- "lstrip": false,
590
- "normalized": false,
591
- "rstrip": false,
592
- "single_word": false,
593
- "special": true
594
- },
595
- "32071": {
596
- "content": "<extra_id_28>",
597
- "lstrip": false,
598
- "normalized": false,
599
- "rstrip": false,
600
- "single_word": false,
601
- "special": true
602
- },
603
- "32072": {
604
- "content": "<extra_id_27>",
605
- "lstrip": false,
606
- "normalized": false,
607
- "rstrip": false,
608
- "single_word": false,
609
- "special": true
610
- },
611
- "32073": {
612
- "content": "<extra_id_26>",
613
- "lstrip": false,
614
- "normalized": false,
615
- "rstrip": false,
616
- "single_word": false,
617
- "special": true
618
- },
619
- "32074": {
620
- "content": "<extra_id_25>",
621
- "lstrip": false,
622
- "normalized": false,
623
- "rstrip": false,
624
- "single_word": false,
625
- "special": true
626
- },
627
- "32075": {
628
- "content": "<extra_id_24>",
629
- "lstrip": false,
630
- "normalized": false,
631
- "rstrip": false,
632
- "single_word": false,
633
- "special": true
634
- },
635
- "32076": {
636
- "content": "<extra_id_23>",
637
- "lstrip": false,
638
- "normalized": false,
639
- "rstrip": false,
640
- "single_word": false,
641
- "special": true
642
- },
643
- "32077": {
644
- "content": "<extra_id_22>",
645
- "lstrip": false,
646
- "normalized": false,
647
- "rstrip": false,
648
- "single_word": false,
649
- "special": true
650
- },
651
- "32078": {
652
- "content": "<extra_id_21>",
653
- "lstrip": false,
654
- "normalized": false,
655
- "rstrip": false,
656
- "single_word": false,
657
- "special": true
658
- },
659
- "32079": {
660
- "content": "<extra_id_20>",
661
- "lstrip": false,
662
- "normalized": false,
663
- "rstrip": false,
664
- "single_word": false,
665
- "special": true
666
- },
667
- "32080": {
668
- "content": "<extra_id_19>",
669
- "lstrip": false,
670
- "normalized": false,
671
- "rstrip": false,
672
- "single_word": false,
673
- "special": true
674
- },
675
- "32081": {
676
- "content": "<extra_id_18>",
677
- "lstrip": false,
678
- "normalized": false,
679
- "rstrip": false,
680
- "single_word": false,
681
- "special": true
682
- },
683
- "32082": {
684
- "content": "<extra_id_17>",
685
- "lstrip": false,
686
- "normalized": false,
687
- "rstrip": false,
688
- "single_word": false,
689
- "special": true
690
- },
691
- "32083": {
692
- "content": "<extra_id_16>",
693
- "lstrip": false,
694
- "normalized": false,
695
- "rstrip": false,
696
- "single_word": false,
697
- "special": true
698
- },
699
- "32084": {
700
- "content": "<extra_id_15>",
701
- "lstrip": false,
702
- "normalized": false,
703
- "rstrip": false,
704
- "single_word": false,
705
- "special": true
706
- },
707
- "32085": {
708
- "content": "<extra_id_14>",
709
- "lstrip": false,
710
- "normalized": false,
711
- "rstrip": false,
712
- "single_word": false,
713
- "special": true
714
- },
715
- "32086": {
716
- "content": "<extra_id_13>",
717
- "lstrip": false,
718
- "normalized": false,
719
- "rstrip": false,
720
- "single_word": false,
721
- "special": true
722
- },
723
- "32087": {
724
- "content": "<extra_id_12>",
725
- "lstrip": false,
726
- "normalized": false,
727
- "rstrip": false,
728
- "single_word": false,
729
- "special": true
730
- },
731
- "32088": {
732
- "content": "<extra_id_11>",
733
- "lstrip": false,
734
- "normalized": false,
735
- "rstrip": false,
736
- "single_word": false,
737
- "special": true
738
- },
739
- "32089": {
740
- "content": "<extra_id_10>",
741
- "lstrip": false,
742
- "normalized": false,
743
- "rstrip": false,
744
- "single_word": false,
745
- "special": true
746
- },
747
- "32090": {
748
- "content": "<extra_id_9>",
749
- "lstrip": false,
750
- "normalized": false,
751
- "rstrip": false,
752
- "single_word": false,
753
- "special": true
754
- },
755
- "32091": {
756
- "content": "<extra_id_8>",
757
- "lstrip": false,
758
- "normalized": false,
759
- "rstrip": false,
760
- "single_word": false,
761
- "special": true
762
- },
763
- "32092": {
764
- "content": "<extra_id_7>",
765
- "lstrip": false,
766
- "normalized": false,
767
- "rstrip": false,
768
- "single_word": false,
769
- "special": true
770
- },
771
- "32093": {
772
- "content": "<extra_id_6>",
773
- "lstrip": false,
774
- "normalized": false,
775
- "rstrip": false,
776
- "single_word": false,
777
- "special": true
778
- },
779
- "32094": {
780
- "content": "<extra_id_5>",
781
- "lstrip": false,
782
- "normalized": false,
783
- "rstrip": false,
784
- "single_word": false,
785
- "special": true
786
- },
787
- "32095": {
788
- "content": "<extra_id_4>",
789
- "lstrip": false,
790
- "normalized": false,
791
- "rstrip": false,
792
- "single_word": false,
793
- "special": true
794
- },
795
- "32096": {
796
- "content": "<extra_id_3>",
797
- "lstrip": false,
798
- "normalized": false,
799
- "rstrip": false,
800
- "single_word": false,
801
- "special": true
802
- },
803
- "32097": {
804
- "content": "<extra_id_2>",
805
- "lstrip": false,
806
- "normalized": false,
807
- "rstrip": false,
808
- "single_word": false,
809
- "special": true
810
- },
811
- "32098": {
812
- "content": "<extra_id_1>",
813
- "lstrip": false,
814
- "normalized": false,
815
- "rstrip": false,
816
- "single_word": false,
817
- "special": true
818
- },
819
- "32099": {
820
- "content": "<extra_id_0>",
821
- "lstrip": false,
822
- "normalized": false,
823
- "rstrip": false,
824
- "single_word": false,
825
- "special": true
826
- }
827
- },
828
- "additional_special_tokens": [
829
- "<extra_id_0>",
830
- "<extra_id_1>",
831
- "<extra_id_2>",
832
- "<extra_id_3>",
833
- "<extra_id_4>",
834
- "<extra_id_5>",
835
- "<extra_id_6>",
836
- "<extra_id_7>",
837
- "<extra_id_8>",
838
- "<extra_id_9>",
839
- "<extra_id_10>",
840
- "<extra_id_11>",
841
- "<extra_id_12>",
842
- "<extra_id_13>",
843
- "<extra_id_14>",
844
- "<extra_id_15>",
845
- "<extra_id_16>",
846
- "<extra_id_17>",
847
- "<extra_id_18>",
848
- "<extra_id_19>",
849
- "<extra_id_20>",
850
- "<extra_id_21>",
851
- "<extra_id_22>",
852
- "<extra_id_23>",
853
- "<extra_id_24>",
854
- "<extra_id_25>",
855
- "<extra_id_26>",
856
- "<extra_id_27>",
857
- "<extra_id_28>",
858
- "<extra_id_29>",
859
- "<extra_id_30>",
860
- "<extra_id_31>",
861
- "<extra_id_32>",
862
- "<extra_id_33>",
863
- "<extra_id_34>",
864
- "<extra_id_35>",
865
- "<extra_id_36>",
866
- "<extra_id_37>",
867
- "<extra_id_38>",
868
- "<extra_id_39>",
869
- "<extra_id_40>",
870
- "<extra_id_41>",
871
- "<extra_id_42>",
872
- "<extra_id_43>",
873
- "<extra_id_44>",
874
- "<extra_id_45>",
875
- "<extra_id_46>",
876
- "<extra_id_47>",
877
- "<extra_id_48>",
878
- "<extra_id_49>",
879
- "<extra_id_50>",
880
- "<extra_id_51>",
881
- "<extra_id_52>",
882
- "<extra_id_53>",
883
- "<extra_id_54>",
884
- "<extra_id_55>",
885
- "<extra_id_56>",
886
- "<extra_id_57>",
887
- "<extra_id_58>",
888
- "<extra_id_59>",
889
- "<extra_id_60>",
890
- "<extra_id_61>",
891
- "<extra_id_62>",
892
- "<extra_id_63>",
893
- "<extra_id_64>",
894
- "<extra_id_65>",
895
- "<extra_id_66>",
896
- "<extra_id_67>",
897
- "<extra_id_68>",
898
- "<extra_id_69>",
899
- "<extra_id_70>",
900
- "<extra_id_71>",
901
- "<extra_id_72>",
902
- "<extra_id_73>",
903
- "<extra_id_74>",
904
- "<extra_id_75>",
905
- "<extra_id_76>",
906
- "<extra_id_77>",
907
- "<extra_id_78>",
908
- "<extra_id_79>",
909
- "<extra_id_80>",
910
- "<extra_id_81>",
911
- "<extra_id_82>",
912
- "<extra_id_83>",
913
- "<extra_id_84>",
914
- "<extra_id_85>",
915
- "<extra_id_86>",
916
- "<extra_id_87>",
917
- "<extra_id_88>",
918
- "<extra_id_89>",
919
- "<extra_id_90>",
920
- "<extra_id_91>",
921
- "<extra_id_92>",
922
- "<extra_id_93>",
923
- "<extra_id_94>",
924
- "<extra_id_95>",
925
- "<extra_id_96>",
926
- "<extra_id_97>",
927
- "<extra_id_98>",
928
- "<extra_id_99>"
929
- ],
930
- "clean_up_tokenization_spaces": true,
931
- "eos_token": "</s>",
932
- "extra_ids": 100,
933
- "model_max_length": 512,
934
- "pad_token": "<pad>",
935
- "sp_model_kwargs": {},
936
- "tokenizer_class": "T5Tokenizer",
937
- "unk_token": "<unk>"
938
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/trans_input.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f98adc49cd4c2f647b32016a990363f67b33d273b860e75efdfda9545a44b439
3
- size 411248
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/train_results.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "epoch": 10.0,
3
- "total_flos": 9122411270725632.0,
4
- "train_loss": 0.5864024265556579,
5
- "train_runtime": 1102.9262,
6
- "train_samples": 5000,
7
- "train_samples_per_second": 45.334,
8
- "train_steps_per_second": 1.423
9
- }
 
 
 
 
 
 
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trainer_state.json DELETED
@@ -1,105 +0,0 @@
1
- {
2
- "best_metric": 55.9605,
3
- "best_model_checkpoint": "logs_and_outputs/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/checkpoint-1500",
4
- "epoch": 10.0,
5
- "eval_steps": 500,
6
- "global_step": 1570,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 3.1847133757961785,
13
- "learning_rate": 0.0003,
14
- "loss": 0.6612,
15
- "step": 500
16
- },
17
- {
18
- "epoch": 3.1847133757961785,
19
- "eval_exact_match": 43.9737,
20
- "eval_exact_match_for_CL": 43.9737,
21
- "eval_exact_match_for_yelp": 43.9737,
22
- "eval_gen_len": 2.5726,
23
- "eval_global_step": 500,
24
- "eval_loss": 0.45953086018562317,
25
- "eval_rouge1": 66.2544,
26
- "eval_rouge1_for_CL": 66.2544,
27
- "eval_rouge1_for_yelp": 66.2544,
28
- "eval_rougeL": 66.2544,
29
- "eval_rougeL_for_CL": 66.2544,
30
- "eval_rougeL_for_yelp": 66.2544,
31
- "eval_runtime": 105.0444,
32
- "eval_samples_per_second": 72.35,
33
- "eval_steps_per_second": 0.286,
34
- "step": 500
35
- },
36
- {
37
- "epoch": 6.369426751592357,
38
- "learning_rate": 0.0003,
39
- "loss": 0.5753,
40
- "step": 1000
41
- },
42
- {
43
- "epoch": 6.369426751592357,
44
- "eval_exact_match": 51.9474,
45
- "eval_exact_match_for_CL": 51.9474,
46
- "eval_exact_match_for_yelp": 51.9474,
47
- "eval_gen_len": 2.5862,
48
- "eval_global_step": 1000,
49
- "eval_loss": 0.4304201900959015,
50
- "eval_rouge1": 69.4868,
51
- "eval_rouge1_for_CL": 69.4868,
52
- "eval_rouge1_for_yelp": 69.4868,
53
- "eval_rougeL": 69.4868,
54
- "eval_rougeL_for_CL": 69.4868,
55
- "eval_rougeL_for_yelp": 69.4868,
56
- "eval_runtime": 104.5891,
57
- "eval_samples_per_second": 72.665,
58
- "eval_steps_per_second": 0.287,
59
- "step": 1000
60
- },
61
- {
62
- "epoch": 9.554140127388536,
63
- "learning_rate": 0.0003,
64
- "loss": 0.5329,
65
- "step": 1500
66
- },
67
- {
68
- "epoch": 9.554140127388536,
69
- "eval_exact_match": 55.9605,
70
- "eval_exact_match_for_CL": 55.9605,
71
- "eval_exact_match_for_yelp": 55.9605,
72
- "eval_gen_len": 2.5192,
73
- "eval_global_step": 1500,
74
- "eval_loss": 0.4083245098590851,
75
- "eval_rouge1": 70.8684,
76
- "eval_rouge1_for_CL": 70.8684,
77
- "eval_rouge1_for_yelp": 70.8684,
78
- "eval_rougeL": 70.8684,
79
- "eval_rougeL_for_CL": 70.8684,
80
- "eval_rougeL_for_yelp": 70.8684,
81
- "eval_runtime": 104.4585,
82
- "eval_samples_per_second": 72.756,
83
- "eval_steps_per_second": 0.287,
84
- "step": 1500
85
- },
86
- {
87
- "epoch": 10.0,
88
- "step": 1570,
89
- "total_flos": 9122411270725632.0,
90
- "train_loss": 0.5864024265556579,
91
- "train_runtime": 1102.9262,
92
- "train_samples_per_second": 45.334,
93
- "train_steps_per_second": 1.423
94
- }
95
- ],
96
- "logging_steps": 500,
97
- "max_steps": 1570,
98
- "num_input_tokens_seen": 0,
99
- "num_train_epochs": 10,
100
- "save_steps": 500,
101
- "total_flos": 9122411270725632.0,
102
- "train_batch_size": null,
103
- "trial_name": null,
104
- "trial_params": null
105
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_0.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4beab9d4dcbba908b7def8ef43eacb08dd6f3941fdf70c924622ec431085ae3a
3
- size 1049682
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_1.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:009305b9def4b16f2d3260b9e1a9493d5e3d9145014d8335bd9a2b4e2c6b8b45
3
- size 41106
 
 
 
 
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_2.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:251823168eaa8fde1fbf577a463ca6d34f3b49abd13868096f45d9dd5544bee1
3
- size 1049682