v2
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +6 -1
- experiment_diagnosis_report.md +301 -0
- improve_gainlora/T5_small/gen_script_long_order3_t5_small_specroute_v2.sh +893 -0
- improve_gainlora/generate_specroute_scripts_v2.py +2 -2
- improve_gainlora/src/cl_trainer_specroute.py +88 -3
- improve_gainlora/src/run_t5.py +13 -7
- results/comparison_results.md +37 -35
- results/experiment_versions.md +167 -0
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/all_results.json +0 -9
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/eval_eval_predictions.jsonl +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/predict_eval_predictions.jsonl +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_0.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_1.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_10.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_11.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_12.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_13.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_14.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_15.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_16.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_17.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_18.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_19.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_2.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_20.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_21.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_22.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_23.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_3.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_4.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_5.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_6.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_7.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_8.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_9.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/runs/Mar13_11-48-44_a802a1875a6b/events.out.tfevents.1773402573.a802a1875a6b.120.0 +0 -0
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/attention_weights.pkl +0 -0
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_A.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_B.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/prompts_keys_till_now.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/special_tokens_map.json +0 -125
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/spiece.model +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer.json +0 -0
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer_config.json +0 -938
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/trans_input.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/train_results.json +0 -9
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trainer_state.json +0 -105
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_0.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_1.pt +0 -3
- root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_2.pt +0 -3
.gitignore
CHANGED
|
@@ -9,4 +9,9 @@ __pycache__/
|
|
| 9 |
*.pyo
|
| 10 |
*.pyd
|
| 11 |
*/logs/*
|
| 12 |
-
*/logs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
*.pyo
|
| 10 |
*.pyd
|
| 11 |
*/logs/*
|
| 12 |
+
*/logs
|
| 13 |
+
logs/*
|
| 14 |
+
logs
|
| 15 |
+
*.log
|
| 16 |
+
|
| 17 |
+
|
experiment_diagnosis_report.md
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Báo cáo chẩn đoán thí nghiệm: SpecRoute vs GainLoRA trên T5-Small
|
| 2 |
+
|
| 3 |
+
> **Benchmark**: Long Sequence Order 3 (15 classification tasks)
|
| 4 |
+
> **Model**: T5-Small (d_model=512, 6 encoder + 6 decoder layers, lora_r=8)
|
| 5 |
+
> **Thí nghiệm**: SpecRoute (improve) vs GainLoRA-InfLoRA (root)
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 1. Xác minh kết quả: Bảng so sánh có chính xác không?
|
| 10 |
+
|
| 11 |
+
### ✅ ROOT GainLoRA: AP = 59.70 — CHÍNH XÁC
|
| 12 |
+
|
| 13 |
+
Nguồn dữ liệu: `logs/root_t5_small/.../15-wic/all_results.json`
|
| 14 |
+
- Task 15 (wic) có `--do_predict` → evaluation trên ALL 15 tasks (70,861 samples)
|
| 15 |
+
- Metrics `predict_exact_match_for_{task}` cho tất cả tasks → **đây là R_{15,j} (final row)**
|
| 16 |
+
- AP = mean(R_{15,j}) = 59.70 ✓ (tính đúng theo paper)
|
| 17 |
+
|
| 18 |
+
### ⚠️ SpecRoute: "AP" = 39.74 — **KHÔNG PHẢI AP THẬT**
|
| 19 |
+
|
| 20 |
+
Nguồn dữ liệu: `predict_eval_predictions.jsonl` tại MỖI task directory
|
| 21 |
+
- SpecRoute THIẾU `--do_predict` cho tasks 2-15 (bug trong script generator)
|
| 22 |
+
- File `predict_eval_predictions.jsonl` ở mỗi task chỉ chứa **current task evaluation**
|
| 23 |
+
- Các con số (yelp=54.36, imdb=0.21, etc.) là **R_{j,j} (diagonal = peak performance)**, KHÔNG phải R_{15,j}
|
| 24 |
+
- 39.74 = mean(diagonal), **KHÔNG phải AP** theo công thức paper
|
| 25 |
+
|
| 26 |
+
**Hệ quả**: AP thật của SpecRoute sẽ THẤP HƠN 39.74 vì forgetting sẽ giảm performance của các tasks đầu. Khoảng cách thực tế với ROOT có thể lớn hơn 19.96 điểm.
|
| 27 |
+
|
| 28 |
+
### Bảng so sánh đã hiệu chỉnh
|
| 29 |
+
|
| 30 |
+
| # | Task | ROOT R_{15,j} (Final) | SpecRoute R_{j,j} (Peak) | Ghi chú |
|
| 31 |
+
|---|------|-----------------------|--------------------------|---------|
|
| 32 |
+
| 1 | yelp | 56.01 | 54.36 | Tương đương |
|
| 33 |
+
| 2 | amazon | 52.05 | 50.01 | Tương đương |
|
| 34 |
+
| 3 | mnli | 34.07 | 35.50 | SpecRoute tốt hơn |
|
| 35 |
+
| 4 | cb | 3.57 | 0.00 | Cả hai đều thấp |
|
| 36 |
+
| 5 | copa | 42.00 | 44.00 | Tương đương |
|
| 37 |
+
| 6 | qqp | 76.96 | 76.72 | Tương đương |
|
| 38 |
+
| 7 | rte | 45.85 | 50.90 | SpecRoute tốt hơn |
|
| 39 |
+
| 8 | imdb | 89.51 | **0.21** ⚠️ | **Không thể học** |
|
| 40 |
+
| 9 | sst2 | 85.21 | **0.00** ⚠️ | **Không thể học** |
|
| 41 |
+
| 10 | dbpedia | 98.16 | 92.22 | Chấp nhận được |
|
| 42 |
+
| 11 | agnews | 88.37 | 68.76 | Giảm đáng kể |
|
| 43 |
+
| 12 | yahoo | 57.28 | **8.12** ⚠️ | **Không thể học** |
|
| 44 |
+
| 13 | multirc | 50.52 | 54.23 | Tương đương |
|
| 45 |
+
| 14 | boolq | 60.43 | 61.13 | Tương đương |
|
| 46 |
+
| 15 | wic | 55.49 | **0.00** ⚠️ | **Không thể học** |
|
| 47 |
+
|
| 48 |
+
**Nhận xét quan trọng**: SpecRoute scoring ở đây là PEAK (ngay sau khi train task đó), trong khi ROOT scoring là FINAL (sau khi train xong tất cả 15 tasks). Với ROOT, imdb PEAK có thể > 89.51 rồi chỉ giảm nhẹ về 89.51. Nhưng với SpecRoute, imdb PEAK đã là 0.21 — model KHÔNG THỂ HỌC task này ngay từ đầu, đây **không phải catastrophic forgetting**.
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## 2. Tại sao FT (Forgetting) không tính được?
|
| 53 |
+
|
| 54 |
+
### Nguyên nhân trực tiếp: `--do_predict` bị thiếu
|
| 55 |
+
|
| 56 |
+
Công thức FT cần:
|
| 57 |
+
- R_{j,j} = performance trên task j ngay sau khi train task j (diagonal)
|
| 58 |
+
- R_{T,j} = performance trên task j sau khi train xong tất cả T tasks (final row)
|
| 59 |
+
|
| 60 |
+
| Method | R_{j,j} (diagonal) | R_{T,j} (final row) | FT computable? |
|
| 61 |
+
|--------|--------------------|--------------------|----------------|
|
| 62 |
+
| ROOT | ❌ Không có (tasks 1-14 thiếu cross-task eval) | ✅ Task 15 có | ❌ Thiếu diagonal |
|
| 63 |
+
| SpecRoute | ⚠️ Có nhưng chỉ single-task eval | ❌ Task 15 không eval cross-task | ❌ Thiếu final row |
|
| 64 |
+
|
| 65 |
+
### Nguyên nhân gốc: Bug trong script generator
|
| 66 |
+
|
| 67 |
+
File `improve_gainlora/generate_specroute_scripts_v2.py`:
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
"long_order3": {
|
| 71 |
+
...
|
| 72 |
+
"do_predict": False, # ← BUG: nên là True
|
| 73 |
+
...
|
| 74 |
+
},
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
**Fix**: Đổi thành `True` cho cả `long_order3` và `long_order4`. Khi `do_predict=True`, script sẽ generate `--do_predict --predict_with_generate` cho mỗi task → `run_t5.py` sẽ evaluate trên ALL task cumulative test sets → `score.py` sẽ build được full matrix R → FT tính được.
|
| 78 |
+
|
| 79 |
+
ROOT cũng cần fix: hiện tại chỉ task 15 có `--do_predict`. Cần thêm cho tasks 1-14 để có full R matrix.
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
## 3. Phân tích nguyên nhân gốc: Tại sao SpecRoute kém?
|
| 84 |
+
|
| 85 |
+
### 3.1 KHÔNG phải do SVD/routing bugs
|
| 86 |
+
|
| 87 |
+
Sau khi đọc toàn bộ source code:
|
| 88 |
+
- `compute_spectral_signatures()`: SVD đúng, lưu Vt[:r] và S[:r] đúng
|
| 89 |
+
- `compute_spectral_routing()`: Weighted Rayleigh quotient đúng, softmax đúng
|
| 90 |
+
- Không có hardcoded dimensions cho T5-large
|
| 91 |
+
- d_model=512, lora_r=8 → SVD rank=8 capture toàn bộ non-zero singular values
|
| 92 |
+
- Gradient checkpointing fix đã áp dụng đúng
|
| 93 |
+
|
| 94 |
+
### 3.2 KHÔNG phải hoàn toàn do config SVD (giả thuyết ban đầu)
|
| 95 |
+
|
| 96 |
+
User hypothesis: "configs ban đầu được thiết kế cho T5_large, T5_small nên config SVD không phù hợp"
|
| 97 |
+
|
| 98 |
+
**Sự thật**: Không có config SVD-specific nào cần thay đổi cho T5-small. Các hyperparameters (lora_r=8, lora_alpha=32, threshold=0.995, temperature=1.0) là model-agnostic. Vấn đề nằm ở chỗ khác.
|
| 99 |
+
|
| 100 |
+
### 3.3 NGUYÊN NHÂN CHÍNH: Thiếu cơ chế chống forgetting
|
| 101 |
+
|
| 102 |
+
Đây là bảng so sánh **cơ chế bảo vệ** giữa 2 phương pháp:
|
| 103 |
+
|
| 104 |
+
| Cơ chế | ROOT GainLoRA | SpecRoute | Tác động |
|
| 105 |
+
|--------|:---:|:---:|----------|
|
| 106 |
+
| GPM gradient projection (LoRA A) | ✅ | ✅ | Chặn gradient phá LoRA cũ |
|
| 107 |
+
| KL distillation (`kl_ratio=0.1`) | ✅ | ❌ | Duy trì routing distribution cũ |
|
| 108 |
+
| Data replay (`gen_data_dir`) | ✅ | ❌ | Reinforce kiến thức cũ |
|
| 109 |
+
| Per-step GPM on routing params | ✅ | ❌ | Bảo vệ trans_input + prompt_key |
|
| 110 |
+
| Trans_input (learned routing) | ✅ | ❌ | Routing có gradient, học continuous |
|
| 111 |
+
|
| 112 |
+
**ROOT có 5 lớp bảo vệ, SpecRoute chỉ có 1 lớp (GPM trên LoRA A)**
|
| 113 |
+
|
| 114 |
+
Khi loại bỏ learned routing (trans_input + prompt_key), SpecRoute đồng thời loại bỏ luôn:
|
| 115 |
+
1. KL distillation (vì không có routing params để distill)
|
| 116 |
+
2. Data replay (vì không có routing MLP cần reinforce)
|
| 117 |
+
3. Per-step GPM trên routing params (vì không có routing params)
|
| 118 |
+
|
| 119 |
+
Đây **không phải là design intention** — SpecRoute muốn replace routing mechanism, nhưng vô tình loại bỏ luôn CÁC CƠ CHẾ BẢO VỆ đi kèm routing.
|
| 120 |
+
|
| 121 |
+
### 3.4 NGUYÊN NHÂN PHỤ: GPM null-space bão hòa sớm ở T5-small
|
| 122 |
+
|
| 123 |
+
Training loss so sánh (bằng chứng GPM over-constraining):
|
| 124 |
+
|
| 125 |
+
| Task (thứ tự) | ROOT loss | SpecRoute loss | Tỉ lệ | SpecRoute score |
|
| 126 |
+
|---|---|---|---|---|
|
| 127 |
+
| 1 yelp | 0.586 | 0.581 | 1.0x | 54.36 |
|
| 128 |
+
| 2 amazon | 0.540 | 0.588 | 1.1x | 50.01 |
|
| 129 |
+
| 5 copa | 0.455 | 0.459 | 1.0x | 44.00 |
|
| 130 |
+
| 6 qqp | 0.288 | 0.304 | 1.1x | 76.72 |
|
| 131 |
+
| 8 imdb | 1.410 | **4.149** | **2.9x** | **0.21** |
|
| 132 |
+
| 9 sst2 | 1.762 | **4.449** | **2.5x** | **0.00** |
|
| 133 |
+
| 12 yahoo | 1.189 | **3.077** | **2.6x** | **8.12** |
|
| 134 |
+
| 15 wic | 0.961 | **3.654** | **3.8x** | **0.00** |
|
| 135 |
+
|
| 136 |
+
**Pattern rõ ràng**: Tasks ban đầu (1-6) loss tương đương → model học OK. Tasks sau (8, 9, 12, 15) loss cao gấp 2.5-3.8x → model KHÔNG THỂ HỌC.
|
| 137 |
+
|
| 138 |
+
Nhưng thú vị: tasks 10 (dbpedia), 13 (multirc), 14 (boolq) vẫn học được tốt (loss < 1.3). Điều này cho thấy vấn đề không chỉ đơn thuần "hết null-space":
|
| 139 |
+
|
| 140 |
+
**Các tasks THẤT BẠI (imdb, sst2, yahoo, wic)** có đặc điểm chung: overlap lớn với tasks TRƯỚC ĐÓ trong feature space:
|
| 141 |
+
- imdb/sst2 = sentiment binary → overlap với yelp (task 1), amazon (task 2)
|
| 142 |
+
- yahoo = topic QA → overlap với nhiều domain trước
|
| 143 |
+
- wic = word sense → cần representations already claimed bởi tasks trước
|
| 144 |
+
|
| 145 |
+
**Giải thích**: GPM từ tasks 1-2 (yelp/amazon sentiment) đã "claim" sentiment-relevant directions. Khi imdb (cũng sentiment) đến, GPM ép LoRA A vào null-space orthogonal với sentiment directions → model bị ép vào directions KHÔNG LIÊN QUAN đến sentiment → không thể phân loại sentiment → loss cao, accuracy 0.
|
| 146 |
+
|
| 147 |
+
Trong ROOT GainLoRA, vấn đề này được giải quyết bởi:
|
| 148 |
+
- Trans_input cho phép MAP input mới vào representation space REUSE kiến thức sentiment cũ
|
| 149 |
+
- KL distillation cho phép routing CHUYỂN imdb sang LoRA branch sentiment đã có
|
| 150 |
+
- Data replay DUY TRÌ sentiment knowledge
|
| 151 |
+
|
| 152 |
+
### 3.5 Training loss cao = model không thể học, KHÔNG PHẢI catastrophic forgetting
|
| 153 |
+
|
| 154 |
+
Đây là phát hiện quan trọng nhất: comparison_results.md ghi "imdb/sst2/wic về 0 do Catastrophic Forgetting" — **NHẬN ĐỊNH NÀY SAI**.
|
| 155 |
+
|
| 156 |
+
Bằng chứng:
|
| 157 |
+
- imdb train_loss = 4.149 (rất cao) → model CHƯA BAO GIỜ học được imdb
|
| 158 |
+
- imdb prediction: "Rififi" (copy từ review text), "Negative" (sai format, label đúng là "Good"/"Bad")
|
| 159 |
+
- sst2 train_loss = 4.449 → tương tự
|
| 160 |
+
|
| 161 |
+
**Đây là "inability to learn" (GPM over-constraining), KHÔNG phải "learned then forgot" (catastrophic forgetting).**
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## 4. Lỗi so sánh không công bằng
|
| 166 |
+
|
| 167 |
+
| Aspect | ROOT | SpecRoute | Vấn đề |
|
| 168 |
+
|--------|------|-----------|--------|
|
| 169 |
+
| Score type | R_{15,j} (FINAL) | R_{j,j} (PEAK/DIAGONAL) | So sánh khác loại |
|
| 170 |
+
| Evaluation | Cross-task (all 15) | Single-task (chỉ current) | Scope khác nhau |
|
| 171 |
+
| `--do_predict` | Task 15 only | Task 1 only | Cả hai đều thiếu |
|
| 172 |
+
|
| 173 |
+
**ROOT**: Đánh giá SAU KHI train xong 15 tasks → bao gồm cả forgetting
|
| 174 |
+
**SpecRoute**: Đánh giá NGAY SAU KHI train từng task → peak performance, chưa bao gồm forgetting
|
| 175 |
+
|
| 176 |
+
Để so sánh công bằng, cần chạy lại SpecRoute với `--do_predict` ở task 15 để có R_{15,j} cho tất cả tasks.
|
| 177 |
+
|
| 178 |
+
---
|
| 179 |
+
|
| 180 |
+
## 5. Định hướng cải tiến
|
| 181 |
+
|
| 182 |
+
### 5.1 Fix NGAY (không đổi methodology)
|
| 183 |
+
|
| 184 |
+
**A. Thêm `--do_predict` cho tất cả tasks**
|
| 185 |
+
```python
|
| 186 |
+
# generate_specroute_scripts_v2.py
|
| 187 |
+
"long_order3": { "do_predict": True }, # was False
|
| 188 |
+
"long_order4": { "do_predict": True }, # was False
|
| 189 |
+
```
|
| 190 |
+
→ Cho phép build full R matrix, tính AP/FT đúng, so sánh công bằng.
|
| 191 |
+
|
| 192 |
+
**B. Khôi phục KL distillation**
|
| 193 |
+
|
| 194 |
+
Đây là fix quan trọng nhất. SpecRoute loại bỏ learned routing nhưng KL distillation hoàn toàn có thể adapt cho spectral routing:
|
| 195 |
+
|
| 196 |
+
```python
|
| 197 |
+
# Concept: KL trên routing output thay vì routing params
|
| 198 |
+
def spectral_kl_regularization(model, old_signatures, input_embeds):
|
| 199 |
+
"""Duy trì routing distribution gần với snapshot sau task trước"""
|
| 200 |
+
current_routing = model.compute_spectral_routing(input_embeds)
|
| 201 |
+
old_routing = compute_old_routing(old_signatures, input_embeds)
|
| 202 |
+
return kl_div(current_routing.log(), old_routing)
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
Tuy nhiên, vì spectral routing là deterministic (không có learnable params), KL trên routing output không tạo gradient hữu ích. Thay vào đó:
|
| 206 |
+
|
| 207 |
+
**Option tốt hơn: KL distillation trên model OUTPUT (logits)**
|
| 208 |
+
```python
|
| 209 |
+
# Sau mỗi task, lưu model logits trên replay data
|
| 210 |
+
# Trong training step tiếp theo:
|
| 211 |
+
kl_loss = kl_div(current_logits, saved_old_logits)
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
**C. Khôi phục Data Replay**
|
| 215 |
+
|
| 216 |
+
Replay không phụ thuộc vào routing mechanism. Có thể dùng generated data hoặc coreset luôn:
|
| 217 |
+
```bash
|
| 218 |
+
--gen_data_dir generated_data/lora_gen_long_t5 # Tái sử dụng tập replay của ROOT
|
| 219 |
+
--data_replay_freq 5 # Replay mỗi 5 steps
|
| 220 |
+
--kl_ratio 0.1 # Weight cho KL loss trên replay
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### 5.2 Giảm GPM threshold cho T5-small
|
| 224 |
+
|
| 225 |
+
Với threshold=0.995, sau 15 tasks, threshold tăng lên 0.99967 → GPM giữ 99.97% variance → null-space cực nhỏ.
|
| 226 |
+
|
| 227 |
+
| threshold | Task 1 | Task 7 | Task 14 | Nhận xét |
|
| 228 |
+
|-----------|--------|--------|---------|----------|
|
| 229 |
+
| 0.995 (hiện tại) | 0.9950 | 0.9973 | 0.9997 | Quá chặt cho T5-small |
|
| 230 |
+
| 0.990 | 0.9900 | 0.9947 | 0.9993 | Vẫn khá chặt |
|
| 231 |
+
| 0.980 | 0.9800 | 0.9893 | 0.9987 | Thử nghiệm đầu tiên |
|
| 232 |
+
| 0.970 | 0.9700 | 0.9840 | 0.9980 | Aggressive nhưng đáng thử |
|
| 233 |
+
|
| 234 |
+
**Đề xuất**: Thử threshold=0.980 trước, nếu forgetting tăng thì kết hợp KL distillation để bù.
|
| 235 |
+
|
| 236 |
+
### 5.3 Cải tiến methodology (dài hạn)
|
| 237 |
+
|
| 238 |
+
**A. Cho phép subspace sharing**
|
| 239 |
+
|
| 240 |
+
Vấn đề gốc: GPM ép tasks tương tự (imdb/sst2 vs yelp/amazon) vào subspaces orthogonal. Cần mechanism cho phép knowledge reuse:
|
| 241 |
+
|
| 242 |
+
```python
|
| 243 |
+
# Ý tưởng: Nếu spectral routing gợi ý task mới SIMILAR với task cũ,
|
| 244 |
+
# giảm GPM protection cho directions tương tự → cho phép reuse
|
| 245 |
+
similarity = compute_spectral_routing(avg_input) # routing weights
|
| 246 |
+
for old_task, weight in enumerate(similarity):
|
| 247 |
+
if weight > threshold_reuse:
|
| 248 |
+
# Giảm GPM projection cho old_task's directions
|
| 249 |
+
# → cho phép refinement thay vì full orthogonality
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
**B. Hybrid routing: spectral + lightweight learned component**
|
| 253 |
+
|
| 254 |
+
Thay vì hoàn toàn parameter-free, thêm adapter nhẹ:
|
| 255 |
+
```python
|
| 256 |
+
routing = alpha * spectral_fit + (1-alpha) * learned_gate
|
| 257 |
+
```
|
| 258 |
+
- `spectral_fit`: parameter-free, ổn định, không cần GPM protection
|
| 259 |
+
- `learned_gate`: lightweight (MLP nhỏ), cho phép gradient flow
|
| 260 |
+
- `alpha`: có thể learnable hoặc fixed (e.g., 0.7)
|
| 261 |
+
|
| 262 |
+
**C. Tách biệt protection vs routing**
|
| 263 |
+
|
| 264 |
+
Thiết kế SpecRoute hiện tại **couple** routing mechanism với protection mechanisms. Cần tách:
|
| 265 |
+
- **Routing**: Spectral (parameter-free) — OK giữ nguyên
|
| 266 |
+
- **Protection**: Cần ÍT NHẤT 2 trong 3: GPM, KL distillation, data replay
|
| 267 |
+
|
| 268 |
+
---
|
| 269 |
+
|
| 270 |
+
## 6. Kế hoạch thí nghiệm tiếp theo
|
| 271 |
+
|
| 272 |
+
### Phase 1: Fix bugs + fair comparison (ưu tiên CAO)
|
| 273 |
+
1. Fix `generate_specroute_scripts_v2.py`: `do_predict=True` cho long benchmarks
|
| 274 |
+
2. Regenerate scripts
|
| 275 |
+
3. Chạy lại SpecRoute Long Order 3 trên T5-small
|
| 276 |
+
4. So sánh AP/FT chính xác giữa 2 methods
|
| 277 |
+
|
| 278 |
+
### Phase 2: Thêm protection mechanisms (ưu tiên CAO)
|
| 279 |
+
1. Thêm KL distillation trên model output logits (replay + KL loss)
|
| 280 |
+
2. Thêm data replay
|
| 281 |
+
3. Grid search: threshold ∈ {0.995, 0.990, 0.980}, kl_ratio ∈ {0.05, 0.1, 0.2}
|
| 282 |
+
|
| 283 |
+
### Phase 3: Validate methodology (sau phase 2)
|
| 284 |
+
1. Nếu Phase 2 cho kết quả tốt → methodology đúng, chỉ thiếu protection
|
| 285 |
+
2. Nếu Phase 2 vẫn kém → spectral routing có vấn đề ở T5-small, cần hybrid approach
|
| 286 |
+
3. Scale lên T5-large để so sánh ở đúng scale ROOT paper dùng
|
| 287 |
+
|
| 288 |
+
---
|
| 289 |
+
|
| 290 |
+
## 7. Tổng kết
|
| 291 |
+
|
| 292 |
+
| Câu hỏi | Trả lời |
|
| 293 |
+
|----------|---------|
|
| 294 |
+
| Kết quả tổng hợp có chính xác? | ⚠️ ROOT đúng (AP=59.70), SpecRoute SAI loại metric (diagonal vs final) |
|
| 295 |
+
| Tại sao kết quả tệ? | SpecRoute loại bỏ routing → vô tình loại bỏ luôn KL + replay + per-step GPM |
|
| 296 |
+
| Do methodology hay config? | **Cả hai**: methodology thiếu protection layers + GPM threshold quá chặt cho T5-small |
|
| 297 |
+
| SVD có phải nguyên nhân? | **Không trực tiếp**. SVD routing code đúng, không có bugs |
|
| 298 |
+
| FT tại sao chưa tính? | Bug trong script generator: `do_predict=False` cho long benchmarks |
|
| 299 |
+
| Hướng cải tiến? | Khôi phục KL distillation + data replay, giảm GPM threshold, fix scripts |
|
| 300 |
+
|
| 301 |
+
**Kết luận cốt lõi**: Ý tưởng spectral routing thay thế learned routing KHÔNG SAI về mặt lý thuyết. Vấn đề là khi implement, các cơ chế protection (KL, replay) bị loại bỏ theo vì chúng gắn chặt với learned routing trong code ROOT. Cần decouple routing mechanism khỏi protection mechanisms.
|
improve_gainlora/T5_small/gen_script_long_order3_t5_small_specroute_v2.sh
ADDED
|
@@ -0,0 +1,893 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#SBATCH -J cl
|
| 3 |
+
#SBATCH -o cl-%j.out
|
| 4 |
+
#SBATCH -p compute
|
| 5 |
+
#SBATCH -N 1
|
| 6 |
+
#SBATCH -t 20:00:00
|
| 7 |
+
#SBATCH --mem 128G
|
| 8 |
+
#SBATCH --gres=gpu:2
|
| 9 |
+
|
| 10 |
+
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
| 11 |
+
|
| 12 |
+
port=$(shuf -i25000-30000 -n1)
|
| 13 |
+
|
| 14 |
+
# ============================================================
|
| 15 |
+
# Auto-detect GPU count and type for optimal parallelism
|
| 16 |
+
# ============================================================
|
| 17 |
+
NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
|
| 18 |
+
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
|
| 19 |
+
|
| 20 |
+
if [ -z "$GPU_MEM" ]; then
|
| 21 |
+
echo "ERROR: No GPU detected!"
|
| 22 |
+
exit 1
|
| 23 |
+
fi
|
| 24 |
+
|
| 25 |
+
# Determine GPU type
|
| 26 |
+
if [ "$GPU_MEM" -lt 20000 ]; then
|
| 27 |
+
IS_T4=1
|
| 28 |
+
echo "[GPU] Detected T4 GPUs (${GPU_MEM}MB VRAM each)"
|
| 29 |
+
else
|
| 30 |
+
IS_T4=0
|
| 31 |
+
echo "[GPU] Detected high-memory GPUs (${GPU_MEM}MB VRAM each)"
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
# Determine parallelism strategy
|
| 35 |
+
if [ "$IS_T4" -eq 1 ] && [ "$NUM_GPUS" -ge 2 ]; then
|
| 36 |
+
GPU_MODE="t4_2gpu"
|
| 37 |
+
GPU_IDS="0,1"
|
| 38 |
+
FP16_FLAG=""
|
| 39 |
+
echo "[GPU] Strategy: 2x T4 DataParallel + fp32 + gradient_checkpointing"
|
| 40 |
+
elif [ "$IS_T4" -eq 1 ]; then
|
| 41 |
+
GPU_MODE="t4_1gpu"
|
| 42 |
+
GPU_IDS="${1:-0}"
|
| 43 |
+
FP16_FLAG=""
|
| 44 |
+
echo "[GPU] Strategy: 1x T4 + fp32 + gradient_checkpointing"
|
| 45 |
+
else
|
| 46 |
+
GPU_MODE="a100"
|
| 47 |
+
GPU_IDS="${1:-0}"
|
| 48 |
+
FP16_FLAG=""
|
| 49 |
+
echo "[GPU] Strategy: A100 (single GPU, fp32)"
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
echo "[GPU] Using CUDA_VISIBLE_DEVICES=$GPU_IDS"
|
| 53 |
+
echo "============================================================"
|
| 54 |
+
echo ""
|
| 55 |
+
|
| 56 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 57 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 58 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 59 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 60 |
+
else
|
| 61 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 62 |
+
fi
|
| 63 |
+
|
| 64 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 65 |
+
--do_train \
|
| 66 |
+
--do_predict \
|
| 67 |
+
--predict_with_generate \
|
| 68 |
+
--model_name_or_path $2 \
|
| 69 |
+
--data_dir CL_Benchmark \
|
| 70 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 71 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/yelp \
|
| 72 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp \
|
| 73 |
+
--per_device_train_batch_size $BSZ \
|
| 74 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 75 |
+
--gradient_accumulation_steps $GA \
|
| 76 |
+
--learning_rate 0.0003 \
|
| 77 |
+
--num_train_epochs 10 \
|
| 78 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 79 |
+
--max_source_length 512 \
|
| 80 |
+
--max_target_length 50 \
|
| 81 |
+
--generation_max_length 50 \
|
| 82 |
+
--add_task_name False \
|
| 83 |
+
--add_dataset_name False \
|
| 84 |
+
--overwrite_output_dir \
|
| 85 |
+
--overwrite_cache \
|
| 86 |
+
--lr_scheduler_type constant \
|
| 87 |
+
--warmup_steps 0 \
|
| 88 |
+
--logging_strategy steps \
|
| 89 |
+
--logging_steps 10 \
|
| 90 |
+
--metric_for_best_model eval_exact_match \
|
| 91 |
+
--evaluation_strategy steps \
|
| 92 |
+
--save_strategy steps \
|
| 93 |
+
--save_total_limit 1 \
|
| 94 |
+
--load_best_model_at_end \
|
| 95 |
+
--lora_r 8 \
|
| 96 |
+
--lora_alpha 32 \
|
| 97 |
+
--lora_dropout 0.0 \
|
| 98 |
+
--data_replay_freq 5 \
|
| 99 |
+
--mlp_hidden_dim 100 \
|
| 100 |
+
--model_name specroute \
|
| 101 |
+
--kl_ratio 0.1 \
|
| 102 |
+
--gen_data_dir CL_Benchmark \
|
| 103 |
+
--threshold 0.980 \
|
| 104 |
+
--transthreshold 0.980 \
|
| 105 |
+
$FP16_FLAG
|
| 106 |
+
|
| 107 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/checkpoint*
|
| 108 |
+
|
| 109 |
+
sleep 5
|
| 110 |
+
|
| 111 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 112 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 113 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 114 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 115 |
+
else
|
| 116 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 117 |
+
fi
|
| 118 |
+
|
| 119 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 120 |
+
--do_train \
|
| 121 |
+
--do_predict \
|
| 122 |
+
--predict_with_generate \
|
| 123 |
+
--model_name_or_path $2 \
|
| 124 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights \
|
| 125 |
+
--data_dir CL_Benchmark \
|
| 126 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 127 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/amazon \
|
| 128 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon \
|
| 129 |
+
--per_device_train_batch_size $BSZ \
|
| 130 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 131 |
+
--gradient_accumulation_steps $GA \
|
| 132 |
+
--learning_rate 0.0003 \
|
| 133 |
+
--num_train_epochs 10 \
|
| 134 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 135 |
+
--max_source_length 512 \
|
| 136 |
+
--max_target_length 50 \
|
| 137 |
+
--generation_max_length 50 \
|
| 138 |
+
--add_task_name False \
|
| 139 |
+
--add_dataset_name False \
|
| 140 |
+
--overwrite_output_dir \
|
| 141 |
+
--overwrite_cache \
|
| 142 |
+
--lr_scheduler_type constant \
|
| 143 |
+
--warmup_steps 0 \
|
| 144 |
+
--logging_strategy steps \
|
| 145 |
+
--logging_steps 10 \
|
| 146 |
+
--metric_for_best_model eval_exact_match_for_amazon \
|
| 147 |
+
--evaluation_strategy steps \
|
| 148 |
+
--save_strategy steps \
|
| 149 |
+
--save_total_limit 1 \
|
| 150 |
+
--load_best_model_at_end \
|
| 151 |
+
--lora_r 8 \
|
| 152 |
+
--lora_alpha 32 \
|
| 153 |
+
--lora_dropout 0.0 \
|
| 154 |
+
--data_replay_freq 5 \
|
| 155 |
+
--mlp_hidden_dim 100 \
|
| 156 |
+
--model_name specroute \
|
| 157 |
+
--kl_ratio 0.1 \
|
| 158 |
+
--gen_data_dir CL_Benchmark \
|
| 159 |
+
--threshold 0.980 \
|
| 160 |
+
--transthreshold 0.980 \
|
| 161 |
+
$FP16_FLAG
|
| 162 |
+
|
| 163 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/checkpoint*
|
| 164 |
+
|
| 165 |
+
sleep 5
|
| 166 |
+
|
| 167 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 168 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 169 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 170 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 171 |
+
else
|
| 172 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 173 |
+
fi
|
| 174 |
+
|
| 175 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 176 |
+
--do_train \
|
| 177 |
+
--do_predict \
|
| 178 |
+
--predict_with_generate \
|
| 179 |
+
--model_name_or_path $2 \
|
| 180 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights \
|
| 181 |
+
--data_dir CL_Benchmark \
|
| 182 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 183 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/mnli \
|
| 184 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli \
|
| 185 |
+
--per_device_train_batch_size $BSZ \
|
| 186 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 187 |
+
--gradient_accumulation_steps $GA \
|
| 188 |
+
--learning_rate 0.0003 \
|
| 189 |
+
--num_train_epochs 10 \
|
| 190 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 191 |
+
--max_source_length 512 \
|
| 192 |
+
--max_target_length 50 \
|
| 193 |
+
--generation_max_length 50 \
|
| 194 |
+
--add_task_name False \
|
| 195 |
+
--add_dataset_name False \
|
| 196 |
+
--overwrite_output_dir \
|
| 197 |
+
--overwrite_cache \
|
| 198 |
+
--lr_scheduler_type constant \
|
| 199 |
+
--warmup_steps 0 \
|
| 200 |
+
--logging_strategy steps \
|
| 201 |
+
--logging_steps 10 \
|
| 202 |
+
--metric_for_best_model eval_exact_match_for_mnli \
|
| 203 |
+
--evaluation_strategy steps \
|
| 204 |
+
--save_strategy steps \
|
| 205 |
+
--save_total_limit 1 \
|
| 206 |
+
--load_best_model_at_end \
|
| 207 |
+
--lora_r 8 \
|
| 208 |
+
--lora_alpha 32 \
|
| 209 |
+
--lora_dropout 0.0 \
|
| 210 |
+
--data_replay_freq 5 \
|
| 211 |
+
--mlp_hidden_dim 100 \
|
| 212 |
+
--model_name specroute \
|
| 213 |
+
--kl_ratio 0.1 \
|
| 214 |
+
--gen_data_dir CL_Benchmark \
|
| 215 |
+
--threshold 0.980 \
|
| 216 |
+
--transthreshold 0.980 \
|
| 217 |
+
$FP16_FLAG
|
| 218 |
+
|
| 219 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/checkpoint*
|
| 220 |
+
|
| 221 |
+
sleep 5
|
| 222 |
+
|
| 223 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 224 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 225 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 226 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 227 |
+
else
|
| 228 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 229 |
+
fi
|
| 230 |
+
|
| 231 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 232 |
+
--do_train \
|
| 233 |
+
--do_predict \
|
| 234 |
+
--predict_with_generate \
|
| 235 |
+
--model_name_or_path $2 \
|
| 236 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights \
|
| 237 |
+
--data_dir CL_Benchmark \
|
| 238 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 239 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/cb \
|
| 240 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb \
|
| 241 |
+
--per_device_train_batch_size $BSZ \
|
| 242 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 243 |
+
--gradient_accumulation_steps $GA \
|
| 244 |
+
--learning_rate 0.0003 \
|
| 245 |
+
--num_train_epochs 10 \
|
| 246 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 247 |
+
--max_source_length 512 \
|
| 248 |
+
--max_target_length 50 \
|
| 249 |
+
--generation_max_length 50 \
|
| 250 |
+
--add_task_name False \
|
| 251 |
+
--add_dataset_name False \
|
| 252 |
+
--overwrite_output_dir \
|
| 253 |
+
--overwrite_cache \
|
| 254 |
+
--lr_scheduler_type constant \
|
| 255 |
+
--warmup_steps 0 \
|
| 256 |
+
--logging_strategy steps \
|
| 257 |
+
--logging_steps 10 \
|
| 258 |
+
--metric_for_best_model eval_exact_match_for_cb \
|
| 259 |
+
--evaluation_strategy steps \
|
| 260 |
+
--save_strategy steps \
|
| 261 |
+
--save_total_limit 1 \
|
| 262 |
+
--load_best_model_at_end \
|
| 263 |
+
--lora_r 8 \
|
| 264 |
+
--lora_alpha 32 \
|
| 265 |
+
--lora_dropout 0.0 \
|
| 266 |
+
--data_replay_freq 5 \
|
| 267 |
+
--mlp_hidden_dim 100 \
|
| 268 |
+
--model_name specroute \
|
| 269 |
+
--kl_ratio 0.1 \
|
| 270 |
+
--gen_data_dir CL_Benchmark \
|
| 271 |
+
--threshold 0.980 \
|
| 272 |
+
--transthreshold 0.980 \
|
| 273 |
+
$FP16_FLAG
|
| 274 |
+
|
| 275 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/checkpoint*
|
| 276 |
+
|
| 277 |
+
sleep 5
|
| 278 |
+
|
| 279 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 280 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 281 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 282 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 283 |
+
else
|
| 284 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 285 |
+
fi
|
| 286 |
+
|
| 287 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 288 |
+
--do_train \
|
| 289 |
+
--do_predict \
|
| 290 |
+
--predict_with_generate \
|
| 291 |
+
--model_name_or_path $2 \
|
| 292 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights \
|
| 293 |
+
--data_dir CL_Benchmark \
|
| 294 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 295 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/copa \
|
| 296 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa \
|
| 297 |
+
--per_device_train_batch_size $BSZ \
|
| 298 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 299 |
+
--gradient_accumulation_steps $GA \
|
| 300 |
+
--learning_rate 0.0003 \
|
| 301 |
+
--num_train_epochs 10 \
|
| 302 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 303 |
+
--max_source_length 512 \
|
| 304 |
+
--max_target_length 50 \
|
| 305 |
+
--generation_max_length 50 \
|
| 306 |
+
--add_task_name False \
|
| 307 |
+
--add_dataset_name False \
|
| 308 |
+
--overwrite_output_dir \
|
| 309 |
+
--overwrite_cache \
|
| 310 |
+
--lr_scheduler_type constant \
|
| 311 |
+
--warmup_steps 0 \
|
| 312 |
+
--logging_strategy steps \
|
| 313 |
+
--logging_steps 10 \
|
| 314 |
+
--metric_for_best_model eval_exact_match_for_copa \
|
| 315 |
+
--evaluation_strategy steps \
|
| 316 |
+
--save_strategy steps \
|
| 317 |
+
--save_total_limit 1 \
|
| 318 |
+
--load_best_model_at_end \
|
| 319 |
+
--lora_r 8 \
|
| 320 |
+
--lora_alpha 32 \
|
| 321 |
+
--lora_dropout 0.0 \
|
| 322 |
+
--data_replay_freq 5 \
|
| 323 |
+
--mlp_hidden_dim 100 \
|
| 324 |
+
--model_name specroute \
|
| 325 |
+
--kl_ratio 0.1 \
|
| 326 |
+
--gen_data_dir CL_Benchmark \
|
| 327 |
+
--threshold 0.980 \
|
| 328 |
+
--transthreshold 0.980 \
|
| 329 |
+
$FP16_FLAG
|
| 330 |
+
|
| 331 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/checkpoint*
|
| 332 |
+
|
| 333 |
+
sleep 5
|
| 334 |
+
|
| 335 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 336 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 337 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 338 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 339 |
+
else
|
| 340 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 341 |
+
fi
|
| 342 |
+
|
| 343 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 344 |
+
--do_train \
|
| 345 |
+
--do_predict \
|
| 346 |
+
--predict_with_generate \
|
| 347 |
+
--model_name_or_path $2 \
|
| 348 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights \
|
| 349 |
+
--data_dir CL_Benchmark \
|
| 350 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 351 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/qqp \
|
| 352 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp \
|
| 353 |
+
--per_device_train_batch_size $BSZ \
|
| 354 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 355 |
+
--gradient_accumulation_steps $GA \
|
| 356 |
+
--learning_rate 0.0003 \
|
| 357 |
+
--num_train_epochs 10 \
|
| 358 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 359 |
+
--max_source_length 512 \
|
| 360 |
+
--max_target_length 50 \
|
| 361 |
+
--generation_max_length 50 \
|
| 362 |
+
--add_task_name False \
|
| 363 |
+
--add_dataset_name False \
|
| 364 |
+
--overwrite_output_dir \
|
| 365 |
+
--overwrite_cache \
|
| 366 |
+
--lr_scheduler_type constant \
|
| 367 |
+
--warmup_steps 0 \
|
| 368 |
+
--logging_strategy steps \
|
| 369 |
+
--logging_steps 10 \
|
| 370 |
+
--metric_for_best_model eval_exact_match_for_qqp \
|
| 371 |
+
--evaluation_strategy steps \
|
| 372 |
+
--save_strategy steps \
|
| 373 |
+
--save_total_limit 1 \
|
| 374 |
+
--load_best_model_at_end \
|
| 375 |
+
--lora_r 8 \
|
| 376 |
+
--lora_alpha 32 \
|
| 377 |
+
--lora_dropout 0.0 \
|
| 378 |
+
--data_replay_freq 5 \
|
| 379 |
+
--mlp_hidden_dim 100 \
|
| 380 |
+
--model_name specroute \
|
| 381 |
+
--kl_ratio 0.1 \
|
| 382 |
+
--gen_data_dir CL_Benchmark \
|
| 383 |
+
--threshold 0.980 \
|
| 384 |
+
--transthreshold 0.980 \
|
| 385 |
+
$FP16_FLAG
|
| 386 |
+
|
| 387 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/checkpoint*
|
| 388 |
+
|
| 389 |
+
sleep 5
|
| 390 |
+
|
| 391 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 392 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 393 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 394 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 395 |
+
else
|
| 396 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 397 |
+
fi
|
| 398 |
+
|
| 399 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 400 |
+
--do_train \
|
| 401 |
+
--do_predict \
|
| 402 |
+
--predict_with_generate \
|
| 403 |
+
--model_name_or_path $2 \
|
| 404 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights \
|
| 405 |
+
--data_dir CL_Benchmark \
|
| 406 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 407 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/rte \
|
| 408 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte \
|
| 409 |
+
--per_device_train_batch_size $BSZ \
|
| 410 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 411 |
+
--gradient_accumulation_steps $GA \
|
| 412 |
+
--learning_rate 0.0003 \
|
| 413 |
+
--num_train_epochs 10 \
|
| 414 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 415 |
+
--max_source_length 512 \
|
| 416 |
+
--max_target_length 50 \
|
| 417 |
+
--generation_max_length 50 \
|
| 418 |
+
--add_task_name False \
|
| 419 |
+
--add_dataset_name False \
|
| 420 |
+
--overwrite_output_dir \
|
| 421 |
+
--overwrite_cache \
|
| 422 |
+
--lr_scheduler_type constant \
|
| 423 |
+
--warmup_steps 0 \
|
| 424 |
+
--logging_strategy steps \
|
| 425 |
+
--logging_steps 10 \
|
| 426 |
+
--metric_for_best_model eval_exact_match_for_rte \
|
| 427 |
+
--evaluation_strategy steps \
|
| 428 |
+
--save_strategy steps \
|
| 429 |
+
--save_total_limit 1 \
|
| 430 |
+
--load_best_model_at_end \
|
| 431 |
+
--lora_r 8 \
|
| 432 |
+
--lora_alpha 32 \
|
| 433 |
+
--lora_dropout 0.0 \
|
| 434 |
+
--data_replay_freq 5 \
|
| 435 |
+
--mlp_hidden_dim 100 \
|
| 436 |
+
--model_name specroute \
|
| 437 |
+
--kl_ratio 0.1 \
|
| 438 |
+
--gen_data_dir CL_Benchmark \
|
| 439 |
+
--threshold 0.980 \
|
| 440 |
+
--transthreshold 0.980 \
|
| 441 |
+
$FP16_FLAG
|
| 442 |
+
|
| 443 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/checkpoint*
|
| 444 |
+
|
| 445 |
+
sleep 5
|
| 446 |
+
|
| 447 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 448 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 449 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 450 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 451 |
+
else
|
| 452 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 453 |
+
fi
|
| 454 |
+
|
| 455 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 456 |
+
--do_train \
|
| 457 |
+
--do_predict \
|
| 458 |
+
--predict_with_generate \
|
| 459 |
+
--model_name_or_path $2 \
|
| 460 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights \
|
| 461 |
+
--data_dir CL_Benchmark \
|
| 462 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 463 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/imdb \
|
| 464 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb \
|
| 465 |
+
--per_device_train_batch_size $BSZ \
|
| 466 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 467 |
+
--gradient_accumulation_steps $GA \
|
| 468 |
+
--learning_rate 0.0003 \
|
| 469 |
+
--num_train_epochs 10 \
|
| 470 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 471 |
+
--max_source_length 512 \
|
| 472 |
+
--max_target_length 50 \
|
| 473 |
+
--generation_max_length 50 \
|
| 474 |
+
--add_task_name False \
|
| 475 |
+
--add_dataset_name False \
|
| 476 |
+
--overwrite_output_dir \
|
| 477 |
+
--overwrite_cache \
|
| 478 |
+
--lr_scheduler_type constant \
|
| 479 |
+
--warmup_steps 0 \
|
| 480 |
+
--logging_strategy steps \
|
| 481 |
+
--logging_steps 10 \
|
| 482 |
+
--metric_for_best_model eval_exact_match_for_imdb \
|
| 483 |
+
--evaluation_strategy steps \
|
| 484 |
+
--save_strategy steps \
|
| 485 |
+
--save_total_limit 1 \
|
| 486 |
+
--load_best_model_at_end \
|
| 487 |
+
--lora_r 8 \
|
| 488 |
+
--lora_alpha 32 \
|
| 489 |
+
--lora_dropout 0.0 \
|
| 490 |
+
--data_replay_freq 5 \
|
| 491 |
+
--mlp_hidden_dim 100 \
|
| 492 |
+
--model_name specroute \
|
| 493 |
+
--kl_ratio 0.1 \
|
| 494 |
+
--gen_data_dir CL_Benchmark \
|
| 495 |
+
--threshold 0.980 \
|
| 496 |
+
--transthreshold 0.980 \
|
| 497 |
+
$FP16_FLAG
|
| 498 |
+
|
| 499 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/checkpoint*
|
| 500 |
+
|
| 501 |
+
sleep 5
|
| 502 |
+
|
| 503 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 504 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 505 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 506 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 507 |
+
else
|
| 508 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 509 |
+
fi
|
| 510 |
+
|
| 511 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 512 |
+
--do_train \
|
| 513 |
+
--do_predict \
|
| 514 |
+
--predict_with_generate \
|
| 515 |
+
--model_name_or_path $2 \
|
| 516 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights \
|
| 517 |
+
--data_dir CL_Benchmark \
|
| 518 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 519 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/sst2 \
|
| 520 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2 \
|
| 521 |
+
--per_device_train_batch_size $BSZ \
|
| 522 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 523 |
+
--gradient_accumulation_steps $GA \
|
| 524 |
+
--learning_rate 0.0003 \
|
| 525 |
+
--num_train_epochs 10 \
|
| 526 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 527 |
+
--max_source_length 512 \
|
| 528 |
+
--max_target_length 50 \
|
| 529 |
+
--generation_max_length 50 \
|
| 530 |
+
--add_task_name False \
|
| 531 |
+
--add_dataset_name False \
|
| 532 |
+
--overwrite_output_dir \
|
| 533 |
+
--overwrite_cache \
|
| 534 |
+
--lr_scheduler_type constant \
|
| 535 |
+
--warmup_steps 0 \
|
| 536 |
+
--logging_strategy steps \
|
| 537 |
+
--logging_steps 10 \
|
| 538 |
+
--metric_for_best_model eval_exact_match_for_sst2 \
|
| 539 |
+
--evaluation_strategy steps \
|
| 540 |
+
--save_strategy steps \
|
| 541 |
+
--save_total_limit 1 \
|
| 542 |
+
--load_best_model_at_end \
|
| 543 |
+
--lora_r 8 \
|
| 544 |
+
--lora_alpha 32 \
|
| 545 |
+
--lora_dropout 0.0 \
|
| 546 |
+
--data_replay_freq 5 \
|
| 547 |
+
--mlp_hidden_dim 100 \
|
| 548 |
+
--model_name specroute \
|
| 549 |
+
--kl_ratio 0.1 \
|
| 550 |
+
--gen_data_dir CL_Benchmark \
|
| 551 |
+
--threshold 0.980 \
|
| 552 |
+
--transthreshold 0.980 \
|
| 553 |
+
$FP16_FLAG
|
| 554 |
+
|
| 555 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/checkpoint*
|
| 556 |
+
|
| 557 |
+
sleep 5
|
| 558 |
+
|
| 559 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 560 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 561 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 562 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 563 |
+
else
|
| 564 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 565 |
+
fi
|
| 566 |
+
|
| 567 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 568 |
+
--do_train \
|
| 569 |
+
--do_predict \
|
| 570 |
+
--predict_with_generate \
|
| 571 |
+
--model_name_or_path $2 \
|
| 572 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights \
|
| 573 |
+
--data_dir CL_Benchmark \
|
| 574 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 575 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/dbpedia \
|
| 576 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia \
|
| 577 |
+
--per_device_train_batch_size $BSZ \
|
| 578 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 579 |
+
--gradient_accumulation_steps $GA \
|
| 580 |
+
--learning_rate 0.0003 \
|
| 581 |
+
--num_train_epochs 10 \
|
| 582 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 583 |
+
--max_source_length 512 \
|
| 584 |
+
--max_target_length 50 \
|
| 585 |
+
--generation_max_length 50 \
|
| 586 |
+
--add_task_name False \
|
| 587 |
+
--add_dataset_name False \
|
| 588 |
+
--overwrite_output_dir \
|
| 589 |
+
--overwrite_cache \
|
| 590 |
+
--lr_scheduler_type constant \
|
| 591 |
+
--warmup_steps 0 \
|
| 592 |
+
--logging_strategy steps \
|
| 593 |
+
--logging_steps 10 \
|
| 594 |
+
--metric_for_best_model eval_exact_match_for_dbpedia \
|
| 595 |
+
--evaluation_strategy steps \
|
| 596 |
+
--save_strategy steps \
|
| 597 |
+
--save_total_limit 1 \
|
| 598 |
+
--load_best_model_at_end \
|
| 599 |
+
--lora_r 8 \
|
| 600 |
+
--lora_alpha 32 \
|
| 601 |
+
--lora_dropout 0.0 \
|
| 602 |
+
--data_replay_freq 5 \
|
| 603 |
+
--mlp_hidden_dim 100 \
|
| 604 |
+
--model_name specroute \
|
| 605 |
+
--kl_ratio 0.1 \
|
| 606 |
+
--gen_data_dir CL_Benchmark \
|
| 607 |
+
--threshold 0.980 \
|
| 608 |
+
--transthreshold 0.980 \
|
| 609 |
+
$FP16_FLAG
|
| 610 |
+
|
| 611 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/checkpoint*
|
| 612 |
+
|
| 613 |
+
sleep 5
|
| 614 |
+
|
| 615 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 616 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 617 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 618 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 619 |
+
else
|
| 620 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 621 |
+
fi
|
| 622 |
+
|
| 623 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 624 |
+
--do_train \
|
| 625 |
+
--do_predict \
|
| 626 |
+
--predict_with_generate \
|
| 627 |
+
--model_name_or_path $2 \
|
| 628 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights \
|
| 629 |
+
--data_dir CL_Benchmark \
|
| 630 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 631 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/agnews \
|
| 632 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews \
|
| 633 |
+
--per_device_train_batch_size $BSZ \
|
| 634 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 635 |
+
--gradient_accumulation_steps $GA \
|
| 636 |
+
--learning_rate 0.0003 \
|
| 637 |
+
--num_train_epochs 10 \
|
| 638 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 639 |
+
--max_source_length 512 \
|
| 640 |
+
--max_target_length 50 \
|
| 641 |
+
--generation_max_length 50 \
|
| 642 |
+
--add_task_name False \
|
| 643 |
+
--add_dataset_name False \
|
| 644 |
+
--overwrite_output_dir \
|
| 645 |
+
--overwrite_cache \
|
| 646 |
+
--lr_scheduler_type constant \
|
| 647 |
+
--warmup_steps 0 \
|
| 648 |
+
--logging_strategy steps \
|
| 649 |
+
--logging_steps 10 \
|
| 650 |
+
--metric_for_best_model eval_exact_match_for_agnews \
|
| 651 |
+
--evaluation_strategy steps \
|
| 652 |
+
--save_strategy steps \
|
| 653 |
+
--save_total_limit 1 \
|
| 654 |
+
--load_best_model_at_end \
|
| 655 |
+
--lora_r 8 \
|
| 656 |
+
--lora_alpha 32 \
|
| 657 |
+
--lora_dropout 0.0 \
|
| 658 |
+
--data_replay_freq 5 \
|
| 659 |
+
--mlp_hidden_dim 100 \
|
| 660 |
+
--model_name specroute \
|
| 661 |
+
--kl_ratio 0.1 \
|
| 662 |
+
--gen_data_dir CL_Benchmark \
|
| 663 |
+
--threshold 0.980 \
|
| 664 |
+
--transthreshold 0.980 \
|
| 665 |
+
$FP16_FLAG
|
| 666 |
+
|
| 667 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/checkpoint*
|
| 668 |
+
|
| 669 |
+
sleep 5
|
| 670 |
+
|
| 671 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 672 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 673 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 674 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 675 |
+
else
|
| 676 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 677 |
+
fi
|
| 678 |
+
|
| 679 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 680 |
+
--do_train \
|
| 681 |
+
--do_predict \
|
| 682 |
+
--predict_with_generate \
|
| 683 |
+
--model_name_or_path $2 \
|
| 684 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights \
|
| 685 |
+
--data_dir CL_Benchmark \
|
| 686 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 687 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/yahoo \
|
| 688 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo \
|
| 689 |
+
--per_device_train_batch_size $BSZ \
|
| 690 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 691 |
+
--gradient_accumulation_steps $GA \
|
| 692 |
+
--learning_rate 0.0003 \
|
| 693 |
+
--num_train_epochs 10 \
|
| 694 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 695 |
+
--max_source_length 512 \
|
| 696 |
+
--max_target_length 50 \
|
| 697 |
+
--generation_max_length 50 \
|
| 698 |
+
--add_task_name False \
|
| 699 |
+
--add_dataset_name False \
|
| 700 |
+
--overwrite_output_dir \
|
| 701 |
+
--overwrite_cache \
|
| 702 |
+
--lr_scheduler_type constant \
|
| 703 |
+
--warmup_steps 0 \
|
| 704 |
+
--logging_strategy steps \
|
| 705 |
+
--logging_steps 10 \
|
| 706 |
+
--metric_for_best_model eval_exact_match_for_yahoo \
|
| 707 |
+
--evaluation_strategy steps \
|
| 708 |
+
--save_strategy steps \
|
| 709 |
+
--save_total_limit 1 \
|
| 710 |
+
--load_best_model_at_end \
|
| 711 |
+
--lora_r 8 \
|
| 712 |
+
--lora_alpha 32 \
|
| 713 |
+
--lora_dropout 0.0 \
|
| 714 |
+
--data_replay_freq 5 \
|
| 715 |
+
--mlp_hidden_dim 100 \
|
| 716 |
+
--model_name specroute \
|
| 717 |
+
--kl_ratio 0.1 \
|
| 718 |
+
--gen_data_dir CL_Benchmark \
|
| 719 |
+
--threshold 0.980 \
|
| 720 |
+
--transthreshold 0.980 \
|
| 721 |
+
$FP16_FLAG
|
| 722 |
+
|
| 723 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/checkpoint*
|
| 724 |
+
|
| 725 |
+
sleep 5
|
| 726 |
+
|
| 727 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 728 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 729 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 730 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 731 |
+
else
|
| 732 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 733 |
+
fi
|
| 734 |
+
|
| 735 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 736 |
+
--do_train \
|
| 737 |
+
--do_predict \
|
| 738 |
+
--predict_with_generate \
|
| 739 |
+
--model_name_or_path $2 \
|
| 740 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/saved_weights \
|
| 741 |
+
--data_dir CL_Benchmark \
|
| 742 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 743 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/multirc \
|
| 744 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc \
|
| 745 |
+
--per_device_train_batch_size $BSZ \
|
| 746 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 747 |
+
--gradient_accumulation_steps $GA \
|
| 748 |
+
--learning_rate 0.0003 \
|
| 749 |
+
--num_train_epochs 10 \
|
| 750 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 751 |
+
--max_source_length 512 \
|
| 752 |
+
--max_target_length 50 \
|
| 753 |
+
--generation_max_length 50 \
|
| 754 |
+
--add_task_name False \
|
| 755 |
+
--add_dataset_name False \
|
| 756 |
+
--overwrite_output_dir \
|
| 757 |
+
--overwrite_cache \
|
| 758 |
+
--lr_scheduler_type constant \
|
| 759 |
+
--warmup_steps 0 \
|
| 760 |
+
--logging_strategy steps \
|
| 761 |
+
--logging_steps 10 \
|
| 762 |
+
--metric_for_best_model eval_exact_match_for_multirc \
|
| 763 |
+
--evaluation_strategy steps \
|
| 764 |
+
--save_strategy steps \
|
| 765 |
+
--save_total_limit 1 \
|
| 766 |
+
--load_best_model_at_end \
|
| 767 |
+
--lora_r 8 \
|
| 768 |
+
--lora_alpha 32 \
|
| 769 |
+
--lora_dropout 0.0 \
|
| 770 |
+
--data_replay_freq 5 \
|
| 771 |
+
--mlp_hidden_dim 100 \
|
| 772 |
+
--model_name specroute \
|
| 773 |
+
--kl_ratio 0.1 \
|
| 774 |
+
--gen_data_dir CL_Benchmark \
|
| 775 |
+
--threshold 0.980 \
|
| 776 |
+
--transthreshold 0.980 \
|
| 777 |
+
$FP16_FLAG
|
| 778 |
+
|
| 779 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc/checkpoint*
|
| 780 |
+
|
| 781 |
+
sleep 5
|
| 782 |
+
|
| 783 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 784 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 785 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 786 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 787 |
+
else
|
| 788 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 789 |
+
fi
|
| 790 |
+
|
| 791 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 792 |
+
--do_train \
|
| 793 |
+
--do_predict \
|
| 794 |
+
--predict_with_generate \
|
| 795 |
+
--model_name_or_path $2 \
|
| 796 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc/saved_weights \
|
| 797 |
+
--data_dir CL_Benchmark \
|
| 798 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 799 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/boolq \
|
| 800 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/14-boolq \
|
| 801 |
+
--per_device_train_batch_size $BSZ \
|
| 802 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 803 |
+
--gradient_accumulation_steps $GA \
|
| 804 |
+
--learning_rate 0.0003 \
|
| 805 |
+
--num_train_epochs 10 \
|
| 806 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 807 |
+
--max_source_length 512 \
|
| 808 |
+
--max_target_length 50 \
|
| 809 |
+
--generation_max_length 50 \
|
| 810 |
+
--add_task_name False \
|
| 811 |
+
--add_dataset_name False \
|
| 812 |
+
--overwrite_output_dir \
|
| 813 |
+
--overwrite_cache \
|
| 814 |
+
--lr_scheduler_type constant \
|
| 815 |
+
--warmup_steps 0 \
|
| 816 |
+
--logging_strategy steps \
|
| 817 |
+
--logging_steps 10 \
|
| 818 |
+
--metric_for_best_model eval_exact_match_for_boolq \
|
| 819 |
+
--evaluation_strategy steps \
|
| 820 |
+
--save_strategy steps \
|
| 821 |
+
--save_total_limit 1 \
|
| 822 |
+
--load_best_model_at_end \
|
| 823 |
+
--lora_r 8 \
|
| 824 |
+
--lora_alpha 32 \
|
| 825 |
+
--lora_dropout 0.0 \
|
| 826 |
+
--data_replay_freq 5 \
|
| 827 |
+
--mlp_hidden_dim 100 \
|
| 828 |
+
--model_name specroute \
|
| 829 |
+
--kl_ratio 0.1 \
|
| 830 |
+
--gen_data_dir CL_Benchmark \
|
| 831 |
+
--threshold 0.980 \
|
| 832 |
+
--transthreshold 0.980 \
|
| 833 |
+
$FP16_FLAG
|
| 834 |
+
|
| 835 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/14-boolq/checkpoint*
|
| 836 |
+
|
| 837 |
+
sleep 5
|
| 838 |
+
|
| 839 |
+
if [ "$GPU_MODE" = "t4_2gpu" ]; then
|
| 840 |
+
BSZ=16; GA=1; EVAL_BSZ=256
|
| 841 |
+
elif [ "$GPU_MODE" = "t4_1gpu" ]; then
|
| 842 |
+
BSZ=32; GA=1; EVAL_BSZ=256
|
| 843 |
+
else
|
| 844 |
+
BSZ=64; GA=1; EVAL_BSZ=512
|
| 845 |
+
fi
|
| 846 |
+
|
| 847 |
+
CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
|
| 848 |
+
--do_train \
|
| 849 |
+
--do_predict \
|
| 850 |
+
--predict_with_generate \
|
| 851 |
+
--model_name_or_path $2 \
|
| 852 |
+
--previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/12-yahoo/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/13-multirc/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/14-boolq/saved_weights \
|
| 853 |
+
--data_dir CL_Benchmark \
|
| 854 |
+
--task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
|
| 855 |
+
--task_config_dir configs/gen_script_long_order3_t5_configs/wic \
|
| 856 |
+
--output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/15-wic \
|
| 857 |
+
--per_device_train_batch_size $BSZ \
|
| 858 |
+
--per_device_eval_batch_size $EVAL_BSZ \
|
| 859 |
+
--gradient_accumulation_steps $GA \
|
| 860 |
+
--learning_rate 0.0003 \
|
| 861 |
+
--num_train_epochs 10 \
|
| 862 |
+
--run_name gen_script_long_order3_t5_small_specroute_v2 \
|
| 863 |
+
--max_source_length 512 \
|
| 864 |
+
--max_target_length 50 \
|
| 865 |
+
--generation_max_length 50 \
|
| 866 |
+
--add_task_name False \
|
| 867 |
+
--add_dataset_name False \
|
| 868 |
+
--overwrite_output_dir \
|
| 869 |
+
--overwrite_cache \
|
| 870 |
+
--lr_scheduler_type constant \
|
| 871 |
+
--warmup_steps 0 \
|
| 872 |
+
--logging_strategy steps \
|
| 873 |
+
--logging_steps 10 \
|
| 874 |
+
--metric_for_best_model eval_exact_match_for_wic \
|
| 875 |
+
--evaluation_strategy steps \
|
| 876 |
+
--save_strategy steps \
|
| 877 |
+
--save_total_limit 1 \
|
| 878 |
+
--load_best_model_at_end \
|
| 879 |
+
--lora_r 8 \
|
| 880 |
+
--lora_alpha 32 \
|
| 881 |
+
--lora_dropout 0.0 \
|
| 882 |
+
--data_replay_freq 5 \
|
| 883 |
+
--mlp_hidden_dim 100 \
|
| 884 |
+
--model_name specroute \
|
| 885 |
+
--kl_ratio 0.1 \
|
| 886 |
+
--gen_data_dir CL_Benchmark \
|
| 887 |
+
--threshold 0.980 \
|
| 888 |
+
--transthreshold 0.980 \
|
| 889 |
+
$FP16_FLAG
|
| 890 |
+
|
| 891 |
+
rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v2/outputs/15-wic/checkpoint*
|
| 892 |
+
|
| 893 |
+
sleep 5
|
improve_gainlora/generate_specroute_scripts_v2.py
CHANGED
|
@@ -107,7 +107,7 @@ SCRIPT_CONFIGS = {
|
|
| 107 |
"lora_r": 8,
|
| 108 |
"epochs": 10,
|
| 109 |
"metric_base": "eval_exact_match",
|
| 110 |
-
"do_predict":
|
| 111 |
"cleanup_checkpoints": True,
|
| 112 |
"batch_a100_task1": (8, 4),
|
| 113 |
"batch_a100_rest": (16, 2),
|
|
@@ -125,7 +125,7 @@ SCRIPT_CONFIGS = {
|
|
| 125 |
"lora_r": 8,
|
| 126 |
"epochs": 10,
|
| 127 |
"metric_base": "eval_exact_match",
|
| 128 |
-
"do_predict":
|
| 129 |
"cleanup_checkpoints": True,
|
| 130 |
"batch_a100_task1": (8, 4),
|
| 131 |
"batch_a100_rest": (16, 2),
|
|
|
|
| 107 |
"lora_r": 8,
|
| 108 |
"epochs": 10,
|
| 109 |
"metric_base": "eval_exact_match",
|
| 110 |
+
"do_predict": True,
|
| 111 |
"cleanup_checkpoints": True,
|
| 112 |
"batch_a100_task1": (8, 4),
|
| 113 |
"batch_a100_rest": (16, 2),
|
|
|
|
| 125 |
"lora_r": 8,
|
| 126 |
"epochs": 10,
|
| 127 |
"metric_base": "eval_exact_match",
|
| 128 |
+
"do_predict": True,
|
| 129 |
"cleanup_checkpoints": True,
|
| 130 |
"batch_a100_task1": (8, 4),
|
| 131 |
"batch_a100_rest": (16, 2),
|
improve_gainlora/src/cl_trainer_specroute.py
CHANGED
|
@@ -68,9 +68,21 @@ class DenserEvalCallback(TrainerCallback):
|
|
| 68 |
return control
|
| 69 |
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
class SpecRoute_Trainer(Seq2SeqTrainer):
|
| 72 |
|
| 73 |
def __init__(self, model, args, train_dataset, cur_task_id, task_order,
|
|
|
|
| 74 |
eval_dataset=None, tokenizer=None, data_collator=None,
|
| 75 |
compute_metrics=None, callbacks=None):
|
| 76 |
super().__init__(
|
|
@@ -82,7 +94,32 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
|
|
| 82 |
self.task_order = task_order
|
| 83 |
self.cur_task_id = cur_task_id
|
| 84 |
self._grad_check_done = False
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
def _save(self, output_dir=None, state_dict=None):
|
| 88 |
# T5 shared embeddings are incompatible with safetensors; force pytorch format
|
|
@@ -94,8 +131,56 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
|
|
| 94 |
self.args.save_safetensors = old
|
| 95 |
|
| 96 |
def training_step(self, model, inputs, **kwargs):
|
| 97 |
-
"""Override to add one-time gradient diagnostic."""
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# One-time gradient check after first backward
|
| 101 |
if not self._grad_check_done:
|
|
|
|
| 68 |
return control
|
| 69 |
|
| 70 |
|
| 71 |
+
def create_memory_replay_generators(task, task_list, replay_data_dict):
|
| 72 |
+
"""Create cycling iterators for previous tasks' replay data."""
|
| 73 |
+
print('Creating generators for previous tasks (SpecRoute replay) ...')
|
| 74 |
+
tasks_to_generators = {}
|
| 75 |
+
curr_task_num = task_list.index(task)
|
| 76 |
+
for idx in np.arange(curr_task_num):
|
| 77 |
+
prev_task = task_list[idx]
|
| 78 |
+
tasks_to_generators[prev_task] = iter(replay_data_dict[prev_task])
|
| 79 |
+
return tasks_to_generators
|
| 80 |
+
|
| 81 |
+
|
| 82 |
class SpecRoute_Trainer(Seq2SeqTrainer):
|
| 83 |
|
| 84 |
def __init__(self, model, args, train_dataset, cur_task_id, task_order,
|
| 85 |
+
data_collator_replay=None, replay_dataset_dict=None,
|
| 86 |
eval_dataset=None, tokenizer=None, data_collator=None,
|
| 87 |
compute_metrics=None, callbacks=None):
|
| 88 |
super().__init__(
|
|
|
|
| 94 |
self.task_order = task_order
|
| 95 |
self.cur_task_id = cur_task_id
|
| 96 |
self._grad_check_done = False
|
| 97 |
+
|
| 98 |
+
# Experience replay setup
|
| 99 |
+
self.data_collator_replay = data_collator_replay
|
| 100 |
+
self.replay_dataset_dict = replay_dataset_dict
|
| 101 |
+
if self.args.data_replay_freq != -1 and replay_dataset_dict is not None:
|
| 102 |
+
from torch.utils.data import RandomSampler
|
| 103 |
+
from transformers.trainer_utils import seed_worker
|
| 104 |
+
seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
|
| 105 |
+
generator = torch.Generator()
|
| 106 |
+
generator.manual_seed(seed)
|
| 107 |
+
self.replay_dataloader_dict = {}
|
| 108 |
+
for dataset_name, dataset in self.replay_dataset_dict.items():
|
| 109 |
+
train_sampler = RandomSampler(dataset, generator=generator)
|
| 110 |
+
self.replay_dataloader_dict[dataset_name] = DataLoader(
|
| 111 |
+
dataset,
|
| 112 |
+
batch_size=self._train_batch_size,
|
| 113 |
+
sampler=train_sampler,
|
| 114 |
+
collate_fn=self.data_collator_replay,
|
| 115 |
+
drop_last=self.args.dataloader_drop_last,
|
| 116 |
+
num_workers=self.args.dataloader_num_workers,
|
| 117 |
+
pin_memory=False,
|
| 118 |
+
worker_init_fn=seed_worker)
|
| 119 |
+
self.replay_iterator_dict = create_memory_replay_generators(
|
| 120 |
+
task_order[cur_task_id], task_order, self.replay_dataloader_dict)
|
| 121 |
+
print(f"[SpecRoute Replay] Enabled: {len(self.replay_dataloader_dict)} tasks, "
|
| 122 |
+
f"freq={self.args.data_replay_freq}, ratio={self.args.kl_ratio}")
|
| 123 |
|
| 124 |
def _save(self, output_dir=None, state_dict=None):
|
| 125 |
# T5 shared embeddings are incompatible with safetensors; force pytorch format
|
|
|
|
| 131 |
self.args.save_safetensors = old
|
| 132 |
|
| 133 |
def training_step(self, model, inputs, **kwargs):
|
| 134 |
+
"""Override to add experience replay and one-time gradient diagnostic."""
|
| 135 |
+
model.train()
|
| 136 |
+
inputs = self._prepare_inputs(inputs)
|
| 137 |
+
|
| 138 |
+
with self.compute_loss_context_manager():
|
| 139 |
+
loss = self.compute_loss(model, inputs)
|
| 140 |
+
|
| 141 |
+
if self.args.n_gpu > 1:
|
| 142 |
+
loss = loss.mean()
|
| 143 |
+
|
| 144 |
+
if self.args.gradient_accumulation_steps > 1 and not self.is_deepspeed_enabled:
|
| 145 |
+
loss = loss / self.args.gradient_accumulation_steps
|
| 146 |
+
|
| 147 |
+
if self.is_deepspeed_enabled:
|
| 148 |
+
self.accelerator.backward(loss)
|
| 149 |
+
else:
|
| 150 |
+
loss.backward()
|
| 151 |
+
|
| 152 |
+
# === Experience Replay: CE loss on old task data ===
|
| 153 |
+
replay_freq = getattr(self.args, 'data_replay_freq', -1)
|
| 154 |
+
if (replay_freq != -1
|
| 155 |
+
and hasattr(self, 'replay_iterator_dict')
|
| 156 |
+
and self.state.global_step > getattr(self.args, 'replay_after_n_epoch', 0) * getattr(self.args, 'step_per_epoch', 0)
|
| 157 |
+
and self.state.global_step % replay_freq == 0):
|
| 158 |
+
for item in list(self.replay_iterator_dict.keys()):
|
| 159 |
+
generator_mem = self.replay_iterator_dict[item]
|
| 160 |
+
try:
|
| 161 |
+
b = next(generator_mem)
|
| 162 |
+
except StopIteration:
|
| 163 |
+
generator_mem = iter(self.replay_dataloader_dict[item])
|
| 164 |
+
self.replay_iterator_dict[item] = generator_mem
|
| 165 |
+
b = next(generator_mem)
|
| 166 |
+
|
| 167 |
+
# Remove replay_labels if present (not needed for CE replay)
|
| 168 |
+
b.pop("replay_labels", None)
|
| 169 |
+
replay_inputs = self._prepare_inputs(b)
|
| 170 |
+
|
| 171 |
+
with self.compute_loss_context_manager():
|
| 172 |
+
replay_loss = self.compute_loss(model, replay_inputs)
|
| 173 |
+
|
| 174 |
+
kl_ratio = getattr(self.args, 'kl_ratio', 0.1)
|
| 175 |
+
replay_loss = kl_ratio * replay_loss
|
| 176 |
+
|
| 177 |
+
if self.args.n_gpu > 1:
|
| 178 |
+
replay_loss = replay_loss.mean()
|
| 179 |
+
|
| 180 |
+
if self.is_deepspeed_enabled:
|
| 181 |
+
self.accelerator.backward(replay_loss)
|
| 182 |
+
else:
|
| 183 |
+
replay_loss.backward()
|
| 184 |
|
| 185 |
# One-time gradient check after first backward
|
| 186 |
if not self._grad_check_done:
|
improve_gainlora/src/run_t5.py
CHANGED
|
@@ -707,7 +707,9 @@ def main():
|
|
| 707 |
input_record_file=data_args.input_record_file)
|
| 708 |
|
| 709 |
replay_dataset_dict, replay_label_dict = None, None
|
| 710 |
-
|
|
|
|
|
|
|
| 711 |
replay_dataset_dict = {}
|
| 712 |
abs_data_dir_replay = os.path.abspath(data_dir) if data_dir else None
|
| 713 |
for idx in range(cur_task_id):
|
|
@@ -725,12 +727,14 @@ def main():
|
|
| 725 |
replay_dataset_dict[task_order[idx]] = raw_datasets_gen["train"]
|
| 726 |
print(raw_datasets_gen)
|
| 727 |
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
|
|
|
|
|
|
| 734 |
print('-'*50)
|
| 735 |
|
| 736 |
# Metric
|
|
@@ -866,6 +870,8 @@ def main():
|
|
| 866 |
train_dataset=train_dataset if training_args.do_train else None,
|
| 867 |
cur_task_id=cur_task_id,
|
| 868 |
task_order=task_order,
|
|
|
|
|
|
|
| 869 |
eval_dataset=eval_dataset if training_args.do_eval else None,
|
| 870 |
tokenizer=tokenizer,
|
| 871 |
data_collator=data_collator,
|
|
|
|
| 707 |
input_record_file=data_args.input_record_file)
|
| 708 |
|
| 709 |
replay_dataset_dict, replay_label_dict = None, None
|
| 710 |
+
# Load replay datasets for methods that need it
|
| 711 |
+
_need_replay_data = model_args.load_checkpoint_from or (training_args.model_name == 'specroute' and cur_task_id > 0)
|
| 712 |
+
if _need_replay_data:
|
| 713 |
replay_dataset_dict = {}
|
| 714 |
abs_data_dir_replay = os.path.abspath(data_dir) if data_dir else None
|
| 715 |
for idx in range(cur_task_id):
|
|
|
|
| 727 |
replay_dataset_dict[task_order[idx]] = raw_datasets_gen["train"]
|
| 728 |
print(raw_datasets_gen)
|
| 729 |
|
| 730 |
+
# Load attention weights for KL replay (GainLoRA only, not SpecRoute)
|
| 731 |
+
if model_args.load_checkpoint_from:
|
| 732 |
+
replay_label_dict = {}
|
| 733 |
+
for idx in range(0,cur_task_id):
|
| 734 |
+
with open(os.path.join("../logs_and_outputs/" + training_args.run_name + "/outputs/", str(idx+1)+"-"+task_order[idx], "saved_weights", "attention_weights.pkl"), 'rb') as f:
|
| 735 |
+
attn_weights = pickle.load(f)
|
| 736 |
+
replay_label_dict[task_order[idx]] = torch.cat([torch.tensor([0.] * (cur_task_id - idx)), torch.tensor(attn_weights)], dim=0).to(dtype=torch.bfloat16, device='cuda')
|
| 737 |
+
print(replay_label_dict)
|
| 738 |
print('-'*50)
|
| 739 |
|
| 740 |
# Metric
|
|
|
|
| 870 |
train_dataset=train_dataset if training_args.do_train else None,
|
| 871 |
cur_task_id=cur_task_id,
|
| 872 |
task_order=task_order,
|
| 873 |
+
data_collator_replay=data_collator_replay,
|
| 874 |
+
replay_dataset_dict=replay_dataset_dict,
|
| 875 |
eval_dataset=eval_dataset if training_args.do_eval else None,
|
| 876 |
tokenizer=tokenizer,
|
| 877 |
data_collator=data_collator,
|
results/comparison_results.md
CHANGED
|
@@ -152,7 +152,18 @@ python src/compute_ap_ft.py \
|
|
| 152 |
| 13 | agnews | | | | |
|
| 153 |
| 14 | multirc | | | | |
|
| 154 |
| 15 | yahoo | | | | |
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
---
|
| 158 |
|
|
@@ -160,43 +171,34 @@ python src/compute_ap_ft.py \
|
|
| 160 |
|
| 161 |
| Method | Order 3 AP↑ | Order 3 FT↓ |
|
| 162 |
|--------|-------------|-------------|
|
| 163 |
-
| **GainLoRA (Root)** | 59.70 | N/A* |
|
| 164 |
-
| **SpecRoute (Improve)** |
|
| 165 |
|
| 166 |
-
> *\*FT = N/A:
|
|
|
|
| 167 |
|
| 168 |
-
##
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|---|------|-----------------------|---------------------------|
|
| 172 |
-
| 1 | yelp | 56.01 | |
|
| 173 |
-
| 2 | amazon | 52.05 | |
|
| 174 |
-
| 3 | mnli | 34.07 | |
|
| 175 |
-
| 4 | cb | 3.57 | |
|
| 176 |
-
| 5 | copa | 42.00 | |
|
| 177 |
-
| 6 | qqp | 76.96 | |
|
| 178 |
-
| 7 | rte | 45.85 | |
|
| 179 |
-
| 8 | imdb | 89.51 | |
|
| 180 |
-
| 9 | sst2 | 85.21 | |
|
| 181 |
-
| 10 | dbpedia | 98.16 | |
|
| 182 |
-
| 11 | agnews | 88.37 | |
|
| 183 |
-
| 12 | yahoo | 57.28 | |
|
| 184 |
-
| 13 | multirc | 50.52 | |
|
| 185 |
-
| 14 | boolq | 60.43 | |
|
| 186 |
-
| 15 | wic | 55.49 | |
|
| 187 |
-
| | **AP / FT** | **59.70 / N/A** | |
|
| 188 |
|
| 189 |
-
--
|
| 190 |
-
|
| 191 |
-
## Quick Harvest (chạy sau khi xong cả 4 orders)
|
| 192 |
-
|
| 193 |
-
```bash
|
| 194 |
-
# Chạy 4 lệnh này để lấy đủ số cho cả 2 bảng:
|
| 195 |
-
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order1_t5large/outputs --task_order "task1572_samsum_summary,task363_sst2_polarity_classification,task1290_xsum_summarization,task181_outcome_extraction,task002_quoref_answer_generation,task1510_evalution_relation_extraction,task639_multi_woz_user_utterance_generation,task1729_personachat_generate_next,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task748_glucose_reverse_cause_event_detection,task511_reddit_tifu_long_text_summarization,task591_sciq_answer_generation,task1687_sentiment140_classification,task875_emotion_classification" --save
|
| 196 |
-
|
| 197 |
-
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order2_t5large/outputs --task_order "task748_glucose_reverse_cause_event_detection,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task639_multi_woz_user_utterance_generation,task1572_samsum_summary,task1687_sentiment140_classification,task591_sciq_answer_generation,task363_sst2_polarity_classification,task1510_evalution_relation_extraction,task1729_personachat_generate_next,task181_outcome_extraction,task511_reddit_tifu_long_text_summarization,task002_quoref_answer_generation,task1290_xsum_summarization,task875_emotion_classification" --save
|
| 198 |
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order4_t5large/outputs --task_order "mnli,cb,wic,copa,qqp,boolq,rte,imdb,yelp,amazon,sst2,dbpedia,agnews,multirc,yahoo" --save
|
| 202 |
-
```
|
|
|
|
| 152 |
| 13 | agnews | | | | |
|
| 153 |
| 14 | multirc | | | | |
|
| 154 |
| 15 | yahoo | | | | |
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
# Chạy 4 lệnh này để lấy đủ số cho cả 2 bảng:
|
| 159 |
+
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order1_t5large/outputs --task_order "task1572_samsum_summary,task363_sst2_polarity_classification,task1290_xsum_summarization,task181_outcome_extraction,task002_quoref_answer_generation,task1510_evalution_relation_extraction,task639_multi_woz_user_utterance_generation,task1729_personachat_generate_next,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task748_glucose_reverse_cause_event_detection,task511_reddit_tifu_long_text_summarization,task591_sciq_answer_generation,task1687_sentiment140_classification,task875_emotion_classification" --save
|
| 160 |
+
|
| 161 |
+
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order2_t5large/outputs --task_order "task748_glucose_reverse_cause_event_detection,task073_commonsenseqa_answer_generation,task1590_diplomacy_text_generation,task639_multi_woz_user_utterance_generation,task1572_samsum_summary,task1687_sentiment140_classification,task591_sciq_answer_generation,task363_sst2_polarity_classification,task1510_evalution_relation_extraction,task1729_personachat_generate_next,task181_outcome_extraction,task511_reddit_tifu_long_text_summarization,task002_quoref_answer_generation,task1290_xsum_summarization,task875_emotion_classification" --save
|
| 162 |
+
|
| 163 |
+
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order3_t5large/outputs --task_order "yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic" --save
|
| 164 |
+
|
| 165 |
+
python src/compute_ap_ft.py --output_base logs_and_outputs/ot_sign_order4_t5large/outputs --task_order "mnli,cb,wic,copa,qqp,boolq,rte,imdb,yelp,amazon,sst2,dbpedia,agnews,multirc,yahoo" --save
|
| 166 |
+
```
|
| 167 |
|
| 168 |
---
|
| 169 |
|
|
|
|
| 171 |
|
| 172 |
| Method | Order 3 AP↑ | Order 3 FT↓ |
|
| 173 |
|--------|-------------|-------------|
|
| 174 |
+
| **GainLoRA (Root)** | **59.70** | N/A* |
|
| 175 |
+
| **SpecRoute (Improve)** | 39.74† | N/A* |
|
| 176 |
|
| 177 |
+
> *\*FT = N/A: cả 2 log chạy thiếu `--do_predict`. Lần tiếp theo dùng script `T5_small/` đã sửa sẽ có đủ FT.*
|
| 178 |
+
> *†Điểm Improve tính từ `predict_eval_predictions.jsonl` của từng task (hàng chéo score matrix). imdb/sst2/wic về 0 do Catastrophic Forgetting.*
|
| 179 |
|
| 180 |
+
### ⚠️ Root GainLoRA tốt hơn SpecRoute trên T5-Small (−19.96 AP)
|
| 181 |
|
| 182 |
+
SpecRoute bị Catastrophic Forgetting nghiêm trọng ở các task phân loại sentiment (imdb=0.21, sst2=0.00, yahoo=8.12, wic=0.00). Nguyên nhân có thể do SVD rank không đủ lớn ở T5-Small, làm routing mechanism không phân tách được subspace của các task.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
## Per-Task Breakdown — Order 3 (T5-Small)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
| # | Task | GainLoRA (Root) | SpecRoute (Improve) | Δ (Improve−Root) |
|
| 187 |
+
|---|------|-----------------|--------------------|-----------------|
|
| 188 |
+
| 1 | yelp | 56.01 | 54.36 | −1.65 |
|
| 189 |
+
| 2 | amazon | 52.05 | 50.01 | −2.04 |
|
| 190 |
+
| 3 | mnli | 34.07 | 35.50 | +1.43 |
|
| 191 |
+
| 4 | cb | 3.57 | 0.00 | −3.57 |
|
| 192 |
+
| 5 | copa | 42.00 | 44.00 | +2.00 |
|
| 193 |
+
| 6 | qqp | 76.96 | 76.72 | −0.24 |
|
| 194 |
+
| 7 | rte | 45.85 | 50.90 | +5.05 |
|
| 195 |
+
| 8 | imdb | 89.51 | 0.21 | **−89.30 ⚠️** |
|
| 196 |
+
| 9 | sst2 | 85.21 | 0.00 | **−85.21 ⚠️** |
|
| 197 |
+
| 10 | dbpedia | 98.16 | 92.22 | −5.94 |
|
| 198 |
+
| 11 | agnews | 88.37 | 68.76 | −19.61 |
|
| 199 |
+
| 12 | yahoo | 57.28 | 8.12 | **−49.16 ⚠️** |
|
| 200 |
+
| 13 | multirc | 50.52 | 54.23 | +3.71 |
|
| 201 |
+
| 14 | boolq | 60.43 | 61.13 | +0.70 |
|
| 202 |
+
| 15 | wic | 55.49 | 0.00 | **−55.49 ⚠️** |
|
| 203 |
+
| | **AP / FT** | **59.70 / N/A** | **39.74 / N/A** | **−19.96** |
|
| 204 |
|
|
|
|
|
|
results/experiment_versions.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SpecRoute — Báo cáo Thử nghiệm theo Version
|
| 2 |
+
|
| 3 |
+
> Tracking tất cả versions thử nghiệm, kết quả, phân tích, và cải tiến.
|
| 4 |
+
> Benchmark: Long Sequence Order 3, 15 classification tasks, model T5-Small.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Version 1.0 — Baseline SpecRoute (Kết quả đầu tiên)
|
| 9 |
+
|
| 10 |
+
### Kịch bản thử nghiệm
|
| 11 |
+
- **Model**: T5-Small (d_model=512, 6 encoder + 6 decoder layers)
|
| 12 |
+
- **Method**: SpecRoute — spectral routing (SVD of LoRA B@A) thay thế learned routing (trans_input + prompt_key) của GainLoRA
|
| 13 |
+
- **So sánh**: ROOT GainLoRA-InfLoRA (original codebase)
|
| 14 |
+
- **Hyperparameters**: lora_r=8, lora_alpha=32, lr=3e-4, 10 epochs, threshold=0.995
|
| 15 |
+
- **Platform**: Kaggle T4 GPU
|
| 16 |
+
|
| 17 |
+
### Kết quả
|
| 18 |
+
|
| 19 |
+
| # | Task | ROOT (Final R_{15,j}) | SpecRoute (Peak R_{j,j}) | Δ |
|
| 20 |
+
|---|------|-----------------------|--------------------------|---|
|
| 21 |
+
| 1 | yelp | 56.01 | 54.36 | -1.65 |
|
| 22 |
+
| 2 | amazon | 52.05 | 50.01 | -2.04 |
|
| 23 |
+
| 3 | mnli | 34.07 | 35.50 | +1.43 |
|
| 24 |
+
| 4 | cb | 3.57 | 0.00 | -3.57 |
|
| 25 |
+
| 5 | copa | 42.00 | 44.00 | +2.00 |
|
| 26 |
+
| 6 | qqp | 76.96 | 76.72 | -0.24 |
|
| 27 |
+
| 7 | rte | 45.85 | 50.90 | +5.05 |
|
| 28 |
+
| 8 | imdb | 89.51 | **0.21** ⚠️ | -89.30 |
|
| 29 |
+
| 9 | sst2 | 85.21 | **0.00** ⚠️ | -85.21 |
|
| 30 |
+
| 10 | dbpedia | 98.16 | 92.22 | -5.94 |
|
| 31 |
+
| 11 | agnews | 88.37 | 68.76 | -19.61 |
|
| 32 |
+
| 12 | yahoo | 57.28 | **8.12** ⚠️ | -49.16 |
|
| 33 |
+
| 13 | multirc | 50.52 | 54.23 | +3.71 |
|
| 34 |
+
| 14 | boolq | 60.43 | 61.13 | +0.70 |
|
| 35 |
+
| 15 | wic | 55.49 | **0.00** ⚠️ | -55.49 |
|
| 36 |
+
| | **Mean** | **59.70** | **39.74** | **-19.96** |
|
| 37 |
+
|
| 38 |
+
> ⚠️ **LƯU Ý QUAN TRỌNG**: So sánh KHÔNG công bằng — ROOT dùng R_{15,j} (final, sau tất cả 15 tasks), SpecRoute dùng R_{j,j} (peak, ngay sau train từng task). AP thực của SpecRoute sẽ thấp hơn 39.74.
|
| 39 |
+
|
| 40 |
+
### Phân tích
|
| 41 |
+
|
| 42 |
+
**1. Prediction metrics không được lưu**
|
| 43 |
+
- SpecRoute `all_results.json` chỉ chứa training metrics, KHÔNG có `predict_exact_match_for_{task}`
|
| 44 |
+
- `task_order.txt` không tồn tại → `score.py` không thể tính AP/FT
|
| 45 |
+
- Nguyên nhân: Có thể do experiment được chạy bằng script khác (không phải T5_small/ scripts đã fix `--do_predict`)
|
| 46 |
+
- T5-large script generator (`generate_specroute_scripts_v2.py`) vẫn có bug `do_predict=False` cho long benchmarks
|
| 47 |
+
|
| 48 |
+
**2. Các tasks THẤT BẠI KHÔNG PHẢI do catastrophic forgetting**
|
| 49 |
+
|
| 50 |
+
| Task | Train Loss (Root) | Train Loss (SpecRoute) | Ratio | Verdict |
|
| 51 |
+
|------|:-:|:-:|:-:|---|
|
| 52 |
+
| imdb | 1.41 | **4.15** | 2.9x | Không thể học |
|
| 53 |
+
| sst2 | 1.76 | **4.45** | 2.5x | Không thể học |
|
| 54 |
+
| yahoo | 1.19 | **3.08** | 2.6x | Không thể học |
|
| 55 |
+
| wic | 0.96 | **3.65** | 3.8x | Không thể học |
|
| 56 |
+
|
| 57 |
+
Training loss cao gấp 2.5-3.8x → model KHÔNG THỂ HỌC ngay từ đầu (inability to learn, NOT catastrophic forgetting).
|
| 58 |
+
|
| 59 |
+
**3. Nguyên nhân gốc: GPM null-space saturation + thiếu protection mechanisms**
|
| 60 |
+
|
| 61 |
+
SpecRoute loại bỏ learned routing → đồng thời mất 4/5 cơ chế protection của ROOT:
|
| 62 |
+
|
| 63 |
+
| Protection Mechanism | ROOT | SpecRoute V1 |
|
| 64 |
+
|---------------------|:---:|:---:|
|
| 65 |
+
| GPM on LoRA A | ✅ | ✅ |
|
| 66 |
+
| KL distillation on routing | ✅ | ❌ |
|
| 67 |
+
| Data replay | ✅ | ❌ |
|
| 68 |
+
| Per-step GPM on routing params | ✅ | ❌ (no routing params) |
|
| 69 |
+
| Learned routing adaptation | ✅ | ❌ (by design) |
|
| 70 |
+
|
| 71 |
+
Khi tasks tương tự (imdb/sst2 vs yelp/amazon — cùng sentiment domain) đến, GPM đã "claim" sentiment-relevant directions → model bị ép vào orthogonal null-space không liên quan → KHÔNG thể học sentiment tasks mới.
|
| 72 |
+
|
| 73 |
+
ROOT GainLoRA giải quyết vấn đề này nhờ trans_input MLP map input mới vào representation space REUSE kiến thức cũ, kết hợp KL distillation + data replay.
|
| 74 |
+
|
| 75 |
+
**4. FT (Forgetting) = N/A**
|
| 76 |
+
- Không tính được vì thiếu cross-task prediction metrics
|
| 77 |
+
|
| 78 |
+
### Cải tiến cho V2
|
| 79 |
+
|
| 80 |
+
| # | Loại | Nội dung | Tác động |
|
| 81 |
+
|---|------|---------|----------|
|
| 82 |
+
| 1 | Bug fix | Fix `do_predict=False` → `True` trong generator | Cho phép tính AP/FT đúng |
|
| 83 |
+
| 2 | Config | Giảm GPM threshold: 0.995 → 0.980 | Mở rộng null-space cho tasks sau |
|
| 84 |
+
| 3 | **Idea change** | Thêm Experience Replay (CE loss trên old task data) | Chống forgetting + hỗ trợ knowledge reuse |
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
## Version 2.0 — SpecRoute + Experience Replay (Planned)
|
| 89 |
+
|
| 90 |
+
### Thay đổi về Idea
|
| 91 |
+
|
| 92 |
+
> **⚠️ IDEA CHANGE**: Version 2 thêm **Experience Replay (CE loss)** vào SpecRoute.
|
| 93 |
+
>
|
| 94 |
+
> SpecRoute V1 claim rằng spectral routing parameter-free đủ để thay thế learned routing. V2 bổ sung rằng:
|
| 95 |
+
> - Spectral routing thay thế **routing mechanism** (đúng, giữ nguyên)
|
| 96 |
+
> - Nhưng **protection mechanisms** (data replay) là ORTHOGONAL với routing mechanism và cần được giữ lại
|
| 97 |
+
> - V2 sử dụng **CE replay trực tiếp** trên old task training data (không cần teacher model hay saved logits)
|
| 98 |
+
> - Khác ROOT (KL on routing scores): SpecRoute replay chỉ cần CE loss vì routing là parameter-free
|
| 99 |
+
>
|
| 100 |
+
> Đây là sự thay đổi từ "spectral routing is sufficient" sang "spectral routing + replay protection is the complete solution".
|
| 101 |
+
> Bản chất: **decouple routing mechanism khỏi protection mechanisms**.
|
| 102 |
+
|
| 103 |
+
### Kịch bản thử nghiệm
|
| 104 |
+
- **Model**: T5-Small (d_model=512, 6 encoder + 6 decoder layers)
|
| 105 |
+
- **Method**: SpecRoute V2 — spectral routing + experience replay (CE loss trên original training data)
|
| 106 |
+
- **Hyperparameters**:
|
| 107 |
+
- lora_r=8, lora_alpha=32, lr=3e-4, 10 epochs
|
| 108 |
+
- **threshold=0.980** (giảm từ 0.995)
|
| 109 |
+
- **data_replay_freq=5** (replay mỗi 5 steps)
|
| 110 |
+
- **kl_ratio=0.1** (weight cho replay CE loss)
|
| 111 |
+
- **gen_data_dir=CL_Benchmark** (replay từ original training data)
|
| 112 |
+
- **Script**: `T5_small/gen_script_long_order3_t5_small_specroute_v2.sh`
|
| 113 |
+
- **Platform**: Kaggle T4 GPU
|
| 114 |
+
|
| 115 |
+
### Code Changes (Actual)
|
| 116 |
+
|
| 117 |
+
**1. Bug Fix: `generate_specroute_scripts_v2.py`**
|
| 118 |
+
- `do_predict=False` → `True` cho `long_order3` và `long_order4`
|
| 119 |
+
|
| 120 |
+
**2. Trainer: `cl_trainer_specroute.py`**
|
| 121 |
+
- Thêm `create_memory_replay_generators()` — tạo DataLoader cycling iterators
|
| 122 |
+
- `__init__()`: nhận `data_collator_replay`, `replay_dataset_dict`, tạo `replay_dataloader_dict` và `replay_iterator_dict`
|
| 123 |
+
- `training_step()`: Sau main CE loss backward, replay CE loss trên old task data:
|
| 124 |
+
```
|
| 125 |
+
Mỗi replay_freq steps:
|
| 126 |
+
For each old task:
|
| 127 |
+
sample batch from replay iterator
|
| 128 |
+
replay_loss = kl_ratio * CE_loss(model, replay_batch)
|
| 129 |
+
replay_loss.backward()
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
**3. Run entry: `run_t5.py`**
|
| 133 |
+
- Mở rộng replay dataset loading condition: `load_checkpoint_from OR (specroute AND cur_task_id > 0)`
|
| 134 |
+
- Skip `attention_weights.pkl` loading cho SpecRoute (không cần KL on routing)
|
| 135 |
+
- Pass `data_collator_replay`, `replay_dataset_dict` vào SpecRoute_Trainer
|
| 136 |
+
|
| 137 |
+
**4. Shell Script: `T5_small/gen_script_long_order3_t5_small_specroute_v2.sh`** (NEW)
|
| 138 |
+
- threshold: 0.995 → 0.980
|
| 139 |
+
- data_replay_freq: -1 → 5
|
| 140 |
+
- Thêm: `--kl_ratio 0.1`, `--gen_data_dir CL_Benchmark`
|
| 141 |
+
- Output dir: `specroute_v2` (tách biệt V1)
|
| 142 |
+
- V1 script giữ nguyên để so sánh
|
| 143 |
+
|
| 144 |
+
### Kết quả
|
| 145 |
+
> *Chưa chạy — cần thực nghiệm*
|
| 146 |
+
|
| 147 |
+
### Phân tích
|
| 148 |
+
> *Pending*
|
| 149 |
+
|
| 150 |
+
### Kỳ vọng
|
| 151 |
+
- Tasks 8 (imdb), 9 (sst2), 12 (yahoo), 15 (wic): kỳ vọng cải thiện đáng kể nhờ threshold thấp hơn (mở rộng null-space)
|
| 152 |
+
- Overall AP: kỳ vọng tăng từ ~39.74 lên >50 (threshold fix), replay CE giúp chống forgetting
|
| 153 |
+
- FT: kỳ vọng tính được (do_predict fix) và forgetting thấp hơn nhờ replay
|
| 154 |
+
|
| 155 |
+
### Nếu kết quả không đạt → V3 Plan
|
| 156 |
+
- **V3a**: Thêm output-level KL distillation (so sánh logits hiện tại vs teacher model snapshot) — yêu cầu lưu teacher model
|
| 157 |
+
- **V3b**: Thêm adaptive threshold per-layer (thay vì cùng threshold cho tất cả layers)
|
| 158 |
+
- **V3c**: SpecRoute + InfLoRA-style direction expansion khi null-space quá nhỏ
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
## Changelog
|
| 163 |
+
|
| 164 |
+
| Date | Version | Change Type | Description |
|
| 165 |
+
|------|---------|-------------|-------------|
|
| 166 |
+
| 2025-XX-XX | V1.0 | Initial | First experiment — baseline SpecRoute vs ROOT GainLoRA |
|
| 167 |
+
| 2025-XX-XX | V2.0 | Idea + Code | Thêm experience replay (CE), giảm threshold 0.995→0.980, fix do_predict |
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/all_results.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"epoch": 10.0,
|
| 3 |
-
"total_flos": 9122411270725632.0,
|
| 4 |
-
"train_loss": 0.5864024265556579,
|
| 5 |
-
"train_runtime": 1102.9262,
|
| 6 |
-
"train_samples": 5000,
|
| 7 |
-
"train_samples_per_second": 45.334,
|
| 8 |
-
"train_steps_per_second": 1.423
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/eval_eval_predictions.jsonl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f0a7a162013858ce8c643d5f66d3b834d8a573de497cec931a216f99e1f0178c
|
| 3 |
-
size 8615403
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/predict_eval_predictions.jsonl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f0a7a162013858ce8c643d5f66d3b834d8a573de497cec931a216f99e1f0178c
|
| 3 |
-
size 8615403
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_0.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6347c1939a53ee2509742a6aa5db72d302a9f33b13fbe90ef95a10f9e9221bb8
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_1.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:843fd6f47f02250c381b8489f82b7adb7cce5852b8dd463066395623cd42ca7c
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_10.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:222a5768a659ab676766654ca8e142f24f7c0761be3b55d63f9a6071d64207ac
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_11.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2b98abe23b3517987bda49db3b1501a0de290ccfe1293476c85ef9fb06ae1af3
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_12.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b4d217ec73e80746735689b52de8985bd9b8b37a3f804d87e8ea8399e5a7ce13
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_13.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b94c96b1e217047809307086ec824bac8630ccbd16d8431e1d4d0bd8cd9d9513
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_14.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3ddbf830599893c4a9f15ae1abf9f120a95063e6a53fde5581a64244edfccec8
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_15.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:063e96e4e954283653178868f826503b2f5d8003f4317b6396df23f47ba1ef6a
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_16.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3e145f0f32e63de296a637276dfb148e216c770504333ea90f87f429486cd242
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_17.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6c388d5777061b3d716fa2d9f33b58fd6b88a00880991654d7d74ae7c2f8393f
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_18.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:29cca1a96c16d78b45dec35a9fcd4f69fd78012d801f9ed7169d42b5d17b1818
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_19.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:78e8649cda3b950cd839967a2be7347d627f1ca1fbe32e3fa2f0ef09852c531b
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_2.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:741af2fdc11424d8a3b16fbee464fb95abe96251b5b96e2a6e9a48b3f91b0023
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_20.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:dd3cbcd2c082eef395e674b4f6aff25636050063f66d7df24136a283756a85cb
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_21.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a0a27c57b783efd371db7bb7e49a6b9d088511a9a79b79a9cee12c6584fbf2ec
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_22.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bbcb435a6636e909e222a288c45038662e44bb1546fa20e602c6dd6bacc7a0d9
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_23.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:86ed74a9a8c5fd74143b95b2d88ad0bfd2e5acb73a55a6de084b2007cd09ba2b
|
| 3 |
-
size 1049687
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_3.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:102c178c81bf0eca52a54fd02f8748209231bce6bcf038daa330e8dedb6cb4e4
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_4.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:764cf3f60cf57542a85cd7a24226830a47d0feda84fc32e6a8a1d9d9c498ebeb
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_5.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0249fa240b48cf18d6bc3638b6977ccbb993a5e4e3d924a7788a662c5668ffed
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_6.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bdd5d0c75a9037ff92d44e21c8ffbec5d32586e40cbdf5ba08b35f0d7ca3ec29
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_7.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:82169d8abd9475d638fc237287bc2f5273f0a0e8b1e8bcccc0e4ad53c9c74958
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_8.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:dff7efc08c970c36173502656d3837456b6910d5e20efcf2a8a70cfcbcf744ef
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/reg_9.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:80e3d101a726351d264a702ff4dd91d81898206ab423198df379119272c1b3be
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/runs/Mar13_11-48-44_a802a1875a6b/events.out.tfevents.1773402573.a802a1875a6b.120.0
DELETED
|
Binary file (9.73 kB)
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/attention_weights.pkl
DELETED
|
Binary file (151 Bytes)
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_A.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e4b19d3392d606e6a2f01514dd365f84474ee8008946d2d8bcb59f543159160b
|
| 3 |
-
size 803442
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/lora_weights_B.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2771ba4fefea1b21526d4a4c06bf7b00075a15ddfcb60fb1ccff48d5551c3b6f
|
| 3 |
-
size 606770
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/prompts_keys_till_now.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f52669525937aba46986ba60df07fb81bedf3745928a864ae4c08d88fae6a069
|
| 3 |
-
size 3298
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/special_tokens_map.json
DELETED
|
@@ -1,125 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"additional_special_tokens": [
|
| 3 |
-
"<extra_id_0>",
|
| 4 |
-
"<extra_id_1>",
|
| 5 |
-
"<extra_id_2>",
|
| 6 |
-
"<extra_id_3>",
|
| 7 |
-
"<extra_id_4>",
|
| 8 |
-
"<extra_id_5>",
|
| 9 |
-
"<extra_id_6>",
|
| 10 |
-
"<extra_id_7>",
|
| 11 |
-
"<extra_id_8>",
|
| 12 |
-
"<extra_id_9>",
|
| 13 |
-
"<extra_id_10>",
|
| 14 |
-
"<extra_id_11>",
|
| 15 |
-
"<extra_id_12>",
|
| 16 |
-
"<extra_id_13>",
|
| 17 |
-
"<extra_id_14>",
|
| 18 |
-
"<extra_id_15>",
|
| 19 |
-
"<extra_id_16>",
|
| 20 |
-
"<extra_id_17>",
|
| 21 |
-
"<extra_id_18>",
|
| 22 |
-
"<extra_id_19>",
|
| 23 |
-
"<extra_id_20>",
|
| 24 |
-
"<extra_id_21>",
|
| 25 |
-
"<extra_id_22>",
|
| 26 |
-
"<extra_id_23>",
|
| 27 |
-
"<extra_id_24>",
|
| 28 |
-
"<extra_id_25>",
|
| 29 |
-
"<extra_id_26>",
|
| 30 |
-
"<extra_id_27>",
|
| 31 |
-
"<extra_id_28>",
|
| 32 |
-
"<extra_id_29>",
|
| 33 |
-
"<extra_id_30>",
|
| 34 |
-
"<extra_id_31>",
|
| 35 |
-
"<extra_id_32>",
|
| 36 |
-
"<extra_id_33>",
|
| 37 |
-
"<extra_id_34>",
|
| 38 |
-
"<extra_id_35>",
|
| 39 |
-
"<extra_id_36>",
|
| 40 |
-
"<extra_id_37>",
|
| 41 |
-
"<extra_id_38>",
|
| 42 |
-
"<extra_id_39>",
|
| 43 |
-
"<extra_id_40>",
|
| 44 |
-
"<extra_id_41>",
|
| 45 |
-
"<extra_id_42>",
|
| 46 |
-
"<extra_id_43>",
|
| 47 |
-
"<extra_id_44>",
|
| 48 |
-
"<extra_id_45>",
|
| 49 |
-
"<extra_id_46>",
|
| 50 |
-
"<extra_id_47>",
|
| 51 |
-
"<extra_id_48>",
|
| 52 |
-
"<extra_id_49>",
|
| 53 |
-
"<extra_id_50>",
|
| 54 |
-
"<extra_id_51>",
|
| 55 |
-
"<extra_id_52>",
|
| 56 |
-
"<extra_id_53>",
|
| 57 |
-
"<extra_id_54>",
|
| 58 |
-
"<extra_id_55>",
|
| 59 |
-
"<extra_id_56>",
|
| 60 |
-
"<extra_id_57>",
|
| 61 |
-
"<extra_id_58>",
|
| 62 |
-
"<extra_id_59>",
|
| 63 |
-
"<extra_id_60>",
|
| 64 |
-
"<extra_id_61>",
|
| 65 |
-
"<extra_id_62>",
|
| 66 |
-
"<extra_id_63>",
|
| 67 |
-
"<extra_id_64>",
|
| 68 |
-
"<extra_id_65>",
|
| 69 |
-
"<extra_id_66>",
|
| 70 |
-
"<extra_id_67>",
|
| 71 |
-
"<extra_id_68>",
|
| 72 |
-
"<extra_id_69>",
|
| 73 |
-
"<extra_id_70>",
|
| 74 |
-
"<extra_id_71>",
|
| 75 |
-
"<extra_id_72>",
|
| 76 |
-
"<extra_id_73>",
|
| 77 |
-
"<extra_id_74>",
|
| 78 |
-
"<extra_id_75>",
|
| 79 |
-
"<extra_id_76>",
|
| 80 |
-
"<extra_id_77>",
|
| 81 |
-
"<extra_id_78>",
|
| 82 |
-
"<extra_id_79>",
|
| 83 |
-
"<extra_id_80>",
|
| 84 |
-
"<extra_id_81>",
|
| 85 |
-
"<extra_id_82>",
|
| 86 |
-
"<extra_id_83>",
|
| 87 |
-
"<extra_id_84>",
|
| 88 |
-
"<extra_id_85>",
|
| 89 |
-
"<extra_id_86>",
|
| 90 |
-
"<extra_id_87>",
|
| 91 |
-
"<extra_id_88>",
|
| 92 |
-
"<extra_id_89>",
|
| 93 |
-
"<extra_id_90>",
|
| 94 |
-
"<extra_id_91>",
|
| 95 |
-
"<extra_id_92>",
|
| 96 |
-
"<extra_id_93>",
|
| 97 |
-
"<extra_id_94>",
|
| 98 |
-
"<extra_id_95>",
|
| 99 |
-
"<extra_id_96>",
|
| 100 |
-
"<extra_id_97>",
|
| 101 |
-
"<extra_id_98>",
|
| 102 |
-
"<extra_id_99>"
|
| 103 |
-
],
|
| 104 |
-
"eos_token": {
|
| 105 |
-
"content": "</s>",
|
| 106 |
-
"lstrip": false,
|
| 107 |
-
"normalized": false,
|
| 108 |
-
"rstrip": false,
|
| 109 |
-
"single_word": false
|
| 110 |
-
},
|
| 111 |
-
"pad_token": {
|
| 112 |
-
"content": "<pad>",
|
| 113 |
-
"lstrip": false,
|
| 114 |
-
"normalized": false,
|
| 115 |
-
"rstrip": false,
|
| 116 |
-
"single_word": false
|
| 117 |
-
},
|
| 118 |
-
"unk_token": {
|
| 119 |
-
"content": "<unk>",
|
| 120 |
-
"lstrip": false,
|
| 121 |
-
"normalized": false,
|
| 122 |
-
"rstrip": false,
|
| 123 |
-
"single_word": false
|
| 124 |
-
}
|
| 125 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/spiece.model
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
|
| 3 |
-
size 791656
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/tokenizer_config.json
DELETED
|
@@ -1,938 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"added_tokens_decoder": {
|
| 3 |
-
"0": {
|
| 4 |
-
"content": "<pad>",
|
| 5 |
-
"lstrip": false,
|
| 6 |
-
"normalized": false,
|
| 7 |
-
"rstrip": false,
|
| 8 |
-
"single_word": false,
|
| 9 |
-
"special": true
|
| 10 |
-
},
|
| 11 |
-
"1": {
|
| 12 |
-
"content": "</s>",
|
| 13 |
-
"lstrip": false,
|
| 14 |
-
"normalized": false,
|
| 15 |
-
"rstrip": false,
|
| 16 |
-
"single_word": false,
|
| 17 |
-
"special": true
|
| 18 |
-
},
|
| 19 |
-
"2": {
|
| 20 |
-
"content": "<unk>",
|
| 21 |
-
"lstrip": false,
|
| 22 |
-
"normalized": false,
|
| 23 |
-
"rstrip": false,
|
| 24 |
-
"single_word": false,
|
| 25 |
-
"special": true
|
| 26 |
-
},
|
| 27 |
-
"32000": {
|
| 28 |
-
"content": "<extra_id_99>",
|
| 29 |
-
"lstrip": false,
|
| 30 |
-
"normalized": false,
|
| 31 |
-
"rstrip": false,
|
| 32 |
-
"single_word": false,
|
| 33 |
-
"special": true
|
| 34 |
-
},
|
| 35 |
-
"32001": {
|
| 36 |
-
"content": "<extra_id_98>",
|
| 37 |
-
"lstrip": false,
|
| 38 |
-
"normalized": false,
|
| 39 |
-
"rstrip": false,
|
| 40 |
-
"single_word": false,
|
| 41 |
-
"special": true
|
| 42 |
-
},
|
| 43 |
-
"32002": {
|
| 44 |
-
"content": "<extra_id_97>",
|
| 45 |
-
"lstrip": false,
|
| 46 |
-
"normalized": false,
|
| 47 |
-
"rstrip": false,
|
| 48 |
-
"single_word": false,
|
| 49 |
-
"special": true
|
| 50 |
-
},
|
| 51 |
-
"32003": {
|
| 52 |
-
"content": "<extra_id_96>",
|
| 53 |
-
"lstrip": false,
|
| 54 |
-
"normalized": false,
|
| 55 |
-
"rstrip": false,
|
| 56 |
-
"single_word": false,
|
| 57 |
-
"special": true
|
| 58 |
-
},
|
| 59 |
-
"32004": {
|
| 60 |
-
"content": "<extra_id_95>",
|
| 61 |
-
"lstrip": false,
|
| 62 |
-
"normalized": false,
|
| 63 |
-
"rstrip": false,
|
| 64 |
-
"single_word": false,
|
| 65 |
-
"special": true
|
| 66 |
-
},
|
| 67 |
-
"32005": {
|
| 68 |
-
"content": "<extra_id_94>",
|
| 69 |
-
"lstrip": false,
|
| 70 |
-
"normalized": false,
|
| 71 |
-
"rstrip": false,
|
| 72 |
-
"single_word": false,
|
| 73 |
-
"special": true
|
| 74 |
-
},
|
| 75 |
-
"32006": {
|
| 76 |
-
"content": "<extra_id_93>",
|
| 77 |
-
"lstrip": false,
|
| 78 |
-
"normalized": false,
|
| 79 |
-
"rstrip": false,
|
| 80 |
-
"single_word": false,
|
| 81 |
-
"special": true
|
| 82 |
-
},
|
| 83 |
-
"32007": {
|
| 84 |
-
"content": "<extra_id_92>",
|
| 85 |
-
"lstrip": false,
|
| 86 |
-
"normalized": false,
|
| 87 |
-
"rstrip": false,
|
| 88 |
-
"single_word": false,
|
| 89 |
-
"special": true
|
| 90 |
-
},
|
| 91 |
-
"32008": {
|
| 92 |
-
"content": "<extra_id_91>",
|
| 93 |
-
"lstrip": false,
|
| 94 |
-
"normalized": false,
|
| 95 |
-
"rstrip": false,
|
| 96 |
-
"single_word": false,
|
| 97 |
-
"special": true
|
| 98 |
-
},
|
| 99 |
-
"32009": {
|
| 100 |
-
"content": "<extra_id_90>",
|
| 101 |
-
"lstrip": false,
|
| 102 |
-
"normalized": false,
|
| 103 |
-
"rstrip": false,
|
| 104 |
-
"single_word": false,
|
| 105 |
-
"special": true
|
| 106 |
-
},
|
| 107 |
-
"32010": {
|
| 108 |
-
"content": "<extra_id_89>",
|
| 109 |
-
"lstrip": false,
|
| 110 |
-
"normalized": false,
|
| 111 |
-
"rstrip": false,
|
| 112 |
-
"single_word": false,
|
| 113 |
-
"special": true
|
| 114 |
-
},
|
| 115 |
-
"32011": {
|
| 116 |
-
"content": "<extra_id_88>",
|
| 117 |
-
"lstrip": false,
|
| 118 |
-
"normalized": false,
|
| 119 |
-
"rstrip": false,
|
| 120 |
-
"single_word": false,
|
| 121 |
-
"special": true
|
| 122 |
-
},
|
| 123 |
-
"32012": {
|
| 124 |
-
"content": "<extra_id_87>",
|
| 125 |
-
"lstrip": false,
|
| 126 |
-
"normalized": false,
|
| 127 |
-
"rstrip": false,
|
| 128 |
-
"single_word": false,
|
| 129 |
-
"special": true
|
| 130 |
-
},
|
| 131 |
-
"32013": {
|
| 132 |
-
"content": "<extra_id_86>",
|
| 133 |
-
"lstrip": false,
|
| 134 |
-
"normalized": false,
|
| 135 |
-
"rstrip": false,
|
| 136 |
-
"single_word": false,
|
| 137 |
-
"special": true
|
| 138 |
-
},
|
| 139 |
-
"32014": {
|
| 140 |
-
"content": "<extra_id_85>",
|
| 141 |
-
"lstrip": false,
|
| 142 |
-
"normalized": false,
|
| 143 |
-
"rstrip": false,
|
| 144 |
-
"single_word": false,
|
| 145 |
-
"special": true
|
| 146 |
-
},
|
| 147 |
-
"32015": {
|
| 148 |
-
"content": "<extra_id_84>",
|
| 149 |
-
"lstrip": false,
|
| 150 |
-
"normalized": false,
|
| 151 |
-
"rstrip": false,
|
| 152 |
-
"single_word": false,
|
| 153 |
-
"special": true
|
| 154 |
-
},
|
| 155 |
-
"32016": {
|
| 156 |
-
"content": "<extra_id_83>",
|
| 157 |
-
"lstrip": false,
|
| 158 |
-
"normalized": false,
|
| 159 |
-
"rstrip": false,
|
| 160 |
-
"single_word": false,
|
| 161 |
-
"special": true
|
| 162 |
-
},
|
| 163 |
-
"32017": {
|
| 164 |
-
"content": "<extra_id_82>",
|
| 165 |
-
"lstrip": false,
|
| 166 |
-
"normalized": false,
|
| 167 |
-
"rstrip": false,
|
| 168 |
-
"single_word": false,
|
| 169 |
-
"special": true
|
| 170 |
-
},
|
| 171 |
-
"32018": {
|
| 172 |
-
"content": "<extra_id_81>",
|
| 173 |
-
"lstrip": false,
|
| 174 |
-
"normalized": false,
|
| 175 |
-
"rstrip": false,
|
| 176 |
-
"single_word": false,
|
| 177 |
-
"special": true
|
| 178 |
-
},
|
| 179 |
-
"32019": {
|
| 180 |
-
"content": "<extra_id_80>",
|
| 181 |
-
"lstrip": false,
|
| 182 |
-
"normalized": false,
|
| 183 |
-
"rstrip": false,
|
| 184 |
-
"single_word": false,
|
| 185 |
-
"special": true
|
| 186 |
-
},
|
| 187 |
-
"32020": {
|
| 188 |
-
"content": "<extra_id_79>",
|
| 189 |
-
"lstrip": false,
|
| 190 |
-
"normalized": false,
|
| 191 |
-
"rstrip": false,
|
| 192 |
-
"single_word": false,
|
| 193 |
-
"special": true
|
| 194 |
-
},
|
| 195 |
-
"32021": {
|
| 196 |
-
"content": "<extra_id_78>",
|
| 197 |
-
"lstrip": false,
|
| 198 |
-
"normalized": false,
|
| 199 |
-
"rstrip": false,
|
| 200 |
-
"single_word": false,
|
| 201 |
-
"special": true
|
| 202 |
-
},
|
| 203 |
-
"32022": {
|
| 204 |
-
"content": "<extra_id_77>",
|
| 205 |
-
"lstrip": false,
|
| 206 |
-
"normalized": false,
|
| 207 |
-
"rstrip": false,
|
| 208 |
-
"single_word": false,
|
| 209 |
-
"special": true
|
| 210 |
-
},
|
| 211 |
-
"32023": {
|
| 212 |
-
"content": "<extra_id_76>",
|
| 213 |
-
"lstrip": false,
|
| 214 |
-
"normalized": false,
|
| 215 |
-
"rstrip": false,
|
| 216 |
-
"single_word": false,
|
| 217 |
-
"special": true
|
| 218 |
-
},
|
| 219 |
-
"32024": {
|
| 220 |
-
"content": "<extra_id_75>",
|
| 221 |
-
"lstrip": false,
|
| 222 |
-
"normalized": false,
|
| 223 |
-
"rstrip": false,
|
| 224 |
-
"single_word": false,
|
| 225 |
-
"special": true
|
| 226 |
-
},
|
| 227 |
-
"32025": {
|
| 228 |
-
"content": "<extra_id_74>",
|
| 229 |
-
"lstrip": false,
|
| 230 |
-
"normalized": false,
|
| 231 |
-
"rstrip": false,
|
| 232 |
-
"single_word": false,
|
| 233 |
-
"special": true
|
| 234 |
-
},
|
| 235 |
-
"32026": {
|
| 236 |
-
"content": "<extra_id_73>",
|
| 237 |
-
"lstrip": false,
|
| 238 |
-
"normalized": false,
|
| 239 |
-
"rstrip": false,
|
| 240 |
-
"single_word": false,
|
| 241 |
-
"special": true
|
| 242 |
-
},
|
| 243 |
-
"32027": {
|
| 244 |
-
"content": "<extra_id_72>",
|
| 245 |
-
"lstrip": false,
|
| 246 |
-
"normalized": false,
|
| 247 |
-
"rstrip": false,
|
| 248 |
-
"single_word": false,
|
| 249 |
-
"special": true
|
| 250 |
-
},
|
| 251 |
-
"32028": {
|
| 252 |
-
"content": "<extra_id_71>",
|
| 253 |
-
"lstrip": false,
|
| 254 |
-
"normalized": false,
|
| 255 |
-
"rstrip": false,
|
| 256 |
-
"single_word": false,
|
| 257 |
-
"special": true
|
| 258 |
-
},
|
| 259 |
-
"32029": {
|
| 260 |
-
"content": "<extra_id_70>",
|
| 261 |
-
"lstrip": false,
|
| 262 |
-
"normalized": false,
|
| 263 |
-
"rstrip": false,
|
| 264 |
-
"single_word": false,
|
| 265 |
-
"special": true
|
| 266 |
-
},
|
| 267 |
-
"32030": {
|
| 268 |
-
"content": "<extra_id_69>",
|
| 269 |
-
"lstrip": false,
|
| 270 |
-
"normalized": false,
|
| 271 |
-
"rstrip": false,
|
| 272 |
-
"single_word": false,
|
| 273 |
-
"special": true
|
| 274 |
-
},
|
| 275 |
-
"32031": {
|
| 276 |
-
"content": "<extra_id_68>",
|
| 277 |
-
"lstrip": false,
|
| 278 |
-
"normalized": false,
|
| 279 |
-
"rstrip": false,
|
| 280 |
-
"single_word": false,
|
| 281 |
-
"special": true
|
| 282 |
-
},
|
| 283 |
-
"32032": {
|
| 284 |
-
"content": "<extra_id_67>",
|
| 285 |
-
"lstrip": false,
|
| 286 |
-
"normalized": false,
|
| 287 |
-
"rstrip": false,
|
| 288 |
-
"single_word": false,
|
| 289 |
-
"special": true
|
| 290 |
-
},
|
| 291 |
-
"32033": {
|
| 292 |
-
"content": "<extra_id_66>",
|
| 293 |
-
"lstrip": false,
|
| 294 |
-
"normalized": false,
|
| 295 |
-
"rstrip": false,
|
| 296 |
-
"single_word": false,
|
| 297 |
-
"special": true
|
| 298 |
-
},
|
| 299 |
-
"32034": {
|
| 300 |
-
"content": "<extra_id_65>",
|
| 301 |
-
"lstrip": false,
|
| 302 |
-
"normalized": false,
|
| 303 |
-
"rstrip": false,
|
| 304 |
-
"single_word": false,
|
| 305 |
-
"special": true
|
| 306 |
-
},
|
| 307 |
-
"32035": {
|
| 308 |
-
"content": "<extra_id_64>",
|
| 309 |
-
"lstrip": false,
|
| 310 |
-
"normalized": false,
|
| 311 |
-
"rstrip": false,
|
| 312 |
-
"single_word": false,
|
| 313 |
-
"special": true
|
| 314 |
-
},
|
| 315 |
-
"32036": {
|
| 316 |
-
"content": "<extra_id_63>",
|
| 317 |
-
"lstrip": false,
|
| 318 |
-
"normalized": false,
|
| 319 |
-
"rstrip": false,
|
| 320 |
-
"single_word": false,
|
| 321 |
-
"special": true
|
| 322 |
-
},
|
| 323 |
-
"32037": {
|
| 324 |
-
"content": "<extra_id_62>",
|
| 325 |
-
"lstrip": false,
|
| 326 |
-
"normalized": false,
|
| 327 |
-
"rstrip": false,
|
| 328 |
-
"single_word": false,
|
| 329 |
-
"special": true
|
| 330 |
-
},
|
| 331 |
-
"32038": {
|
| 332 |
-
"content": "<extra_id_61>",
|
| 333 |
-
"lstrip": false,
|
| 334 |
-
"normalized": false,
|
| 335 |
-
"rstrip": false,
|
| 336 |
-
"single_word": false,
|
| 337 |
-
"special": true
|
| 338 |
-
},
|
| 339 |
-
"32039": {
|
| 340 |
-
"content": "<extra_id_60>",
|
| 341 |
-
"lstrip": false,
|
| 342 |
-
"normalized": false,
|
| 343 |
-
"rstrip": false,
|
| 344 |
-
"single_word": false,
|
| 345 |
-
"special": true
|
| 346 |
-
},
|
| 347 |
-
"32040": {
|
| 348 |
-
"content": "<extra_id_59>",
|
| 349 |
-
"lstrip": false,
|
| 350 |
-
"normalized": false,
|
| 351 |
-
"rstrip": false,
|
| 352 |
-
"single_word": false,
|
| 353 |
-
"special": true
|
| 354 |
-
},
|
| 355 |
-
"32041": {
|
| 356 |
-
"content": "<extra_id_58>",
|
| 357 |
-
"lstrip": false,
|
| 358 |
-
"normalized": false,
|
| 359 |
-
"rstrip": false,
|
| 360 |
-
"single_word": false,
|
| 361 |
-
"special": true
|
| 362 |
-
},
|
| 363 |
-
"32042": {
|
| 364 |
-
"content": "<extra_id_57>",
|
| 365 |
-
"lstrip": false,
|
| 366 |
-
"normalized": false,
|
| 367 |
-
"rstrip": false,
|
| 368 |
-
"single_word": false,
|
| 369 |
-
"special": true
|
| 370 |
-
},
|
| 371 |
-
"32043": {
|
| 372 |
-
"content": "<extra_id_56>",
|
| 373 |
-
"lstrip": false,
|
| 374 |
-
"normalized": false,
|
| 375 |
-
"rstrip": false,
|
| 376 |
-
"single_word": false,
|
| 377 |
-
"special": true
|
| 378 |
-
},
|
| 379 |
-
"32044": {
|
| 380 |
-
"content": "<extra_id_55>",
|
| 381 |
-
"lstrip": false,
|
| 382 |
-
"normalized": false,
|
| 383 |
-
"rstrip": false,
|
| 384 |
-
"single_word": false,
|
| 385 |
-
"special": true
|
| 386 |
-
},
|
| 387 |
-
"32045": {
|
| 388 |
-
"content": "<extra_id_54>",
|
| 389 |
-
"lstrip": false,
|
| 390 |
-
"normalized": false,
|
| 391 |
-
"rstrip": false,
|
| 392 |
-
"single_word": false,
|
| 393 |
-
"special": true
|
| 394 |
-
},
|
| 395 |
-
"32046": {
|
| 396 |
-
"content": "<extra_id_53>",
|
| 397 |
-
"lstrip": false,
|
| 398 |
-
"normalized": false,
|
| 399 |
-
"rstrip": false,
|
| 400 |
-
"single_word": false,
|
| 401 |
-
"special": true
|
| 402 |
-
},
|
| 403 |
-
"32047": {
|
| 404 |
-
"content": "<extra_id_52>",
|
| 405 |
-
"lstrip": false,
|
| 406 |
-
"normalized": false,
|
| 407 |
-
"rstrip": false,
|
| 408 |
-
"single_word": false,
|
| 409 |
-
"special": true
|
| 410 |
-
},
|
| 411 |
-
"32048": {
|
| 412 |
-
"content": "<extra_id_51>",
|
| 413 |
-
"lstrip": false,
|
| 414 |
-
"normalized": false,
|
| 415 |
-
"rstrip": false,
|
| 416 |
-
"single_word": false,
|
| 417 |
-
"special": true
|
| 418 |
-
},
|
| 419 |
-
"32049": {
|
| 420 |
-
"content": "<extra_id_50>",
|
| 421 |
-
"lstrip": false,
|
| 422 |
-
"normalized": false,
|
| 423 |
-
"rstrip": false,
|
| 424 |
-
"single_word": false,
|
| 425 |
-
"special": true
|
| 426 |
-
},
|
| 427 |
-
"32050": {
|
| 428 |
-
"content": "<extra_id_49>",
|
| 429 |
-
"lstrip": false,
|
| 430 |
-
"normalized": false,
|
| 431 |
-
"rstrip": false,
|
| 432 |
-
"single_word": false,
|
| 433 |
-
"special": true
|
| 434 |
-
},
|
| 435 |
-
"32051": {
|
| 436 |
-
"content": "<extra_id_48>",
|
| 437 |
-
"lstrip": false,
|
| 438 |
-
"normalized": false,
|
| 439 |
-
"rstrip": false,
|
| 440 |
-
"single_word": false,
|
| 441 |
-
"special": true
|
| 442 |
-
},
|
| 443 |
-
"32052": {
|
| 444 |
-
"content": "<extra_id_47>",
|
| 445 |
-
"lstrip": false,
|
| 446 |
-
"normalized": false,
|
| 447 |
-
"rstrip": false,
|
| 448 |
-
"single_word": false,
|
| 449 |
-
"special": true
|
| 450 |
-
},
|
| 451 |
-
"32053": {
|
| 452 |
-
"content": "<extra_id_46>",
|
| 453 |
-
"lstrip": false,
|
| 454 |
-
"normalized": false,
|
| 455 |
-
"rstrip": false,
|
| 456 |
-
"single_word": false,
|
| 457 |
-
"special": true
|
| 458 |
-
},
|
| 459 |
-
"32054": {
|
| 460 |
-
"content": "<extra_id_45>",
|
| 461 |
-
"lstrip": false,
|
| 462 |
-
"normalized": false,
|
| 463 |
-
"rstrip": false,
|
| 464 |
-
"single_word": false,
|
| 465 |
-
"special": true
|
| 466 |
-
},
|
| 467 |
-
"32055": {
|
| 468 |
-
"content": "<extra_id_44>",
|
| 469 |
-
"lstrip": false,
|
| 470 |
-
"normalized": false,
|
| 471 |
-
"rstrip": false,
|
| 472 |
-
"single_word": false,
|
| 473 |
-
"special": true
|
| 474 |
-
},
|
| 475 |
-
"32056": {
|
| 476 |
-
"content": "<extra_id_43>",
|
| 477 |
-
"lstrip": false,
|
| 478 |
-
"normalized": false,
|
| 479 |
-
"rstrip": false,
|
| 480 |
-
"single_word": false,
|
| 481 |
-
"special": true
|
| 482 |
-
},
|
| 483 |
-
"32057": {
|
| 484 |
-
"content": "<extra_id_42>",
|
| 485 |
-
"lstrip": false,
|
| 486 |
-
"normalized": false,
|
| 487 |
-
"rstrip": false,
|
| 488 |
-
"single_word": false,
|
| 489 |
-
"special": true
|
| 490 |
-
},
|
| 491 |
-
"32058": {
|
| 492 |
-
"content": "<extra_id_41>",
|
| 493 |
-
"lstrip": false,
|
| 494 |
-
"normalized": false,
|
| 495 |
-
"rstrip": false,
|
| 496 |
-
"single_word": false,
|
| 497 |
-
"special": true
|
| 498 |
-
},
|
| 499 |
-
"32059": {
|
| 500 |
-
"content": "<extra_id_40>",
|
| 501 |
-
"lstrip": false,
|
| 502 |
-
"normalized": false,
|
| 503 |
-
"rstrip": false,
|
| 504 |
-
"single_word": false,
|
| 505 |
-
"special": true
|
| 506 |
-
},
|
| 507 |
-
"32060": {
|
| 508 |
-
"content": "<extra_id_39>",
|
| 509 |
-
"lstrip": false,
|
| 510 |
-
"normalized": false,
|
| 511 |
-
"rstrip": false,
|
| 512 |
-
"single_word": false,
|
| 513 |
-
"special": true
|
| 514 |
-
},
|
| 515 |
-
"32061": {
|
| 516 |
-
"content": "<extra_id_38>",
|
| 517 |
-
"lstrip": false,
|
| 518 |
-
"normalized": false,
|
| 519 |
-
"rstrip": false,
|
| 520 |
-
"single_word": false,
|
| 521 |
-
"special": true
|
| 522 |
-
},
|
| 523 |
-
"32062": {
|
| 524 |
-
"content": "<extra_id_37>",
|
| 525 |
-
"lstrip": false,
|
| 526 |
-
"normalized": false,
|
| 527 |
-
"rstrip": false,
|
| 528 |
-
"single_word": false,
|
| 529 |
-
"special": true
|
| 530 |
-
},
|
| 531 |
-
"32063": {
|
| 532 |
-
"content": "<extra_id_36>",
|
| 533 |
-
"lstrip": false,
|
| 534 |
-
"normalized": false,
|
| 535 |
-
"rstrip": false,
|
| 536 |
-
"single_word": false,
|
| 537 |
-
"special": true
|
| 538 |
-
},
|
| 539 |
-
"32064": {
|
| 540 |
-
"content": "<extra_id_35>",
|
| 541 |
-
"lstrip": false,
|
| 542 |
-
"normalized": false,
|
| 543 |
-
"rstrip": false,
|
| 544 |
-
"single_word": false,
|
| 545 |
-
"special": true
|
| 546 |
-
},
|
| 547 |
-
"32065": {
|
| 548 |
-
"content": "<extra_id_34>",
|
| 549 |
-
"lstrip": false,
|
| 550 |
-
"normalized": false,
|
| 551 |
-
"rstrip": false,
|
| 552 |
-
"single_word": false,
|
| 553 |
-
"special": true
|
| 554 |
-
},
|
| 555 |
-
"32066": {
|
| 556 |
-
"content": "<extra_id_33>",
|
| 557 |
-
"lstrip": false,
|
| 558 |
-
"normalized": false,
|
| 559 |
-
"rstrip": false,
|
| 560 |
-
"single_word": false,
|
| 561 |
-
"special": true
|
| 562 |
-
},
|
| 563 |
-
"32067": {
|
| 564 |
-
"content": "<extra_id_32>",
|
| 565 |
-
"lstrip": false,
|
| 566 |
-
"normalized": false,
|
| 567 |
-
"rstrip": false,
|
| 568 |
-
"single_word": false,
|
| 569 |
-
"special": true
|
| 570 |
-
},
|
| 571 |
-
"32068": {
|
| 572 |
-
"content": "<extra_id_31>",
|
| 573 |
-
"lstrip": false,
|
| 574 |
-
"normalized": false,
|
| 575 |
-
"rstrip": false,
|
| 576 |
-
"single_word": false,
|
| 577 |
-
"special": true
|
| 578 |
-
},
|
| 579 |
-
"32069": {
|
| 580 |
-
"content": "<extra_id_30>",
|
| 581 |
-
"lstrip": false,
|
| 582 |
-
"normalized": false,
|
| 583 |
-
"rstrip": false,
|
| 584 |
-
"single_word": false,
|
| 585 |
-
"special": true
|
| 586 |
-
},
|
| 587 |
-
"32070": {
|
| 588 |
-
"content": "<extra_id_29>",
|
| 589 |
-
"lstrip": false,
|
| 590 |
-
"normalized": false,
|
| 591 |
-
"rstrip": false,
|
| 592 |
-
"single_word": false,
|
| 593 |
-
"special": true
|
| 594 |
-
},
|
| 595 |
-
"32071": {
|
| 596 |
-
"content": "<extra_id_28>",
|
| 597 |
-
"lstrip": false,
|
| 598 |
-
"normalized": false,
|
| 599 |
-
"rstrip": false,
|
| 600 |
-
"single_word": false,
|
| 601 |
-
"special": true
|
| 602 |
-
},
|
| 603 |
-
"32072": {
|
| 604 |
-
"content": "<extra_id_27>",
|
| 605 |
-
"lstrip": false,
|
| 606 |
-
"normalized": false,
|
| 607 |
-
"rstrip": false,
|
| 608 |
-
"single_word": false,
|
| 609 |
-
"special": true
|
| 610 |
-
},
|
| 611 |
-
"32073": {
|
| 612 |
-
"content": "<extra_id_26>",
|
| 613 |
-
"lstrip": false,
|
| 614 |
-
"normalized": false,
|
| 615 |
-
"rstrip": false,
|
| 616 |
-
"single_word": false,
|
| 617 |
-
"special": true
|
| 618 |
-
},
|
| 619 |
-
"32074": {
|
| 620 |
-
"content": "<extra_id_25>",
|
| 621 |
-
"lstrip": false,
|
| 622 |
-
"normalized": false,
|
| 623 |
-
"rstrip": false,
|
| 624 |
-
"single_word": false,
|
| 625 |
-
"special": true
|
| 626 |
-
},
|
| 627 |
-
"32075": {
|
| 628 |
-
"content": "<extra_id_24>",
|
| 629 |
-
"lstrip": false,
|
| 630 |
-
"normalized": false,
|
| 631 |
-
"rstrip": false,
|
| 632 |
-
"single_word": false,
|
| 633 |
-
"special": true
|
| 634 |
-
},
|
| 635 |
-
"32076": {
|
| 636 |
-
"content": "<extra_id_23>",
|
| 637 |
-
"lstrip": false,
|
| 638 |
-
"normalized": false,
|
| 639 |
-
"rstrip": false,
|
| 640 |
-
"single_word": false,
|
| 641 |
-
"special": true
|
| 642 |
-
},
|
| 643 |
-
"32077": {
|
| 644 |
-
"content": "<extra_id_22>",
|
| 645 |
-
"lstrip": false,
|
| 646 |
-
"normalized": false,
|
| 647 |
-
"rstrip": false,
|
| 648 |
-
"single_word": false,
|
| 649 |
-
"special": true
|
| 650 |
-
},
|
| 651 |
-
"32078": {
|
| 652 |
-
"content": "<extra_id_21>",
|
| 653 |
-
"lstrip": false,
|
| 654 |
-
"normalized": false,
|
| 655 |
-
"rstrip": false,
|
| 656 |
-
"single_word": false,
|
| 657 |
-
"special": true
|
| 658 |
-
},
|
| 659 |
-
"32079": {
|
| 660 |
-
"content": "<extra_id_20>",
|
| 661 |
-
"lstrip": false,
|
| 662 |
-
"normalized": false,
|
| 663 |
-
"rstrip": false,
|
| 664 |
-
"single_word": false,
|
| 665 |
-
"special": true
|
| 666 |
-
},
|
| 667 |
-
"32080": {
|
| 668 |
-
"content": "<extra_id_19>",
|
| 669 |
-
"lstrip": false,
|
| 670 |
-
"normalized": false,
|
| 671 |
-
"rstrip": false,
|
| 672 |
-
"single_word": false,
|
| 673 |
-
"special": true
|
| 674 |
-
},
|
| 675 |
-
"32081": {
|
| 676 |
-
"content": "<extra_id_18>",
|
| 677 |
-
"lstrip": false,
|
| 678 |
-
"normalized": false,
|
| 679 |
-
"rstrip": false,
|
| 680 |
-
"single_word": false,
|
| 681 |
-
"special": true
|
| 682 |
-
},
|
| 683 |
-
"32082": {
|
| 684 |
-
"content": "<extra_id_17>",
|
| 685 |
-
"lstrip": false,
|
| 686 |
-
"normalized": false,
|
| 687 |
-
"rstrip": false,
|
| 688 |
-
"single_word": false,
|
| 689 |
-
"special": true
|
| 690 |
-
},
|
| 691 |
-
"32083": {
|
| 692 |
-
"content": "<extra_id_16>",
|
| 693 |
-
"lstrip": false,
|
| 694 |
-
"normalized": false,
|
| 695 |
-
"rstrip": false,
|
| 696 |
-
"single_word": false,
|
| 697 |
-
"special": true
|
| 698 |
-
},
|
| 699 |
-
"32084": {
|
| 700 |
-
"content": "<extra_id_15>",
|
| 701 |
-
"lstrip": false,
|
| 702 |
-
"normalized": false,
|
| 703 |
-
"rstrip": false,
|
| 704 |
-
"single_word": false,
|
| 705 |
-
"special": true
|
| 706 |
-
},
|
| 707 |
-
"32085": {
|
| 708 |
-
"content": "<extra_id_14>",
|
| 709 |
-
"lstrip": false,
|
| 710 |
-
"normalized": false,
|
| 711 |
-
"rstrip": false,
|
| 712 |
-
"single_word": false,
|
| 713 |
-
"special": true
|
| 714 |
-
},
|
| 715 |
-
"32086": {
|
| 716 |
-
"content": "<extra_id_13>",
|
| 717 |
-
"lstrip": false,
|
| 718 |
-
"normalized": false,
|
| 719 |
-
"rstrip": false,
|
| 720 |
-
"single_word": false,
|
| 721 |
-
"special": true
|
| 722 |
-
},
|
| 723 |
-
"32087": {
|
| 724 |
-
"content": "<extra_id_12>",
|
| 725 |
-
"lstrip": false,
|
| 726 |
-
"normalized": false,
|
| 727 |
-
"rstrip": false,
|
| 728 |
-
"single_word": false,
|
| 729 |
-
"special": true
|
| 730 |
-
},
|
| 731 |
-
"32088": {
|
| 732 |
-
"content": "<extra_id_11>",
|
| 733 |
-
"lstrip": false,
|
| 734 |
-
"normalized": false,
|
| 735 |
-
"rstrip": false,
|
| 736 |
-
"single_word": false,
|
| 737 |
-
"special": true
|
| 738 |
-
},
|
| 739 |
-
"32089": {
|
| 740 |
-
"content": "<extra_id_10>",
|
| 741 |
-
"lstrip": false,
|
| 742 |
-
"normalized": false,
|
| 743 |
-
"rstrip": false,
|
| 744 |
-
"single_word": false,
|
| 745 |
-
"special": true
|
| 746 |
-
},
|
| 747 |
-
"32090": {
|
| 748 |
-
"content": "<extra_id_9>",
|
| 749 |
-
"lstrip": false,
|
| 750 |
-
"normalized": false,
|
| 751 |
-
"rstrip": false,
|
| 752 |
-
"single_word": false,
|
| 753 |
-
"special": true
|
| 754 |
-
},
|
| 755 |
-
"32091": {
|
| 756 |
-
"content": "<extra_id_8>",
|
| 757 |
-
"lstrip": false,
|
| 758 |
-
"normalized": false,
|
| 759 |
-
"rstrip": false,
|
| 760 |
-
"single_word": false,
|
| 761 |
-
"special": true
|
| 762 |
-
},
|
| 763 |
-
"32092": {
|
| 764 |
-
"content": "<extra_id_7>",
|
| 765 |
-
"lstrip": false,
|
| 766 |
-
"normalized": false,
|
| 767 |
-
"rstrip": false,
|
| 768 |
-
"single_word": false,
|
| 769 |
-
"special": true
|
| 770 |
-
},
|
| 771 |
-
"32093": {
|
| 772 |
-
"content": "<extra_id_6>",
|
| 773 |
-
"lstrip": false,
|
| 774 |
-
"normalized": false,
|
| 775 |
-
"rstrip": false,
|
| 776 |
-
"single_word": false,
|
| 777 |
-
"special": true
|
| 778 |
-
},
|
| 779 |
-
"32094": {
|
| 780 |
-
"content": "<extra_id_5>",
|
| 781 |
-
"lstrip": false,
|
| 782 |
-
"normalized": false,
|
| 783 |
-
"rstrip": false,
|
| 784 |
-
"single_word": false,
|
| 785 |
-
"special": true
|
| 786 |
-
},
|
| 787 |
-
"32095": {
|
| 788 |
-
"content": "<extra_id_4>",
|
| 789 |
-
"lstrip": false,
|
| 790 |
-
"normalized": false,
|
| 791 |
-
"rstrip": false,
|
| 792 |
-
"single_word": false,
|
| 793 |
-
"special": true
|
| 794 |
-
},
|
| 795 |
-
"32096": {
|
| 796 |
-
"content": "<extra_id_3>",
|
| 797 |
-
"lstrip": false,
|
| 798 |
-
"normalized": false,
|
| 799 |
-
"rstrip": false,
|
| 800 |
-
"single_word": false,
|
| 801 |
-
"special": true
|
| 802 |
-
},
|
| 803 |
-
"32097": {
|
| 804 |
-
"content": "<extra_id_2>",
|
| 805 |
-
"lstrip": false,
|
| 806 |
-
"normalized": false,
|
| 807 |
-
"rstrip": false,
|
| 808 |
-
"single_word": false,
|
| 809 |
-
"special": true
|
| 810 |
-
},
|
| 811 |
-
"32098": {
|
| 812 |
-
"content": "<extra_id_1>",
|
| 813 |
-
"lstrip": false,
|
| 814 |
-
"normalized": false,
|
| 815 |
-
"rstrip": false,
|
| 816 |
-
"single_word": false,
|
| 817 |
-
"special": true
|
| 818 |
-
},
|
| 819 |
-
"32099": {
|
| 820 |
-
"content": "<extra_id_0>",
|
| 821 |
-
"lstrip": false,
|
| 822 |
-
"normalized": false,
|
| 823 |
-
"rstrip": false,
|
| 824 |
-
"single_word": false,
|
| 825 |
-
"special": true
|
| 826 |
-
}
|
| 827 |
-
},
|
| 828 |
-
"additional_special_tokens": [
|
| 829 |
-
"<extra_id_0>",
|
| 830 |
-
"<extra_id_1>",
|
| 831 |
-
"<extra_id_2>",
|
| 832 |
-
"<extra_id_3>",
|
| 833 |
-
"<extra_id_4>",
|
| 834 |
-
"<extra_id_5>",
|
| 835 |
-
"<extra_id_6>",
|
| 836 |
-
"<extra_id_7>",
|
| 837 |
-
"<extra_id_8>",
|
| 838 |
-
"<extra_id_9>",
|
| 839 |
-
"<extra_id_10>",
|
| 840 |
-
"<extra_id_11>",
|
| 841 |
-
"<extra_id_12>",
|
| 842 |
-
"<extra_id_13>",
|
| 843 |
-
"<extra_id_14>",
|
| 844 |
-
"<extra_id_15>",
|
| 845 |
-
"<extra_id_16>",
|
| 846 |
-
"<extra_id_17>",
|
| 847 |
-
"<extra_id_18>",
|
| 848 |
-
"<extra_id_19>",
|
| 849 |
-
"<extra_id_20>",
|
| 850 |
-
"<extra_id_21>",
|
| 851 |
-
"<extra_id_22>",
|
| 852 |
-
"<extra_id_23>",
|
| 853 |
-
"<extra_id_24>",
|
| 854 |
-
"<extra_id_25>",
|
| 855 |
-
"<extra_id_26>",
|
| 856 |
-
"<extra_id_27>",
|
| 857 |
-
"<extra_id_28>",
|
| 858 |
-
"<extra_id_29>",
|
| 859 |
-
"<extra_id_30>",
|
| 860 |
-
"<extra_id_31>",
|
| 861 |
-
"<extra_id_32>",
|
| 862 |
-
"<extra_id_33>",
|
| 863 |
-
"<extra_id_34>",
|
| 864 |
-
"<extra_id_35>",
|
| 865 |
-
"<extra_id_36>",
|
| 866 |
-
"<extra_id_37>",
|
| 867 |
-
"<extra_id_38>",
|
| 868 |
-
"<extra_id_39>",
|
| 869 |
-
"<extra_id_40>",
|
| 870 |
-
"<extra_id_41>",
|
| 871 |
-
"<extra_id_42>",
|
| 872 |
-
"<extra_id_43>",
|
| 873 |
-
"<extra_id_44>",
|
| 874 |
-
"<extra_id_45>",
|
| 875 |
-
"<extra_id_46>",
|
| 876 |
-
"<extra_id_47>",
|
| 877 |
-
"<extra_id_48>",
|
| 878 |
-
"<extra_id_49>",
|
| 879 |
-
"<extra_id_50>",
|
| 880 |
-
"<extra_id_51>",
|
| 881 |
-
"<extra_id_52>",
|
| 882 |
-
"<extra_id_53>",
|
| 883 |
-
"<extra_id_54>",
|
| 884 |
-
"<extra_id_55>",
|
| 885 |
-
"<extra_id_56>",
|
| 886 |
-
"<extra_id_57>",
|
| 887 |
-
"<extra_id_58>",
|
| 888 |
-
"<extra_id_59>",
|
| 889 |
-
"<extra_id_60>",
|
| 890 |
-
"<extra_id_61>",
|
| 891 |
-
"<extra_id_62>",
|
| 892 |
-
"<extra_id_63>",
|
| 893 |
-
"<extra_id_64>",
|
| 894 |
-
"<extra_id_65>",
|
| 895 |
-
"<extra_id_66>",
|
| 896 |
-
"<extra_id_67>",
|
| 897 |
-
"<extra_id_68>",
|
| 898 |
-
"<extra_id_69>",
|
| 899 |
-
"<extra_id_70>",
|
| 900 |
-
"<extra_id_71>",
|
| 901 |
-
"<extra_id_72>",
|
| 902 |
-
"<extra_id_73>",
|
| 903 |
-
"<extra_id_74>",
|
| 904 |
-
"<extra_id_75>",
|
| 905 |
-
"<extra_id_76>",
|
| 906 |
-
"<extra_id_77>",
|
| 907 |
-
"<extra_id_78>",
|
| 908 |
-
"<extra_id_79>",
|
| 909 |
-
"<extra_id_80>",
|
| 910 |
-
"<extra_id_81>",
|
| 911 |
-
"<extra_id_82>",
|
| 912 |
-
"<extra_id_83>",
|
| 913 |
-
"<extra_id_84>",
|
| 914 |
-
"<extra_id_85>",
|
| 915 |
-
"<extra_id_86>",
|
| 916 |
-
"<extra_id_87>",
|
| 917 |
-
"<extra_id_88>",
|
| 918 |
-
"<extra_id_89>",
|
| 919 |
-
"<extra_id_90>",
|
| 920 |
-
"<extra_id_91>",
|
| 921 |
-
"<extra_id_92>",
|
| 922 |
-
"<extra_id_93>",
|
| 923 |
-
"<extra_id_94>",
|
| 924 |
-
"<extra_id_95>",
|
| 925 |
-
"<extra_id_96>",
|
| 926 |
-
"<extra_id_97>",
|
| 927 |
-
"<extra_id_98>",
|
| 928 |
-
"<extra_id_99>"
|
| 929 |
-
],
|
| 930 |
-
"clean_up_tokenization_spaces": true,
|
| 931 |
-
"eos_token": "</s>",
|
| 932 |
-
"extra_ids": 100,
|
| 933 |
-
"model_max_length": 512,
|
| 934 |
-
"pad_token": "<pad>",
|
| 935 |
-
"sp_model_kwargs": {},
|
| 936 |
-
"tokenizer_class": "T5Tokenizer",
|
| 937 |
-
"unk_token": "<unk>"
|
| 938 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/saved_weights/trans_input.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f98adc49cd4c2f647b32016a990363f67b33d273b860e75efdfda9545a44b439
|
| 3 |
-
size 411248
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/train_results.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"epoch": 10.0,
|
| 3 |
-
"total_flos": 9122411270725632.0,
|
| 4 |
-
"train_loss": 0.5864024265556579,
|
| 5 |
-
"train_runtime": 1102.9262,
|
| 6 |
-
"train_samples": 5000,
|
| 7 |
-
"train_samples_per_second": 45.334,
|
| 8 |
-
"train_steps_per_second": 1.423
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trainer_state.json
DELETED
|
@@ -1,105 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": 55.9605,
|
| 3 |
-
"best_model_checkpoint": "logs_and_outputs/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/checkpoint-1500",
|
| 4 |
-
"epoch": 10.0,
|
| 5 |
-
"eval_steps": 500,
|
| 6 |
-
"global_step": 1570,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 3.1847133757961785,
|
| 13 |
-
"learning_rate": 0.0003,
|
| 14 |
-
"loss": 0.6612,
|
| 15 |
-
"step": 500
|
| 16 |
-
},
|
| 17 |
-
{
|
| 18 |
-
"epoch": 3.1847133757961785,
|
| 19 |
-
"eval_exact_match": 43.9737,
|
| 20 |
-
"eval_exact_match_for_CL": 43.9737,
|
| 21 |
-
"eval_exact_match_for_yelp": 43.9737,
|
| 22 |
-
"eval_gen_len": 2.5726,
|
| 23 |
-
"eval_global_step": 500,
|
| 24 |
-
"eval_loss": 0.45953086018562317,
|
| 25 |
-
"eval_rouge1": 66.2544,
|
| 26 |
-
"eval_rouge1_for_CL": 66.2544,
|
| 27 |
-
"eval_rouge1_for_yelp": 66.2544,
|
| 28 |
-
"eval_rougeL": 66.2544,
|
| 29 |
-
"eval_rougeL_for_CL": 66.2544,
|
| 30 |
-
"eval_rougeL_for_yelp": 66.2544,
|
| 31 |
-
"eval_runtime": 105.0444,
|
| 32 |
-
"eval_samples_per_second": 72.35,
|
| 33 |
-
"eval_steps_per_second": 0.286,
|
| 34 |
-
"step": 500
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"epoch": 6.369426751592357,
|
| 38 |
-
"learning_rate": 0.0003,
|
| 39 |
-
"loss": 0.5753,
|
| 40 |
-
"step": 1000
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"epoch": 6.369426751592357,
|
| 44 |
-
"eval_exact_match": 51.9474,
|
| 45 |
-
"eval_exact_match_for_CL": 51.9474,
|
| 46 |
-
"eval_exact_match_for_yelp": 51.9474,
|
| 47 |
-
"eval_gen_len": 2.5862,
|
| 48 |
-
"eval_global_step": 1000,
|
| 49 |
-
"eval_loss": 0.4304201900959015,
|
| 50 |
-
"eval_rouge1": 69.4868,
|
| 51 |
-
"eval_rouge1_for_CL": 69.4868,
|
| 52 |
-
"eval_rouge1_for_yelp": 69.4868,
|
| 53 |
-
"eval_rougeL": 69.4868,
|
| 54 |
-
"eval_rougeL_for_CL": 69.4868,
|
| 55 |
-
"eval_rougeL_for_yelp": 69.4868,
|
| 56 |
-
"eval_runtime": 104.5891,
|
| 57 |
-
"eval_samples_per_second": 72.665,
|
| 58 |
-
"eval_steps_per_second": 0.287,
|
| 59 |
-
"step": 1000
|
| 60 |
-
},
|
| 61 |
-
{
|
| 62 |
-
"epoch": 9.554140127388536,
|
| 63 |
-
"learning_rate": 0.0003,
|
| 64 |
-
"loss": 0.5329,
|
| 65 |
-
"step": 1500
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 9.554140127388536,
|
| 69 |
-
"eval_exact_match": 55.9605,
|
| 70 |
-
"eval_exact_match_for_CL": 55.9605,
|
| 71 |
-
"eval_exact_match_for_yelp": 55.9605,
|
| 72 |
-
"eval_gen_len": 2.5192,
|
| 73 |
-
"eval_global_step": 1500,
|
| 74 |
-
"eval_loss": 0.4083245098590851,
|
| 75 |
-
"eval_rouge1": 70.8684,
|
| 76 |
-
"eval_rouge1_for_CL": 70.8684,
|
| 77 |
-
"eval_rouge1_for_yelp": 70.8684,
|
| 78 |
-
"eval_rougeL": 70.8684,
|
| 79 |
-
"eval_rougeL_for_CL": 70.8684,
|
| 80 |
-
"eval_rougeL_for_yelp": 70.8684,
|
| 81 |
-
"eval_runtime": 104.4585,
|
| 82 |
-
"eval_samples_per_second": 72.756,
|
| 83 |
-
"eval_steps_per_second": 0.287,
|
| 84 |
-
"step": 1500
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"epoch": 10.0,
|
| 88 |
-
"step": 1570,
|
| 89 |
-
"total_flos": 9122411270725632.0,
|
| 90 |
-
"train_loss": 0.5864024265556579,
|
| 91 |
-
"train_runtime": 1102.9262,
|
| 92 |
-
"train_samples_per_second": 45.334,
|
| 93 |
-
"train_steps_per_second": 1.423
|
| 94 |
-
}
|
| 95 |
-
],
|
| 96 |
-
"logging_steps": 500,
|
| 97 |
-
"max_steps": 1570,
|
| 98 |
-
"num_input_tokens_seen": 0,
|
| 99 |
-
"num_train_epochs": 10,
|
| 100 |
-
"save_steps": 500,
|
| 101 |
-
"total_flos": 9122411270725632.0,
|
| 102 |
-
"train_batch_size": null,
|
| 103 |
-
"trial_name": null,
|
| 104 |
-
"trial_params": null
|
| 105 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_0.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4beab9d4dcbba908b7def8ef43eacb08dd6f3941fdf70c924622ec431085ae3a
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_1.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:009305b9def4b16f2d3260b9e1a9493d5e3d9145014d8335bd9a2b4e2c6b8b45
|
| 3 |
-
size 41106
|
|
|
|
|
|
|
|
|
|
|
|
root_gainlora/logs/root_t5_small/gen_script_long_order3_t5_small_gainlora_inflora/outputs/1-yelp/trans_input/reg_2.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:251823168eaa8fde1fbf577a463ca6d34f3b49abd13868096f45d9dd5544bee1
|
| 3 |
-
size 1049682
|
|
|
|
|
|
|
|
|
|
|
|