v10a

Browse files

Files changed (8) hide show

final_check.py +64 -0
improve_gainlora/T5_small/gen_script_long_order3_t5_small_specroute_v11.sh +950 -0
improve_gainlora/results/gen_script_long_order3_t5_small_specroute_v2.txt +3 -0
improve_gainlora/results/gen_script_long_order3_t5_small_specroute_v5.txt +3 -0
improve_gainlora/src/cl_trainer_specroute.py +128 -3
parse_and_score_v2.py +87 -0
recalculate_em.py +82 -0
results/experiment_versions.md +117 -0

final_check.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import json
+import os
+def load_json(path):
+    with open(path, 'r') as f:
+        return json.load(f)
+def get_matrix_from_outputs(base_dir, run_name, tasks):
+    matrix = []
+    for i in range(len(tasks)):
+        row = []
+        res_file = f"{base_dir}/{run_name}/outputs/{i+1}-{tasks[i]}/all_results.json"
+        if not os.path.exists(res_file):
+            matrix.append([0.0]*len(tasks))
+            continue
+        data = load_json(res_file)
+        for j in range(i + 1):
+            key = f"predict_eval_rougeL_for_{tasks[j]}"
+            row.append(data.get(key, 0.0))
+        row.extend([0.0]*(len(tasks)-len(row)))
+        matrix.append(row)
+    return matrix
+def calculate_stats(matrix):
+    task_num = len(matrix[0])
+    final_row = matrix[-1]
+    AP = sum(final_row) / task_num
+    fgt_list = []
+    for j in range(task_num - 1):
+        history = [row[j] for row in matrix if row[j] > 0]
+        if not history:
+            continue
+        best = max(history)
+        final = final_row[j]
+        fgt_list.append(best - final)
+    Fgt = sum(fgt_list) / len(fgt_list) if fgt_list else 0.0
+    return AP, Fgt
+tasks = ["yelp", "amazon", "mnli", "cb", "copa", "qqp", "rte", "imdb", "sst2", "dbpedia", "agnews", "yahoo", "multirc", "boolq", "wic"]
+# ROOT
+root_dir = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/root_t5_small"
+root_run = "gen_script_long_order3_t5_small_gainlora_inflora"
+# ROOT might not have all_results.json with predict metrics as seen earlier.
+# So I'll use the user's documented values for ROOT if needed.
+# But let's try reading V5 which definitely has them.
+v5_dir = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/t5_small_improve"
+v5_run = "gen_script_long_order3_t5_small_specroute_v5"
+print("--- V5 Matrix ---")
+try:
+    v5_matrix = get_matrix_from_outputs(v5_dir, v5_run, tasks)
+    v5_ap, v5_fgt = calculate_stats(v5_matrix)
+    print(f"V5 AP(rougeL): {v5_ap:.4f}")
+    print(f"V5 Fgt: {v5_fgt:.4f}")
+except Exception as e:
+    print(f"V5 failed: {e}")
+# For V10, we have the final vector from log:
+v10_final = [59.9013, 59.7018, 30.5395, 0.0, 55.0, 11.9474, 10.1083, 89.8947, 65.2523, 53.1737, 65.0342, 62.0329, 43.1312, 62.4465, 56.4263]
+v10_ap = sum(v10_final) / 15
+print(f"V10 AP(rougeL): {v10_ap:.4f}")

improve_gainlora/T5_small/gen_script_long_order3_t5_small_specroute_v11.sh ADDED Viewed

	@@ -0,0 +1,950 @@

+#!/bin/bash
+#SBATCH -J cl
+#SBATCH -o cl-%j.out
+#SBATCH -p compute
+#SBATCH -N 1
+#SBATCH -t 20:00:00
+#SBATCH --mem 128G
+#SBATCH --gres=gpu:2
+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+port=$(shuf -i25000-30000 -n1)
+# ============================================================
+# Auto-detect GPU count and type for optimal parallelism
+# ============================================================
+NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
+GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
+if [ -z "$GPU_MEM" ]; then
+    echo "ERROR: No GPU detected!"
+    exit 1
+fi
+# GPU type detection
+# T4 <15500 MB | P100 15500-17000 MB | RTX3090 ~24576 | A100 40000 | H100 80000
+if [ "$GPU_MEM" -lt 15500 ]; then
+    GPU_TYPE="t4"
+    echo "[GPU] Detected T4 (${GPU_MEM}MB)"
+elif [ "$GPU_MEM" -le 17000 ]; then
+    GPU_TYPE="p100"
+    echo "[GPU] Detected P100 (${GPU_MEM}MB)"
+else
+    GPU_TYPE="highvram"
+    echo "[GPU] Detected high-VRAM GPU (${GPU_MEM}MB)"
+fi
+# Parallelism: T4/P100 use gradient_checkpointing (16 GB fp32); highvram uses DataParallel if 2+ GPUs
+if [ "$GPU_TYPE" = "t4" ] && [ "$NUM_GPUS" -ge 2 ]; then
+    GPU_MODE="t4_2gpu"
+    GPU_IDS="0,1"
+    FP16_FLAG="--gradient_checkpointing"
+    echo "[GPU] Strategy: 2x T4 DataParallel + fp32 + gradient_checkpointing"
+elif [ "$GPU_TYPE" = "t4" ]; then
+    GPU_MODE="t4_1gpu"
+    GPU_IDS="${1:-0}"
+    FP16_FLAG="--gradient_checkpointing"
+    echo "[GPU] Strategy: 1x T4 (${GPU_MEM}MB) + fp32 + gradient_checkpointing"
+elif [ "$GPU_TYPE" = "p100" ]; then
+    GPU_MODE="p100"
+    GPU_IDS="${1:-0}"
+    FP16_FLAG="--gradient_checkpointing"
+    echo "[GPU] Strategy: P100 16GB + fp32 + gradient_checkpointing"
+else
+    GPU_MODE="a100"
+    if [ "$NUM_GPUS" -ge 2 ]; then
+        GPU_IDS="0,1"
+        echo "[GPU] Strategy: ${NUM_GPUS}x ${GPU_MEM}MB DataParallel (RTX3090/A100, fp32)"
+    else
+        GPU_IDS="${1:-0}"
+        echo "[GPU] Strategy: 1x ${GPU_MEM}MB GPU (fp32)"
+    fi
+    FP16_FLAG=""
+fi
+echo "[GPU] Using CUDA_VISIBLE_DEVICES=$GPU_IDS"
+echo "============================================================"
+echo ""
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/yelp \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/amazon \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_amazon \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/mnli \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_mnli \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/cb \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_cb \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/copa \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_copa \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/qqp \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_qqp \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/rte \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_rte \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/imdb \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_imdb \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/sst2 \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2 \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_sst2 \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/dbpedia \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_dbpedia \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/agnews \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_agnews \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/yahoo \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_yahoo \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/multirc \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/13-multirc \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_multirc \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/13-multirc/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/13-multirc/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/13-multirc/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/13-multirc/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/boolq \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/14-boolq \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_boolq \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/14-boolq/checkpoint*
+sleep 5
+if [ "$GPU_MODE" = "t4_2gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=64
+elif [ "$GPU_MODE" = "t4_1gpu" ]; then
+    BSZ=8; GA=2; EVAL_BSZ=32
+elif [ "$GPU_MODE" = "p100" ]; then
+    BSZ=16; GA=2; EVAL_BSZ=32
+else
+    BSZ=64; GA=1; EVAL_BSZ=128
+fi
+CUDA_VISIBLE_DEVICES=$GPU_IDS python src/run_t5.py \
+   --do_train \
+   --load_checkpoint_from logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/14-boolq/saved_weights/trans_input.pt \
+   --previous_prompt_key_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/14-boolq/saved_weights/prompts_keys_till_now.pt \
+   --do_predict \
+   --predict_with_generate \
+   --model_name_or_path $2 \
+   --previous_lora_path logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/1-yelp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/2-amazon/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/3-mnli/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/4-cb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/5-copa/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/6-qqp/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/7-rte/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/8-imdb/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/9-sst2/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/10-dbpedia/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/11-agnews/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/12-yahoo/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/13-multirc/saved_weights,logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/14-boolq/saved_weights \
+   --data_dir CL_Benchmark \
+   --task_order yelp,amazon,mnli,cb,copa,qqp,rte,imdb,sst2,dbpedia,agnews,yahoo,multirc,boolq,wic \
+   --task_config_dir configs/gen_script_long_order3_t5_configs/wic \
+   --output_dir logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/15-wic \
+   --per_device_train_batch_size $BSZ \
+   --per_device_eval_batch_size $EVAL_BSZ \
+   --gradient_accumulation_steps $GA \
+   --learning_rate 0.0003 \
+   --num_train_epochs 10 \
+   --run_name gen_script_long_order3_t5_small_specroute_v11 \
+   --max_source_length 512 \
+   --max_target_length 50 \
+   --generation_max_length 50 \
+   --add_task_name False \
+   --add_dataset_name False \
+   --overwrite_output_dir \
+   --overwrite_cache \
+   --lr_scheduler_type constant \
+   --warmup_steps 0 \
+   --logging_strategy steps \
+   --logging_steps 10 \
+   --metric_for_best_model eval_exact_match_for_wic \
+   --evaluation_strategy epoch \
+   --save_strategy epoch \
+   --save_total_limit 1 \
+   --load_best_model_at_end \
+   --lora_r 8 \
+   --lora_alpha 32 \
+   --lora_dropout 0.0 \
+   --data_replay_freq -1 \
+   --mlp_hidden_dim 100 \
+   --model_name specroute \
+   --routing_mode learned \
+   --threshold 0.995 \
+   --transthreshold 0.995 \
+   $FP16_FLAG
+rm -rf logs_and_outputs/gen_script_long_order3_t5_small_specroute_v11/outputs/15-wic/checkpoint*
+sleep 5

improve_gainlora/results/gen_script_long_order3_t5_small_specroute_v2.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26c66b8453b30e2331a8a08d2a425b8c375aced3dd9c26346c0f544d0d4b524f
+size 182

improve_gainlora/results/gen_script_long_order3_t5_small_specroute_v5.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efaab52e503376bf2a0446839a2825c9efe5a2558c86595fcbe35892d093e41f
+size 174

improve_gainlora/src/cl_trainer_specroute.py CHANGED Viewed

@@ -18,6 +18,7 @@ from torch.utils.data.distributed import DistributedSampler
 from transformers import GenerationConfig
 from transformers.trainer_seq2seq import Seq2SeqTrainer
 from transformers.trainer import *
 from transformers.trainer_pt_utils import (
     nested_truncate, nested_concat, nested_numpify,
     find_batch_size,
@@ -82,11 +83,15 @@ class PeriodicGCCallback(TrainerCallback):
 class TransInputGPMCallback(TrainerCallback):
-    """V10a: Apply GPM projection to trans_input and prompt_key after optimizer step."""
     def __init__(self, trainer):
         self.trainer = trainer
     def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
         if getattr(self.trainer, "cur_task_id", 0) > 1 and getattr(self.trainer.model.encoder, "routing_mode", "") == "learned":
             from copy import deepcopy
             self.trainer._old_trans_input_0 = deepcopy(self.trainer.model.encoder.trans_input[0].weight.detach())
@@ -94,6 +99,8 @@ class TransInputGPMCallback(TrainerCallback):
             self.trainer._old_prompt_key = deepcopy(self.trainer.model.encoder.prompt_key.detach())
     def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
         if getattr(self.trainer, "cur_task_id", 0) > 1 and getattr(self.trainer.model.encoder, "routing_mode", "") == "learned":
             if not hasattr(self.trainer, "feature_trans_mat") or not self.trainer.feature_trans_mat:
                 return
@@ -313,6 +320,109 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
         print(f'[C5] Covariance collected for {len(self._task_covariance)} layers.')
     def load_previous_reg_matrix(self):
         """Load LoRA GPM bases from previous task. Also load trans_input GPM if learned routing."""
         reg_matrix = []
@@ -389,11 +499,26 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
     def get_reg_matrix(self):
         """
-        Project current LoRA A into null-space of old tasks' GPM bases.
-        No prompt_key/trans_input operations.
         """
         self.feature_list, self.feature_trans_list, self._cur_task = self.load_previous_reg_matrix()
         if len(self.feature_list) == 0:
             # First task: no constraints
             return

 from transformers import GenerationConfig
 from transformers.trainer_seq2seq import Seq2SeqTrainer
 from transformers.trainer import *
+from typing import Optional, List, Tuple
 from transformers.trainer_pt_utils import (
     nested_truncate, nested_concat, nested_numpify,
     find_batch_size,
 class TransInputGPMCallback(TrainerCallback):
+    """V10a: Apply GPM projection to trans_input and prompt_key after optimizer step.
+    V11: Disabled by default (use_routing_gpm=False). Hard GPM on routing kills
+    discriminative capacity → catastrophic forgetting. See V10a analysis."""
     def __init__(self, trainer):
         self.trainer = trainer
     def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        if not getattr(self.trainer, "use_routing_gpm", False):
+            return control
         if getattr(self.trainer, "cur_task_id", 0) > 1 and getattr(self.trainer.model.encoder, "routing_mode", "") == "learned":
             from copy import deepcopy
             self.trainer._old_trans_input_0 = deepcopy(self.trainer.model.encoder.trans_input[0].weight.detach())
             self.trainer._old_prompt_key = deepcopy(self.trainer.model.encoder.prompt_key.detach())
     def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        if not getattr(self.trainer, "use_routing_gpm", False):
+            return control
         if getattr(self.trainer, "cur_task_id", 0) > 1 and getattr(self.trainer.model.encoder, "routing_mode", "") == "learned":
             if not hasattr(self.trainer, "feature_trans_mat") or not self.trainer.feature_trans_mat:
                 return
         print(f'[C5] Covariance collected for {len(self._task_covariance)} layers.')
+    # ================================================================
+    # V11: ROOT-style Prompt-Key Re-initialization
+    # ================================================================
+    def _reinit_prompt_key(self):
+        """Re-initialize prompt_key using SVD of trans_input output covariance.
+        ROOT's key insight: prompt_key must be in the null-space of previous
+        routing features to ensure orthogonal task separation.
+        Task 1: prompt_key = top eigenvector of trans_input output covariance C_3.
+          This aligns the routing key with the dominant direction of the MLP's
+          output space → maximizes discriminability for the first task.
+          Formally: p_1 = argmax_{||p||=1} p^T C_3 p (Rayleigh quotient)
+        Task t>1: prompt_key = top eigenvector of random matrix projected into
+          null-space of old routing features.
+          p_t = U_1 of SVD(Q_old · R) where Q_old = I - P_old, R ~ N(0,1)
+          This guarantees: p_t ⊥ span({p_1,...,p_{t-1}}) up to GPM threshold.
+        """
+        module = self.model.encoder
+        if not hasattr(module, 'prompt_key'):
+            return
+        # Ensure chunk dimensions are set up
+        module.get_chunk(self.args.chunk)
+        # Collect trans_input output covariance (200 batches)
+        module.get_trans_feature = True
+        module.stage_trans = 0
+        print('[V11] Collecting trans_input covariance for prompt_key init...')
+        train_dataloader = self.get_train_dataloader()
+        if isinstance(train_dataloader, DataLoader) and isinstance(
+            train_dataloader.sampler, DistributedSampler
+        ):
+            train_dataloader.sampler.set_epoch(77)
+        with torch.no_grad():
+            for step, inputs in enumerate(train_dataloader):
+                inputs = self._prepare_inputs(inputs)
+                inputs.pop('labels', None)
+                self.model(**inputs)
+                if step >= 200:
+                    break
+        pre_norm = module.prompt_key.detach().norm()
+        if len(self.feature_trans_list) == 0:
+            # === TASK 1: Data-informed init ===
+            # prompt_key = top eigenvector of output covariance (matrix_trans_3)
+            for index in module.matrix_trans_3.keys():
+                cur_trans_matrix = module.matrix_trans_3[index]
+                cur_trans_matrix = torch.nan_to_num(cur_trans_matrix, nan=0.0, posinf=1e6, neginf=-1e6)
+                try:
+                    U, S, V = torch.linalg.svd(cur_trans_matrix)
+                except Exception:
+                    cpu_mat = cur_trans_matrix.detach().cpu().float()
+                    U, S, V = torch.linalg.svd(cpu_mat)
+                    U = U.to(device=cur_trans_matrix.device, dtype=cur_trans_matrix.dtype)
+                module.prompt_key.data[:, index*module.step:(index+1)*module.step].copy_(U[:, :1].T)
+            print('[V11] Task 1: prompt_key = top eigvec of trans_input output covariance.')
+        else:
+            # === TASK t>1: Null-space orthogonal init ===
+            # Build projection matrix P_old from saved routing GPM bases
+            feature_trans_mat_2 = {}
+            if len(self.feature_trans_list) >= 3:
+                for index in self.feature_trans_list[2].keys():
+                    feature_trans_mat_2[index] = torch.mm(
+                        self.feature_trans_list[2][index],
+                        self.feature_trans_list[2][index].T
+                    ).to("cuda:0")
+            for index in module.matrix_trans_3.keys():
+                cur_trans_matrix = torch.randn_like(module.matrix_trans_3[index])
+                if index in feature_trans_mat_2:
+                    # Q_old * R: project random matrix into null-space
+                    cur_trans_matrix = cur_trans_matrix - torch.mm(
+                        feature_trans_mat_2[index], cur_trans_matrix
+                    )
+                try:
+                    U, S, V = torch.linalg.svd(cur_trans_matrix)
+                except Exception:
+                    cpu_mat = cur_trans_matrix.detach().cpu().float()
+                    U, S, V = torch.linalg.svd(cpu_mat)
+                    U = U.to(device=cur_trans_matrix.device, dtype=cur_trans_matrix.dtype)
+                module.prompt_key.data[:, index*module.step:(index+1)*module.step].copy_(U[:, :1].T)
+            print(f'[V11] Task {self.cur_task_id+1}: prompt_key = top eigvec in null-space of old routing features.')
+        # Normalize to preserve original scale (ROOT convention)
+        module.prompt_key.data /= math.sqrt(module.chunk_trans)
+        module.prompt_key.data *= pre_norm
+        # Cleanup covariance accumulators
+        for index in list(module.matrix_trans_3.keys()):
+            module.matrix_trans_1[index].zero_()
+            module.matrix_trans_3[index].zero_()
+            module.n_trans_matrix[index] = 0
+        module.matrix_trans_2.zero_()
+        module.get_trans_feature = False
+        module.stage_trans = 0
+        print(f'[V11] prompt_key re-initialized. norm={module.prompt_key.data.norm().item():.4f}')
     def load_previous_reg_matrix(self):
         """Load LoRA GPM bases from previous task. Also load trans_input GPM if learned routing."""
         reg_matrix = []
     def get_reg_matrix(self):
         """
+        V11: Project current LoRA A into null-space of old tasks' GPM bases.
+        Also re-initialize prompt_key for learned routing (ROOT-style SVD).
         """
         self.feature_list, self.feature_trans_list, self._cur_task = self.load_previous_reg_matrix()
+        # ================================================================
+        # V11: Prompt-key re-initialization (ROOT-style)
+        # ================================================================
+        # ROOT achieves low forgetting because:
+        # 1. prompt_key is initialized in the null-space of old routing features
+        #    → orthogonal to old keys → naturally separable tasks
+        # 2. trans_input (MLP) is free to learn without GPM constraint
+        #    → discriminative routing features
+        #
+        # Math: For task t, prompt_key_t ∈ null(P_old) where P_old = Σ U_k U_k^T
+        # This ensures cos(prompt_key_t, prompt_key_k) ≈ 0 for k < t
+        # → different tasks activate different experts.
+        if getattr(self.model.encoder, "routing_mode", "") == "learned":
+            self._reinit_prompt_key()
         if len(self.feature_list) == 0:
             # First task: no constraints
             return

parse_and_score_v2.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import re
+import sys
+def parse_log(log_path):
+    with open(log_path, 'r') as f:
+        content = f.read()
+    # Task order as defined in script
+    tasks = ["yelp", "amazon", "mnli", "cb", "copa", "qqp", "rte", "imdb", "sst2", "dbpedia", "agnews", "yahoo", "multirc", "boolq", "wic"]
+    # Split content into segments, each ending with a "predict metrics" block
+    # We look for "predict_exact_match_for_CL" as an anchor for each step evaluation
+    segments = re.split(r'predict_exact_match_for_CL\s+=\s+\d+\.\d+', content)
+    # The last segment might be empty if there's nothing after the final metrics
+    if not segments[-1].strip():
+        segments = segments[:-1]
+    # We expect 15 evaluations
+    print(f"Found {len(segments)} evaluation segments in {log_path}")
+    matrix = []
+    for seg in segments:
+        scores = []
+        for task in tasks:
+            match = re.search(fr'predict_exact_match_for_{task}\s+=\s+(\d+\.\d+|\d+)', seg)
+            if match:
+                scores.append(float(match.group(1)))
+            else:
+                scores.append(0.0)
+        if any(s > 0 for s in scores): # Only add if we found at least one score
+            matrix.append(scores)
+    # If it's the final evaluation only (like in some logs), we might have only 1 segment
+    return matrix, tasks
+def calculate_metrics(matrix):
+    if not matrix:
+        return None
+    task_num = len(matrix[0])
+    # final_scores is the last row provided (if the run ended at 15, it's matrix[14])
+    final_scores = matrix[-1]
+    AP = sum(final_scores) / task_num
+    # Forgetting: max(history) - final
+    fgt_list = []
+    for t_idx in range(task_num - 1):
+        history = [row[t_idx] for row in matrix]
+        best = max(history)
+        final = final_scores[t_idx]
+        fgt_list.append(best - final)
+    Fgt = sum(fgt_list) / len(fgt_list) if fgt_list else 0.0
+    # User's definition of forgetting in markdown: Final - Initial?
+    # Let's calculate that too just in case
+    fgt_user_list = []
+    for t_idx in range(task_num - 1):
+        initial = matrix[t_idx][t_idx] if t_idx < len(matrix) else 0.0
+        final = final_scores[t_idx]
+        fgt_user_list.append(final - initial)
+    Fgt_user = sum(fgt_user_list) / len(fgt_user_list) if fgt_user_list else 0.0
+    return {
+        "AP": AP,
+        "Fgt (Best-Final)": Fgt,
+        "Fgt_user (Final-Initial)": Fgt_user,
+        "Final Scores": final_scores
+    }
+log_v10 = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/t5_small_improve/improve_gainlora_v10.log"
+matrix, tasks = parse_log(log_v10)
+metrics = calculate_metrics(matrix)
+print("--- V10 Metrics ---")
+if metrics:
+    print(f"AP (EM): {metrics['AP']:.4f}")
+    print(f"Fgt (Best-Final): {metrics['Fgt (Best-Final)']:.4f}")
+    print(f"Fgt (Final-Initial): {metrics['Fgt_user (Final-Initial)']:.4f}")
+    print("Final Scores:", metrics['Final Scores'])
+else:
+    print("Failed to parse matrix for V10")
+# Also do V5 for comparison
+log_v5_dir = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/t5_small_improve/gen_script_long_order3_t5_small_specroute_v5/"
+# We need to find the log file inside v5 dir.
+# It's likely in outputs/ or similar.

recalculate_em.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import json
+import os
+def load_json(path):
+    with open(path, 'r') as f:
+        return json.load(f)
+def get_matrix_from_outputs(base_dir, run_name, tasks, metric='exact_match'):
+    matrix = []
+    for i in range(len(tasks)):
+        row = []
+        res_file = f"{base_dir}/{run_name}/outputs/{i+1}-{tasks[i]}/all_results.json"
+        if not os.path.exists(res_file):
+            matrix.append([0.0]*len(tasks))
+            continue
+        data = load_json(res_file)
+        for j in range(i + 1):
+            key = f"predict_{metric}_for_{tasks[j]}"
+            row.append(data.get(key, 0.0))
+        row.extend([0.0]*(len(tasks)-len(row)))
+        matrix.append(row)
+    return matrix
+def calculate_stats(matrix):
+    task_num = len(matrix[0])
+    final_row = matrix[-1]
+    AP = sum(final_row) / task_num
+    fgt_list = []
+    for j in range(task_num - 1):
+        history = [row[j] for row in matrix if row[j] > 0]
+        if not history:
+            continue
+        best = max(history)
+        final = final_row[j]
+        fgt_list.append(best - final)
+    Fgt = sum(fgt_list) / len(fgt_list) if fgt_list else 0.0
+    return AP, Fgt
+tasks = ["yelp", "amazon", "mnli", "cb", "copa", "qqp", "rte", "imdb", "sst2", "dbpedia", "agnews", "yahoo", "multirc", "boolq", "wic"]
+# V5 (EM)
+v5_dir = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/t5_small_improve"
+v5_run = "gen_script_long_order3_t5_small_specroute_v5"
+print("--- V5 (EM) ---")
+try:
+    v5_matrix = get_matrix_from_outputs(v5_dir, v5_run, tasks, 'exact_match')
+    v5_ap_em, v5_fgt_em = calculate_stats(v5_matrix)
+    print(f"V5 AP(EM): {v5_ap_em:.4f}")
+    print(f"V5 Fgt(EM): {v5_fgt_em:.4f}")
+except Exception as e:
+    print(f"V5 failed: {e}")
+# ROOT (EM) - Based on User's Markdown since I can't find some ROOT JSONs
+# Actually, let's try to parse ROOT logs if any, but 59.7 is definitely the target.
+print("--- ROOT (EM) Target ---")
+print("ROOT AP(EM): 59.70")
+# V10a (EM) - From Log
+v10_final_em = {
+    "agnews": 38.7237,
+    "amazon": 29.0263,
+    "boolq": 62.4465,
+    "cb": 0.0,
+    "copa": 55.0,
+    "dbpedia": 40.5395,
+    "imdb": 90.0789,
+    "mnli": 32.1316,
+    "multirc": 59.1172,
+    "qqp": 64.3158,
+    "rte": 52.7076,
+    "sst2": 83.945,
+    "wic": 56.4263,
+    "yahoo": 64.8947,
+    "yelp": 21.3289
+}
+# Order: yelp, amazon, mnli, cb, copa, qqp, rte, imdb, sst2, dbpedia, agnews, yahoo, multirc, boolq, wic
+ordered_v10_em = [21.3289, 29.0263, 32.1316, 0.0, 55.0, 64.3158, 52.7076, 90.0789, 83.9450, 40.5395, 38.7237, 64.8947, 59.1172, 62.4465, 56.4263]
+v10_ap_em = sum(ordered_v10_em) / 15
+print(f"V10a AP(EM): {v10_ap_em:.4f}")

results/experiment_versions.md CHANGED Viewed

@@ -377,3 +377,120 @@ V8 fail imdb/sst2/yahoo do B_t không học (gradient bị block). V9 oracle rou
 ### V10b (Grassmannian Distance Routing - The Zero-Replay Ideal)
 - **Method**: Evaluates similarity by computing the Grassmannian distance (principal angles) between the batch's local principal subspace $U_{batch}$ and expert orthogonal projection $U_A$.
 - **Why**: Directly measures subset geometric alignment, entirely bypassing scale-based similarity issues (GPM-Routing paradox). Batch-level SVD aggregates representations properly. Valid for batched inference ($B \ge 8$), falling back to A-row for small batches.

 ### V10b (Grassmannian Distance Routing - The Zero-Replay Ideal)
 - **Method**: Evaluates similarity by computing the Grassmannian distance (principal angles) between the batch's local principal subspace $U_{batch}$ and expert orthogonal projection $U_A$.
 - **Why**: Directly measures subset geometric alignment, entirely bypassing scale-based similarity issues (GPM-Routing paradox). Batch-level SVD aggregates representations properly. Valid for batched inference ($B \ge 8$), falling back to A-row for small batches.
+### V10a Results
+| Task | Final EM | Best EM | Forgetting |
+|------|------:|------:|------:|
+| yelp | 33.45 | 56.49 | 23.04 |
+| amazon | 35.37 | 53.05 | 17.68 |
+| mnli | 30.54 | 49.11 | 18.57 |
+| cb | 0.00 | 57.14 | 57.14 |
+| copa | 55.00 | 55.00 | 0.00 |
+| qqp | 11.95 | 78.84 | 66.89 |
+| rte | 10.11 | 57.76 | 47.65 |
+| imdb | 89.89 | 91.51 | 1.62 |
+| sst2 | 65.25 | 88.88 | 23.62 |
+| dbpedia | 40.70 | 98.47 | 57.78 |
+| agnews | 42.67 | 90.05 | 47.38 |
+| yahoo | 61.88 | 66.01 | 4.13 |
+| multirc | 43.13 | 59.12 | 15.99 |
+| boolq | 62.45 | 62.45 | 0.00 |
+| wic | 56.43 | 56.43 | 0.00 |
+| **Cl (EM)** | **42.59** | | **27.25** |
+**V10a is CATASTROPHIC**: Cl=42.59 (vs ROOT 59.70), FT=27.25 (vs ROOT ~low, V5 0.91).
+### V10a Root Cause Analysis
+**100% of forgetting comes from routing failure**, not weight overwriting (LoRA B matrices for old tasks are frozen in `previous_lora_weights`).
+**Three critical differences from ROOT:**
+1. **TransInputGPMCallback (THE KILLER)**: V10a applies GPM projection to `trans_input` + `prompt_key` every training step with threshold=0.995. By task 9, ~95% of routing feature space is locked → routing effectively frozen → cannot distinguish new tasks. ROOT does NOT constrain routing during training.
+2. **Missing prompt_key re-initialization**: ROOT re-initializes `prompt_key` before each task using SVD of trans_input output covariance (task 1) or random-in-null-space (task 2+). V10a starts from `nn.init.uniform_(-1, 1)` every task → no data-informed, orthogonal starting point.
+3. **No trans_input covariance collection**: ROOT collects 1000 batches of trans_input feature covariance for prompt_key initialization. V10a only collects LoRA covariance (for C5).
+**The deadly combination**: Random prompt_key + Over-constrained routing = Bad starting point + Cannot learn = Routing failure = Catastrophic forgetting.
+---
+## V11 — ROOT Routing + C5 Init + Advanced Inference Routing
+### Motivation
+V10a proved that GPM on routing is fundamentally wrong: routing needs discriminative capacity, not orthogonality constraints. V11 reverts to ROOT's proven routing mechanism while keeping C5 (data-informed LoRA A init) and C4 (gradient preconditioning) for improved per-task expert quality. Additionally, V11 introduces two advanced inference-time routing strategies grounded in information theory.
+### Base Fix (all V11 variants)
+1. **Disable TransInputGPMCallback**: `use_routing_gpm = False` (default)
+2. **ROOT-style prompt_key re-init**: SVD of trans_input output covariance (task 1) or null-space random SVD (task 2+)
+3. **Keep C5**: Data-informed A init via Constrained PCA in null-space
+4. **Keep C4**: Gradient preconditioning (AA^T + εI)^{-1/2}
+### V11a: Base (ROOT routing + C5)
+**Script**: `T5_small/gen_script_long_order3_t5_small_specroute_v11a.sh`
+**Args**: `--routing_mode learned --routing_strategy base`
+**Expected**: ≈ ROOT AP (routing identical), potentially better due to C5.
+### V11b: Softmax Routing Normalization (Option B)
+**Script**: `T5_small/gen_script_long_order3_t5_small_specroute_v11b.sh`
+**Args**: `--routing_mode learned --routing_strategy softmax --routing_temp 0.1`
+**Mathematical formulation:**
+ROOT uses independent sigmoid routing: $w_k = |\sigma(4 \cos(x_k, p_k)) \cdot 2 - 1|$.
+Each task gets weight in [0,1] independently → multiple experts may contribute equally → cross-expert interference.
+V11b converts to competitive softmax gating (standard MoE):
+$$p_k = \frac{\exp(s_k / \tau)}{\sum_j \exp(s_j / \tau)}$$
+where $s_k = \text{logit}(w_k) = \log w_k - \log(1 - w_k)$ and $\tau$ is temperature.
+**Information-theoretic justification:**
+Let $Y$ = model output, $T$ = task, $X$ = input. Output: $Y = \sum_k p_k f_k(X)$.
+$$H(Y|X) \geq \sum_k p_k H(f_k(X)|X) \quad \text{(concavity of entropy)}$$
+Cross-expert interference term: $\sum_{j \neq k} p_j \|f_j(X) - f_k(X)\|^2$.
+Minimizing this ≡ concentrating $p$ on argmax (one expert dominates) ≡ lower $\tau$.
+In the limit $\tau \to 0$: softmax → argmax (hard top-1 routing, zero interference).
+**Expected improvement**: Lower FT due to sharper expert selection.
+### V11c: Product-of-Experts Ensemble (Option C)
+**Script**: `T5_small/gen_script_long_order3_t5_small_specroute_v11c.sh`
+**Args**: `--routing_mode learned --routing_strategy ensemble --routing_temp 0.1 --ensemble_weight 0.7`
+**Mathematical formulation:**
+Fuse learned ($p_L$) and spectral ($p_S$) routing via Product-of-Experts (Hinton, 2002):
+$$p_{\text{ens}}(T=k|x) \propto p_L(T=k|x)^\gamma \cdot p_S(T=k|x)^{1-\gamma}$$
+In log space:
+$$\log p_{\text{ens}} = \gamma \cdot \frac{s_L^{(k)}}{\tau} + (1-\gamma) \cdot \frac{s_S^{(k)}}{\tau} + \text{const}$$
+**Bayesian justification:**
+If learned and spectral routing encode independent evidence about task identity $T$:
+$$p(T|x) \propto p_L(T|x) \cdot p_S(T|x) \quad \text{(posterior = product of likelihoods)}$$
+This is the classical Product-of-Experts derivation (assuming uniform prior on T).
+**Complementary error profiles:**
+- Learned routing: excels on recently trained tasks (MLP adapts); degrades on distant old tasks (feature drift)
+- Spectral routing: parameter-free → zero drift; weaker on same-domain tasks (GPM forces $A_k \perp A_j$)
+- When both agree: high confidence → nearly always correct
+- When they disagree: hedged prediction → reduces worst-case error
+**Channel capacity argument:**
+Each routing method has limited channel capacity $C_L, C_S$ for encoding task identity.
+Ensemble capacity: $C_{\text{ens}} \geq \max(C_L, C_S)$ (data processing inequality) with equality iff one subsumes the other.
+Since learned and spectral use orthogonal feature spaces (MLP output vs A-row projection), $C_{\text{ens}} > \max(C_L, C_S)$.
+**Expected improvement**: Both AP ↑ (better routing accuracy) and FT ↓ (spectral stabilizes learned).
+### Hyperparameters
+All V11 variants:
+- lora_r = 8, lora_alpha = 32
+- lr = 3e-4, epochs = 10
+- threshold = 0.995, transthreshold = 0.995
+- mlp_hidden_dim = 100
+V11b specific: routing_temp = 0.1
+V11c specific: routing_temp = 0.1, ensemble_weight = 0.7