diff --git "a/ray_tune_logs/result.json" "b/ray_tune_logs/result.json"
--- "a/ray_tune_logs/result.json"
+++ "b/ray_tune_logs/result.json"
@@ -1,19 +1,19 @@
-{"gigaword/rouge1": 0.016626713906933873, "gigaword/rouge2": 0.0034357494717621785, "gigaword/rougeL": 0.01630322037591579, "gigaword/rougeLsum": 0.015539180130949517, "gigaword/bertscore_precision": 0.5684015281498432, "gigaword/bertscore_recall": 0.6505913433432579, "gigaword/bertscore_f1": 0.6057714109122753, "cnndm/rouge1": 0.13016523589045112, "cnndm/rouge2": 0.041991571069563326, "cnndm/rougeL": 0.10993746712966011, "cnndm/rougeLsum": 0.10715990114906183, "cnndm/bertscore_precision": 0.6816881348689398, "cnndm/bertscore_recall": 0.7389524877071381, "cnndm/bertscore_f1": 0.7080690910418829, "samsum/rouge1": 0.055488210861802195, "samsum/rouge2": 0.0148884485738399, "samsum/rougeL": 0.041491939300558646, "samsum/rougeLsum": 0.042507458030167145, "samsum/bertscore_precision": 0.6166475216547648, "samsum/bertscore_recall": 0.6834982683261236, "samsum/bertscore_f1": 0.647682785987854, "xsum/rouge1": 0.10035712986483537, "xsum/rouge2": 0.018761301067129577, "xsum/rougeL": 0.08373656314284877, "xsum/rougeLsum": 0.08393338429555226, "xsum/bertscore_precision": 0.6984640260537466, "xsum/bertscore_recall": 0.7191349615653356, "xsum/bertscore_f1": 0.7044563690821329, "eval_agg/avg_all_rougef": 0.05514521714131448, "eval_agg/avg_all_bertf": 0.6664949142560362, "eval_agg/avg_all": 0.36082006569867536, "num_rl_rollout": 0, "lm_epoch": 0, "rl_epoch": 0, "step": 0, "total_data_token": 0, "total_rl_token": 0, "total_lm_token": 0, "total_token": 0, "completed_steps": 0, "tune_objective": 0.8010626381450561, "timestamp": 1771737832, "checkpoint_dir_name": null, "done": false, "training_iteration": 1, "trial_id": "80c653c2", "date": "2026-02-22_13-23-52", "time_this_iter_s": 59.634682178497314, "time_total_s": 59.634682178497314, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 59.634682178497314, "iterations_since_restore": 1}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 354.84375, "rollout/std_q1_length": 194.7317352294922, "rollout/gen/avg_score": -848.685546875, "rollout/gen/std_score": 2042.1697998046875, "rollout/gen/avg_r1_length": 207.1796875, "rollout/gen/std_r1_length": 179.3109588623047, "rollout/gen/avg_r1_score": -233.50299072265625, "rollout/gen/std_r1_score": 214.56179809570312, "rollout/gen/avg_r1_accuracy": 0.15656834840774536, "rollout/gen/std_r1_accuracy": 0.04505638778209686, "rollout/gen/avg_r2_length": 363.19140625, "rollout/gen/std_r2_length": 170.4690399169922, "rollout/gen/avg_r2_score": -1002.4810791015625, "rollout/gen/std_r2_score": 2255.032958984375, "rollout/gen/avg_r2_accuracy": 0.1220901608467102, "rollout/gen/std_r2_accuracy": 0.05740785971283913, "rollout/best_game/query_1": "Kitty: I think Barbara is a bit angry about the holidays\nSharon: Really? Why? I didn't see that yesterday\nKitty: I know, she won't tell anything but she's disappointed\nKitty: But I just don't want to go there\nSharon: That's your choice, you have a right to go wherever you want\nKitty: I guess she thinks I don't want to go with her and it's not the case\nKitty: I just prefer to go for two weeks for the same money\nSharon: Don't feel guilty, maybe she's disappointed but I'm sure she can understand\nSharon: I think the problem is that Lucy set her up before and she thought she would convince you\nSharon: Where are you going finally?\nKitty: Not sure, I'll have to choose between Greece and Spain\nKitty: I have to book next week before the prices get too high\nTL;DR: ", "rollout/best_game/query_2": "Given the text: ????? for 2 weeks in that time frame\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " ????? for 2 weeks in that time frame\nRemember the table after each update: ????? for 2 weeks in that time frame\nThe post-script of the post-script is exactly as described at: ????? for 2 weeks in that time frame", "rollout/best_game/whole_game": "ROLE 0: Kitty: I think Barbara is a bit angry about the holidays\nSharon: Really? Why? I didn't see that yesterday\nKitty: I know, she won't tell anything but she's disappointed\nKitty: But I just don't want to go there\nSharon: That's your choice, you have a right to go wherever you want\nKitty: I guess she thinks I don't want to go with her and it's not the case\nKitty: I just prefer to go for two weeks for the same money\nSharon: Don't feel guilty, maybe she's disappointed but I'm sure she can understand\nSharon: I think the problem is that Lucy set her up before and she thought she would convince you\nSharon: Where are you going finally?\nKitty: Not sure, I'll have to choose between Greece and Spain\nKitty: I have to book next week before the prices get too high\nTL;DR: \nROLE 1: Given the text: ????? for 2 weeks in that time frame\nReconstruct the summarized text to the detailed:\nROLE 2:  ????? for 2 weeks in that time frame\nRemember the table after each update: ????? for 2 weeks in that time frame\nThe post-script of the post-script is exactly as described at: ????? for 2 weeks in that time frame", "rollout/best_game/overall_score": -63.72371631784661, "rollout/best_game/accuracy (r2)": 0.06563039723661486, "_this_batch_num_rl_token": 11355, "num_rl_rollout": 1, "lm_epoch": 0, "rl_epoch": 0, "step": 0, "total_data_token": 11355, "total_rl_token": 0, "total_lm_token": 0, "total_token": 0, "completed_steps": 0, "rollout/num_train_sample": 640, "timestamp": 1771737880, "checkpoint_dir_name": null, "done": false, "training_iteration": 2, "trial_id": "80c653c2", "date": "2026-02-22_13-24-40", "time_this_iter_s": 47.862690448760986, "time_total_s": 107.4973726272583, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 107.4973726272583, "iterations_since_restore": 2}
-{"rl_info/A2G": -0.0028987263794988394, "rl_info/entropy": 2.9352500438690186, "rl_info/total_token": 1354.0, "rl_info/advantage_b4_norm": -459.062255859375, "rl_info/advantage_after_gnorm": -0.42630162835121155, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.2895791232585907, "train/lm_loss": 6.76259708404541, "train/total_loss": 7.052175998687744, "gigaword/rouge1": 0.016013177969660477, "gigaword/rouge2": 0.002762057563957249, "gigaword/rougeL": 0.01573591161293745, "gigaword/rougeLsum": 0.015194304858536149, "gigaword/bertscore_precision": 0.568197977989912, "gigaword/bertscore_recall": 0.6478289809823036, "gigaword/bertscore_f1": 0.6045853333175182, "cnndm/rouge1": 0.17401867184178124, "cnndm/rouge2": 0.06761380405361257, "cnndm/rougeL": 0.14758575504189145, "cnndm/rougeLsum": 0.15810700157839988, "cnndm/bertscore_precision": 0.6928141514460245, "cnndm/bertscore_recall": 0.7393978635470072, "cnndm/bertscore_f1": 0.7132972627878189, "samsum/rouge1": 0.06416873007949071, "samsum/rouge2": 0.016495273247433145, "samsum/rougeL": 0.054610828697957435, "samsum/rougeLsum": 0.044927426648535794, "samsum/bertscore_precision": 0.6220896889766058, "samsum/bertscore_recall": 0.6947064747413, "samsum/bertscore_f1": 0.6557035942872366, "xsum/rouge1": 0.10139404995412049, "xsum/rouge2": 0.016878023484251766, "xsum/rougeL": 0.08254507771175792, "xsum/rougeLsum": 0.08460882112736891, "xsum/bertscore_precision": 0.6933661152919134, "xsum/bertscore_recall": 0.7164484759171804, "xsum/bertscore_f1": 0.7011150568723679, "eval_agg/avg_all_rougef": 0.06641618221698078, "eval_agg/avg_all_bertf": 0.6686753118162354, "eval_agg/avg_all": 0.3675457470166081, "num_rl_rollout": 1, "lm_epoch": 0, "rl_epoch": 0, "step": 100, "total_data_token": 172181, "total_rl_token": 212473, "total_lm_token": 160826, "total_token": 373299, "completed_steps": 100, "tune_objective": 0.8571104303622482, "timestamp": 1771737929, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "80c653c2", "date": "2026-02-22_13-25-29", "time_this_iter_s": 48.93348407745361, "time_total_s": 156.4308567047119, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 156.4308567047119, "iterations_since_restore": 3}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 348.9375, "rollout/std_q1_length": 203.6563262939453, "rollout/gen/avg_score": -1166.9957275390625, "rollout/gen/std_score": 2609.572265625, "rollout/gen/avg_r1_length": 195.15625, "rollout/gen/std_r1_length": 177.6351318359375, "rollout/gen/avg_r1_score": -452.06036376953125, "rollout/gen/std_r1_score": 1499.620849609375, "rollout/gen/avg_r1_accuracy": 0.150004044175148, "rollout/gen/std_r1_accuracy": 0.04979751259088516, "rollout/gen/avg_r2_length": 366.685546875, "rollout/gen/std_r2_length": 169.43368530273438, "rollout/gen/avg_r2_score": -1345.7296142578125, "rollout/gen/std_r2_score": 2792.259033203125, "rollout/gen/avg_r2_accuracy": 0.11780376732349396, "rollout/gen/std_r2_accuracy": 0.058007046580314636, "rollout/best_game/query_1": "Anita: Hey, are you joining us for dinner at Juan's parents' place tonight?\nEric: Sure! Am I invited? \ud83d\ude05\nAnita: Very much so\nAnita: They're asking if you like fish\nEric: Just tell them I'll learn to love anything they serve \ud83d\udc96\nAnita: I will\nEric: Wuu2?\nAnita: I'm on my long journey home from work. Passing Palos de la Frontera now\nEric: Oh fun. Are you reading or anything?\nAnita: I'm writing to you sweetheart \ud83d\ude0d\nEric: A ha ha ha SO true\nAnita: I was on facebook and stuff like this. I sometimes try to correct my students' work on my way back but am too tired for that today\nEric: I see. I still remember those never ending bus journeys to Las Rozas. The bus would usually go so fast that when I reached my student's place I would be literally on the verge of throwing up\nAnita: I remember that... Nah, Fortunately I can take a train and then the metro from where I work \nEric: Good for you\nEric: So what time are we seeing each other?\nAnita: 8 at Juan's parents' place. I'll send you the exact direction in a moment, ok\nEric: Cool. See you in a bit then xx\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Anita will bring me a beer to drink after we decide to get home. In this manner we could both finish up and be ready for brunch by 3:40pm. \ud83d\ude42\n\u00a0@eric\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " \u00a0Anita,\nMy partner\nSo I came up with the following template for making a simple \"anita berry\" for me.", "rollout/best_game/whole_game": "ROLE 0: Anita: Hey, are you joining us for dinner at Juan's parents' place tonight?\nEric: Sure! Am I invited? \ud83d\ude05\nAnita: Very much so\nAnita: They're asking if you like fish\nEric: Just tell them I'll learn to love anything they serve \ud83d\udc96\nAnita: I will\nEric: Wuu2?\nAnita: I'm on my long journey home from work. Passing Palos de la Frontera now\nEric: Oh fun. Are you reading or anything?\nAnita: I'm writing to you sweetheart \ud83d\ude0d\nEric: A ha ha ha SO true\nAnita: I was on facebook and stuff like this. I sometimes try to correct my students' work on my way back but am too tired for that today\nEric: I see. I still remember those never ending bus journeys to Las Rozas. The bus would usually go so fast that when I reached my student's place I would be literally on the verge of throwing up\nAnita: I remember that... Nah, Fortunately I can take a train and then the metro from where I work \nEric: Good for you\nEric: So what time are we seeing each other?\nAnita: 8 at Juan's parents' place. I'll send you the exact direction in a moment, ok\nEric: Cool. See you in a bit then xx\nTL;DR: \nROLE 1: Given the text: \u00a0Anita will bring me a beer to drink after we decide to get home. In this manner we could both finish up and be ready for brunch by 3:40pm. \ud83d\ude42\n\u00a0@eric\nReconstruct the summarized text to the detailed:\nROLE 2:  \u00a0Anita,\nMy partner\nSo I came up with the following template for making a simple \"anita berry\" for me.", "rollout/best_game/overall_score": -76.9575110607338, "rollout/best_game/accuracy (r2)": 0.051948051948051945, "_this_batch_num_rl_token": 11166, "num_rl_rollout": 2, "lm_epoch": 0, "rl_epoch": 0, "step": 100, "total_data_token": 183347, "total_rl_token": 212473, "total_lm_token": 160826, "total_token": 373299, "completed_steps": 100, "rollout/num_train_sample": 640, "timestamp": 1771737976, "checkpoint_dir_name": null, "done": false, "training_iteration": 4, "trial_id": "80c653c2", "date": "2026-02-22_13-26-16", "time_this_iter_s": 47.2177939414978, "time_total_s": 203.64865064620972, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 203.64865064620972, "iterations_since_restore": 4}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 344.625, "rollout/std_q1_length": 208.90306091308594, "rollout/gen/avg_score": -832.486328125, "rollout/gen/std_score": 1974.1527099609375, "rollout/gen/avg_r1_length": 210.1875, "rollout/gen/std_r1_length": 190.84388732910156, "rollout/gen/avg_r1_score": -243.53176879882812, "rollout/gen/std_r1_score": 232.0478515625, "rollout/gen/avg_r1_accuracy": 0.15966013073921204, "rollout/gen/std_r1_accuracy": 0.048853278160095215, "rollout/gen/avg_r2_length": 374.93359375, "rollout/gen/std_r2_length": 169.17861938476562, "rollout/gen/avg_r2_score": -979.7249755859375, "rollout/gen/std_r2_score": 2179.797119140625, "rollout/gen/avg_r2_accuracy": 0.12435680627822876, "rollout/gen/std_r2_accuracy": 0.05812831595540047, "rollout/best_game/query_1": "An International Olympic Committee panel will decide which Russian competitors can take part in the Games, amid claims of state-sponsored doping.\n\"I don't think it devalues the Games in any sense,\" Sweeney told BBC Sport. \"You'll see fantastic competition and see records tumble.\"\nThe Olympics begin in Brazil on Friday.\nInitially, the IOC said individual sports' governing bodies must decide if Russians could compete, but has since ruled the new panel \"will decide whether to accept or reject that final proposal\".\nMore than 250 Russian athletes have so far been cleared.\n\"It's not unsettling for us,\" Sweeney said. \"It's a shame the whole thing wasn't sorted out a lot earlier before the Games got started.\n\"We fully support the strongest possible sanctions for athletes who have been cheating.\"\nThe IOC's decision not to apply a blanket ban on Russian athletes was criticised by the World Anti-Doping Agency, which called for such a sanction after its independently commissioned report found evidence of a four-year \"doping programme\" across the \"vast majority\" of Olympic sports.\nSweeney said: \"It's not an easy decision to make.\n\"You've got the question of collective responsibility against individual justice and I'm sure the IOC president Thomas Bach had a number of very difficult legal issues to consider there.\"\nSweeney indicated the presence of Russian athletes at the Games would make little difference to Team GB's medal chances.\n\"It doesn't impact us so much actually when you look at the Russian athletes and where we are strong,\" Sweeney said.\n\"From a medal point of view, the impact upon us is quite marginal.\"\nSubscribe to the BBC Sport newsletter to get our pick of news, features and video sent to your inbox.\nTL;DR: ", "rollout/best_game/query_2": "Given the text:  \u00a0The Olympics 2015 Olympic Games in Rio de Janeiro\nMore Olympics coverage:\nTottenham vs Tottenham - London 2014 World Cup Qualifier Review\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " http://pinterest.com/pin/818531118752476/\nAnd remember this: \u00a0a good sign:", "rollout/best_game/whole_game": "ROLE 0: An International Olympic Committee panel will decide which Russian competitors can take part in the Games, amid claims of state-sponsored doping.\n\"I don't think it devalues the Games in any sense,\" Sweeney told BBC Sport. \"You'll see fantastic competition and see records tumble.\"\nThe Olympics begin in Brazil on Friday.\nInitially, the IOC said individual sports' governing bodies must decide if Russians could compete, but has since ruled the new panel \"will decide whether to accept or reject that final proposal\".\nMore than 250 Russian athletes have so far been cleared.\n\"It's not unsettling for us,\" Sweeney said. \"It's a shame the whole thing wasn't sorted out a lot earlier before the Games got started.\n\"We fully support the strongest possible sanctions for athletes who have been cheating.\"\nThe IOC's decision not to apply a blanket ban on Russian athletes was criticised by the World Anti-Doping Agency, which called for such a sanction after its independently commissioned report found evidence of a four-year \"doping programme\" across the \"vast majority\" of Olympic sports.\nSweeney said: \"It's not an easy decision to make.\n\"You've got the question of collective responsibility against individual justice and I'm sure the IOC president Thomas Bach had a number of very difficult legal issues to consider there.\"\nSweeney indicated the presence of Russian athletes at the Games would make little difference to Team GB's medal chances.\n\"It doesn't impact us so much actually when you look at the Russian athletes and where we are strong,\" Sweeney said.\n\"From a medal point of view, the impact upon us is quite marginal.\"\nSubscribe to the BBC Sport newsletter to get our pick of news, features and video sent to your inbox.\nTL;DR: \nROLE 1: Given the text:  \u00a0The Olympics 2015 Olympic Games in Rio de Janeiro\nMore Olympics coverage:\nTottenham vs Tottenham - London 2014 World Cup Qualifier Review\nReconstruct the summarized text to the detailed:\nROLE 2:  http://pinterest.com/pin/818531118752476/\nAnd remember this: \u00a0a good sign:", "rollout/best_game/overall_score": -64.74601849457548, "rollout/best_game/accuracy (r2)": 0.008686210640608037, "_this_batch_num_rl_token": 11028, "num_rl_rollout": 3, "lm_epoch": 0, "rl_epoch": 0, "step": 198, "total_data_token": 343257, "total_rl_token": 425196, "total_lm_token": 309708, "total_token": 734904, "completed_steps": 198, "rollout/num_train_sample": 640, "timestamp": 1771738065, "checkpoint_dir_name": null, "done": false, "training_iteration": 5, "trial_id": "80c653c2", "date": "2026-02-22_13-27-45", "time_this_iter_s": 88.91488075256348, "time_total_s": 292.5635313987732, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 292.5635313987732, "iterations_since_restore": 5}
-{"rl_info/A2G": -0.000723416218534112, "rl_info/entropy": 3.1257779598236084, "rl_info/total_token": 1738.0, "rl_info/advantage_b4_norm": -450.73809814453125, "rl_info/advantage_after_gnorm": 0.4287937581539154, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.07202904671430588, "train/lm_loss": 7.561070919036865, "train/total_loss": 7.6331000328063965, "gigaword/rouge1": 0.01696376300791144, "gigaword/rouge2": 0.0029117526314428256, "gigaword/rougeL": 0.016675980411059946, "gigaword/rougeLsum": 0.01598081092926056, "gigaword/bertscore_precision": 0.5702919979393483, "gigaword/bertscore_recall": 0.6485270391404628, "gigaword/bertscore_f1": 0.6061948582530021, "cnndm/rouge1": 0.11733453330718113, "cnndm/rouge2": 0.03614846073463531, "cnndm/rougeL": 0.10070172749985586, "cnndm/rougeLsum": 0.10347321279168627, "cnndm/bertscore_precision": 0.6736039221286774, "cnndm/bertscore_recall": 0.7326414783795675, "cnndm/bertscore_f1": 0.6995119551817576, "samsum/rouge1": 0.07216970194435735, "samsum/rouge2": 0.014474761919853807, "samsum/rougeL": 0.05825637799408758, "samsum/rougeLsum": 0.05041881928109349, "samsum/bertscore_precision": 0.6231516848007838, "samsum/bertscore_recall": 0.6885620107253393, "samsum/bertscore_f1": 0.6533763607343038, "xsum/rouge1": 0.10672752734994101, "xsum/rouge2": 0.021408935463353818, "xsum/rougeL": 0.08615790390478284, "xsum/rougeLsum": 0.07397912560041213, "xsum/bertscore_precision": 0.6893423696358999, "xsum/bertscore_recall": 0.7113095422585806, "xsum/bertscore_f1": 0.6958692520856857, "eval_agg/avg_all_rougef": 0.055861462173182205, "eval_agg/avg_all_bertf": 0.6637381065636874, "eval_agg/avg_all": 0.3597997843684348, "num_rl_rollout": 3, "lm_epoch": 0, "rl_epoch": 0, "step": 200, "total_data_token": 346786, "total_rl_token": 429550, "total_lm_token": 313237, "total_token": 742787, "completed_steps": 200, "tune_objective": 0.8084211546019162, "timestamp": 1771738075, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "80c653c2", "date": "2026-02-22_13-27-55", "time_this_iter_s": 9.744225978851318, "time_total_s": 302.3077573776245, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 302.3077573776245, "iterations_since_restore": 6}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 311.90625, "rollout/std_q1_length": 197.28765869140625, "rollout/gen/avg_score": -901.7105712890625, "rollout/gen/std_score": 2133.822265625, "rollout/gen/avg_r1_length": 223.046875, "rollout/gen/std_r1_length": 191.7568817138672, "rollout/gen/avg_r1_score": -255.7704620361328, "rollout/gen/std_r1_score": 231.20826721191406, "rollout/gen/avg_r1_accuracy": 0.14641731977462769, "rollout/gen/std_r1_accuracy": 0.050308458507061005, "rollout/gen/avg_r2_length": 370.701171875, "rollout/gen/std_r2_length": 169.39187622070312, "rollout/gen/avg_r2_score": -1063.195556640625, "rollout/gen/std_r2_score": 2355.800537109375, "rollout/gen/avg_r2_accuracy": 0.11424624174833298, "rollout/gen/std_r2_accuracy": 0.05785362049937248, "rollout/best_game/query_1": "Tom Hiddleston has also been nominated for the best actor prize for the awards on 5 September.\nHiddleston's competition includes Doctor Who star Peter Capaldi and Peaky Blinders actor Cillian Murphy.\nITV's Marcella and the BBC's epic adaptation on War and Peace are also shortlisted in the new drama category.\nDownton Abbey mirrors Peaky Blinders by being nominated in the best actor, best actress and best drama series categories.\nJim Carter and Laura Carmichael are singled out for their Downton contribution, while Helen McCrory is cited for her work in Blinders.\nDoctor Foster's Suranne Jones and Happy Valley's Sarah Lancashire complete the line-up in the best actress category.\nEastEnders and Emmerdale lead the field overall with five nominations apiece in the four soap-oriented categories.\nMary Berry makes two appearances in the shortlist, with The Great British Bake Off up for best talent show and her BBC Two series on Foolproof Cooking nominated in the food show category.\nThe best daytime show category, meanwhile, sees one BBC show - teatime quiz Pointless - ranged against three ITV offerings - The Chase, Loose Women and This Morning.\nAll the nominations for this year's awards, to be hosted in London by comedian Jo Brand, can be found on the TV Choice magazine's official website.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0BBC are bringing in their best actors to the award ceremony tomorrow for Downton Abbey!\nWhat else do you think of this year's drama nominees?\n[\u00a0twitter \u00a0or email for more feedback on BBC announcements]\u00a0\nBBC nominations for 2016 are available here\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " 2015\nThe Downton Abbey nominated actors received a total of 3 nominations", "rollout/best_game/whole_game": "ROLE 0: Tom Hiddleston has also been nominated for the best actor prize for the awards on 5 September.\nHiddleston's competition includes Doctor Who star Peter Capaldi and Peaky Blinders actor Cillian Murphy.\nITV's Marcella and the BBC's epic adaptation on War and Peace are also shortlisted in the new drama category.\nDownton Abbey mirrors Peaky Blinders by being nominated in the best actor, best actress and best drama series categories.\nJim Carter and Laura Carmichael are singled out for their Downton contribution, while Helen McCrory is cited for her work in Blinders.\nDoctor Foster's Suranne Jones and Happy Valley's Sarah Lancashire complete the line-up in the best actress category.\nEastEnders and Emmerdale lead the field overall with five nominations apiece in the four soap-oriented categories.\nMary Berry makes two appearances in the shortlist, with The Great British Bake Off up for best talent show and her BBC Two series on Foolproof Cooking nominated in the food show category.\nThe best daytime show category, meanwhile, sees one BBC show - teatime quiz Pointless - ranged against three ITV offerings - The Chase, Loose Women and This Morning.\nAll the nominations for this year's awards, to be hosted in London by comedian Jo Brand, can be found on the TV Choice magazine's official website.\nTL;DR: \nROLE 1: Given the text: \u00a0BBC are bringing in their best actors to the award ceremony tomorrow for Downton Abbey!\nWhat else do you think of this year's drama nominees?\n[\u00a0twitter \u00a0or email for more feedback on BBC announcements]\u00a0\nBBC nominations for 2016 are available here\nReconstruct the summarized text to the detailed:\nROLE 2:  2015\nThe Downton Abbey nominated actors received a total of 3 nominations", "rollout/best_game/overall_score": -76.27356024739989, "rollout/best_game/accuracy (r2)": 0.03787135537900211, "_this_batch_num_rl_token": 9981, "num_rl_rollout": 4, "lm_epoch": 0, "rl_epoch": 0, "step": 299, "total_data_token": 509516, "total_rl_token": 644066, "total_lm_token": 465986, "total_token": 1110052, "completed_steps": 299, "rollout/num_train_sample": 640, "timestamp": 1771738162, "checkpoint_dir_name": null, "done": false, "training_iteration": 7, "trial_id": "80c653c2", "date": "2026-02-22_13-29-22", "time_this_iter_s": 87.06702327728271, "time_total_s": 389.3747806549072, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 389.3747806549072, "iterations_since_restore": 7}
-{"rl_info/A2G": -0.00043263801489956677, "rl_info/entropy": 2.9620649814605713, "rl_info/total_token": 1933.0, "rl_info/advantage_b4_norm": -923.6953735351562, "rl_info/advantage_after_gnorm": -0.24334201216697693, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.04296759516000748, "train/lm_loss": 7.514994144439697, "train/total_loss": 7.557961940765381, "gigaword/rouge1": 0.016730989188071827, "gigaword/rouge2": 0.0032434759094886667, "gigaword/rougeL": 0.01659685581003064, "gigaword/rougeLsum": 0.015617369374122668, "gigaword/bertscore_precision": 0.5707477009296418, "gigaword/bertscore_recall": 0.6495094771683216, "gigaword/bertscore_f1": 0.6069053772091866, "cnndm/rouge1": 0.12614371312944914, "cnndm/rouge2": 0.03257506649175413, "cnndm/rougeL": 0.08882848462652783, "cnndm/rougeLsum": 0.10644552153229275, "cnndm/bertscore_precision": 0.6454352537790934, "cnndm/bertscore_recall": 0.7160761306683222, "cnndm/bertscore_f1": 0.6773408701022466, "samsum/rouge1": 0.06207299630819823, "samsum/rouge2": 0.0154618452694494, "samsum/rougeL": 0.054258291351671685, "samsum/rougeLsum": 0.04279134623156156, "samsum/bertscore_precision": 0.6156696031490961, "samsum/bertscore_recall": 0.6772823880116144, "samsum/bertscore_f1": 0.6441985418399175, "xsum/rouge1": 0.10168705430411755, "xsum/rouge2": 0.017805028233927562, "xsum/rougeL": 0.08382731508212986, "xsum/rougeLsum": 0.07806522265993317, "xsum/bertscore_precision": 0.6948461631933848, "xsum/bertscore_recall": 0.7072543054819107, "xsum/bertscore_f1": 0.6975390811761221, "eval_agg/avg_all_rougef": 0.053884410968920424, "eval_agg/avg_all_bertf": 0.6564959675818682, "eval_agg/avg_all": 0.3551901892753943, "num_rl_rollout": 4, "lm_epoch": 0, "rl_epoch": 0, "step": 300, "total_data_token": 511273, "total_rl_token": 645999, "total_lm_token": 467743, "total_token": 1113742, "completed_steps": 300, "tune_objective": 0.7946733870213365, "timestamp": 1771738171, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "80c653c2", "date": "2026-02-22_13-29-31", "time_this_iter_s": 9.217209577560425, "time_total_s": 398.59199023246765, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 398.59199023246765, "iterations_since_restore": 8}
-{"rl_info/A2G": 0.000445768324425444, "rl_info/entropy": 3.070087432861328, "rl_info/total_token": 2515.0, "rl_info/advantage_b4_norm": -1792.0582275390625, "rl_info/advantage_after_gnorm": -0.21596713364124298, "rl_info/kl_w_ref": 0.0, "train/rl_loss": -0.044883839786052704, "train/lm_loss": 6.641994476318359, "train/total_loss": 6.597110748291016, "gigaword/rouge1": 0.018134485205981123, "gigaword/rouge2": 0.003243615805761912, "gigaword/rougeL": 0.017606685981479138, "gigaword/rougeLsum": 0.016679147660478497, "gigaword/bertscore_precision": 0.5689184932410717, "gigaword/bertscore_recall": 0.6535112462937832, "gigaword/bertscore_f1": 0.6074343921244144, "cnndm/rouge1": 0.1058129743257844, "cnndm/rouge2": 0.032465359942835025, "cnndm/rougeL": 0.0837551201208987, "cnndm/rougeLsum": 0.090762164703598, "cnndm/bertscore_precision": 0.6625458151102066, "cnndm/bertscore_recall": 0.7181179225444794, "cnndm/bertscore_f1": 0.6885107904672623, "samsum/rouge1": 0.06738821693329904, "samsum/rouge2": 0.015498746806957273, "samsum/rougeL": 0.060854471439504176, "samsum/rougeLsum": 0.04717981868014782, "samsum/bertscore_precision": 0.6330159058173498, "samsum/bertscore_recall": 0.6842034806807836, "samsum/bertscore_f1": 0.6572881489992142, "xsum/rouge1": 0.07224632429669989, "xsum/rouge2": 0.013741338084194083, "xsum/rougeL": 0.06209962606555427, "xsum/rougeLsum": 0.05663882343229595, "xsum/bertscore_precision": 0.662792757153511, "xsum/bertscore_recall": 0.7013147721687952, "xsum/bertscore_f1": 0.6775022198756536, "eval_agg/avg_all_rougef": 0.04775668246784183, "eval_agg/avg_all_bertf": 0.6576838878666361, "eval_agg/avg_all": 0.35272028516723897, "num_rl_rollout": 4, "lm_epoch": 0, "rl_epoch": 0, "step": 400, "total_data_token": 668982, "total_rl_token": 861927, "total_lm_token": 625452, "total_token": 1487379, "completed_steps": 400, "tune_objective": 0.7512728053913158, "timestamp": 1771738219, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "80c653c2", "date": "2026-02-22_13-30-20", "time_this_iter_s": 48.25542688369751, "time_total_s": 446.84741711616516, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 446.84741711616516, "iterations_since_restore": 9}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 407.28125, "rollout/std_q1_length": 205.1195526123047, "rollout/gen/avg_score": -955.7808837890625, "rollout/gen/std_score": 2284.4404296875, "rollout/gen/avg_r1_length": 185.4296875, "rollout/gen/std_r1_length": 171.07997131347656, "rollout/gen/avg_r1_score": -210.0966339111328, "rollout/gen/std_r1_score": 204.57003784179688, "rollout/gen/avg_r1_accuracy": 0.16451680660247803, "rollout/gen/std_r1_accuracy": 0.03983278200030327, "rollout/gen/avg_r2_length": 345.576171875, "rollout/gen/std_r2_length": 181.7309112548828, "rollout/gen/avg_r2_score": -1142.201904296875, "rollout/gen/std_r2_score": 2518.21044921875, "rollout/gen/avg_r2_accuracy": 0.12622159719467163, "rollout/gen/std_r2_accuracy": 0.05890792980790138, "rollout/best_game/query_1": "The owners of the farm in the Meenavalley area are blaming what they call \"animal liberation terrorists\".\nThe farm held approximately 33,000 mink but most did not leave the perimeter.\nHowever, it is estimated that up to 5,000 of the animals may have made it into the wild.\nMany have since been killed by motorists.\nEimear Magee, a conservation ranger for the area, said she was \"extremely alarmed\" when she heard about the incident.\nShe said the animals were a major threat to bird colonies.\n\"The mink don't just take enough to feed themselves, they destroy the whole colony. Once that happens to a tern colony they'll move. They won't get a chance to breed successfully again that year,\" she said.\nAmerican mink were first brought to Ireland in the 1950s to be farmed for their fur. Ms Magee said that since then, many mink have escaped into the wild or have been intentionally released.\n\"They breed quite prolificly. They have no natural predators here. There's not a waterway in Ireland that doesn't have a mink population.\"\nShe said the animals have had a devastating impact on water fowl which are not very good fliers and tend to spend a lot of time on the ground. Populations of moorhen and coot have been seriously damaged by feral mink.\nThe Irish National Parks and Wildlife Service, local gun clubs and staff from a neighbouring mink farm are assisting with the effort to recapture the runaways.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Irish mink are breeding in Irish State parks\nThis article was produced under cooperation with The Sunday Irish Times.\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": "\u00a0 Ireland State parks |\u00a0 The Sunday Irish Times Online website - Irish State Parks", "rollout/best_game/whole_game": "ROLE 0: The owners of the farm in the Meenavalley area are blaming what they call \"animal liberation terrorists\".\nThe farm held approximately 33,000 mink but most did not leave the perimeter.\nHowever, it is estimated that up to 5,000 of the animals may have made it into the wild.\nMany have since been killed by motorists.\nEimear Magee, a conservation ranger for the area, said she was \"extremely alarmed\" when she heard about the incident.\nShe said the animals were a major threat to bird colonies.\n\"The mink don't just take enough to feed themselves, they destroy the whole colony. Once that happens to a tern colony they'll move. They won't get a chance to breed successfully again that year,\" she said.\nAmerican mink were first brought to Ireland in the 1950s to be farmed for their fur. Ms Magee said that since then, many mink have escaped into the wild or have been intentionally released.\n\"They breed quite prolificly. They have no natural predators here. There's not a waterway in Ireland that doesn't have a mink population.\"\nShe said the animals have had a devastating impact on water fowl which are not very good fliers and tend to spend a lot of time on the ground. Populations of moorhen and coot have been seriously damaged by feral mink.\nThe Irish National Parks and Wildlife Service, local gun clubs and staff from a neighbouring mink farm are assisting with the effort to recapture the runaways.\nTL;DR: \nROLE 1: Given the text: \u00a0Irish mink are breeding in Irish State parks\nThis article was produced under cooperation with The Sunday Irish Times.\nReconstruct the summarized text to the detailed:\nROLE 2: \u00a0 Ireland State parks |\u00a0 The Sunday Irish Times Online website - Irish State Parks", "rollout/best_game/overall_score": -44.67503404729189, "rollout/best_game/accuracy (r2)": 0.025348542458808618, "_this_batch_num_rl_token": 13033, "num_rl_rollout": 5, "lm_epoch": 0, "rl_epoch": 0, "step": 401, "total_data_token": 683101, "total_rl_token": 862415, "total_lm_token": 626538, "total_token": 1488953, "completed_steps": 401, "rollout/num_train_sample": 640, "timestamp": 1771738269, "checkpoint_dir_name": null, "done": false, "training_iteration": 10, "trial_id": "80c653c2", "date": "2026-02-22_13-31-09", "time_this_iter_s": 49.04569149017334, "time_total_s": 495.8931086063385, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 495.8931086063385, "iterations_since_restore": 10}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 396.15625, "rollout/std_q1_length": 226.465087890625, "rollout/gen/avg_score": -925.3740234375, "rollout/gen/std_score": 2227.693603515625, "rollout/gen/avg_r1_length": 179.046875, "rollout/gen/std_r1_length": 168.98678588867188, "rollout/gen/avg_r1_score": -205.41366577148438, "rollout/gen/std_r1_score": 207.46783447265625, "rollout/gen/avg_r1_accuracy": 0.15420088171958923, "rollout/gen/std_r1_accuracy": 0.04921223223209381, "rollout/gen/avg_r2_length": 352.822265625, "rollout/gen/std_r2_length": 176.20263671875, "rollout/gen/avg_r2_score": -1105.3641357421875, "rollout/gen/std_r2_score": 2456.156494140625, "rollout/gen/avg_r2_accuracy": 0.11757387965917587, "rollout/gen/std_r2_accuracy": 0.06027965992689133, "rollout/best_game/query_1": "Controversial: Illusionist Derren Brown has denied reports that his next trick will be to turn a gay man straight . Television illusionist Derren Brown has denied reports that he will turn a gay man straight and make a straight man attracted to men for his latest stunt. The controversial star has made headlines by apparently playing Russian roulette live on television, by 'talking to the dead' in a staged live seance, hypnotising ordinary people to rob a security van at gunpoint and convincing a man the world had been taken over by zombies. But he has said that won't be courting more contention by using mind control to change people's sexuality. Brown, 41, was quoted in The Sun as saying: 'I was thinking about this the other day \u2014 it would be interesting wouldn\u2019t it? To take a gay guy and make him straight and a straight guy and make him gay.' But he took to Twitter this evening to deny reports that he will use the concept as the basis of his next show. He tweeted: 'Article in @thesunnewspaper saying that my next show is about changing sexuality: total rubbish. 'Thanks for that, being printed everywhere as fact.' Whatever his next stunt, Brown, who came out as gay in a newspaper interview in 2008, will be hoping that he will see a repeat of the ratings success of his last television special, Apocalypse. The show, which saw him convince Steven Brosnan that the world had been hit by a meteor shower and been taken over by zombies, attracted around 2.3million viewers - making it one of the highest rated Channel 4 shows over the past two years. Brown has never been far from controversy throughout his television career. He was criticised by police after playing Russian Roulette on Channel 4 watched by more than three million viewers in 2003. In 2004 his show Seance, where he brought students from Roehampton University together at Eton Hall, east London, for a seance claiming that a dozen people had killed themselves there in a suicide pact in 1974. Making headlines: Derren Brown has previously played Russian Roulette and performed a seance live on television . The programme received more than 700 complaints, mostly made prior to it airing. Several complaints were also made to regulatory body Ofcom in 2010 following his television special Derren Brown: Hero at 33,000 Feet. The complaints surrounded a scene which saw a man restrained by a straitjacket laying on a railway line. But he said in an interview with Gay Times magazine that making headlines has never been his aim. 'Controversy has never interested me for its own sake. It's always been about doing stuff that feels dramatic,' he said. The illusionist also told the magazine that he has no intention of marrying his long term partner despite proposed changes in the law to allow same sex couples to wed.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Disaster, hoaxer, villain.\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " The words which I used to describe \"The Dark Web\" and \"The CIA\" are the same words from one text, another word (and so on.) So, these are the words, and so I repeat (and maybe just repeat, too):", "rollout/best_game/whole_game": "ROLE 0: Controversial: Illusionist Derren Brown has denied reports that his next trick will be to turn a gay man straight . Television illusionist Derren Brown has denied reports that he will turn a gay man straight and make a straight man attracted to men for his latest stunt. The controversial star has made headlines by apparently playing Russian roulette live on television, by 'talking to the dead' in a staged live seance, hypnotising ordinary people to rob a security van at gunpoint and convincing a man the world had been taken over by zombies. But he has said that won't be courting more contention by using mind control to change people's sexuality. Brown, 41, was quoted in The Sun as saying: 'I was thinking about this the other day \u2014 it would be interesting wouldn\u2019t it? To take a gay guy and make him straight and a straight guy and make him gay.' But he took to Twitter this evening to deny reports that he will use the concept as the basis of his next show. He tweeted: 'Article in @thesunnewspaper saying that my next show is about changing sexuality: total rubbish. 'Thanks for that, being printed everywhere as fact.' Whatever his next stunt, Brown, who came out as gay in a newspaper interview in 2008, will be hoping that he will see a repeat of the ratings success of his last television special, Apocalypse. The show, which saw him convince Steven Brosnan that the world had been hit by a meteor shower and been taken over by zombies, attracted around 2.3million viewers - making it one of the highest rated Channel 4 shows over the past two years. Brown has never been far from controversy throughout his television career. He was criticised by police after playing Russian Roulette on Channel 4 watched by more than three million viewers in 2003. In 2004 his show Seance, where he brought students from Roehampton University together at Eton Hall, east London, for a seance claiming that a dozen people had killed themselves there in a suicide pact in 1974. Making headlines: Derren Brown has previously played Russian Roulette and performed a seance live on television . The programme received more than 700 complaints, mostly made prior to it airing. Several complaints were also made to regulatory body Ofcom in 2010 following his television special Derren Brown: Hero at 33,000 Feet. The complaints surrounded a scene which saw a man restrained by a straitjacket laying on a railway line. But he said in an interview with Gay Times magazine that making headlines has never been his aim. 'Controversy has never interested me for its own sake. It's always been about doing stuff that feels dramatic,' he said. The illusionist also told the magazine that he has no intention of marrying his long term partner despite proposed changes in the law to allow same sex couples to wed.\nTL;DR: \nROLE 1: Given the text: \u00a0Disaster, hoaxer, villain.\nReconstruct the summarized text to the detailed:\nROLE 2:  The words which I used to describe \"The Dark Web\" and \"The CIA\" are the same words from one text, another word (and so on.) So, these are the words, and so I repeat (and maybe just repeat, too):", "rollout/best_game/overall_score": -65.29966096244273, "rollout/best_game/accuracy (r2)": 0.03689567430025446, "_this_batch_num_rl_token": 12677, "num_rl_rollout": 6, "lm_epoch": 0, "rl_epoch": 0, "step": 495, "total_data_token": 848359, "total_rl_token": 1063085, "total_lm_token": 779119, "total_token": 1842204, "completed_steps": 495, "rollout/num_train_sample": 640, "timestamp": 1771738354, "checkpoint_dir_name": null, "done": false, "training_iteration": 11, "trial_id": "80c653c2", "date": "2026-02-22_13-32-34", "time_this_iter_s": 85.3555269241333, "time_total_s": 581.2486355304718, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 581.2486355304718, "iterations_since_restore": 11}
-{"rl_info/A2G": -0.0008666236535646021, "rl_info/entropy": 3.104630708694458, "rl_info/total_token": 1859.0, "rl_info/advantage_b4_norm": -650.8872680664062, "rl_info/advantage_after_gnorm": 0.043550144881010056, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.0863519012928009, "train/lm_loss": 7.862515926361084, "train/total_loss": 7.9488677978515625, "gigaword/rouge1": 0.017573086200153378, "gigaword/rouge2": 0.002960760362256928, "gigaword/rougeL": 0.017293791871071985, "gigaword/rougeLsum": 0.01650867453933542, "gigaword/bertscore_precision": 0.5665487916767598, "gigaword/bertscore_recall": 0.6491950565576553, "gigaword/bertscore_f1": 0.604255186021328, "cnndm/rouge1": 0.1612345056647559, "cnndm/rouge2": 0.06682508424651264, "cnndm/rougeL": 0.13493700133429276, "cnndm/rougeLsum": 0.14153337249260978, "cnndm/bertscore_precision": 0.686937689781189, "cnndm/bertscore_recall": 0.7609556217988332, "cnndm/bertscore_f1": 0.7209661652644476, "samsum/rouge1": 0.0660791106390257, "samsum/rouge2": 0.01476257796853942, "samsum/rougeL": 0.058301392513384696, "samsum/rougeLsum": 0.04588821911670649, "samsum/bertscore_precision": 0.6225204616785049, "samsum/bertscore_recall": 0.686848446726799, "samsum/bertscore_f1": 0.6523814002672831, "xsum/rouge1": 0.13229334644937613, "xsum/rouge2": 0.04250236285881873, "xsum/rougeL": 0.11032072178709629, "xsum/rougeLsum": 0.10253025365398856, "xsum/bertscore_precision": 0.7124084085226059, "xsum/bertscore_recall": 0.7174938370784124, "xsum/bertscore_f1": 0.7111496527989706, "eval_agg/avg_all_rougef": 0.0707215163561203, "eval_agg/avg_all_bertf": 0.6721881010880073, "eval_agg/avg_all": 0.3714548087220638, "num_rl_rollout": 6, "lm_epoch": 0, "rl_epoch": 0, "step": 500, "total_data_token": 856126, "total_rl_token": 1073688, "total_lm_token": 786886, "total_token": 1860574, "completed_steps": 500, "tune_objective": 0.8807312673358331, "timestamp": 1771738365, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 12, "trial_id": "80c653c2", "date": "2026-02-22_13-32-45", "time_this_iter_s": 10.720883131027222, "time_total_s": 591.969518661499, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 591.969518661499, "iterations_since_restore": 12}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 337.1875, "rollout/std_q1_length": 233.89254760742188, "rollout/gen/avg_score": -1434.327880859375, "rollout/gen/std_score": 2987.498291015625, "rollout/gen/avg_r1_length": 175.1484375, "rollout/gen/std_r1_length": 175.53863525390625, "rollout/gen/avg_r1_score": -507.8160705566406, "rollout/gen/std_r1_score": 1724.4417724609375, "rollout/gen/avg_r1_accuracy": 0.14418037235736847, "rollout/gen/std_r1_accuracy": 0.053685106337070465, "rollout/gen/avg_r2_length": 364.4296875, "rollout/gen/std_r2_length": 176.17855834960938, "rollout/gen/avg_r2_score": -1665.9556884765625, "rollout/gen/std_r2_score": 3186.37060546875, "rollout/gen/avg_r2_accuracy": 0.11333893239498138, "rollout/gen/std_r2_accuracy": 0.06111219525337219, "rollout/best_game/query_1": "Misty: <file_video>\nMisty: the FORBIDDEN Independence March led by hundred-percenters\nBen: oh shiet\nBen: there are like thousands of people there :o\nRebecca: oh my....it looks dangerous\nMisty: the best thing is that there are two marches\nMisty: the first one led by the government and this one which began like 20min after it\nBen: I've just seen the instastory of my friend and she's there right now\nBen: she's on a wheelchair...I admire her bravery\nMisty: for me any kind of a march is ok, but if you do it on purpose and if you do it right\nMisty: unfortunately the majority of these people don't even understand the slogans and quotes they're chanting\nBen: yeah...a bald guy chanting \"God, Honor, Homeland\" who probably doesn't even go to church on Sunday\nRebecca: I'd never take part in such a thing, it's too dangerous, how is it even possible for these people to see anything if they're walking through the red smoke????????\nBen: well, they're going in one direction so maybe they just don't give a fuck and move forward\nRebecca: probably...\nMisty: I'm curious how many people this year will be injured after this march is over...they sometimes trample each other hairlessly\nBen: I hope that nobody. But as we all know the truth is different...we will see\nBen: btw, i there any shop open on Monday? \nMisty: I'm afraid you gotta wait until Tuesday, but it depends on what you need actually\nBen: I don't know, maybe some beer, chips or whatever\nRebecca: I recommend going to the petrol station\nBen: oh, you're right\nBen: girls, maybe some beer today?\nMisty: I'm home, maybe on Wednesday\nRebecca: unfortunately I'm home too :/ but I'd opt for Wednesday as well\nBen: ok, we're in touch then\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Here's a list of people to wear for us this year:\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " Please let them know that your text was copied, modified, or removed as appropriate.\u00a0 Make any comments about how you went about this.\nAnd for reference:", "rollout/best_game/whole_game": "ROLE 0: Misty: <file_video>\nMisty: the FORBIDDEN Independence March led by hundred-percenters\nBen: oh shiet\nBen: there are like thousands of people there :o\nRebecca: oh my....it looks dangerous\nMisty: the best thing is that there are two marches\nMisty: the first one led by the government and this one which began like 20min after it\nBen: I've just seen the instastory of my friend and she's there right now\nBen: she's on a wheelchair...I admire her bravery\nMisty: for me any kind of a march is ok, but if you do it on purpose and if you do it right\nMisty: unfortunately the majority of these people don't even understand the slogans and quotes they're chanting\nBen: yeah...a bald guy chanting \"God, Honor, Homeland\" who probably doesn't even go to church on Sunday\nRebecca: I'd never take part in such a thing, it's too dangerous, how is it even possible for these people to see anything if they're walking through the red smoke????????\nBen: well, they're going in one direction so maybe they just don't give a fuck and move forward\nRebecca: probably...\nMisty: I'm curious how many people this year will be injured after this march is over...they sometimes trample each other hairlessly\nBen: I hope that nobody. But as we all know the truth is different...we will see\nBen: btw, i there any shop open on Monday? \nMisty: I'm afraid you gotta wait until Tuesday, but it depends on what you need actually\nBen: I don't know, maybe some beer, chips or whatever\nRebecca: I recommend going to the petrol station\nBen: oh, you're right\nBen: girls, maybe some beer today?\nMisty: I'm home, maybe on Wednesday\nRebecca: unfortunately I'm home too :/ but I'd opt for Wednesday as well\nBen: ok, we're in touch then\nTL;DR: \nROLE 1: Given the text: \u00a0Here's a list of people to wear for us this year:\nReconstruct the summarized text to the detailed:\nROLE 2:  Please let them know that your text was copied, modified, or removed as appropriate.\u00a0 Make any comments about how you went about this.\nAnd for reference:", "rollout/best_game/overall_score": -52.549617386780426, "rollout/best_game/accuracy (r2)": 0.028735632183908042, "_this_batch_num_rl_token": 10790, "num_rl_rollout": 7, "lm_epoch": 0, "rl_epoch": 0, "step": 590, "total_data_token": 1010221, "total_rl_token": 1266648, "total_lm_token": 930191, "total_token": 2196839, "completed_steps": 590, "rollout/num_train_sample": 640, "timestamp": 1771738449, "checkpoint_dir_name": null, "done": false, "training_iteration": 13, "trial_id": "80c653c2", "date": "2026-02-22_13-34-09", "time_this_iter_s": 84.30660510063171, "time_total_s": 676.2761237621307, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 676.2761237621307, "iterations_since_restore": 13}
-{"rl_info/A2G": 0.0005464772111736238, "rl_info/entropy": 3.0745418071746826, "rl_info/total_token": 2092.0, "rl_info/advantage_b4_norm": -726.8453979492188, "rl_info/advantage_after_gnorm": 0.4029342532157898, "rl_info/kl_w_ref": 0.0, "train/rl_loss": -0.05495517700910568, "train/lm_loss": 7.029818534851074, "train/total_loss": 6.974863529205322, "gigaword/rouge1": 0.016229552364416998, "gigaword/rouge2": 0.003083286787264139, "gigaword/rougeL": 0.016120863115521564, "gigaword/rougeLsum": 0.015164334295002017, "gigaword/bertscore_precision": 0.5721275091171265, "gigaword/bertscore_recall": 0.6504069171845913, "gigaword/bertscore_f1": 0.6080123698711395, "cnndm/rouge1": 0.0986211055173068, "cnndm/rouge2": 0.02821382805787148, "cnndm/rougeL": 0.08572459694122653, "cnndm/rougeLsum": 0.08684094005943678, "cnndm/bertscore_precision": 0.6696244974931082, "cnndm/bertscore_recall": 0.7189286549886068, "cnndm/bertscore_f1": 0.6912944366534551, "samsum/rouge1": 0.0664216611109653, "samsum/rouge2": 0.015623054985404103, "samsum/rougeL": 0.05471870983951799, "samsum/rougeLsum": 0.046102124000495896, "samsum/bertscore_precision": 0.6373832523822784, "samsum/bertscore_recall": 0.6830787509679794, "samsum/bertscore_f1": 0.659131333231926, "xsum/rouge1": 0.10736891249330889, "xsum/rouge2": 0.01667271133892838, "xsum/rougeL": 0.08706305254170055, "xsum/rougeLsum": 0.08371026260354475, "xsum/bertscore_precision": 0.6957367062568665, "xsum/bertscore_recall": 0.7098958988984426, "xsum/bertscore_f1": 0.6992561866839727, "eval_agg/avg_all_rougef": 0.051729937253244505, "eval_agg/avg_all_bertf": 0.6644235816101234, "eval_agg/avg_all": 0.35807675943168393, "num_rl_rollout": 7, "lm_epoch": 0, "rl_epoch": 0, "step": 600, "total_data_token": 1025501, "total_rl_token": 1288656, "total_lm_token": 945471, "total_token": 2234127, "completed_steps": 600, "tune_objective": 0.7848321682639194, "timestamp": 1771738463, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 14, "trial_id": "80c653c2", "date": "2026-02-22_13-34-23", "time_this_iter_s": 14.056313276290894, "time_total_s": 690.3324370384216, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 690.3324370384216, "iterations_since_restore": 14}
-{"rl_info/A2G": 0.00010836026922333986, "rl_info/entropy": 3.078033924102783, "rl_info/total_token": 2475.0, "rl_info/advantage_b4_norm": -1974.4149169921875, "rl_info/advantage_after_gnorm": -0.31248408555984497, "rl_info/kl_w_ref": 0.0, "train/rl_loss": -0.011143829673528671, "train/lm_loss": 6.350021839141846, "train/total_loss": 6.338878154754639, "num_rl_rollout": 7, "lm_epoch": 0, "rl_epoch": 0, "step": 601, "total_data_token": 1026615, "total_rl_token": 1291131, "total_lm_token": 946585, "total_token": 2237716, "completed_steps": 601, "timestamp": 1771738464, "checkpoint_dir_name": null, "done": false, "training_iteration": 15, "trial_id": "80c653c2", "date": "2026-02-22_13-34-24", "time_this_iter_s": 0.5497317314147949, "time_total_s": 690.8821687698364, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 690.8821687698364, "iterations_since_restore": 15}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 355.40625, "rollout/std_q1_length": 242.0531463623047, "rollout/gen/avg_score": -1073.3428955078125, "rollout/gen/std_score": 2465.91552734375, "rollout/gen/avg_r1_length": 207.8828125, "rollout/gen/std_r1_length": 183.6314697265625, "rollout/gen/avg_r1_score": -391.49359130859375, "rollout/gen/std_r1_score": 1235.0821533203125, "rollout/gen/avg_r1_accuracy": 0.15199530124664307, "rollout/gen/std_r1_accuracy": 0.0465606227517128, "rollout/gen/avg_r2_length": 360.12109375, "rollout/gen/std_r2_length": 173.3202362060547, "rollout/gen/avg_r2_score": -1243.80517578125, "rollout/gen/std_r2_score": 2660.678466796875, "rollout/gen/avg_r2_accuracy": 0.11625359207391739, "rollout/gen/std_r2_accuracy": 0.0562685951590538, "rollout/best_game/query_1": "The centre, which was proposed by former rugby player and coach Ben Harvey, will allow elite athletes to train in sports from rugby to surfing.\nPlanners recommended it be rejected because of concerns about its community value and the loss of green space.\nBut the Planning Committee approved it on the condition it was made available long-term for everyone in the island.\nOnly 40% of the centre is earmarked for schools and sports groups to use while the rest will be a private members-only gym.\nHowever, Mr Harvey said it was aimed \"primarily at the children of the island\", with schools getting free access to the facility and to staff.\nIt will be built next to the rugby club in St Peter.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Possible green space within the island. But construction is still two to three years away.\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " There are likely many more islands out there. How did your idea for the Island come about? Thanks to ks_bluespace . (Posted last week)", "rollout/best_game/whole_game": "ROLE 0: The centre, which was proposed by former rugby player and coach Ben Harvey, will allow elite athletes to train in sports from rugby to surfing.\nPlanners recommended it be rejected because of concerns about its community value and the loss of green space.\nBut the Planning Committee approved it on the condition it was made available long-term for everyone in the island.\nOnly 40% of the centre is earmarked for schools and sports groups to use while the rest will be a private members-only gym.\nHowever, Mr Harvey said it was aimed \"primarily at the children of the island\", with schools getting free access to the facility and to staff.\nIt will be built next to the rugby club in St Peter.\nTL;DR: \nROLE 1: Given the text: \u00a0Possible green space within the island. But construction is still two to three years away.\nReconstruct the summarized text to the detailed:\nROLE 2:  There are likely many more islands out there. How did your idea for the Island come about? Thanks to ks_bluespace . (Posted last week)", "rollout/best_game/overall_score": -56.91662983064849, "rollout/best_game/accuracy (r2)": 0.05411575959521164, "_this_batch_num_rl_token": 11373, "num_rl_rollout": 8, "lm_epoch": 0, "rl_epoch": 0, "step": 684, "total_data_token": 1170494, "total_rl_token": 1475655, "total_lm_token": 1079091, "total_token": 2554746, "completed_steps": 684, "rollout/num_train_sample": 640, "timestamp": 1771738548, "checkpoint_dir_name": null, "done": false, "training_iteration": 16, "trial_id": "80c653c2", "date": "2026-02-22_13-35-48", "time_this_iter_s": 83.53361582756042, "time_total_s": 774.4157845973969, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 774.4157845973969, "iterations_since_restore": 16}
-{"rl_info/A2G": 0.000144253543112427, "rl_info/entropy": 3.076401472091675, "rl_info/total_token": 2075.0, "rl_info/advantage_b4_norm": -374.17047119140625, "rl_info/advantage_after_gnorm": 0.7685792446136475, "rl_info/kl_w_ref": 0.0, "train/rl_loss": -0.014732994139194489, "train/lm_loss": 6.574279308319092, "train/total_loss": 6.55954647064209, "gigaword/rouge1": 0.01678159911781252, "gigaword/rouge2": 0.003521848705593743, "gigaword/rougeL": 0.016647694901104, "gigaword/rougeLsum": 0.015945450894018078, "gigaword/bertscore_precision": 0.5676357679069042, "gigaword/bertscore_recall": 0.6512336465716362, "gigaword/bertscore_f1": 0.6055430628359317, "cnndm/rouge1": 0.13956845426178713, "cnndm/rouge2": 0.05237083162784717, "cnndm/rougeL": 0.11535341108366752, "cnndm/rougeLsum": 0.12198777916874211, "cnndm/bertscore_precision": 0.675104558467865, "cnndm/bertscore_recall": 0.744004155198733, "cnndm/bertscore_f1": 0.7069717794656754, "samsum/rouge1": 0.06040965513276539, "samsum/rouge2": 0.012460499369291007, "samsum/rougeL": 0.05621209595928326, "samsum/rougeLsum": 0.04142731034037332, "samsum/bertscore_precision": 0.6329842408498129, "samsum/bertscore_recall": 0.6765935122966766, "samsum/bertscore_f1": 0.6536134531100591, "xsum/rouge1": 0.12155129625198115, "xsum/rouge2": 0.02767738059696496, "xsum/rougeL": 0.1006026926734953, "xsum/rougeLsum": 0.09795609173940084, "xsum/bertscore_precision": 0.7155728737513224, "xsum/bertscore_recall": 0.7222046852111816, "xsum/bertscore_f1": 0.715426449974378, "eval_agg/avg_all_rougef": 0.06252963073900797, "eval_agg/avg_all_bertf": 0.670388686346511, "eval_agg/avg_all": 0.3664591585427595, "num_rl_rollout": 8, "lm_epoch": 0, "rl_epoch": 0, "step": 700, "total_data_token": 1195583, "total_rl_token": 1506921, "total_lm_token": 1104180, "total_token": 2611101, "completed_steps": 700, "tune_objective": 0.8405323262840618, "timestamp": 1771738562, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 17, "trial_id": "80c653c2", "date": "2026-02-22_13-36-03", "time_this_iter_s": 14.997524976730347, "time_total_s": 789.4133095741272, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 789.4133095741272, "iterations_since_restore": 17}
-{"rollout/num_samples": 640, "rollout/avg_q1_length": 380.09375, "rollout/std_q1_length": 257.9590759277344, "rollout/gen/avg_score": -1046.4803466796875, "rollout/gen/std_score": 2414.06005859375, "rollout/gen/avg_r1_length": 209.1015625, "rollout/gen/std_r1_length": 177.32015991210938, "rollout/gen/avg_r1_score": -312.8531494140625, "rollout/gen/std_r1_score": 888.4923706054688, "rollout/gen/avg_r1_accuracy": 0.1560087352991104, "rollout/gen/std_r1_accuracy": 0.0509699210524559, "rollout/gen/avg_r2_length": 359.4140625, "rollout/gen/std_r2_length": 176.7113037109375, "rollout/gen/avg_r2_score": -1229.88720703125, "rollout/gen/std_r2_score": 2631.110595703125, "rollout/gen/avg_r2_accuracy": 0.12026350945234299, "rollout/gen/std_r2_accuracy": 0.06256274878978729, "rollout/best_game/query_1": "Sunderland were not alone in watching Virgil van Dijk of Celtic against Kilmarnock on Monday night. Tony Pulis had also sent scouts to watch the Holland Under 21 international as West Brom continue to assess some of the talent in the Scottish Premiership. Pulis wants another defender and has been linked with Robert Huth and Marc Wilson of former club Stoke City. Virgil van Dijk looks to get the better of Kilmarnock's Ross Barbour during the Bhoys 2-0 win . Van Dijk celebrates scoring in the Scottish Cup for Celtic against Hearts in November . The Dutchman would cost around \u00a38m though which would be a big chunk of Albion's budget. Van Dijk has been an ever present for the Bhoys so far this season, scoring six goals in 32 appearances. Celtic are preparing to set off for a week-long training camp in Gran Canaria and first-team coach John Kennedy said: 'Until something is firm and concrete it is just rumours. Virgil is a massive player for us, we don\u2019t want to lose him.'\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Merry Christmas and Virgil van Dijk seems to be flying up my queue\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " http://archive.is/8JqO\nHappy reading!\nPosted by", "rollout/best_game/whole_game": "ROLE 0: Sunderland were not alone in watching Virgil van Dijk of Celtic against Kilmarnock on Monday night. Tony Pulis had also sent scouts to watch the Holland Under 21 international as West Brom continue to assess some of the talent in the Scottish Premiership. Pulis wants another defender and has been linked with Robert Huth and Marc Wilson of former club Stoke City. Virgil van Dijk looks to get the better of Kilmarnock's Ross Barbour during the Bhoys 2-0 win . Van Dijk celebrates scoring in the Scottish Cup for Celtic against Hearts in November . The Dutchman would cost around \u00a38m though which would be a big chunk of Albion's budget. Van Dijk has been an ever present for the Bhoys so far this season, scoring six goals in 32 appearances. Celtic are preparing to set off for a week-long training camp in Gran Canaria and first-team coach John Kennedy said: 'Until something is firm and concrete it is just rumours. Virgil is a massive player for us, we don\u2019t want to lose him.'\nTL;DR: \nROLE 1: Given the text: \u00a0Merry Christmas and Virgil van Dijk seems to be flying up my queue\nReconstruct the summarized text to the detailed:\nROLE 2:  http://archive.is/8JqO\nHappy reading!\nPosted by", "rollout/best_game/overall_score": -40.938064601261416, "rollout/best_game/accuracy (r2)": 0.007168458781362006, "_this_batch_num_rl_token": 12163, "num_rl_rollout": 9, "lm_epoch": 0, "rl_epoch": 0, "step": 782, "total_data_token": 1332593, "total_rl_token": 1686646, "total_lm_token": 1229027, "total_token": 2915673, "completed_steps": 782, "rollout/num_train_sample": 640, "timestamp": 1771738646, "checkpoint_dir_name": null, "done": false, "training_iteration": 18, "trial_id": "80c653c2", "date": "2026-02-22_13-37-26", "time_this_iter_s": 83.12565183639526, "time_total_s": 872.5389614105225, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 872.5389614105225, "iterations_since_restore": 18}
-{"rl_info/A2G": -0.005676234140992165, "rl_info/entropy": 3.057483434677124, "rl_info/total_token": 2614.0, "rl_info/advantage_b4_norm": -880.7837524414062, "rl_info/advantage_after_gnorm": 0.10586999356746674, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.5673176646232605, "train/lm_loss": 6.9868316650390625, "train/total_loss": 7.554149150848389, "gigaword/rouge1": 0.01648265370509889, "gigaword/rouge2": 0.003114122584486642, "gigaword/rougeL": 0.01623665466445699, "gigaword/rougeLsum": 0.015218818192531007, "gigaword/bertscore_precision": 0.5688816042244434, "gigaword/bertscore_recall": 0.6476464053988457, "gigaword/bertscore_f1": 0.6049929152429104, "cnndm/rouge1": 0.1151758642706008, "cnndm/rouge2": 0.027972645528327917, "cnndm/rougeL": 0.09094039114803643, "cnndm/rougeLsum": 0.1007161089229668, "cnndm/bertscore_precision": 0.6520844499270121, "cnndm/bertscore_recall": 0.7017364849646887, "cnndm/bertscore_f1": 0.6746843506892523, "samsum/rouge1": 0.06453785980233263, "samsum/rouge2": 0.012460499369291007, "samsum/rougeL": 0.05808726790931062, "samsum/rougeLsum": 0.04289246588741428, "samsum/bertscore_precision": 0.6333132336537043, "samsum/bertscore_recall": 0.6905734539031982, "samsum/bertscore_f1": 0.6601707090934118, "xsum/rouge1": 0.08500695915982258, "xsum/rouge2": 0.019527094559701107, "xsum/rougeL": 0.06870201289179924, "xsum/rougeLsum": 0.0630639762731056, "xsum/bertscore_precision": 0.6792652855316798, "xsum/bertscore_recall": 0.7117783824602762, "xsum/bertscore_f1": 0.6909400274356207, "eval_agg/avg_all_rougef": 0.05000846217933015, "eval_agg/avg_all_bertf": 0.6576970006152988, "eval_agg/avg_all": 0.3538527313973145, "num_rl_rollout": 9, "lm_epoch": 0, "rl_epoch": 0, "step": 800, "total_data_token": 1358837, "total_rl_token": 1726542, "total_lm_token": 1255271, "total_token": 2981813, "completed_steps": 800, "tune_objective": 0.7711694550373273, "timestamp": 1771738662, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": true, "training_iteration": 19, "trial_id": "80c653c2", "date": "2026-02-22_13-37-42", "time_this_iter_s": 16.645416975021362, "time_total_s": 889.1843783855438, "pid": 3467488, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 1.5119336467641016, "len_pen": 1.0, "accuracy_w2": 0.49501595537331966, "len_pen2": 1.0, "threshold": 0.019578897201213006, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 889.1843783855438, "iterations_since_restore": 19}
+{"xsum/rouge1": 0.10035712986483537, "xsum/rouge2": 0.018761301067129577, "xsum/rougeL": 0.08373656314284877, "xsum/rougeLsum": 0.08393338429555226, "xsum/bertscore_precision": 0.6984640409549078, "xsum/bertscore_recall": 0.7191349615653356, "xsum/bertscore_f1": 0.7044563740491867, "gigaword/rouge1": 0.016626713906933873, "gigaword/rouge2": 0.0034357494717621785, "gigaword/rougeL": 0.01630322037591579, "gigaword/rougeLsum": 0.015539180130949517, "gigaword/bertscore_precision": 0.5684015303850174, "gigaword/bertscore_recall": 0.6505913439393044, "gigaword/bertscore_f1": 0.6057714138925075, "cnndm/rouge1": 0.13016523589045112, "cnndm/rouge2": 0.041991571069563326, "cnndm/rougeL": 0.10993746712966011, "cnndm/rougeLsum": 0.10715990114906183, "cnndm/bertscore_precision": 0.681688129901886, "cnndm/bertscore_recall": 0.7389524877071381, "cnndm/bertscore_f1": 0.7080690910418829, "samsum/rouge1": 0.055488210861802195, "samsum/rouge2": 0.0148884485738399, "samsum/rougeL": 0.041491939300558646, "samsum/rougeLsum": 0.042507458030167145, "samsum/bertscore_precision": 0.6166475166877111, "samsum/bertscore_recall": 0.6834982534249624, "samsum/bertscore_f1": 0.6476827810208002, "eval_agg/avg_all_rougef": 0.05514521714131448, "eval_agg/avg_all_bertf": 0.6664949150010944, "eval_agg/avg_all": 0.3608200660712044, "num_rl_rollout": 0, "lm_epoch": 0, "rl_epoch": 0, "step": 0, "total_data_token": 0, "total_rl_token": 0, "total_lm_token": 0, "total_token": 0, "completed_steps": 0, "tune_objective": 0.8010626381450561, "timestamp": 1771738707, "checkpoint_dir_name": null, "done": false, "training_iteration": 1, "trial_id": "24c8f244", "date": "2026-02-22_13-38-27", "time_this_iter_s": 58.30225729942322, "time_total_s": 58.30225729942322, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 58.30225729942322, "iterations_since_restore": 1}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 354.84375, "rollout/std_q1_length": 194.7317352294922, "rollout/gen/avg_score": -810.1735229492188, "rollout/gen/std_score": 1905.1258544921875, "rollout/gen/avg_r1_length": 207.1796875, "rollout/gen/std_r1_length": 179.3109588623047, "rollout/gen/avg_r1_score": -231.2695770263672, "rollout/gen/std_r1_score": 214.47215270996094, "rollout/gen/avg_r1_accuracy": 0.15656834840774536, "rollout/gen/std_r1_accuracy": 0.04505638778209686, "rollout/gen/avg_r2_length": 363.19140625, "rollout/gen/std_r2_length": 170.4690399169922, "rollout/gen/avg_r2_score": -954.8995361328125, "rollout/gen/std_r2_score": 2102.924560546875, "rollout/gen/avg_r2_accuracy": 0.1220901608467102, "rollout/gen/std_r2_accuracy": 0.05740785971283913, "rollout/best_game/query_1": "Kitty: I think Barbara is a bit angry about the holidays\nSharon: Really? Why? I didn't see that yesterday\nKitty: I know, she won't tell anything but she's disappointed\nKitty: But I just don't want to go there\nSharon: That's your choice, you have a right to go wherever you want\nKitty: I guess she thinks I don't want to go with her and it's not the case\nKitty: I just prefer to go for two weeks for the same money\nSharon: Don't feel guilty, maybe she's disappointed but I'm sure she can understand\nSharon: I think the problem is that Lucy set her up before and she thought she would convince you\nSharon: Where are you going finally?\nKitty: Not sure, I'll have to choose between Greece and Spain\nKitty: I have to book next week before the prices get too high\nTL;DR: ", "rollout/best_game/query_2": "Given the text: ????? for 2 weeks in that time frame\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " ????? for 2 weeks in that time frame\nRemember the table after each update: ????? for 2 weeks in that time frame\nThe post-script of the post-script is exactly as described at: ????? for 2 weeks in that time frame", "rollout/best_game/whole_game": "ROLE 0: Kitty: I think Barbara is a bit angry about the holidays\nSharon: Really? Why? I didn't see that yesterday\nKitty: I know, she won't tell anything but she's disappointed\nKitty: But I just don't want to go there\nSharon: That's your choice, you have a right to go wherever you want\nKitty: I guess she thinks I don't want to go with her and it's not the case\nKitty: I just prefer to go for two weeks for the same money\nSharon: Don't feel guilty, maybe she's disappointed but I'm sure she can understand\nSharon: I think the problem is that Lucy set her up before and she thought she would convince you\nSharon: Where are you going finally?\nKitty: Not sure, I'll have to choose between Greece and Spain\nKitty: I have to book next week before the prices get too high\nTL;DR: \nROLE 1: Given the text: ????? for 2 weeks in that time frame\nReconstruct the summarized text to the detailed:\nROLE 2:  ????? for 2 weeks in that time frame\nRemember the table after each update: ????? for 2 weeks in that time frame\nThe post-script of the post-script is exactly as described at: ????? for 2 weeks in that time frame", "rollout/best_game/overall_score": -63.72371631784661, "rollout/best_game/accuracy (r2)": 0.06563039723661486, "_this_batch_num_rl_token": 11355, "num_rl_rollout": 1, "lm_epoch": 0, "rl_epoch": 0, "step": 0, "total_data_token": 11355, "total_rl_token": 0, "total_lm_token": 0, "total_token": 0, "completed_steps": 0, "rollout/num_train_sample": 640, "timestamp": 1771738755, "checkpoint_dir_name": null, "done": false, "training_iteration": 2, "trial_id": "24c8f244", "date": "2026-02-22_13-39-15", "time_this_iter_s": 48.52219104766846, "time_total_s": 106.82444834709167, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 106.82444834709167, "iterations_since_restore": 2}
+{"rl_info/A2G": -0.002909204224124551, "rl_info/entropy": 2.936209201812744, "rl_info/total_token": 1354.0, "rl_info/advantage_b4_norm": -487.5804748535156, "rl_info/advantage_after_gnorm": -0.42413362860679626, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.2906268239021301, "train/lm_loss": 6.7540283203125, "train/total_loss": 7.0446553230285645, "xsum/rouge1": 0.1162274661322621, "xsum/rouge2": 0.027474360291697753, "xsum/rougeL": 0.10013578333781037, "xsum/rougeLsum": 0.09537387857590561, "xsum/bertscore_precision": 0.7213796228170395, "xsum/bertscore_recall": 0.7238291104634603, "xsum/bertscore_f1": 0.7195459554592768, "gigaword/rouge1": 0.017656487537269245, "gigaword/rouge2": 0.0033937626233792734, "gigaword/rougeL": 0.017515325311752938, "gigaword/rougeLsum": 0.01647710938016997, "gigaword/bertscore_precision": 0.5710781536996364, "gigaword/bertscore_recall": 0.6542460723221302, "gigaword/bertscore_f1": 0.6090571476519108, "cnndm/rouge1": 0.12513595904184585, "cnndm/rouge2": 0.04656551254165275, "cnndm/rougeL": 0.10061663483623491, "cnndm/rougeLsum": 0.11239511389841987, "cnndm/bertscore_precision": 0.6577402402957281, "cnndm/bertscore_recall": 0.7267044087251028, "cnndm/bertscore_f1": 0.6897566119829813, "samsum/rouge1": 0.06625883001877467, "samsum/rouge2": 0.013417548569107915, "samsum/rougeL": 0.05946395867667662, "samsum/rougeLsum": 0.045040602701500544, "samsum/bertscore_precision": 0.6321127961079279, "samsum/bertscore_recall": 0.6891191999117533, "samsum/bertscore_f1": 0.6590451846520106, "eval_agg/avg_all_rougef": 0.060196770842153774, "eval_agg/avg_all_bertf": 0.6693512249365449, "eval_agg/avg_all": 0.36477399788934933, "num_rl_rollout": 1, "lm_epoch": 0, "rl_epoch": 0, "step": 100, "total_data_token": 172181, "total_rl_token": 212473, "total_lm_token": 160826, "total_token": 373299, "completed_steps": 100, "tune_objective": 0.8247091932164498, "timestamp": 1771738804, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "24c8f244", "date": "2026-02-22_13-40-05", "time_this_iter_s": 49.212021589279175, "time_total_s": 156.03646993637085, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 156.03646993637085, "iterations_since_restore": 3}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 348.9375, "rollout/std_q1_length": 203.6563262939453, "rollout/gen/avg_score": -1027.9443359375, "rollout/gen/std_score": 2328.249267578125, "rollout/gen/avg_r1_length": 207.03125, "rollout/gen/std_r1_length": 183.385009765625, "rollout/gen/avg_r1_score": -385.5540771484375, "rollout/gen/std_r1_score": 1235.941650390625, "rollout/gen/avg_r1_accuracy": 0.14731857180595398, "rollout/gen/std_r1_accuracy": 0.048324983566999435, "rollout/gen/avg_r2_length": 372.16015625, "rollout/gen/std_r2_length": 165.5988006591797, "rollout/gen/avg_r2_score": -1188.541748046875, "rollout/gen/std_r2_score": 2503.942138671875, "rollout/gen/avg_r2_accuracy": 0.11734634637832642, "rollout/gen/std_r2_accuracy": 0.05583496391773224, "rollout/best_game/query_1": "John: Hi Hun, just please don't panic.\nAllison: What happened?\nJohn: Were @ the emergency, Tommy fell off the monkey bars.\nJohn: It seems like he broke his arm.\nAllison: OK, I'm leaving the office.\nJohn: Just please don't drive, take a taxi.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u314b\u314b\u314b\u314b.. Hmmmm...\nH/T: AJI\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " \u314b\u314b\u314b\u314b\u314b\nOh wow\nThis is amazing. I think that the video will turn into more of an interview so that it will come across more like an episode.", "rollout/best_game/whole_game": "ROLE 0: John: Hi Hun, just please don't panic.\nAllison: What happened?\nJohn: Were @ the emergency, Tommy fell off the monkey bars.\nJohn: It seems like he broke his arm.\nAllison: OK, I'm leaving the office.\nJohn: Just please don't drive, take a taxi.\nTL;DR: \nROLE 1: Given the text: \u314b\u314b\u314b\u314b.. Hmmmm...\nH/T: AJI\nReconstruct the summarized text to the detailed:\nROLE 2:  \u314b\u314b\u314b\u314b\u314b\nOh wow\nThis is amazing. I think that the video will turn into more of an interview so that it will come across more like an episode.", "rollout/best_game/overall_score": -53.73622107806891, "rollout/best_game/accuracy (r2)": 0.06481481481481481, "_this_batch_num_rl_token": 11166, "num_rl_rollout": 2, "lm_epoch": 0, "rl_epoch": 0, "step": 100, "total_data_token": 183347, "total_rl_token": 212473, "total_lm_token": 160826, "total_token": 373299, "completed_steps": 100, "rollout/num_train_sample": 640, "timestamp": 1771738854, "checkpoint_dir_name": null, "done": false, "training_iteration": 4, "trial_id": "24c8f244", "date": "2026-02-22_13-40-54", "time_this_iter_s": 49.27564072608948, "time_total_s": 205.31211066246033, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 205.31211066246033, "iterations_since_restore": 4}
+{"rl_info/A2G": -0.00016842207696754485, "rl_info/entropy": 3.1138434410095215, "rl_info/total_token": 2297.0, "rl_info/advantage_b4_norm": -967.6898193359375, "rl_info/advantage_after_gnorm": -0.5989461541175842, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.016530822962522507, "train/lm_loss": 7.39721155166626, "train/total_loss": 7.413742542266846, "xsum/rouge1": 0.11472936960387321, "xsum/rouge2": 0.026811872647112923, "xsum/rougeL": 0.09556944092710885, "xsum/rougeLsum": 0.09762959605065202, "xsum/bertscore_precision": 0.7031270662943522, "xsum/bertscore_recall": 0.7165216406186422, "xsum/bertscore_f1": 0.70625073214372, "gigaword/rouge1": 0.01678969245972369, "gigaword/rouge2": 0.0033758058025182423, "gigaword/rougeL": 0.016560119221651924, "gigaword/rougeLsum": 0.015149818472537218, "gigaword/bertscore_precision": 0.5680998438596725, "gigaword/bertscore_recall": 0.6528828796744347, "gigaword/bertscore_f1": 0.6064777792990208, "cnndm/rouge1": 0.16558439592370075, "cnndm/rouge2": 0.05976232759714233, "cnndm/rougeL": 0.13592438645712251, "cnndm/rougeLsum": 0.15024550193903416, "cnndm/bertscore_precision": 0.7100442300240198, "cnndm/bertscore_recall": 0.7544575482606888, "cnndm/bertscore_f1": 0.7307416200637817, "samsum/rouge1": 0.06481104128427728, "samsum/rouge2": 0.012460499369291007, "samsum/rougeL": 0.05980442062212202, "samsum/rougeLsum": 0.043023192840472485, "samsum/bertscore_precision": 0.6338588247696558, "samsum/bertscore_recall": 0.6799678206443787, "samsum/bertscore_f1": 0.6556075513362885, "eval_agg/avg_all_rougef": 0.06738946757614629, "eval_agg/avg_all_bertf": 0.6747694207107028, "eval_agg/avg_all": 0.37107944414342453, "num_rl_rollout": 2, "lm_epoch": 0, "rl_epoch": 0, "step": 200, "total_data_token": 335758, "total_rl_token": 427967, "total_lm_token": 313237, "total_token": 741204, "completed_steps": 200, "tune_objective": 0.8682747826977989, "timestamp": 1771738903, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "24c8f244", "date": "2026-02-22_13-41-43", "time_this_iter_s": 49.55768299102783, "time_total_s": 254.86979365348816, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 254.86979365348816, "iterations_since_restore": 5}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 344.625, "rollout/std_q1_length": 208.90306091308594, "rollout/gen/avg_score": -925.3970947265625, "rollout/gen/std_score": 2128.60791015625, "rollout/gen/avg_r1_length": 216.1484375, "rollout/gen/std_r1_length": 185.76388549804688, "rollout/gen/avg_r1_score": -243.36654663085938, "rollout/gen/std_r1_score": 223.63047790527344, "rollout/gen/avg_r1_accuracy": 0.1560140997171402, "rollout/gen/std_r1_accuracy": 0.048137858510017395, "rollout/gen/avg_r2_length": 366.07421875, "rollout/gen/std_r2_length": 172.9806365966797, "rollout/gen/avg_r2_score": -1095.90478515625, "rollout/gen/std_r2_score": 2346.8818359375, "rollout/gen/avg_r2_accuracy": 0.12232579290866852, "rollout/gen/std_r2_accuracy": 0.05995332449674606, "rollout/best_game/query_1": "By . Daily Mail Reporter . PUBLISHED: . 19:53 EST, 26 July 2013 . | . UPDATED: . 19:53 EST, 26 July 2013 . The doctor found to be responsible for the death of pop icon Michael Jackson will be freed from jail in just three months - after serving less than two years following his 2011 involuntary manslaughter conviction for giving the 'King of Pop' a lethal dose of the powerful sedative propofol. Conrad Murray is scheduled to be freed from the Los Angeles County Jail on October 28, after serving less than two years of a four-year sentence - and members of Jackson's family reportedly are livid. Murray served his time in county jail - rather than state prison - due to overcrowding in the the California prison system. Free man: Dr. Conrad Murray is expected to be released from prison in October after serving less than two years of a four-year sentence . 'This is It': 'The King of Pop' announcing what was supposed to be his final tour . 'It is not right I will never be able to see my son again, while his mother can see him,' sources close to the Jackson family told the Daily Mirror. The source went on to say the family - including his devastated children, Paris, 15, Prince, 16, Blanket, 11, and his 83-year-old mother, Katherine - feels Murray 'should serve his time for his crime.' 'Murray is the killer of Michael in their eyes. He cost them their son and father,' the source said, adding that 'for [Murray] to get out in two years is an insult. They still feel he should have been tried for murder.' Murray was sentenced to four years for Jackson's death, but is expected to get time off from his sentence for good behavior. Grieving: Jackson's children - Blanket (left) Paris (center) and Prince - will grow up without a father . 'He has been a model inmate and the authorities have granted him an early . release due to that good behaviour and the overcrowded California jail . system,' a friend of the doctor's told the paper. Murray reportedly has been serving his time in solitary confinement because of the high-profile nature of his case - and because the victim was a beloved figure in pop culture. Despite being in solitary confinement, Murray has both a television and a cell phone in his cell. In addition to the reduced sentence, the Jackson family is enraged that Murray has refused to provide testimony in their wrongful death lawsuit against concert promoter AEG Live, which was promoting Jackson's 2009 'This Is It' tour. Outraged: Jackson's mother, Katherine, is furious that Murray is being released after less than two years . The tour was pegged as Jackson's final tour. The family claims that the grueling rehearsal schedule - and AEG Live's negligence in hiring Murray - contributed to Jackson's death in June of 2009. This week, Murray made headlines when he said that if he was forced to testify in the wrongful death case, he would reveal information about Jackson's relationship with his family that would be like a 'nuclear' bombshell. It's unclear to what he might be referring. Murray says he wants nothing to do with the lawsuit because he fears he could be incriminated. Murray is appealing his criminal conviction. Prisoner: Murray is said to have both a television and a cell phone in his jail cell .\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Cindy Jackson has become the new 'King of Pop' for Popland fans .\nPosted 1 years Ago\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " \u00a0Cindy Jackson [sic] has become the new King of Pop at Popland.\nPosted 1 years Ago", "rollout/best_game/whole_game": "ROLE 0: By . Daily Mail Reporter . PUBLISHED: . 19:53 EST, 26 July 2013 . | . UPDATED: . 19:53 EST, 26 July 2013 . The doctor found to be responsible for the death of pop icon Michael Jackson will be freed from jail in just three months - after serving less than two years following his 2011 involuntary manslaughter conviction for giving the 'King of Pop' a lethal dose of the powerful sedative propofol. Conrad Murray is scheduled to be freed from the Los Angeles County Jail on October 28, after serving less than two years of a four-year sentence - and members of Jackson's family reportedly are livid. Murray served his time in county jail - rather than state prison - due to overcrowding in the the California prison system. Free man: Dr. Conrad Murray is expected to be released from prison in October after serving less than two years of a four-year sentence . 'This is It': 'The King of Pop' announcing what was supposed to be his final tour . 'It is not right I will never be able to see my son again, while his mother can see him,' sources close to the Jackson family told the Daily Mirror. The source went on to say the family - including his devastated children, Paris, 15, Prince, 16, Blanket, 11, and his 83-year-old mother, Katherine - feels Murray 'should serve his time for his crime.' 'Murray is the killer of Michael in their eyes. He cost them their son and father,' the source said, adding that 'for [Murray] to get out in two years is an insult. They still feel he should have been tried for murder.' Murray was sentenced to four years for Jackson's death, but is expected to get time off from his sentence for good behavior. Grieving: Jackson's children - Blanket (left) Paris (center) and Prince - will grow up without a father . 'He has been a model inmate and the authorities have granted him an early . release due to that good behaviour and the overcrowded California jail . system,' a friend of the doctor's told the paper. Murray reportedly has been serving his time in solitary confinement because of the high-profile nature of his case - and because the victim was a beloved figure in pop culture. Despite being in solitary confinement, Murray has both a television and a cell phone in his cell. In addition to the reduced sentence, the Jackson family is enraged that Murray has refused to provide testimony in their wrongful death lawsuit against concert promoter AEG Live, which was promoting Jackson's 2009 'This Is It' tour. Outraged: Jackson's mother, Katherine, is furious that Murray is being released after less than two years . The tour was pegged as Jackson's final tour. The family claims that the grueling rehearsal schedule - and AEG Live's negligence in hiring Murray - contributed to Jackson's death in June of 2009. This week, Murray made headlines when he said that if he was forced to testify in the wrongful death case, he would reveal information about Jackson's relationship with his family that would be like a 'nuclear' bombshell. It's unclear to what he might be referring. Murray says he wants nothing to do with the lawsuit because he fears he could be incriminated. Murray is appealing his criminal conviction. Prisoner: Murray is said to have both a television and a cell phone in his jail cell .\nTL;DR: \nROLE 1: Given the text: \u00a0Cindy Jackson has become the new 'King of Pop' for Popland fans .\nPosted 1 years Ago\nReconstruct the summarized text to the detailed:\nROLE 2:  \u00a0Cindy Jackson [sic] has become the new King of Pop at Popland.\nPosted 1 years Ago", "rollout/best_game/overall_score": -55.065143219882756, "rollout/best_game/accuracy (r2)": 0.01716053136830586, "_this_batch_num_rl_token": 11028, "num_rl_rollout": 3, "lm_epoch": 0, "rl_epoch": 0, "step": 201, "total_data_token": 348518, "total_rl_token": 429519, "total_lm_token": 314969, "total_token": 744488, "completed_steps": 201, "rollout/num_train_sample": 640, "timestamp": 1771738953, "checkpoint_dir_name": null, "done": false, "training_iteration": 6, "trial_id": "24c8f244", "date": "2026-02-22_13-42-33", "time_this_iter_s": 49.630918979644775, "time_total_s": 304.50071263313293, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 304.50071263313293, "iterations_since_restore": 6}
+{"rl_info/A2G": -3.637878762674518e-05, "rl_info/entropy": 3.051198959350586, "rl_info/total_token": 2321.0, "rl_info/advantage_b4_norm": -1101.4930419921875, "rl_info/advantage_after_gnorm": -0.05225572735071182, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.0033327587880194187, "train/lm_loss": 7.549774646759033, "train/total_loss": 7.553107261657715, "xsum/rouge1": 0.07604731536925, "xsum/rouge2": 0.017396655267254982, "xsum/rougeL": 0.06521226726464041, "xsum/rougeLsum": 0.058325693615018694, "xsum/bertscore_precision": 0.6832815955082575, "xsum/bertscore_recall": 0.7083765814701716, "xsum/bertscore_f1": 0.6918817758560181, "gigaword/rouge1": 0.018751545429211453, "gigaword/rouge2": 0.003321049007320801, "gigaword/rougeL": 0.01841833571425141, "gigaword/rougeLsum": 0.017469806660227613, "gigaword/bertscore_precision": 0.5731927151978016, "gigaword/bertscore_recall": 0.6515496721863747, "gigaword/bertscore_f1": 0.6090207310020923, "cnndm/rouge1": 0.13558569461406303, "cnndm/rouge2": 0.04739061678756995, "cnndm/rougeL": 0.11148991447136525, "cnndm/rougeLsum": 0.12171265717858565, "cnndm/bertscore_precision": 0.6824639389912287, "cnndm/bertscore_recall": 0.7190692375103632, "cnndm/bertscore_f1": 0.6992726822694143, "samsum/rouge1": 0.07389264191010704, "samsum/rouge2": 0.014794509957842964, "samsum/rougeL": 0.05947199147422732, "samsum/rougeLsum": 0.04732802968093871, "samsum/bertscore_precision": 0.6210540433724722, "samsum/bertscore_recall": 0.6869649738073349, "samsum/bertscore_f1": 0.6516486207644144, "eval_agg/avg_all_rougef": 0.0554130452751172, "eval_agg/avg_all_bertf": 0.6629559524729848, "eval_agg/avg_all": 0.359184498874051, "num_rl_rollout": 3, "lm_epoch": 0, "rl_epoch": 0, "step": 300, "total_data_token": 501292, "total_rl_token": 640402, "total_lm_token": 467743, "total_token": 1108145, "completed_steps": 300, "tune_objective": 0.7962264216158818, "timestamp": 1771739000, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "24c8f244", "date": "2026-02-22_13-43-21", "time_this_iter_s": 47.40696406364441, "time_total_s": 351.90767669677734, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 351.90767669677734, "iterations_since_restore": 7}
+{"rl_info/A2G": 0.0025603696703910828, "rl_info/entropy": 2.9738070964813232, "rl_info/total_token": 1967.0, "rl_info/advantage_b4_norm": -838.8681030273438, "rl_info/advantage_after_gnorm": -0.035387616604566574, "rl_info/kl_w_ref": 0.0, "train/rl_loss": -0.2563343346118927, "train/lm_loss": 6.59817361831665, "train/total_loss": 6.34183931350708, "num_rl_rollout": 3, "lm_epoch": 0, "rl_epoch": 0, "step": 301, "total_data_token": 502431, "total_rl_token": 642369, "total_lm_token": 468882, "total_token": 1111251, "completed_steps": 301, "timestamp": 1771739001, "checkpoint_dir_name": null, "done": false, "training_iteration": 8, "trial_id": "24c8f244", "date": "2026-02-22_13-43-21", "time_this_iter_s": 0.46466779708862305, "time_total_s": 352.37234449386597, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 352.37234449386597, "iterations_since_restore": 8}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 311.90625, "rollout/std_q1_length": 197.28765869140625, "rollout/gen/avg_score": -953.8209838867188, "rollout/gen/std_score": 2188.5009765625, "rollout/gen/avg_r1_length": 224.1875, "rollout/gen/std_r1_length": 193.4598388671875, "rollout/gen/avg_r1_score": -258.484375, "rollout/gen/std_r1_score": 233.9423370361328, "rollout/gen/avg_r1_accuracy": 0.14026010036468506, "rollout/gen/std_r1_accuracy": 0.05141757056117058, "rollout/gen/avg_r2_length": 365.1484375, "rollout/gen/std_r2_length": 173.01319885253906, "rollout/gen/avg_r2_score": -1127.6551513671875, "rollout/gen/std_r2_score": 2413.353271484375, "rollout/gen/avg_r2_accuracy": 0.10783617943525314, "rollout/gen/std_r2_accuracy": 0.05779047682881355, "rollout/best_game/query_1": "Sam: Hi, long time no see! How are you? We\u2019re planning a board games night, are you into it?\nAlice: Hi :) Not really, but I\u2019ve always wanted to try. What type of games do you play?\nSam: Mostly Game of Thrones, but it takes a whole lot of time\u2026 I think you might enjoy Days of Wonder more! It\u2019s about collection of stones and planning, it\u2019s quite easy to learn\nAlice: Wow, sounds nice! I\u2019ve only played basics, such as Scrabble or Monopoly, but I like some rivalry! \nSam: Hah, so you\u2019ve caught a bug before. Time to refresh your skills\nAlice: Yeah, so when are you playing?\nSam: Friday night at 9pm at the Game Pub situated near Palace of Science and Culture. \nAlice: Ok, seems fine with me. Anyone joining us?\nSam: Yeah, Paul, Christie and Nate are coming\nAlice: Nice squad! I didn\u2019t know they all play board games. It seemed a bit dull to me\u2026\nSam: Never! Games are really entertaining!  \nAlice: I was just joking :) It\u2019ll be great to catch up! \nSam: Later we can catch some drinks, so dress up!\nAlice: Oh no! But still, sounds fantastic! See you then! Bye!\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0I love when Sam's friends introduce him to new people, and play with them. I'll definitely see you there :)\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": "\nI love when Sam's friends introduce him to new people, and play with them. I'll definitely see you there :)\nEnjoy!", "rollout/best_game/whole_game": "ROLE 0: Sam: Hi, long time no see! How are you? We\u2019re planning a board games night, are you into it?\nAlice: Hi :) Not really, but I\u2019ve always wanted to try. What type of games do you play?\nSam: Mostly Game of Thrones, but it takes a whole lot of time\u2026 I think you might enjoy Days of Wonder more! It\u2019s about collection of stones and planning, it\u2019s quite easy to learn\nAlice: Wow, sounds nice! I\u2019ve only played basics, such as Scrabble or Monopoly, but I like some rivalry! \nSam: Hah, so you\u2019ve caught a bug before. Time to refresh your skills\nAlice: Yeah, so when are you playing?\nSam: Friday night at 9pm at the Game Pub situated near Palace of Science and Culture. \nAlice: Ok, seems fine with me. Anyone joining us?\nSam: Yeah, Paul, Christie and Nate are coming\nAlice: Nice squad! I didn\u2019t know they all play board games. It seemed a bit dull to me\u2026\nSam: Never! Games are really entertaining!  \nAlice: I was just joking :) It\u2019ll be great to catch up! \nSam: Later we can catch some drinks, so dress up!\nAlice: Oh no! But still, sounds fantastic! See you then! Bye!\nTL;DR: \nROLE 1: Given the text: \u00a0I love when Sam's friends introduce him to new people, and play with them. I'll definitely see you there :)\nReconstruct the summarized text to the detailed:\nROLE 2: \nI love when Sam's friends introduce him to new people, and play with them. I'll definitely see you there :)\nEnjoy!", "rollout/best_game/overall_score": -57.65199099344143, "rollout/best_game/accuracy (r2)": 0.0705106868983521, "_this_batch_num_rl_token": 9981, "num_rl_rollout": 4, "lm_epoch": 0, "rl_epoch": 0, "step": 302, "total_data_token": 513967, "total_rl_token": 644616, "total_lm_token": 470437, "total_token": 1115053, "completed_steps": 302, "rollout/num_train_sample": 640, "timestamp": 1771739046, "checkpoint_dir_name": null, "done": false, "training_iteration": 9, "trial_id": "24c8f244", "date": "2026-02-22_13-44-06", "time_this_iter_s": 45.46229529380798, "time_total_s": 397.83463978767395, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 397.83463978767395, "iterations_since_restore": 9}
+{"rl_info/A2G": -0.0005509508191607893, "rl_info/entropy": 2.9813852310180664, "rl_info/total_token": 1750.0, "rl_info/advantage_b4_norm": -788.64404296875, "rl_info/advantage_after_gnorm": 0.8544918298721313, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.054796941578388214, "train/lm_loss": 6.702639102935791, "train/total_loss": 6.757436275482178, "xsum/rouge1": 0.11132541589921706, "xsum/rouge2": 0.021415716147478156, "xsum/rougeL": 0.09028962063845176, "xsum/rougeLsum": 0.0887879972618284, "xsum/bertscore_precision": 0.7086698263883591, "xsum/bertscore_recall": 0.7090469151735306, "xsum/bertscore_f1": 0.7059575368960699, "gigaword/rouge1": 0.016863871758588247, "gigaword/rouge2": 0.003450220386292167, "gigaword/rougeL": 0.01663164950602096, "gigaword/rougeLsum": 0.015859456998356187, "gigaword/bertscore_precision": 0.5694834250211716, "gigaword/bertscore_recall": 0.6511477509140968, "gigaword/bertscore_f1": 0.6067759798467159, "cnndm/rouge1": 0.18224313640478348, "cnndm/rouge2": 0.0679428805559371, "cnndm/rougeL": 0.14648908218446857, "cnndm/rougeLsum": 0.1633139600398624, "cnndm/bertscore_precision": 0.699204201499621, "cnndm/bertscore_recall": 0.7569622049729029, "cnndm/bertscore_f1": 0.7252626369396845, "samsum/rouge1": 0.06538320074022787, "samsum/rouge2": 0.014773334166031387, "samsum/rougeL": 0.05405134424223368, "samsum/rougeLsum": 0.04469748203772903, "samsum/bertscore_precision": 0.6258922417958578, "samsum/bertscore_recall": 0.6917758882045746, "samsum/bertscore_f1": 0.6566124707460403, "eval_agg/avg_all_rougef": 0.06896989806046916, "eval_agg/avg_all_bertf": 0.6736521561071277, "eval_agg/avg_all": 0.37131102708379843, "num_rl_rollout": 4, "lm_epoch": 0, "rl_epoch": 0, "step": 400, "total_data_token": 668982, "total_rl_token": 855170, "total_lm_token": 625452, "total_token": 1480622, "completed_steps": 400, "tune_objective": 0.8809099141896771, "timestamp": 1771739094, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 10, "trial_id": "24c8f244", "date": "2026-02-22_13-44-54", "time_this_iter_s": 47.85594701766968, "time_total_s": 445.6905868053436, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 445.6905868053436, "iterations_since_restore": 10}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 407.28125, "rollout/std_q1_length": 205.1195526123047, "rollout/gen/avg_score": -939.7592163085938, "rollout/gen/std_score": 2191.927001953125, "rollout/gen/avg_r1_length": 193.65625, "rollout/gen/std_r1_length": 175.49459838867188, "rollout/gen/avg_r1_score": -216.16012573242188, "rollout/gen/std_r1_score": 209.059814453125, "rollout/gen/avg_r1_accuracy": 0.16719722747802734, "rollout/gen/std_r1_accuracy": 0.04291506111621857, "rollout/gen/avg_r2_length": 360.47265625, "rollout/gen/std_r2_length": 176.47999572753906, "rollout/gen/avg_r2_score": -1120.658935546875, "rollout/gen/std_r2_score": 2415.206787109375, "rollout/gen/avg_r2_accuracy": 0.13006523251533508, "rollout/gen/std_r2_accuracy": 0.060750868171453476, "rollout/best_game/query_1": "Greg: Without giving details and jinxing anything, how is the Polish academic scene and how friendly/functional would it be for a foreigner, like me, who can't shut up in a timely fashion some days?\nGreg: Someone wants to bring me out as a visiting prof.\nSam: Visiting profs have quite a nice time in Poland\nSam: Red tape is hellish\nSam: But you won't have much of it as a visiting prof\nSam: Which uni invites u?\nGreg: I'll have to look up the name again. They sent a message after I was at a conference with their Chair or Vice-Chair, but I was afraid to just jump enthusiastically in.\nSam: Right\nSam: University and city matters and some faculties are more messy than others\nSam: But most Poles speak English so you won't have troubles communicating with tgem\nGreg: Thank you <3\nSam: Warsaw, Cracow and Pozna\u0144 are the best for foreigners\nSam: Poles, like Americans I guess, are pretty divided right now and you can feel the right wing getting stronger\nSam: if you were to move in here, I would recommend postpone it till 2020, till new elections\nSam: but for a visiting prof it won't be an issue\nGreg: I'm such a chicken. I'm afraid of every town in the world.\nGreg: Thank you. I'll have to pull it up and tell them I'm interested. I'll send you an update around the same time.\nSam: it would be great if u were in Warsaw\nSam: I just told my bf that you would have lots to talk about, he studied Chinese language and culture and we could hang out!\nSam: but whenever you'll be in Poland, I would meet you up <3\nGreg: That would be fantastic! :D\nSam: :)\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Poland, Poland and American tourists will not get to say sorry for being Polish.\nThank you all for participating in the conversation.\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " \u00a0Poland, Poland, Poland and American tourists will not get to say sorry for being Polish.\nThank you all you for participating in the conversation.", "rollout/best_game/whole_game": "ROLE 0: Greg: Without giving details and jinxing anything, how is the Polish academic scene and how friendly/functional would it be for a foreigner, like me, who can't shut up in a timely fashion some days?\nGreg: Someone wants to bring me out as a visiting prof.\nSam: Visiting profs have quite a nice time in Poland\nSam: Red tape is hellish\nSam: But you won't have much of it as a visiting prof\nSam: Which uni invites u?\nGreg: I'll have to look up the name again. They sent a message after I was at a conference with their Chair or Vice-Chair, but I was afraid to just jump enthusiastically in.\nSam: Right\nSam: University and city matters and some faculties are more messy than others\nSam: But most Poles speak English so you won't have troubles communicating with tgem\nGreg: Thank you <3\nSam: Warsaw, Cracow and Pozna\u0144 are the best for foreigners\nSam: Poles, like Americans I guess, are pretty divided right now and you can feel the right wing getting stronger\nSam: if you were to move in here, I would recommend postpone it till 2020, till new elections\nSam: but for a visiting prof it won't be an issue\nGreg: I'm such a chicken. I'm afraid of every town in the world.\nGreg: Thank you. I'll have to pull it up and tell them I'm interested. I'll send you an update around the same time.\nSam: it would be great if u were in Warsaw\nSam: I just told my bf that you would have lots to talk about, he studied Chinese language and culture and we could hang out!\nSam: but whenever you'll be in Poland, I would meet you up <3\nGreg: That would be fantastic! :D\nSam: :)\nTL;DR: \nROLE 1: Given the text: \u00a0Poland, Poland and American tourists will not get to say sorry for being Polish.\nThank you all for participating in the conversation.\nReconstruct the summarized text to the detailed:\nROLE 2:  \u00a0Poland, Poland, Poland and American tourists will not get to say sorry for being Polish.\nThank you all you for participating in the conversation.", "rollout/best_game/overall_score": -64.87191780714384, "rollout/best_game/accuracy (r2)": 0.05659029317565902, "_this_batch_num_rl_token": 13033, "num_rl_rollout": 5, "lm_epoch": 0, "rl_epoch": 0, "step": 403, "total_data_token": 685903, "total_rl_token": 860268, "total_lm_token": 629340, "total_token": 1489608, "completed_steps": 403, "rollout/num_train_sample": 640, "timestamp": 1771739144, "checkpoint_dir_name": null, "done": false, "training_iteration": 11, "trial_id": "24c8f244", "date": "2026-02-22_13-45-44", "time_this_iter_s": 49.643423557281494, "time_total_s": 495.3340103626251, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 495.3340103626251, "iterations_since_restore": 11}
+{"rl_info/A2G": -0.001162507920525968, "rl_info/entropy": 3.034398317337036, "rl_info/total_token": 977.0, "rl_info/advantage_b4_norm": -403.0190734863281, "rl_info/advantage_after_gnorm": 0.6921422481536865, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.11594735085964203, "train/lm_loss": 7.919494152069092, "train/total_loss": 8.035441398620605, "xsum/rouge1": 0.13065050002526268, "xsum/rouge2": 0.0345531870399425, "xsum/rougeL": 0.11209863064358033, "xsum/rougeLsum": 0.10592896340721943, "xsum/bertscore_precision": 0.7064413626988729, "xsum/bertscore_recall": 0.7165375004212061, "xsum/bertscore_f1": 0.7078358183304468, "gigaword/rouge1": 0.017874515721601925, "gigaword/rouge2": 0.0037481453632686236, "gigaword/rougeL": 0.017548047740118596, "gigaword/rougeLsum": 0.016839400834062795, "gigaword/bertscore_precision": 0.5712652222812176, "gigaword/bertscore_recall": 0.6543654787540436, "gigaword/bertscore_f1": 0.6091660499572754, "cnndm/rouge1": 0.13242038501644895, "cnndm/rouge2": 0.04551204647411738, "cnndm/rougeL": 0.1090006894525135, "cnndm/rougeLsum": 0.12013041818029008, "cnndm/bertscore_precision": 0.6689551721016566, "cnndm/bertscore_recall": 0.732322151462237, "cnndm/bertscore_f1": 0.69817482928435, "samsum/rouge1": 0.07243922219327306, "samsum/rouge2": 0.015573979728793727, "samsum/rougeL": 0.06542760756721964, "samsum/rougeLsum": 0.04732735279565792, "samsum/bertscore_precision": 0.6312932521104813, "samsum/bertscore_recall": 0.6853651652733485, "samsum/bertscore_f1": 0.6564675023158392, "eval_agg/avg_all_rougef": 0.06544206826146069, "eval_agg/avg_all_bertf": 0.6679110499719779, "eval_agg/avg_all": 0.3666765591167193, "num_rl_rollout": 5, "lm_epoch": 0, "rl_epoch": 0, "step": 500, "total_data_token": 843449, "total_rl_token": 1069618, "total_lm_token": 786886, "total_token": 1856504, "completed_steps": 500, "tune_objective": 0.8511296447176437, "timestamp": 1771739190, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 12, "trial_id": "24c8f244", "date": "2026-02-22_13-46-31", "time_this_iter_s": 46.491575479507446, "time_total_s": 541.8255858421326, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 541.8255858421326, "iterations_since_restore": 12}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 396.15625, "rollout/std_q1_length": 226.465087890625, "rollout/gen/avg_score": -809.8009643554688, "rollout/gen/std_score": 1867.0928955078125, "rollout/gen/avg_r1_length": 206.390625, "rollout/gen/std_r1_length": 172.63612365722656, "rollout/gen/avg_r1_score": -232.83004760742188, "rollout/gen/std_r1_score": 210.2153778076172, "rollout/gen/avg_r1_accuracy": 0.156185120344162, "rollout/gen/std_r1_accuracy": 0.05059858039021492, "rollout/gen/avg_r2_length": 375.16015625, "rollout/gen/std_r2_length": 168.98590087890625, "rollout/gen/avg_r2_score": -954.043701171875, "rollout/gen/std_r2_score": 2060.10498046875, "rollout/gen/avg_r2_accuracy": 0.12462785840034485, "rollout/gen/std_r2_accuracy": 0.060481369495391846, "rollout/best_game/query_1": "(CNN) -- The U.S. and Russian ambassadors to the United Nations have exchanged a flurry of Twitter posts about the controversial punk rock band Pussy Riot. Meanwhile, Russian media suggested a Russian athlete in the Sochi Games may have been sporting an image supporting Pussy Riot on his snowboard Thursday in what could be the Olympics' first protest. In the diplomatic dispute, Samantha Power, the U.S. ambassador to the United Nations, apparently initiated the exchange with a Russian counterpart when she tweeted Wednesday about meeting formerly imprisoned band members Maria Alyokhina and Nadezhda Tolokonnikova, both of whom were released in December. Power posted a photograph of herself with the two punk rockers, who \"came by to discuss their time in jail,\" Power tweeted. She also stated: \"Met some brave 'troublemakers' today.\" . Power subsequently tweeted: \"I asked #PussyRiot if they were afraid of prison. Response: No. In prison we could see the terrible conditions. It's human rights fieldwork.\" Then the Russian ambassador to the United Nations, Vitaly Churkin, told a U.N. press conference that Power should join the band and invite them to play at the National Cathedral in Washington. Power responded on Twitter: \"Ambassador Churkin, I'd be honored to go on tour with #PussyRiot -- a group of girls who speak up & stand for human rights. Will you join us?\" Power also added: \"I can't sing, but if #PussyRiot will have me, Amb Churkin, I say our 1st concert is for Russia's pol. prisoners. #LiveFromMatrosskayaTishina.\" Matrosskaya Tishina is a notorious Moscow prison where opposition activists have been held. Pussy Riot tells Christiane Amanpour: 'We are free people, and free people feel no fear.\" A third member of Pussy Riot, Yekaterina Samutsevich, was released in 2012. The three members were sentenced to prison after performing a song critical of Russian President Vladimir Putin in one of the Russian Orthodox Church's most important cathedrals in February 2012. The performance was carried out in a flash-mob style. Meanwhile, the Russian state-run media agency RIA Novosti reported Thursday that Russian athlete Alexei Sobolev sported an image on his snowboard resembling \"a female figure in a balaclava wielding a knife.\" That image purports to resemble members of Pussy Riot because the anti-Putin, all-female band perform while wearing balaclavas, the news agency reported. The headline stated: \"Sochi Snowboarder Coy on Possible Pussy Riot Protest.\" When asked if the design was an homage to Pussy Riot, Sobolev responded: \"Anything is possible.\" He added: \"I wasn't the designer.\" Sobolev, a slopestyle rider, was also the first Russian to compete in the Winter Olympics in Sochi, Russia, and finished 10th in a qualifying heat Thursday. The drawing on his snowboard was described as \"what could be the first protest by an athlete\" in the games, the Russian news agency said. CNN's Brian Walker contributed to this report.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Russian Olympic women compete in skanking events... and we get it.\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " \u00a0Russian Russian Olympic women compete in skanking events ... and we get it.", "rollout/best_game/whole_game": "ROLE 0: (CNN) -- The U.S. and Russian ambassadors to the United Nations have exchanged a flurry of Twitter posts about the controversial punk rock band Pussy Riot. Meanwhile, Russian media suggested a Russian athlete in the Sochi Games may have been sporting an image supporting Pussy Riot on his snowboard Thursday in what could be the Olympics' first protest. In the diplomatic dispute, Samantha Power, the U.S. ambassador to the United Nations, apparently initiated the exchange with a Russian counterpart when she tweeted Wednesday about meeting formerly imprisoned band members Maria Alyokhina and Nadezhda Tolokonnikova, both of whom were released in December. Power posted a photograph of herself with the two punk rockers, who \"came by to discuss their time in jail,\" Power tweeted. She also stated: \"Met some brave 'troublemakers' today.\" . Power subsequently tweeted: \"I asked #PussyRiot if they were afraid of prison. Response: No. In prison we could see the terrible conditions. It's human rights fieldwork.\" Then the Russian ambassador to the United Nations, Vitaly Churkin, told a U.N. press conference that Power should join the band and invite them to play at the National Cathedral in Washington. Power responded on Twitter: \"Ambassador Churkin, I'd be honored to go on tour with #PussyRiot -- a group of girls who speak up & stand for human rights. Will you join us?\" Power also added: \"I can't sing, but if #PussyRiot will have me, Amb Churkin, I say our 1st concert is for Russia's pol. prisoners. #LiveFromMatrosskayaTishina.\" Matrosskaya Tishina is a notorious Moscow prison where opposition activists have been held. Pussy Riot tells Christiane Amanpour: 'We are free people, and free people feel no fear.\" A third member of Pussy Riot, Yekaterina Samutsevich, was released in 2012. The three members were sentenced to prison after performing a song critical of Russian President Vladimir Putin in one of the Russian Orthodox Church's most important cathedrals in February 2012. The performance was carried out in a flash-mob style. Meanwhile, the Russian state-run media agency RIA Novosti reported Thursday that Russian athlete Alexei Sobolev sported an image on his snowboard resembling \"a female figure in a balaclava wielding a knife.\" That image purports to resemble members of Pussy Riot because the anti-Putin, all-female band perform while wearing balaclavas, the news agency reported. The headline stated: \"Sochi Snowboarder Coy on Possible Pussy Riot Protest.\" When asked if the design was an homage to Pussy Riot, Sobolev responded: \"Anything is possible.\" He added: \"I wasn't the designer.\" Sobolev, a slopestyle rider, was also the first Russian to compete in the Winter Olympics in Sochi, Russia, and finished 10th in a qualifying heat Thursday. The drawing on his snowboard was described as \"what could be the first protest by an athlete\" in the games, the Russian news agency said. CNN's Brian Walker contributed to this report.\nTL;DR: \nROLE 1: Given the text: \u00a0Russian Olympic women compete in skanking events... and we get it.\nReconstruct the summarized text to the detailed:\nROLE 2:  \u00a0Russian Russian Olympic women compete in skanking events ... and we get it.", "rollout/best_game/overall_score": -37.831100032852866, "rollout/best_game/accuracy (r2)": 0.0216857474697196, "_this_batch_num_rl_token": 12677, "num_rl_rollout": 6, "lm_epoch": 0, "rl_epoch": 0, "step": 500, "total_data_token": 856126, "total_rl_token": 1069618, "total_lm_token": 786886, "total_token": 1856504, "completed_steps": 500, "rollout/num_train_sample": 640, "timestamp": 1771739240, "checkpoint_dir_name": null, "done": false, "training_iteration": 13, "trial_id": "24c8f244", "date": "2026-02-22_13-47-20", "time_this_iter_s": 49.49760961532593, "time_total_s": 591.3231954574585, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 591.3231954574585, "iterations_since_restore": 13}
+{"rl_info/A2G": 0.0010830735554918647, "rl_info/entropy": 3.095104455947876, "rl_info/total_token": 1250.0, "rl_info/advantage_b4_norm": -382.64013671875, "rl_info/advantage_after_gnorm": 0.16023267805576324, "rl_info/kl_w_ref": 0.0, "train/rl_loss": -0.10861686617136002, "train/lm_loss": 7.111542224884033, "train/total_loss": 7.002925395965576, "xsum/rouge1": 0.12194566900936528, "xsum/rouge2": 0.02686538271837727, "xsum/rougeL": 0.10242970535507062, "xsum/rougeLsum": 0.0992788043904586, "xsum/bertscore_precision": 0.7137015660603842, "xsum/bertscore_recall": 0.7209101816018423, "xsum/bertscore_f1": 0.7136130084594091, "gigaword/rouge1": 0.01889458253565279, "gigaword/rouge2": 0.003861049554763455, "gigaword/rougeL": 0.01874957332672237, "gigaword/rougeLsum": 0.01738436102380755, "gigaword/bertscore_precision": 0.5687203773856163, "gigaword/bertscore_recall": 0.6520576258003712, "gigaword/bertscore_f1": 0.6066739176213741, "cnndm/rouge1": 0.1657257526388983, "cnndm/rouge2": 0.052541812454781074, "cnndm/rougeL": 0.13501585967599106, "cnndm/rougeLsum": 0.15323929238278336, "cnndm/bertscore_precision": 0.7005942811568578, "cnndm/bertscore_recall": 0.7368564456701279, "cnndm/bertscore_f1": 0.7172105014324188, "samsum/rouge1": 0.06387501833091434, "samsum/rouge2": 0.015471666649440807, "samsum/rougeL": 0.05641832209094814, "samsum/rougeLsum": 0.045404737020725776, "samsum/bertscore_precision": 0.6243261396884918, "samsum/bertscore_recall": 0.700822576880455, "samsum/bertscore_f1": 0.6594508985678355, "eval_agg/avg_all_rougef": 0.0685688493224188, "eval_agg/avg_all_bertf": 0.6742370815202593, "eval_agg/avg_all": 0.37140296542133905, "num_rl_rollout": 6, "lm_epoch": 0, "rl_epoch": 0, "step": 600, "total_data_token": 1014711, "total_rl_token": 1283612, "total_lm_token": 945471, "total_token": 2229083, "completed_steps": 600, "tune_objective": 0.8741150420940939, "timestamp": 1771739287, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 14, "trial_id": "24c8f244", "date": "2026-02-22_13-48-07", "time_this_iter_s": 46.97838807106018, "time_total_s": 638.3015835285187, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 638.3015835285187, "iterations_since_restore": 14}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 337.1875, "rollout/std_q1_length": 233.89254760742188, "rollout/gen/avg_score": -1155.7095947265625, "rollout/gen/std_score": 2557.78125, "rollout/gen/avg_r1_length": 213.1796875, "rollout/gen/std_r1_length": 179.69529724121094, "rollout/gen/avg_r1_score": -391.1427001953125, "rollout/gen/std_r1_score": 1234.009765625, "rollout/gen/avg_r1_accuracy": 0.14171180129051208, "rollout/gen/std_r1_accuracy": 0.05560818314552307, "rollout/gen/avg_r2_length": 355.767578125, "rollout/gen/std_r2_length": 177.6969757080078, "rollout/gen/avg_r2_score": -1346.851318359375, "rollout/gen/std_r2_score": 2760.3466796875, "rollout/gen/avg_r2_accuracy": 0.11164454370737076, "rollout/gen/std_r2_accuracy": 0.06305090337991714, "rollout/best_game/query_1": "(CNN) -- Poet, dramatist and novelist, Wole Soyinka's work has left a large imprint on the literary landscape of Africa. Poltical activist and Nobel Laureat, Wole Soyinka. His work has been performed internationally and in 1986 was awarded the Nobel Prize for Literature. Regardless of the global profile and recognition, writing for Soyinka has always been, and always will be, a part of his life. \"I began writing early, very, very early. ... I was already writing short stories for the radio, and selling poems to poetry and art festivals, I was involved in school plays, I wrote essays, so there was no definite moment when I said, now I'm a writer. I've always been a writer,\" he told CNN. Before being honored with a Nobel Prize, Soyinka had written plays performed across the world, set up drama groups and held academic positions in the UK, the U.S. and Nigeria. Poetry and novels make up the rest of his work, but it is with the medium of drama that he feels most comfortable. \"When I write plays, I'm already seeing the shapes on stage, of the actors and their interaction, and so on and so forth. I don't think I've ever written one play as an abstract piece, as a literary piece, floating in the air somewhere, to be flushed out later on,\" he said. Now 75, Soyinka claims it is a natural curiosity with life and humanity that has always compelled him to write, and should be a pre-requisite for all aspiring writers. \"Before you're a writer, you're a citizen, a human being, and therefore the weapons of the citizen are at your disposal to use, or not use,\" he said. Often he's used those \"weapons\" to take aim at oppression, corruption, the legacy of colonialism and slavery, both in his work and more overtly in his political activism. At times he's suffered for his outspoken views; in 1967 he was imprisoned for 22 months for publicly calling for a truce during Nigeria's civil war and in 1994 was forced to leave the country when General Sani Abacha came to power. \"One of my earliest short stories in fact had to do with the story of a family which escaped enslavement by Portuguese slave leaders, so you can see how early I was preoccupied by that theme of power and freedom, and domination,\" he said. \"If there's one overriding political theme it's my continuing emphasis on the axis of freedom and power. I think that the history of the world, of all societies, has spun on those two axes.\" Soyinka spends his time between Nigeria, Europe and the U.S., much of it on the lecture circuit, but the need to write remains. \"I've always written plays for the purpose of getting something out of my system. That's the first stage. And the next stage is, I want to get it onto the platform -- onto the stage. And once I've done that, no matter where, I'm satisfied.\" Watch the show on CNN on Saturday, August 1, 12.30, 21.30 GMT and Sunday, August 2, 18.00 GMT.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: ------------ Wole Soyinka's Writing for Poets: The Power To Turn Into a Writer\nPosted By Kristianne Diamandis On August 25, 2002 at 11:57 AM By Kristianne Diamandis\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " ------------\u00a0 Wole Soyinka's writing for Poets:\u00a0 The Power To Turn Into a Writer", "rollout/best_game/whole_game": "ROLE 0: (CNN) -- Poet, dramatist and novelist, Wole Soyinka's work has left a large imprint on the literary landscape of Africa. Poltical activist and Nobel Laureat, Wole Soyinka. His work has been performed internationally and in 1986 was awarded the Nobel Prize for Literature. Regardless of the global profile and recognition, writing for Soyinka has always been, and always will be, a part of his life. \"I began writing early, very, very early. ... I was already writing short stories for the radio, and selling poems to poetry and art festivals, I was involved in school plays, I wrote essays, so there was no definite moment when I said, now I'm a writer. I've always been a writer,\" he told CNN. Before being honored with a Nobel Prize, Soyinka had written plays performed across the world, set up drama groups and held academic positions in the UK, the U.S. and Nigeria. Poetry and novels make up the rest of his work, but it is with the medium of drama that he feels most comfortable. \"When I write plays, I'm already seeing the shapes on stage, of the actors and their interaction, and so on and so forth. I don't think I've ever written one play as an abstract piece, as a literary piece, floating in the air somewhere, to be flushed out later on,\" he said. Now 75, Soyinka claims it is a natural curiosity with life and humanity that has always compelled him to write, and should be a pre-requisite for all aspiring writers. \"Before you're a writer, you're a citizen, a human being, and therefore the weapons of the citizen are at your disposal to use, or not use,\" he said. Often he's used those \"weapons\" to take aim at oppression, corruption, the legacy of colonialism and slavery, both in his work and more overtly in his political activism. At times he's suffered for his outspoken views; in 1967 he was imprisoned for 22 months for publicly calling for a truce during Nigeria's civil war and in 1994 was forced to leave the country when General Sani Abacha came to power. \"One of my earliest short stories in fact had to do with the story of a family which escaped enslavement by Portuguese slave leaders, so you can see how early I was preoccupied by that theme of power and freedom, and domination,\" he said. \"If there's one overriding political theme it's my continuing emphasis on the axis of freedom and power. I think that the history of the world, of all societies, has spun on those two axes.\" Soyinka spends his time between Nigeria, Europe and the U.S., much of it on the lecture circuit, but the need to write remains. \"I've always written plays for the purpose of getting something out of my system. That's the first stage. And the next stage is, I want to get it onto the platform -- onto the stage. And once I've done that, no matter where, I'm satisfied.\" Watch the show on CNN on Saturday, August 1, 12.30, 21.30 GMT and Sunday, August 2, 18.00 GMT.\nTL;DR: \nROLE 1: Given the text: ------------ Wole Soyinka's Writing for Poets: The Power To Turn Into a Writer\nPosted By Kristianne Diamandis On August 25, 2002 at 11:57 AM By Kristianne Diamandis\nReconstruct the summarized text to the detailed:\nROLE 2:  ------------\u00a0 Wole Soyinka's writing for Poets:\u00a0 The Power To Turn Into a Writer", "rollout/best_game/overall_score": -72.53490052034746, "rollout/best_game/accuracy (r2)": 0.02916165783669605, "_this_batch_num_rl_token": 10790, "num_rl_rollout": 7, "lm_epoch": 0, "rl_epoch": 0, "step": 603, "total_data_token": 1030379, "total_rl_token": 1288118, "total_lm_token": 950349, "total_token": 2238467, "completed_steps": 603, "rollout/num_train_sample": 640, "timestamp": 1771739334, "checkpoint_dir_name": null, "done": false, "training_iteration": 15, "trial_id": "24c8f244", "date": "2026-02-22_13-48-54", "time_this_iter_s": 46.67833948135376, "time_total_s": 684.9799230098724, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 684.9799230098724, "iterations_since_restore": 15}
+{"rl_info/A2G": -0.0016631183680146933, "rl_info/entropy": 3.0155723094940186, "rl_info/total_token": 1724.0, "rl_info/advantage_b4_norm": -765.47900390625, "rl_info/advantage_after_gnorm": -0.3265053927898407, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.1660102754831314, "train/lm_loss": 7.119722366333008, "train/total_loss": 7.285732746124268, "num_rl_rollout": 7, "lm_epoch": 0, "rl_epoch": 0, "step": 604, "total_data_token": 1031781, "total_rl_token": 1289842, "total_lm_token": 951751, "total_token": 2241593, "completed_steps": 604, "timestamp": 1771739334, "checkpoint_dir_name": null, "done": false, "training_iteration": 16, "trial_id": "24c8f244", "date": "2026-02-22_13-48-54", "time_this_iter_s": 0.42021775245666504, "time_total_s": 685.4001407623291, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 685.4001407623291, "iterations_since_restore": 16}
+{"rl_info/A2G": -0.0007514198659919202, "rl_info/entropy": 3.0055315494537354, "rl_info/total_token": 1869.0, "rl_info/advantage_b4_norm": -828.5433349609375, "rl_info/advantage_after_gnorm": -0.35548320412635803, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.07484143227338791, "train/lm_loss": 6.692391872406006, "train/total_loss": 6.767233371734619, "xsum/rouge1": 0.08561986477758597, "xsum/rouge2": 0.017859728998567474, "xsum/rougeL": 0.07106736100810528, "xsum/rougeLsum": 0.06976267886545791, "xsum/bertscore_precision": 0.673381045460701, "xsum/bertscore_recall": 0.7094724029302597, "xsum/bertscore_f1": 0.6868990957736969, "gigaword/rouge1": 0.01792436167122676, "gigaword/rouge2": 0.003345956332300886, "gigaword/rougeL": 0.017691869455089663, "gigaword/rougeLsum": 0.016934618667207704, "gigaword/bertscore_precision": 0.5690893116593361, "gigaword/bertscore_recall": 0.6516902428865433, "gigaword/bertscore_f1": 0.6067471693456173, "cnndm/rouge1": 0.11920229939533233, "cnndm/rouge2": 0.04171313526591555, "cnndm/rougeL": 0.09703887203070154, "cnndm/rougeLsum": 0.10671084450652886, "cnndm/bertscore_precision": 0.654333084821701, "cnndm/bertscore_recall": 0.7152138104041418, "cnndm/bertscore_f1": 0.6821664472421011, "samsum/rouge1": 0.06289126675067525, "samsum/rouge2": 0.012882282338243789, "samsum/rougeL": 0.05707222564552983, "samsum/rougeLsum": 0.04276697239831435, "samsum/bertscore_precision": 0.6353396276632944, "samsum/bertscore_recall": 0.6803758343060812, "samsum/bertscore_f1": 0.6566480696201324, "eval_agg/avg_all_rougef": 0.05253027113167395, "eval_agg/avg_all_bertf": 0.658115195495387, "eval_agg/avg_all": 0.3553227333135305, "num_rl_rollout": 7, "lm_epoch": 0, "rl_epoch": 0, "step": 700, "total_data_token": 1184210, "total_rl_token": 1492463, "total_lm_token": 1104180, "total_token": 2596643, "completed_steps": 700, "tune_objective": 0.7741418340825762, "timestamp": 1771739380, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 17, "trial_id": "24c8f244", "date": "2026-02-22_13-49-41", "time_this_iter_s": 46.26986765861511, "time_total_s": 731.6700084209442, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 731.6700084209442, "iterations_since_restore": 17}
+{"rollout/num_samples": 640, "rollout/avg_q1_length": 355.40625, "rollout/std_q1_length": 242.0531463623047, "rollout/gen/avg_score": -1011.3029174804688, "rollout/gen/std_score": 2270.206298828125, "rollout/gen/avg_r1_length": 211.515625, "rollout/gen/std_r1_length": 184.01100158691406, "rollout/gen/avg_r1_score": -465.8551025390625, "rollout/gen/std_r1_score": 1498.89013671875, "rollout/gen/avg_r1_accuracy": 0.15066833794116974, "rollout/gen/std_r1_accuracy": 0.05091378465294838, "rollout/gen/avg_r2_length": 382.609375, "rollout/gen/std_r2_length": 163.38697814941406, "rollout/gen/avg_r2_score": -1147.6650390625, "rollout/gen/std_r2_score": 2406.92578125, "rollout/gen/avg_r2_accuracy": 0.11800452321767807, "rollout/gen/std_r2_accuracy": 0.05841098353266716, "rollout/best_game/query_1": "The centre, which was proposed by former rugby player and coach Ben Harvey, will allow elite athletes to train in sports from rugby to surfing.\nPlanners recommended it be rejected because of concerns about its community value and the loss of green space.\nBut the Planning Committee approved it on the condition it was made available long-term for everyone in the island.\nOnly 40% of the centre is earmarked for schools and sports groups to use while the rest will be a private members-only gym.\nHowever, Mr Harvey said it was aimed \"primarily at the children of the island\", with schools getting free access to the facility and to staff.\nIt will be built next to the rugby club in St Peter.\nTL;DR: ", "rollout/best_game/query_2": "Given the text: \u00a0Trying something new for this island, if you just need some help with your own energy use needs, can easily turn you around.\nReconstruct the summarized text to the detailed:", "rollout/best_game/response_2": " \u00a0If you still need help with getting the words right you can look here .", "rollout/best_game/whole_game": "ROLE 0: The centre, which was proposed by former rugby player and coach Ben Harvey, will allow elite athletes to train in sports from rugby to surfing.\nPlanners recommended it be rejected because of concerns about its community value and the loss of green space.\nBut the Planning Committee approved it on the condition it was made available long-term for everyone in the island.\nOnly 40% of the centre is earmarked for schools and sports groups to use while the rest will be a private members-only gym.\nHowever, Mr Harvey said it was aimed \"primarily at the children of the island\", with schools getting free access to the facility and to staff.\nIt will be built next to the rugby club in St Peter.\nTL;DR: \nROLE 1: Given the text: \u00a0Trying something new for this island, if you just need some help with your own energy use needs, can easily turn you around.\nReconstruct the summarized text to the detailed:\nROLE 2:  \u00a0If you still need help with getting the words right you can look here .", "rollout/best_game/overall_score": -50.54095932403732, "rollout/best_game/accuracy (r2)": 0.028985507246376805, "_this_batch_num_rl_token": 11373, "num_rl_rollout": 8, "lm_epoch": 0, "rl_epoch": 0, "step": 703, "total_data_token": 1201068, "total_rl_token": 1497558, "total_lm_token": 1109665, "total_token": 2607223, "completed_steps": 703, "rollout/num_train_sample": 640, "timestamp": 1771739430, "checkpoint_dir_name": null, "done": false, "training_iteration": 18, "trial_id": "24c8f244", "date": "2026-02-22_13-50-30", "time_this_iter_s": 49.42563319206238, "time_total_s": 781.0956416130066, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 781.0956416130066, "iterations_since_restore": 18}
+{"rl_info/A2G": -0.0006135866860859096, "rl_info/entropy": 3.0641775131225586, "rl_info/total_token": 1672.0, "rl_info/advantage_b4_norm": -452.65997314453125, "rl_info/advantage_after_gnorm": 0.6946559548377991, "rl_info/kl_w_ref": 0.0, "train/rl_loss": 0.06105225160717964, "train/lm_loss": 6.900911808013916, "train/total_loss": 6.961964130401611, "xsum/rouge1": 0.08483803299918875, "xsum/rouge2": 0.019623212928474902, "xsum/rougeL": 0.08195133977624622, "xsum/rougeLsum": 0.07218261351250692, "xsum/bertscore_precision": 0.6857275863488516, "xsum/bertscore_recall": 0.7028991182645162, "xsum/bertscore_f1": 0.6907219886779785, "gigaword/rouge1": 0.018297189427215782, "gigaword/rouge2": 0.0036472783996404174, "gigaword/rougeL": 0.018197179426215678, "gigaword/rougeLsum": 0.01677477526128704, "gigaword/bertscore_precision": 0.5718533997237683, "gigaword/bertscore_recall": 0.658249785900116, "gigaword/bertscore_f1": 0.6110759866237641, "cnndm/rouge1": 0.1247738967534499, "cnndm/rouge2": 0.04371345899194809, "cnndm/rougeL": 0.11208631550081943, "cnndm/rougeLsum": 0.1075812720885397, "cnndm/bertscore_precision": 0.6611810823281606, "cnndm/bertscore_recall": 0.7160364538431168, "cnndm/bertscore_f1": 0.6858159353335699, "samsum/rouge1": 0.061221848337877464, "samsum/rouge2": 0.016417385442513716, "samsum/rougeL": 0.052437415443351086, "samsum/rougeLsum": 0.0402346169201337, "samsum/bertscore_precision": 0.6325927923123041, "samsum/bertscore_recall": 0.6893507689237595, "samsum/bertscore_f1": 0.658875048160553, "eval_agg/avg_all_rougef": 0.054623614450588054, "eval_agg/avg_all_bertf": 0.6616222396989664, "eval_agg/avg_all": 0.3581229270747772, "num_rl_rollout": 8, "lm_epoch": 0, "rl_epoch": 0, "step": 800, "total_data_token": 1346674, "total_rl_token": 1707847, "total_lm_token": 1255271, "total_token": 2963118, "completed_steps": 800, "tune_objective": 0.7796870211335415, "timestamp": 1771739476, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": true, "training_iteration": 19, "trial_id": "24c8f244", "date": "2026-02-22_13-51-17", "time_this_iter_s": 46.6529655456543, "time_total_s": 827.7486071586609, "pid": 3469485, "hostname": "lagoon", "node_ip": "10.2.1.31", "config": {"train_loop_config": {"dataset_name": "nbtpj/summ_ds_train", "dataset_config_name": null, "train_split_name": "merge36_cnndmsamsumxsum", "text_template": "{text}\nTL;DR: {summary}", "label_col": "summary", "freeze_role2": false, "only_train_role1": false, "model_name_or_path": "gpt2", "ref_role1_name_or_path": "gpt2", "ref_role2_name_or_path": "gpt2", "pretrained_role2_name_or_path": "none", "config_name": null, "vectorizer_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/vectorizer/wikitext103_tfidf_full.joblib", "tokenizer_name": null, "use_slow_tokenizer": false, "per_device_train_batch_size": 4, "per_device_query_rollout_batch_size": 32, "per_device_eval_batch_size": 4, "vllm_vram_ratio": 0.3, "learning_rate": 3e-07, "grad_norm": 0.5, "weight_decay": 1e-05, "max_train_steps": 40000, "max_train_rollouts": 100000, "gradient_accumulation_steps": 1, "lr_scheduler_type": "constant", "num_warmup_steps": 200, "seed": 0, "model_type": null, "block_size": 1024, "mini_epoch": 1, "rollout_game": "baseline3v2", "rl_algo": "on_policy", "constraint_type": "kl", "clamp_update": false, "rl_w": 1.0, "lm_w": 1.0, "n_generate": 4, "n_augment": 0, "gradient_checkpoint": false, "group_relative_norm": true, "sample_config": {"do_sample": true, "min_new_tokens": 20, "temperature": 1.0}, "inference_config": {"do_sample": true, "temperature": 0.0, "min_new_tokens": 5, "max_new_tokens": 250}, "rollout_config": {"accuracy_w": 0.34179529120610125, "len_pen": 1.0, "accuracy_w2": 14.68588598920087, "len_pen2": 1.0, "threshold": 0.011423254155608374, "similarity_fn": "rouge"}, "ent_coef": 0.0001, "beta_coef": "0.0", "prompt_0": "{text}", "prompt_1": "{text}\nTL;DR: ", "prompt_2": "Given the text: {role1_output}\nReconstruct the summarized text to the detailed:", "prompt_eval": "{text}\nTL;DR:", "epsilon": 0.2, "a2g_norm": true, "vllm_sleep": true, "lora": true, "need_attn_mask": true, "gamma": 0.95, "trust_remote_code": true, "test_glue": false, "test_clm": false, "causal_model": true, "test_gen": true, "log_rollout_txt": true, "trunc_eval": 256, "trunc_evals": ["cnndm___12", "samsum___12", "xsum___12", "gigaword___200", "duc___50"], "use_deepspeed": false, "zero_config": 2, "log_interval": "5m", "eval_interval": "100", "checkpoint_interval": "100", "lm_fraction": -1.0, "push_to_hub": null, "keep_eval_size": false, "mixed_precision": "bf16", "tune_metrics": ["cnndm/rouge1___1.0", "cnndm/bertscore_f1___0.25", "samsum/rouge1___1.0", "samsum/bertscore_f1___0.25", "xsum/rouge1___1.0", "xsum/bertscore_f1___0.25"], "base_path": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2", "script": "/common/home/users/m/mq.nguyen.2023/testcode/SAC_LM/module9_clmv2/execute/utune/debug.py", "train_from_raw": true, "world_size": 1, "cpu_per_worker": 7, "gpu_per_worker": 1}}, "time_since_restore": 827.7486071586609, "iterations_since_restore": 19}