a12354 commited on 16 days ago

Commit

7d22995

verified ·

1 Parent(s): 8d2b389

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
img/architecture.png +3 -0
rats40k_adapter/README.md +70 -0
rats40k_adapter/eval_rats40k.py +244 -0
rats40k_adapter/finetune_rats40k_lora.py +314 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/README.md +202 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/adapter_config.json +37 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/added_tokens.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_0.pth +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_1.pth +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_2.pth +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_3.pth +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scaler.pt +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scheduler.pt +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/tokenizer.model +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/training_args.bin +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/added_tokens.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/scheduler.pt +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/special_tokens_map.json +30 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer.model +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer_config.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/trainer_state.json +692 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/training_args.bin +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/special_tokens_map.json +30 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.model +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer_config.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/training_args.bin +3 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/TSAD_test_metrics.json +16 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank0.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank1.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank2.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank3.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/TSAD_test_metrics.json +16 -0
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank0.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank1.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank2.json +0 -0
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank3.json +0 -0
rats40k_adapter/rats40k_common.py +347 -0
rats40k_adapter/run_sft_4gpu.sh +145 -0
rats40k_adapter/run_zeroshot_4gpu.sh +81 -0
rats40k_adapter/run_zeroshot_then_sft_4gpu.sh +75 -0
training/finetune.py +133 -0
training/finetune.sh +37 -0
training/pretrain.py +154 -0
training/pretrain.sh +37 -0
tsqa_adapter/logs/sft_4gpu_20260615_140322.log +875 -0
tsqa_adapter/logs/sft_4gpu_20260615_141604.log +210 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+img/architecture.png filter=lfs diff=lfs merge=lfs -text

img/architecture.png ADDED Viewed

Git LFS Details

SHA256: acc14adddd5e8986d8857509d1f3f731020ee423970c0848e488b865c6c6231b
Pointer size: 131 Bytes
Size of remote file: 310 kB

rats40k_adapter/README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# RATs40K Adapter for ChatTime
+This folder adapts ChatTime to the RATs40K univariate anomaly QA task.
+It intentionally uses the numeric `Observation` field only. It does not use
+`FigurePath`, so the resulting baseline should be compared with numeric-only
+Time-RA settings rather than VLM image-input settings.
+## Required inputs
+- `MODEL_PATH`: local ChatTime model directory. This is required by default.
+- `PYTHON_BIN`: Python executable. The shell scripts default to
+  `/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python`.
+- `DATA_PATH`: defaults to
+  `/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json`.
+The scripts do not download HuggingFace weights unless `ALLOW_HF_DOWNLOAD=1`
+is set explicitly.
+The default precision is FP16 because the configured four-GPU machine uses
+Tesla V100 GPUs. SFT defaults to regular FP16 LoRA with
+`LOAD_IN_4BIT=0`, `PER_DEVICE_TRAIN_BATCH_SIZE=1`, and
+`GRADIENT_ACCUMULATION_STEPS=16`.
+Evaluation defaults to `EVAL_BATCH_SIZE=4` per GPU. With four GPUs, the
+maximum global evaluation batch size is 16.
+The task prompt is aligned with Time-RA's univariate
+`USER_DETECTION_PROMPT`; ChatTime still receives the normalized/discretized
+series through its native `### Input` section. Evaluation checks prompt token
+lengths and fails instead of truncating. Defaults are `MAX_INPUT_TOKENS=3936`,
+`MAX_NEW_TOKENS=160`, and `MAX_SEQ_LENGTH=4096` for SFT.
+## Zero-shot
+```bash
+cd /mnt/share01/sqk/ChatTime
+MODEL_PATH=/mnt/share01/sqk/models/ChatTime-1-7B-Chat \
+bash rats40k_adapter/run_zeroshot_4gpu.sh
+```
+## SFT + Eval
+```bash
+cd /mnt/share01/sqk/ChatTime
+MODEL_PATH=/mnt/share01/sqk/models/ChatTime-1-7B-Chat \
+bash rats40k_adapter/run_sft_4gpu.sh
+```
+## Zero-shot Then SFT + Eval
+```bash
+cd /mnt/share01/sqk/ChatTime
+bash rats40k_adapter/run_zeroshot_then_sft_4gpu.sh
+```
+## Saved Results
+- Zero-shot outputs:
+  `/mnt/share01/sqk/ChatTime/rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot`
+- SFT outputs:
+  `/mnt/share01/sqk/ChatTime/rats40k_adapter/outputs/pipeline_20260608_175250/sft`
+Useful smoke-test knobs:
+```bash
+MAX_TRAIN_SAMPLES=128 MAX_EVAL_SAMPLES=64 bash rats40k_adapter/run_zeroshot_then_sft_4gpu.sh
+MAX_TRAIN_SAMPLES=128 MAX_EVAL_SAMPLES=64 bash rats40k_adapter/run_sft_4gpu.sh
+MAX_EVAL_SAMPLES=64 bash rats40k_adapter/run_zeroshot_4gpu.sh
+```

rats40k_adapter/eval_rats40k.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import argparse
+import copy
+import os
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from rats40k_common import (
+    atomic_write_json,
+    build_prediction,
+    build_prompt,
+    compute_metrics,
+    load_dataset_json,
+    valid_split_items,
+)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Evaluate ChatTime on RATs40K univariate anomaly QA."
+    )
+    parser.add_argument("--model_path", required=True)
+    parser.add_argument("--adapter_path", default=None)
+    parser.add_argument(
+        "--data_path",
+        default="/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json",
+    )
+    parser.add_argument("--split", default="TSAD_test")
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--result_name", default=None)
+    parser.add_argument("--max_eval_samples", type=int, default=None)
+    parser.add_argument("--eval_batch_size", type=int, default=4)
+    parser.add_argument("--max_input_tokens", type=int, default=3936)
+    parser.add_argument("--max_new_tokens", type=int, default=160)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--top_p", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--torch_dtype", choices=["auto", "bf16", "fp16", "fp32"], default="fp16")
+    parser.add_argument("--allow_hf_download", action="store_true")
+    return parser.parse_args()
+def init_distributed():
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    rank = int(os.environ.get("RANK", "0"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank)
+    if world_size > 1 and not dist.is_initialized():
+        dist.init_process_group(backend="nccl")
+    return rank, local_rank, world_size
+def dtype_from_arg(value):
+    if value == "auto":
+        return "auto"
+    if value == "bf16":
+        return torch.bfloat16
+    if value == "fp16":
+        return torch.float16
+    return torch.float32
+def load_model_and_tokenizer(args, local_rank):
+    local_files_only = not args.allow_hf_download
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+        local_files_only=local_files_only,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+    device = f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu"
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+        torch_dtype=dtype_from_arg(args.torch_dtype),
+        low_cpu_mem_usage=True,
+        device_map={"": device} if torch.cuda.is_available() else None,
+        local_files_only=local_files_only,
+    )
+    if args.adapter_path:
+        try:
+            from peft import PeftModel
+        except Exception as exc:
+            raise RuntimeError(
+                "peft is required when --adapter_path is provided. "
+                "Install peft in the selected Python environment."
+            ) from exc
+        model = PeftModel.from_pretrained(
+            model,
+            args.adapter_path,
+            local_files_only=local_files_only,
+        )
+    model.eval()
+    return model, tokenizer, device
+def generate_responses(model, tokenizer, device, prompts, args):
+    raw_encodings = tokenizer(
+        prompts,
+        add_special_tokens=True,
+        truncation=False,
+    )
+    prompt_lengths = [len(input_ids) for input_ids in raw_encodings["input_ids"]]
+    max_prompt_length = max(prompt_lengths) if prompt_lengths else 0
+    if max_prompt_length > args.max_input_tokens:
+        longest = max(range(len(prompt_lengths)), key=lambda idx: prompt_lengths[idx])
+        raise RuntimeError(
+            "Prompt token length exceeds max_input_tokens. "
+            f"max_prompt_length={max_prompt_length}, "
+            f"max_input_tokens={args.max_input_tokens}, "
+            f"batch_index={longest}. Increase MAX_INPUT_TOKENS or shorten the prompt."
+        )
+    model_context = getattr(model.config, "max_position_embeddings", None)
+    if model_context and max_prompt_length + args.max_new_tokens > model_context:
+        raise RuntimeError(
+            "Prompt plus generation budget exceeds model context length. "
+            f"max_prompt_length={max_prompt_length}, "
+            f"max_new_tokens={args.max_new_tokens}, "
+            f"model_context={model_context}. "
+            "Lower MAX_NEW_TOKENS or MAX_INPUT_TOKENS."
+        )
+    inputs = tokenizer(
+        prompts,
+        return_tensors="pt",
+        padding=True,
+        truncation=False,
+    )
+    inputs = {key: value.to(device) for key, value in inputs.items()}
+    do_sample = args.temperature > 0
+    generation_config = copy.deepcopy(model.generation_config)
+    generation_config.do_sample = do_sample
+    generation_config.pad_token_id = tokenizer.pad_token_id
+    generation_config.eos_token_id = tokenizer.eos_token_id
+    if do_sample:
+        generation_config.temperature = args.temperature
+        generation_config.top_p = args.top_p
+        generation_config.top_k = args.top_k
+    else:
+        generation_config.temperature = None
+        generation_config.top_p = None
+        generation_config.top_k = None
+    generation_kwargs = {
+        "max_new_tokens": args.max_new_tokens,
+    }
+    with torch.inference_mode():
+        output = model.generate(
+            **inputs,
+            generation_config=generation_config,
+            **generation_kwargs,
+        )
+    new_tokens = output[:, inputs["input_ids"].shape[-1] :]
+    return [
+        response.strip()
+        for response in tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+    ]
+def main():
+    args = parse_args()
+    rank, local_rank, world_size = init_distributed()
+    output_dir = Path(args.output_dir)
+    shard_dir = output_dir / "shards"
+    shard_dir.mkdir(parents=True, exist_ok=True)
+    data = load_dataset_json(args.data_path)
+    items = valid_split_items(data, args.split)
+    if args.max_eval_samples is not None and args.max_eval_samples >= 0:
+        items = items[: args.max_eval_samples]
+    shard_items = items[rank::world_size]
+    if rank == 0:
+        print(f"Dataset: {args.data_path}")
+        print(f"Split: {args.split}")
+        print(f"Total samples: {len(items)}")
+        print(f"World size: {world_size}")
+        print(f"Per-device eval batch size: {args.eval_batch_size}")
+        print(f"Maximum global eval batch size: {args.eval_batch_size * world_size}")
+        print(f"Output dir: {output_dir}")
+    model, tokenizer, device = load_model_and_tokenizer(args, local_rank)
+    predictions = {}
+    batch_size = max(1, args.eval_batch_size)
+    batch_starts = range(0, len(shard_items), batch_size)
+    for start in tqdm(
+        batch_starts,
+        total=(len(shard_items) + batch_size - 1) // batch_size,
+        desc=f"rank {rank}",
+        disable=rank != 0,
+    ):
+        batch_items = shard_items[start : start + batch_size]
+        prompts = [
+            build_prompt(item["Observation"], item.get("Source"))
+            for _, item in batch_items
+        ]
+        responses = generate_responses(model, tokenizer, device, prompts, args)
+        for (idx, _), response in zip(batch_items, responses):
+            predictions[idx] = build_prediction(response)
+    result_name = args.result_name or f"{args.split}_predictions.json"
+    shard_path = shard_dir / f"{Path(result_name).stem}.rank{rank}.json"
+    atomic_write_json(predictions, shard_path)
+    if world_size > 1:
+        dist.barrier()
+    if rank == 0:
+        merged = {}
+        for shard_rank in range(world_size):
+            path = shard_dir / f"{Path(result_name).stem}.rank{shard_rank}.json"
+            shard = load_dataset_json(path)
+            merged.update(shard)
+        def sort_key(pair):
+            key = pair[0]
+            return (0, int(key)) if key.isdigit() else (1, key)
+        merged = dict(sorted(merged.items(), key=sort_key))
+        result_path = output_dir / result_name
+        metrics_path = output_dir / f"{args.split}_metrics.json"
+        atomic_write_json({args.split: merged}, result_path)
+        atomic_write_json(compute_metrics(data, merged, args.split), metrics_path)
+        print(f"Saved predictions: {result_path}")
+        print(f"Saved metrics: {metrics_path}")
+    if world_size > 1:
+        dist.barrier()
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

rats40k_adapter/finetune_rats40k_lora.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import argparse
+import inspect
+import os
+from pathlib import Path
+import torch
+from torch.utils.data import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from rats40k_common import build_prompt, build_response, load_dataset_json, valid_split_items
+def patch_accelerate_compatibility():
+    from accelerate import Accelerator
+    signature = inspect.signature(Accelerator.unwrap_model)
+    if "keep_torch_compile" in signature.parameters:
+        return
+    original_unwrap_model = Accelerator.unwrap_model
+    def unwrap_model_compat(
+        self,
+        model,
+        keep_fp32_wrapper=True,
+        keep_torch_compile=True,
+    ):
+        del keep_torch_compile
+        return original_unwrap_model(
+            self,
+            model,
+            keep_fp32_wrapper=keep_fp32_wrapper,
+        )
+    Accelerator.unwrap_model = unwrap_model_compat
+    print(
+        "Applied accelerate compatibility patch: "
+        "Accelerator.unwrap_model accepts keep_torch_compile."
+    )
+class PromptResponseDataset(Dataset):
+    def __init__(self, rows, tokenizer, max_seq_length):
+        self.rows = rows
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self._validate_lengths()
+    def __len__(self):
+        return len(self.rows)
+    def __getitem__(self, index):
+        _, prompt, response = self.rows[index]
+        prompt_ids = self.tokenizer(
+            prompt,
+            add_special_tokens=True,
+            truncation=False,
+        )["input_ids"]
+        response_ids = self.tokenizer(
+            response + self.tokenizer.eos_token,
+            add_special_tokens=False,
+            truncation=False,
+        )["input_ids"]
+        input_ids = prompt_ids + response_ids
+        prompt_len = len(prompt_ids)
+        labels = [-100] * prompt_len + input_ids[prompt_len:]
+        attention_mask = [1] * len(input_ids)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+    def _validate_lengths(self):
+        max_prompt_len = 0
+        max_total_len = 0
+        too_long = []
+        for idx, prompt, response in self.rows:
+            prompt_ids = self.tokenizer(
+                prompt,
+                add_special_tokens=True,
+                truncation=False,
+            )["input_ids"]
+            response_ids = self.tokenizer(
+                response + self.tokenizer.eos_token,
+                add_special_tokens=False,
+                truncation=False,
+            )["input_ids"]
+            prompt_len = len(prompt_ids)
+            total_len = prompt_len + len(response_ids)
+            max_prompt_len = max(max_prompt_len, prompt_len)
+            max_total_len = max(max_total_len, total_len)
+            if total_len > self.max_seq_length:
+                too_long.append((idx, prompt_len, total_len))
+                if len(too_long) >= 5:
+                    break
+        print(
+            "SFT token length check: "
+            f"max_prompt_len={max_prompt_len}, "
+            f"max_total_len={max_total_len}, "
+            f"max_seq_length={self.max_seq_length}"
+        )
+        if too_long:
+            examples = ", ".join(
+                f"{idx}:prompt={prompt_len},total={total_len}"
+                for idx, prompt_len, total_len in too_long
+            )
+            raise RuntimeError(
+                "Some SFT samples exceed max_seq_length and would be truncated. "
+                f"Examples: {examples}. Increase MAX_SEQ_LENGTH or shorten the prompt."
+            )
+class CausalLMCollator:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, features):
+        max_len = max(len(feature["input_ids"]) for feature in features)
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for feature in features:
+            pad_len = max_len - len(feature["input_ids"])
+            input_ids.append(feature["input_ids"] + [self.tokenizer.pad_token_id] * pad_len)
+            attention_mask.append(feature["attention_mask"] + [0] * pad_len)
+            labels.append(feature["labels"] + [-100] * pad_len)
+        return {
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "labels": torch.tensor(labels, dtype=torch.long),
+        }
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="LoRA SFT for ChatTime on RATs40K univariate anomaly QA."
+    )
+    parser.add_argument("--model_path", required=True)
+    parser.add_argument(
+        "--data_path",
+        default="/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json",
+    )
+    parser.add_argument("--train_split", default="TSAD_train")
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--max_train_samples", type=int, default=None)
+    parser.add_argument("--max_seq_length", type=int, default=4096)
+    parser.add_argument("--per_device_train_batch_size", type=int, default=2)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=8)
+    parser.add_argument("--num_train_epochs", type=float, default=2.0)
+    parser.add_argument("--learning_rate", type=float, default=2e-4)
+    parser.add_argument("--warmup_ratio", type=float, default=0.05)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--logging_steps", type=int, default=10)
+    parser.add_argument("--save_steps", type=int, default=200)
+    parser.add_argument("--save_total_limit", type=int, default=2)
+    parser.add_argument("--dataloader_num_workers", type=int, default=4)
+    parser.add_argument("--lora_rank", type=int, default=16)
+    parser.add_argument("--lora_alpha", type=int, default=32)
+    parser.add_argument("--lora_dropout", type=float, default=0.05)
+    parser.add_argument("--load_in_4bit", action="store_true")
+    parser.add_argument("--gradient_checkpointing", action="store_true")
+    parser.add_argument("--torch_dtype", choices=["bf16", "fp16", "fp32"], default="fp16")
+    parser.add_argument("--allow_hf_download", action="store_true")
+    return parser.parse_args()
+def dtype_from_arg(value):
+    if value == "bf16":
+        return torch.bfloat16
+    if value == "fp16":
+        return torch.float16
+    return torch.float32
+def local_rank():
+    return int(os.environ.get("LOCAL_RANK", "0"))
+def build_rows(data_path, split, max_samples):
+    data = load_dataset_json(data_path)
+    items = valid_split_items(data, split)
+    if max_samples is not None and max_samples >= 0:
+        items = items[:max_samples]
+    rows = []
+    for idx, item in items:
+        prompt = build_prompt(item["Observation"], item.get("Source"))
+        response = build_response(item.get("Thought", ""), item.get("ActionID"))
+        rows.append((idx, prompt, response))
+    return rows
+def load_model(args):
+    try:
+        from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+    except Exception as exc:
+        raise RuntimeError(
+            "peft is required for SFT. Install peft in the selected Python environment."
+        ) from exc
+    dtype = dtype_from_arg(args.torch_dtype)
+    local_files_only = not args.allow_hf_download
+    device = f"cuda:{local_rank()}" if torch.cuda.is_available() else "cpu"
+    quantization_config = None
+    if args.load_in_4bit:
+        try:
+            from transformers import BitsAndBytesConfig
+        except Exception as exc:
+            raise RuntimeError(
+                "transformers BitsAndBytesConfig is required for --load_in_4bit."
+            ) from exc
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=dtype,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+        quantization_config=quantization_config,
+        device_map={"": device} if torch.cuda.is_available() and args.load_in_4bit else None,
+        local_files_only=local_files_only,
+    )
+    if not args.load_in_4bit and torch.cuda.is_available():
+        model.to(device)
+    if args.load_in_4bit:
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=args.gradient_checkpointing,
+        )
+    elif args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+    lora_config = LoraConfig(
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    return model
+def main():
+    args = parse_args()
+    patch_accelerate_compatibility()
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank())
+    local_files_only = not args.allow_hf_download
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+        local_files_only=local_files_only,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    rows = build_rows(args.data_path, args.train_split, args.max_train_samples)
+    train_dataset = PromptResponseDataset(rows, tokenizer, args.max_seq_length)
+    model = load_model(args)
+    bf16 = args.torch_dtype == "bf16"
+    fp16 = args.torch_dtype == "fp16"
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        num_train_epochs=args.num_train_epochs,
+        learning_rate=args.learning_rate,
+        warmup_ratio=args.warmup_ratio,
+        weight_decay=args.weight_decay,
+        logging_steps=args.logging_steps,
+        save_steps=args.save_steps,
+        save_total_limit=args.save_total_limit,
+        dataloader_num_workers=args.dataloader_num_workers,
+        bf16=bf16,
+        fp16=fp16,
+        report_to="none",
+        remove_unused_columns=False,
+        ddp_find_unused_parameters=False,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=CausalLMCollator(tokenizer),
+    )
+    trainer.train()
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+if __name__ == "__main__":
+    main()

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /mnt/share01/sqk/models/ChatTime-1-7B-Chat
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/mnt/share01/sqk/models/ChatTime-1-7B-Chat",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4c9bdb7f1fdf439aa0f3c5fb41c3ce23e5e6e873bea3f378cf26a709d3a3d22
+size 15024

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f195cb3d44682c30ad9c0e1b320d29a952dc22676a666d5b7c0a105f554e012b
+size 15024

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d644f876963b59d7d58322d0dbd4f84b5f005eb85a095c14ef20d7e8528948b
+size 15024

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2835319bf595568b23d432fbbab931291be0d746234b19ee4344a5852238e357
+size 15024

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba165e391bcfa2e1188f6c4a775e972bb6f49e4c5970a96da748324529cedb20
+size 988

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a175fc835d2784e8615427cf828af918ee04b274e34925b9edf89d29106ab1c1
+size 1064

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:277bbf113ecf76ec5b62586e2b4fa91501b2571b1380f4721de69ef68675511f
+size 5432

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:617fa12ac6cb39783256991c6577b58ec2981bdfd4cdfb58008163c743049429
+size 1064

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/trainer_state.json ADDED Viewed

	@@ -0,0 +1,692 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 946,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.021144442976080348,
+      "grad_norm": 6.2254319190979,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 3.0149,
+      "step": 10
+    },
+    {
+      "epoch": 0.042288885952160696,
+      "grad_norm": 0.7327573895454407,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 1.623,
+      "step": 20
+    },
+    {
+      "epoch": 0.06343332892824105,
+      "grad_norm": 0.5618261098861694,
+      "learning_rate": 0.00011250000000000001,
+      "loss": 1.1099,
+      "step": 30
+    },
+    {
+      "epoch": 0.08457777190432139,
+      "grad_norm": 0.48980122804641724,
+      "learning_rate": 0.00015416666666666668,
+      "loss": 0.9131,
+      "step": 40
+    },
+    {
+      "epoch": 0.10572221488040175,
+      "grad_norm": 0.488565593957901,
+      "learning_rate": 0.00019583333333333334,
+      "loss": 0.7716,
+      "step": 50
+    },
+    {
+      "epoch": 0.1268666578564821,
+      "grad_norm": 0.4297373592853546,
+      "learning_rate": 0.0001979955456570156,
+      "loss": 0.723,
+      "step": 60
+    },
+    {
+      "epoch": 0.14801110083256244,
+      "grad_norm": 0.4536869525909424,
+      "learning_rate": 0.0001957683741648107,
+      "loss": 0.6879,
+      "step": 70
+    },
+    {
+      "epoch": 0.16915554380864278,
+      "grad_norm": 0.41550788283348083,
+      "learning_rate": 0.0001935412026726058,
+      "loss": 0.6586,
+      "step": 80
+    },
+    {
+      "epoch": 0.19029998678472315,
+      "grad_norm": 0.38494592905044556,
+      "learning_rate": 0.0001913140311804009,
+      "loss": 0.6324,
+      "step": 90
+    },
+    {
+      "epoch": 0.2114444297608035,
+      "grad_norm": 0.3633956015110016,
+      "learning_rate": 0.000189086859688196,
+      "loss": 0.631,
+      "step": 100
+    },
+    {
+      "epoch": 0.23258887273688383,
+      "grad_norm": 0.3775959312915802,
+      "learning_rate": 0.0001868596881959911,
+      "loss": 0.6103,
+      "step": 110
+    },
+    {
+      "epoch": 0.2537333157129642,
+      "grad_norm": 0.35080480575561523,
+      "learning_rate": 0.0001846325167037862,
+      "loss": 0.6159,
+      "step": 120
+    },
+    {
+      "epoch": 0.2748777586890445,
+      "grad_norm": 0.4399946928024292,
+      "learning_rate": 0.0001824053452115813,
+      "loss": 0.5983,
+      "step": 130
+    },
+    {
+      "epoch": 0.2960222016651249,
+      "grad_norm": 0.4049876928329468,
+      "learning_rate": 0.0001801781737193764,
+      "loss": 0.5881,
+      "step": 140
+    },
+    {
+      "epoch": 0.31716664464120525,
+      "grad_norm": 0.3834834396839142,
+      "learning_rate": 0.0001779510022271715,
+      "loss": 0.5703,
+      "step": 150
+    },
+    {
+      "epoch": 0.33831108761728557,
+      "grad_norm": 0.3201199471950531,
+      "learning_rate": 0.0001757238307349666,
+      "loss": 0.5777,
+      "step": 160
+    },
+    {
+      "epoch": 0.35945553059336594,
+      "grad_norm": 0.3475135564804077,
+      "learning_rate": 0.0001734966592427617,
+      "loss": 0.5627,
+      "step": 170
+    },
+    {
+      "epoch": 0.3805999735694463,
+      "grad_norm": 0.3944849371910095,
+      "learning_rate": 0.0001712694877505568,
+      "loss": 0.569,
+      "step": 180
+    },
+    {
+      "epoch": 0.4017444165455266,
+      "grad_norm": 0.3674592673778534,
+      "learning_rate": 0.0001690423162583519,
+      "loss": 0.5621,
+      "step": 190
+    },
+    {
+      "epoch": 0.422888859521607,
+      "grad_norm": 0.3651335835456848,
+      "learning_rate": 0.000166815144766147,
+      "loss": 0.5597,
+      "step": 200
+    },
+    {
+      "epoch": 0.4440333024976873,
+      "grad_norm": 0.3435162901878357,
+      "learning_rate": 0.0001645879732739421,
+      "loss": 0.5538,
+      "step": 210
+    },
+    {
+      "epoch": 0.46517774547376767,
+      "grad_norm": 0.3870578408241272,
+      "learning_rate": 0.0001623608017817372,
+      "loss": 0.5574,
+      "step": 220
+    },
+    {
+      "epoch": 0.48632218844984804,
+      "grad_norm": 0.40335071086883545,
+      "learning_rate": 0.0001601336302895323,
+      "loss": 0.5394,
+      "step": 230
+    },
+    {
+      "epoch": 0.5074666314259284,
+      "grad_norm": 0.3105282187461853,
+      "learning_rate": 0.0001579064587973274,
+      "loss": 0.5403,
+      "step": 240
+    },
+    {
+      "epoch": 0.5286110744020087,
+      "grad_norm": 0.3729188144207001,
+      "learning_rate": 0.00015567928730512252,
+      "loss": 0.5466,
+      "step": 250
+    },
+    {
+      "epoch": 0.549755517378089,
+      "grad_norm": 0.3619287312030792,
+      "learning_rate": 0.0001534521158129176,
+      "loss": 0.5305,
+      "step": 260
+    },
+    {
+      "epoch": 0.5708999603541695,
+      "grad_norm": 0.34232136607170105,
+      "learning_rate": 0.0001512249443207127,
+      "loss": 0.5319,
+      "step": 270
+    },
+    {
+      "epoch": 0.5920444033302498,
+      "grad_norm": 0.38660332560539246,
+      "learning_rate": 0.0001489977728285078,
+      "loss": 0.5242,
+      "step": 280
+    },
+    {
+      "epoch": 0.6131888463063301,
+      "grad_norm": 0.35314109921455383,
+      "learning_rate": 0.0001467706013363029,
+      "loss": 0.5255,
+      "step": 290
+    },
+    {
+      "epoch": 0.6343332892824105,
+      "grad_norm": 0.3418401777744293,
+      "learning_rate": 0.00014454342984409802,
+      "loss": 0.5357,
+      "step": 300
+    },
+    {
+      "epoch": 0.6554777322584908,
+      "grad_norm": 0.357149213552475,
+      "learning_rate": 0.0001423162583518931,
+      "loss": 0.5131,
+      "step": 310
+    },
+    {
+      "epoch": 0.6766221752345711,
+      "grad_norm": 0.3720100224018097,
+      "learning_rate": 0.0001400890868596882,
+      "loss": 0.5072,
+      "step": 320
+    },
+    {
+      "epoch": 0.6977666182106516,
+      "grad_norm": 0.342650443315506,
+      "learning_rate": 0.0001378619153674833,
+      "loss": 0.5194,
+      "step": 330
+    },
+    {
+      "epoch": 0.7189110611867319,
+      "grad_norm": 0.34781211614608765,
+      "learning_rate": 0.00013563474387527841,
+      "loss": 0.5094,
+      "step": 340
+    },
+    {
+      "epoch": 0.7400555041628122,
+      "grad_norm": 0.3401576280593872,
+      "learning_rate": 0.00013340757238307352,
+      "loss": 0.5192,
+      "step": 350
+    },
+    {
+      "epoch": 0.7611999471388926,
+      "grad_norm": 0.3490856885910034,
+      "learning_rate": 0.0001311804008908686,
+      "loss": 0.5045,
+      "step": 360
+    },
+    {
+      "epoch": 0.7823443901149729,
+      "grad_norm": 0.3488720655441284,
+      "learning_rate": 0.0001289532293986637,
+      "loss": 0.502,
+      "step": 370
+    },
+    {
+      "epoch": 0.8034888330910532,
+      "grad_norm": 0.37278613448143005,
+      "learning_rate": 0.00012672605790645878,
+      "loss": 0.5038,
+      "step": 380
+    },
+    {
+      "epoch": 0.8246332760671337,
+      "grad_norm": 0.3677748441696167,
+      "learning_rate": 0.00012449888641425391,
+      "loss": 0.505,
+      "step": 390
+    },
+    {
+      "epoch": 0.845777719043214,
+      "grad_norm": 0.3815574049949646,
+      "learning_rate": 0.00012227171492204902,
+      "loss": 0.4997,
+      "step": 400
+    },
+    {
+      "epoch": 0.8669221620192943,
+      "grad_norm": 0.37245893478393555,
+      "learning_rate": 0.0001200445434298441,
+      "loss": 0.4989,
+      "step": 410
+    },
+    {
+      "epoch": 0.8880666049953746,
+      "grad_norm": 0.3642374277114868,
+      "learning_rate": 0.0001178173719376392,
+      "loss": 0.4992,
+      "step": 420
+    },
+    {
+      "epoch": 0.909211047971455,
+      "grad_norm": 0.32838189601898193,
+      "learning_rate": 0.0001155902004454343,
+      "loss": 0.4947,
+      "step": 430
+    },
+    {
+      "epoch": 0.9303554909475353,
+      "grad_norm": 0.36527854204177856,
+      "learning_rate": 0.00011336302895322941,
+      "loss": 0.4952,
+      "step": 440
+    },
+    {
+      "epoch": 0.9514999339236156,
+      "grad_norm": 0.3686304986476898,
+      "learning_rate": 0.0001111358574610245,
+      "loss": 0.4964,
+      "step": 450
+    },
+    {
+      "epoch": 0.9726443768996961,
+      "grad_norm": 0.3496793210506439,
+      "learning_rate": 0.0001089086859688196,
+      "loss": 0.4827,
+      "step": 460
+    },
+    {
+      "epoch": 0.9937888198757764,
+      "grad_norm": 0.3722958266735077,
+      "learning_rate": 0.0001066815144766147,
+      "loss": 0.4838,
+      "step": 470
+    },
+    {
+      "epoch": 1.0148011100832564,
+      "grad_norm": 0.3902372717857361,
+      "learning_rate": 0.00010445434298440981,
+      "loss": 0.4696,
+      "step": 480
+    },
+    {
+      "epoch": 1.0359455530593367,
+      "grad_norm": 0.3780229687690735,
+      "learning_rate": 0.00010222717149220491,
+      "loss": 0.4686,
+      "step": 490
+    },
+    {
+      "epoch": 1.057089996035417,
+      "grad_norm": 0.3552299737930298,
+      "learning_rate": 0.0001,
+      "loss": 0.457,
+      "step": 500
+    },
+    {
+      "epoch": 1.0782344390114973,
+      "grad_norm": 0.3887428045272827,
+      "learning_rate": 9.77728285077951e-05,
+      "loss": 0.4735,
+      "step": 510
+    },
+    {
+      "epoch": 1.0993788819875776,
+      "grad_norm": 0.3928622603416443,
+      "learning_rate": 9.55456570155902e-05,
+      "loss": 0.4675,
+      "step": 520
+    },
+    {
+      "epoch": 1.120523324963658,
+      "grad_norm": 0.3686327636241913,
+      "learning_rate": 9.331848552338531e-05,
+      "loss": 0.4804,
+      "step": 530
+    },
+    {
+      "epoch": 1.1416677679397385,
+      "grad_norm": 0.35772374272346497,
+      "learning_rate": 9.109131403118041e-05,
+      "loss": 0.4609,
+      "step": 540
+    },
+    {
+      "epoch": 1.1628122109158188,
+      "grad_norm": 0.35283800959587097,
+      "learning_rate": 8.88641425389755e-05,
+      "loss": 0.4693,
+      "step": 550
+    },
+    {
+      "epoch": 1.183956653891899,
+      "grad_norm": 0.37653160095214844,
+      "learning_rate": 8.663697104677061e-05,
+      "loss": 0.4551,
+      "step": 560
+    },
+    {
+      "epoch": 1.2051010968679794,
+      "grad_norm": 0.35314637422561646,
+      "learning_rate": 8.44097995545657e-05,
+      "loss": 0.4539,
+      "step": 570
+    },
+    {
+      "epoch": 1.2262455398440597,
+      "grad_norm": 0.35260340571403503,
+      "learning_rate": 8.21826280623608e-05,
+      "loss": 0.4531,
+      "step": 580
+    },
+    {
+      "epoch": 1.24738998282014,
+      "grad_norm": 0.3616096079349518,
+      "learning_rate": 7.995545657015591e-05,
+      "loss": 0.4645,
+      "step": 590
+    },
+    {
+      "epoch": 1.2685344257962203,
+      "grad_norm": 0.3933924436569214,
+      "learning_rate": 7.7728285077951e-05,
+      "loss": 0.4469,
+      "step": 600
+    },
+    {
+      "epoch": 1.2896788687723006,
+      "grad_norm": 0.3878353536128998,
+      "learning_rate": 7.550111358574611e-05,
+      "loss": 0.467,
+      "step": 610
+    },
+    {
+      "epoch": 1.3108233117483812,
+      "grad_norm": 0.41165846586227417,
+      "learning_rate": 7.32739420935412e-05,
+      "loss": 0.4504,
+      "step": 620
+    },
+    {
+      "epoch": 1.3319677547244615,
+      "grad_norm": 0.36190614104270935,
+      "learning_rate": 7.10467706013363e-05,
+      "loss": 0.4517,
+      "step": 630
+    },
+    {
+      "epoch": 1.3531121977005418,
+      "grad_norm": 0.3983185887336731,
+      "learning_rate": 6.881959910913141e-05,
+      "loss": 0.444,
+      "step": 640
+    },
+    {
+      "epoch": 1.3742566406766221,
+      "grad_norm": 0.38672661781311035,
+      "learning_rate": 6.659242761692652e-05,
+      "loss": 0.4488,
+      "step": 650
+    },
+    {
+      "epoch": 1.3954010836527027,
+      "grad_norm": 0.36232879757881165,
+      "learning_rate": 6.436525612472161e-05,
+      "loss": 0.4371,
+      "step": 660
+    },
+    {
+      "epoch": 1.416545526628783,
+      "grad_norm": 0.40571126341819763,
+      "learning_rate": 6.21380846325167e-05,
+      "loss": 0.4427,
+      "step": 670
+    },
+    {
+      "epoch": 1.4376899696048633,
+      "grad_norm": 0.36234796047210693,
+      "learning_rate": 5.9910913140311805e-05,
+      "loss": 0.4439,
+      "step": 680
+    },
+    {
+      "epoch": 1.4588344125809436,
+      "grad_norm": 0.4014786183834076,
+      "learning_rate": 5.7683741648106904e-05,
+      "loss": 0.4548,
+      "step": 690
+    },
+    {
+      "epoch": 1.479978855557024,
+      "grad_norm": 0.3884125053882599,
+      "learning_rate": 5.545657015590201e-05,
+      "loss": 0.4531,
+      "step": 700
+    },
+    {
+      "epoch": 1.5011232985331042,
+      "grad_norm": 0.3621061146259308,
+      "learning_rate": 5.322939866369711e-05,
+      "loss": 0.4407,
+      "step": 710
+    },
+    {
+      "epoch": 1.5222677415091845,
+      "grad_norm": 0.3601549565792084,
+      "learning_rate": 5.100222717149221e-05,
+      "loss": 0.439,
+      "step": 720
+    },
+    {
+      "epoch": 1.5434121844852648,
+      "grad_norm": 0.37766754627227783,
+      "learning_rate": 4.8775055679287305e-05,
+      "loss": 0.4397,
+      "step": 730
+    },
+    {
+      "epoch": 1.5645566274613452,
+      "grad_norm": 0.38728606700897217,
+      "learning_rate": 4.654788418708241e-05,
+      "loss": 0.4455,
+      "step": 740
+    },
+    {
+      "epoch": 1.5857010704374257,
+      "grad_norm": 0.3532933294773102,
+      "learning_rate": 4.432071269487751e-05,
+      "loss": 0.4375,
+      "step": 750
+    },
+    {
+      "epoch": 1.606845513413506,
+      "grad_norm": 0.37484633922576904,
+      "learning_rate": 4.209354120267261e-05,
+      "loss": 0.4386,
+      "step": 760
+    },
+    {
+      "epoch": 1.6279899563895863,
+      "grad_norm": 0.40252485871315,
+      "learning_rate": 3.986636971046771e-05,
+      "loss": 0.4394,
+      "step": 770
+    },
+    {
+      "epoch": 1.6491343993656669,
+      "grad_norm": 0.3895283043384552,
+      "learning_rate": 3.7639198218262804e-05,
+      "loss": 0.4356,
+      "step": 780
+    },
+    {
+      "epoch": 1.6702788423417472,
+      "grad_norm": 0.4058088958263397,
+      "learning_rate": 3.541202672605791e-05,
+      "loss": 0.4461,
+      "step": 790
+    },
+    {
+      "epoch": 1.6914232853178275,
+      "grad_norm": 0.40314358472824097,
+      "learning_rate": 3.318485523385301e-05,
+      "loss": 0.4311,
+      "step": 800
+    },
+    {
+      "epoch": 1.7125677282939078,
+      "grad_norm": 0.384658545255661,
+      "learning_rate": 3.095768374164811e-05,
+      "loss": 0.4363,
+      "step": 810
+    },
+    {
+      "epoch": 1.7337121712699881,
+      "grad_norm": 0.3810129463672638,
+      "learning_rate": 2.873051224944321e-05,
+      "loss": 0.4383,
+      "step": 820
+    },
+    {
+      "epoch": 1.7548566142460684,
+      "grad_norm": 0.39279329776763916,
+      "learning_rate": 2.650334075723831e-05,
+      "loss": 0.4228,
+      "step": 830
+    },
+    {
+      "epoch": 1.7760010572221487,
+      "grad_norm": 0.39959919452667236,
+      "learning_rate": 2.427616926503341e-05,
+      "loss": 0.4262,
+      "step": 840
+    },
+    {
+      "epoch": 1.797145500198229,
+      "grad_norm": 0.3827113211154938,
+      "learning_rate": 2.2048997772828508e-05,
+      "loss": 0.4311,
+      "step": 850
+    },
+    {
+      "epoch": 1.8182899431743094,
+      "grad_norm": 0.39276352524757385,
+      "learning_rate": 1.982182628062361e-05,
+      "loss": 0.4341,
+      "step": 860
+    },
+    {
+      "epoch": 1.83943438615039,
+      "grad_norm": 0.38558751344680786,
+      "learning_rate": 1.759465478841871e-05,
+      "loss": 0.4207,
+      "step": 870
+    },
+    {
+      "epoch": 1.8605788291264702,
+      "grad_norm": 0.4052915573120117,
+      "learning_rate": 1.5367483296213807e-05,
+      "loss": 0.4254,
+      "step": 880
+    },
+    {
+      "epoch": 1.8817232721025505,
+      "grad_norm": 0.3884909749031067,
+      "learning_rate": 1.3140311804008909e-05,
+      "loss": 0.4198,
+      "step": 890
+    },
+    {
+      "epoch": 1.902867715078631,
+      "grad_norm": 0.39251548051834106,
+      "learning_rate": 1.091314031180401e-05,
+      "loss": 0.4312,
+      "step": 900
+    },
+    {
+      "epoch": 1.9240121580547114,
+      "grad_norm": 0.382098525762558,
+      "learning_rate": 8.685968819599109e-06,
+      "loss": 0.4257,
+      "step": 910
+    },
+    {
+      "epoch": 1.9451566010307917,
+      "grad_norm": 0.3773449957370758,
+      "learning_rate": 6.45879732739421e-06,
+      "loss": 0.4215,
+      "step": 920
+    },
+    {
+      "epoch": 1.966301044006872,
+      "grad_norm": 0.39837542176246643,
+      "learning_rate": 4.231625835189309e-06,
+      "loss": 0.4232,
+      "step": 930
+    },
+    {
+      "epoch": 1.9874454869829523,
+      "grad_norm": 0.38558995723724365,
+      "learning_rate": 2.00445434298441e-06,
+      "loss": 0.4254,
+      "step": 940
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 946,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.876005077664924e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:277bbf113ecf76ec5b62586e2b4fa91501b2571b1380f4721de69ef68675511f
+size 5432

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:277bbf113ecf76ec5b62586e2b4fa91501b2571b1380f4721de69ef68675511f
+size 5432

rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/TSAD_test_metrics.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "split": "TSAD_test",
+    "num_dataset_samples": 6034,
+    "num_prediction_samples": 6034,
+    "num_valid_samples": 5980,
+    "num_missing_predictions": 0,
+    "num_invalid_predictions": 54,
+    "type_accuracy": 0.19581939799331102,
+    "type_precision_macro": 0.14972735341824286,
+    "type_recall_macro": 0.2321495128168519,
+    "type_f1_macro": 0.11749066864105973,
+    "binary_accuracy": 0.8377926421404682,
+    "binary_precision_macro": 0.7210006797954495,
+    "binary_recall_macro": 0.7288534724234521,
+    "binary_f1_macro": 0.7247792958267192
+}

rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/TSAD_test_metrics.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "split": "TSAD_test",
+    "num_dataset_samples": 6034,
+    "num_prediction_samples": 6034,
+    "num_valid_samples": 6034,
+    "num_missing_predictions": 0,
+    "num_invalid_predictions": 0,
+    "type_accuracy": 0.14053695724229368,
+    "type_precision_macro": 0.016068805185920968,
+    "type_recall_macro": 0.0665676819309319,
+    "type_f1_macro": 0.025105704354246135,
+    "binary_accuracy": 0.3667550546900895,
+    "binary_precision_macro": 0.519821130521865,
+    "binary_recall_macro": 0.5270888471072895,
+    "binary_f1_macro": 0.36091116180192573
+}

rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rats40k_adapter/rats40k_common.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import importlib.util
+import json
+import os
+import re
+import sys
+from pathlib import Path
+import numpy as np
+CHAT_TIME_DIR = Path(__file__).resolve().parents[1]
+TIME_RA_PROMPT_PATH = (
+    CHAT_TIME_DIR.parent / "Time-RA" / "prompts" / "prompt_llama_anoclf_reason.py"
+)
+_TIME_RA_PROMPT_TEMPLATE = None
+_TIME_RA_PROMPT_TEMPLATE_LOADED = False
+if str(CHAT_TIME_DIR) not in sys.path:
+    sys.path.insert(0, str(CHAT_TIME_DIR))
+from utils.prompt import getPrompt  # noqa: E402
+from utils.tools import Discretizer, Serializer  # noqa: E402
+ACTION_ID_TO_NAME = {
+    0: "Normal Sequence",
+    1: "Point Anomaly",
+    2: "Periodic Change Anomaly",
+    3: "Trend Change Anomaly",
+    4: "Change Point Anomaly",
+    5: "Distributional Change Anomaly",
+    6: "Amplitude Anomaly",
+    7: "Pattern Change Anomaly",
+    8: "Sparse Anomaly",
+    9: "Repeated Value Anomaly",
+    10: "Sudden Flatline Anomaly",
+    11: "Drift Anomaly",
+    12: "Sudden Spike Anomaly",
+    13: "Continuous Segment Anomaly",
+    14: "Nonlinear Pattern Anomaly",
+}
+ACTION_DESCRIPTIONS = {
+    0: "There are no abnormal situations in this time series.",
+    1: "A single data point significantly deviates from the local or global pattern.",
+    2: "The original periodic pattern is disrupted.",
+    3: "A sudden change appears in the long-term trend.",
+    4: "Statistical properties such as mean or variance change abruptly.",
+    5: "The statistical distribution changes significantly.",
+    6: "The amplitude exceeds normal upper or lower bounds.",
+    7: "The pattern suddenly changes from one form to another.",
+    8: "Isolated anomalous patterns occasionally appear in a long series.",
+    9: "Continuous or intermittent repeated values disrupt normal fluctuations.",
+    10: "The series suddenly becomes a flat line with no normal fluctuations.",
+    11: "The data gradually drifts away from the normal level.",
+    12: "The data suddenly spikes or drops briefly and then returns to normal.",
+    13: "A continuous segment deviates from the normal pattern.",
+    14: "Nonlinear changes break the original linear rule.",
+}
+def action_mapping_text():
+    lines = []
+    for action_id in sorted(ACTION_ID_TO_NAME):
+        lines.append(
+            f"{action_id}. {ACTION_ID_TO_NAME[action_id]}: "
+            f"{ACTION_DESCRIPTIONS[action_id]}"
+        )
+    return "\n".join(lines)
+def build_instruction(source):
+    source_text = source or "unknown"
+    time_ra_template = load_time_ra_prompt_template()
+    if time_ra_template:
+        return time_ra_template.format(
+            our_source=source_text,
+            our_observation="the serialized time series provided in the ### Input section",
+        )
+    source_text = source or "unknown"
+    return (
+        "Classify the provided univariate time series for anomaly detection. "
+        "The sequence is from the domain of "
+        f"{source_text}.\n\n"
+        "Use exactly one ActionID from the following mapping:\n"
+        f"{action_mapping_text()}\n\n"
+        "Return exactly two fields: Thought and ActionID. "
+        "Do not return a category name instead of ActionID."
+    )
+def load_time_ra_prompt_template():
+    global _TIME_RA_PROMPT_TEMPLATE, _TIME_RA_PROMPT_TEMPLATE_LOADED
+    if _TIME_RA_PROMPT_TEMPLATE_LOADED:
+        return _TIME_RA_PROMPT_TEMPLATE
+    _TIME_RA_PROMPT_TEMPLATE_LOADED = True
+    if not TIME_RA_PROMPT_PATH.exists():
+        return None
+    spec = importlib.util.spec_from_file_location(
+        "time_ra_prompt_llama_anoclf_reason",
+        TIME_RA_PROMPT_PATH,
+    )
+    if spec is None or spec.loader is None:
+        return None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    _TIME_RA_PROMPT_TEMPLATE = getattr(module, "USER_DETECTION_PROMPT", None)
+    return _TIME_RA_PROMPT_TEMPLATE
+def _to_float_array(series):
+    if isinstance(series, np.ndarray):
+        arr = series.astype(float, copy=False)
+    else:
+        arr = np.asarray(series, dtype=float)
+    if arr.ndim != 1:
+        arr = arr.reshape(-1)
+    return arr
+def serialize_observation(series):
+    arr = _to_float_array(series)
+    discretizer = Discretizer()
+    serializer = Serializer()
+    return serializer.serialize(discretizer.discretize(arr))
+def build_prompt(series, source, response=None):
+    instruction = build_instruction(source)
+    serialized = serialize_observation(series)
+    return getPrompt(
+        flag="analysis",
+        instruction=instruction,
+        input=serialized,
+        response="" if response is None else response,
+    )
+def build_response(thought, action_id):
+    thought = (thought or "").strip()
+    try:
+        action_id = int(action_id)
+    except (TypeError, ValueError):
+        action_id = -1
+    return f"Thought: {thought}\nActionID: {action_id}"
+def load_dataset_json(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def valid_split_items(data, split):
+    split_data = data.get(split, {})
+    if not isinstance(split_data, dict):
+        return []
+    items = [
+        (str(idx), item)
+        for idx, item in split_data.items()
+        if isinstance(item, dict) and isinstance(item.get("Observation"), list)
+    ]
+    return sorted(items, key=lambda pair: _sort_key(pair[0]))
+def _sort_key(value):
+    try:
+        return (0, int(value))
+    except (TypeError, ValueError):
+        return (1, str(value))
+def _json_candidates(text):
+    text = "" if text is None else str(text).strip()
+    yield text
+    fenced = re.findall(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.S | re.I)
+    for snippet in fenced:
+        yield snippet
+    match = re.search(r"\{.*\}", text, flags=re.S)
+    if match:
+        yield match.group(0)
+def parse_model_response(response):
+    text = "" if response is None else str(response)
+    thought = ""
+    action_id = None
+    for snippet in _json_candidates(text):
+        try:
+            obj = json.loads(snippet)
+        except (TypeError, ValueError):
+            continue
+        if not isinstance(obj, dict):
+            continue
+        for key in ("Thought", "thought", "Reason", "reason"):
+            if key in obj:
+                thought = str(obj[key]).strip()
+                break
+        for key in ("ActionID", "action_id", "actionId", "Action", "Label", "label"):
+            if key in obj:
+                action_id = _parse_action_id(obj[key])
+                break
+        if action_id is not None:
+            return thought, action_id
+    thought_match = re.search(
+        r"Thought\s*[:：]\s*(.*?)(?=\n\s*(?:ActionID|Action\s*ID|Action|Label)\s*[:：]|$)",
+        text,
+        flags=re.I | re.S,
+    )
+    if thought_match:
+        thought = thought_match.group(1).strip()
+    patterns = [
+        r"(?:ActionID|Action\s*ID)\s*[:：]\s*(-?\d{1,2})",
+        r'"ActionID"\s*:\s*(-?\d{1,2})',
+        r"\bAction\s*[:：]\s*(-?\d{1,2})",
+        r"\bLabel\s*[:：]\s*(-?\d{1,2})",
+        r"\b(-?\d{1,2})\b",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, text, flags=re.I)
+        if not match:
+            continue
+        action_id = _parse_action_id(match.group(1))
+        if action_id is not None:
+            return thought, action_id
+    lowered = re.sub(r"[^a-z0-9]+", " ", text.lower())
+    for candidate_id, name in ACTION_ID_TO_NAME.items():
+        normalized_name = re.sub(r"[^a-z0-9]+", " ", name.lower())
+        if normalized_name in lowered:
+            return thought, candidate_id
+    return thought, None
+def _parse_action_id(value):
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value if 0 <= value <= 14 else None
+    match = re.search(r"-?\d{1,2}", str(value))
+    if not match:
+        return None
+    action_id = int(match.group(0))
+    return action_id if 0 <= action_id <= 14 else None
+def build_prediction(response):
+    thought, action_id = parse_model_response(response)
+    if action_id is None:
+        return {
+            "Thought": thought,
+            "RawResponse": "" if response is None else str(response),
+            "ParseError": "unrecognized_action_id",
+        }
+    return {
+        "Thought": thought,
+        "ActionID": action_id,
+        "Action": ACTION_ID_TO_NAME[action_id],
+        "Label": 0 if action_id == 0 else 1,
+        "RawResponse": "" if response is None else str(response),
+    }
+def compute_metrics(data, predictions, split):
+    items = dict(valid_split_items(data, split))
+    y_true = []
+    y_pred = []
+    valid_keys = []
+    for idx, item in items.items():
+        pred = predictions.get(idx)
+        if not isinstance(pred, dict):
+            continue
+        pred_id = _parse_action_id(pred.get("ActionID"))
+        true_id = _parse_action_id(item.get("ActionID"))
+        if pred_id is None or true_id is None:
+            continue
+        y_true.append(true_id)
+        y_pred.append(pred_id)
+        valid_keys.append(idx)
+    metrics = {
+        "split": split,
+        "num_dataset_samples": len(items),
+        "num_prediction_samples": len(predictions),
+        "num_valid_samples": len(valid_keys),
+        "num_missing_predictions": len(items) - len(set(items) & set(predictions)),
+        "num_invalid_predictions": len(set(items) & set(predictions)) - len(valid_keys),
+    }
+    if not y_true:
+        return metrics
+    try:
+        from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+        y_true_binary = [0 if x == 0 else 1 for x in y_true]
+        y_pred_binary = [0 if x == 0 else 1 for x in y_pred]
+        metrics.update(
+            {
+                "type_accuracy": float(accuracy_score(y_true, y_pred)),
+                "type_precision_macro": float(
+                    precision_score(y_true, y_pred, average="macro", zero_division=0)
+                ),
+                "type_recall_macro": float(
+                    recall_score(y_true, y_pred, average="macro", zero_division=0)
+                ),
+                "type_f1_macro": float(
+                    f1_score(y_true, y_pred, average="macro", zero_division=0)
+                ),
+                "binary_accuracy": float(accuracy_score(y_true_binary, y_pred_binary)),
+                "binary_precision_macro": float(
+                    precision_score(
+                        y_true_binary, y_pred_binary, average="macro", zero_division=0
+                    )
+                ),
+                "binary_recall_macro": float(
+                    recall_score(
+                        y_true_binary, y_pred_binary, average="macro", zero_division=0
+                    )
+                ),
+                "binary_f1_macro": float(
+                    f1_score(y_true_binary, y_pred_binary, average="macro", zero_division=0)
+                ),
+            }
+        )
+    except Exception:
+        correct = sum(int(a == b) for a, b in zip(y_true, y_pred))
+        metrics["type_accuracy"] = correct / len(y_true)
+        true_binary = [0 if x == 0 else 1 for x in y_true]
+        pred_binary = [0 if x == 0 else 1 for x in y_pred]
+        metrics["binary_accuracy"] = sum(
+            int(a == b) for a, b in zip(true_binary, pred_binary)
+        ) / len(y_true)
+        metrics["metric_warning"] = "sklearn unavailable; only accuracy was computed."
+    return metrics
+def atomic_write_json(obj, path):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
+    with open(tmp, "w", encoding="utf-8") as f:
+        json.dump(obj, f, indent=4, ensure_ascii=False)
+    os.replace(tmp, path)

rats40k_adapter/run_sft_4gpu.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/bin/bash
+set -Eeuo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "$PROJECT_DIR"
+RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
+PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-/mnt/share01/sqk/ITFormer/accelerate_config.yaml}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+MODEL_PATH="${MODEL_PATH:-/mnt/share01/sqk/models/ChatTime-1-7B-Chat}"
+ALLOW_HF_DOWNLOAD="${ALLOW_HF_DOWNLOAD:-0}"
+DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
+TRAIN_SPLIT="${TRAIN_SPLIT:-TSAD_train}"
+EVAL_SPLIT="${EVAL_SPLIT:-TSAD_test}"
+OUTPUT_ROOT="${OUTPUT_ROOT:-${PROJECT_DIR}/rats40k_adapter/outputs/sft_${RUN_ID}}"
+ADAPTER_OUTPUT_DIR="${ADAPTER_OUTPUT_DIR:-${OUTPUT_ROOT}/adapter}"
+EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-${OUTPUT_ROOT}/eval}"
+RUN_EVAL_AFTER_SFT="${RUN_EVAL_AFTER_SFT:-1}"
+RESULT_NAME="${RESULT_NAME:-RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.json}"
+MAX_TRAIN_SAMPLES="${MAX_TRAIN_SAMPLES:-}"
+MAX_EVAL_SAMPLES="${MAX_EVAL_SAMPLES:-}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+MAX_SEQ_LENGTH="${MAX_SEQ_LENGTH:-4096}"
+PER_DEVICE_TRAIN_BATCH_SIZE="${PER_DEVICE_TRAIN_BATCH_SIZE:-1}"
+GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-16}"
+NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-2}"
+LEARNING_RATE="${LEARNING_RATE:-2e-4}"
+LORA_RANK="${LORA_RANK:-16}"
+LORA_ALPHA="${LORA_ALPHA:-32}"
+LORA_DROPOUT="${LORA_DROPOUT:-0.05}"
+SAVE_STEPS="${SAVE_STEPS:-200}"
+LOGGING_STEPS="${LOGGING_STEPS:-10}"
+SAVE_TOTAL_LIMIT="${SAVE_TOTAL_LIMIT:-2}"
+DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-4}"
+LOAD_IN_4BIT="${LOAD_IN_4BIT:-0}"
+GRADIENT_CHECKPOINTING="${GRADIENT_CHECKPOINTING:-1}"
+TORCH_DTYPE="${TORCH_DTYPE:-fp16}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-160}"
+MAX_INPUT_TOKENS="${MAX_INPUT_TOKENS:-3936}"
+LOG_DIR="${LOG_DIR:-${PROJECT_DIR}/rats40k_adapter/logs}"
+LOG_FILE="${LOG_FILE:-${LOG_DIR}/sft_4gpu_${RUN_ID}.log}"
+mkdir -p "$LOG_DIR" "$OUTPUT_ROOT"
+fail() {
+    echo "$*" >&2
+    exit 1
+}
+exec > >(tee -a "$LOG_FILE") 2>&1
+export CUDA_VISIBLE_DEVICES
+export PYTHONPATH="${PROJECT_DIR}:${PYTHONPATH:-}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
+export WANDB_MODE=offline
+[ -x "$PYTHON_BIN" ] || fail "Python executable not found: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python."
+[ -f "$ACCELERATE_CONFIG" ] || fail "Accelerate config not found: $ACCELERATE_CONFIG"
+[ -f "$DATA_PATH" ] || fail "RATs40K data file not found: $DATA_PATH"
+[ -n "$MODEL_PATH" ] || fail "MODEL_PATH is required. Use a local ChatTime model path, or set ALLOW_HF_DOWNLOAD=1 with a HuggingFace model id."
+if [ ! -d "$MODEL_PATH" ] && [ "$ALLOW_HF_DOWNLOAD" != "1" ]; then
+    fail "MODEL_PATH is not a local directory: $MODEL_PATH. Set ALLOW_HF_DOWNLOAD=1 if you intentionally want HuggingFace downloads."
+fi
+"$PYTHON_BIN" -c "import accelerate; print('accelerate:', accelerate.__version__)" || \
+    fail "The selected Python cannot import accelerate: $PYTHON_BIN"
+ACCELERATE_CMD=("$PYTHON_BIN" -m accelerate.commands.accelerate_cli)
+if [ "$LOAD_IN_4BIT" = "1" ]; then
+    "$PYTHON_BIN" -c "import importlib.metadata as m; print('bitsandbytes:', m.version('bitsandbytes'))" || \
+        fail "LOAD_IN_4BIT=1 requires bitsandbytes in $PYTHON_BIN. Install it with: $PYTHON_BIN -m pip install bitsandbytes. To run without downloading it, set LOAD_IN_4BIT=0 PER_DEVICE_TRAIN_BATCH_SIZE=1 GRADIENT_ACCUMULATION_STEPS=16."
+fi
+TRAIN_EXTRA_ARGS=()
+if [ -n "$MAX_TRAIN_SAMPLES" ]; then
+    TRAIN_EXTRA_ARGS+=(--max_train_samples "$MAX_TRAIN_SAMPLES")
+fi
+if [ "$ALLOW_HF_DOWNLOAD" = "1" ]; then
+    TRAIN_EXTRA_ARGS+=(--allow_hf_download)
+fi
+if [ "$LOAD_IN_4BIT" = "1" ]; then
+    TRAIN_EXTRA_ARGS+=(--load_in_4bit)
+fi
+if [ "$GRADIENT_CHECKPOINTING" = "1" ]; then
+    TRAIN_EXTRA_ARGS+=(--gradient_checkpointing)
+fi
+echo "Run id: $RUN_ID"
+echo "Python: $PYTHON_BIN"
+echo "Accelerate: ${ACCELERATE_CMD[*]}"
+echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+echo "Model path: $MODEL_PATH"
+echo "Data path: $DATA_PATH"
+echo "Adapter output dir: $ADAPTER_OUTPUT_DIR"
+echo "Eval output dir: $EVAL_OUTPUT_DIR"
+echo "Log file: $LOG_FILE"
+"${ACCELERATE_CMD[@]}" launch --config_file "$ACCELERATE_CONFIG" \
+    rats40k_adapter/finetune_rats40k_lora.py \
+    --model_path "$MODEL_PATH" \
+    --data_path "$DATA_PATH" \
+    --train_split "$TRAIN_SPLIT" \
+    --output_dir "$ADAPTER_OUTPUT_DIR" \
+    --max_seq_length "$MAX_SEQ_LENGTH" \
+    --per_device_train_batch_size "$PER_DEVICE_TRAIN_BATCH_SIZE" \
+    --gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS" \
+    --num_train_epochs "$NUM_TRAIN_EPOCHS" \
+    --learning_rate "$LEARNING_RATE" \
+    --lora_rank "$LORA_RANK" \
+    --lora_alpha "$LORA_ALPHA" \
+    --lora_dropout "$LORA_DROPOUT" \
+    --save_steps "$SAVE_STEPS" \
+    --logging_steps "$LOGGING_STEPS" \
+    --save_total_limit "$SAVE_TOTAL_LIMIT" \
+    --dataloader_num_workers "$DATALOADER_NUM_WORKERS" \
+    --torch_dtype "$TORCH_DTYPE" \
+    "${TRAIN_EXTRA_ARGS[@]}"
+if [ "$RUN_EVAL_AFTER_SFT" = "1" ]; then
+    EVAL_EXTRA_ARGS=()
+    if [ -n "$MAX_EVAL_SAMPLES" ]; then
+        EVAL_EXTRA_ARGS+=(--max_eval_samples "$MAX_EVAL_SAMPLES")
+    fi
+    if [ "$ALLOW_HF_DOWNLOAD" = "1" ]; then
+        EVAL_EXTRA_ARGS+=(--allow_hf_download)
+    fi
+    "${ACCELERATE_CMD[@]}" launch --config_file "$ACCELERATE_CONFIG" \
+        rats40k_adapter/eval_rats40k.py \
+        --model_path "$MODEL_PATH" \
+        --adapter_path "$ADAPTER_OUTPUT_DIR" \
+        --data_path "$DATA_PATH" \
+        --split "$EVAL_SPLIT" \
+        --output_dir "$EVAL_OUTPUT_DIR" \
+        --result_name "$RESULT_NAME" \
+        --eval_batch_size "$EVAL_BATCH_SIZE" \
+        --max_new_tokens "$MAX_NEW_TOKENS" \
+        --max_input_tokens "$MAX_INPUT_TOKENS" \
+        --torch_dtype "$TORCH_DTYPE" \
+        "${EVAL_EXTRA_ARGS[@]}"
+fi

rats40k_adapter/run_zeroshot_4gpu.sh ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/bin/bash
+set -Eeuo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "$PROJECT_DIR"
+RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
+PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-/mnt/share01/sqk/ITFormer/accelerate_config.yaml}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+MODEL_PATH="${MODEL_PATH:-/mnt/share01/sqk/models/ChatTime-1-7B-Chat}"
+ALLOW_HF_DOWNLOAD="${ALLOW_HF_DOWNLOAD:-0}"
+DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
+SPLIT="${SPLIT:-TSAD_test}"
+OUTPUT_DIR="${OUTPUT_DIR:-${PROJECT_DIR}/rats40k_adapter/outputs/zeroshot_${RUN_ID}}"
+RESULT_NAME="${RESULT_NAME:-RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.json}"
+MAX_EVAL_SAMPLES="${MAX_EVAL_SAMPLES:-}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-160}"
+MAX_INPUT_TOKENS="${MAX_INPUT_TOKENS:-3936}"
+TORCH_DTYPE="${TORCH_DTYPE:-fp16}"
+LOG_DIR="${LOG_DIR:-${PROJECT_DIR}/rats40k_adapter/logs}"
+LOG_FILE="${LOG_FILE:-${LOG_DIR}/zeroshot_4gpu_${RUN_ID}.log}"
+mkdir -p "$LOG_DIR" "$OUTPUT_DIR"
+fail() {
+    echo "$*" >&2
+    exit 1
+}
+exec > >(tee -a "$LOG_FILE") 2>&1
+export CUDA_VISIBLE_DEVICES
+export PYTHONPATH="${PROJECT_DIR}:${PYTHONPATH:-}"
+export TOKENIZERS_PARALLELISM=false
+export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
+[ -x "$PYTHON_BIN" ] || fail "Python executable not found: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python."
+[ -f "$ACCELERATE_CONFIG" ] || fail "Accelerate config not found: $ACCELERATE_CONFIG"
+[ -f "$DATA_PATH" ] || fail "RATs40K data file not found: $DATA_PATH"
+[ -n "$MODEL_PATH" ] || fail "MODEL_PATH is required. Use a local ChatTime model path, or set ALLOW_HF_DOWNLOAD=1 with a HuggingFace model id."
+if [ ! -d "$MODEL_PATH" ] && [ "$ALLOW_HF_DOWNLOAD" != "1" ]; then
+    fail "MODEL_PATH is not a local directory: $MODEL_PATH. Set ALLOW_HF_DOWNLOAD=1 if you intentionally want HuggingFace downloads."
+fi
+"$PYTHON_BIN" -c "import accelerate; print('accelerate:', accelerate.__version__)" || \
+    fail "The selected Python cannot import accelerate: $PYTHON_BIN"
+ACCELERATE_CMD=("$PYTHON_BIN" -m accelerate.commands.accelerate_cli)
+EXTRA_ARGS=()
+if [ -n "$MAX_EVAL_SAMPLES" ]; then
+    EXTRA_ARGS+=(--max_eval_samples "$MAX_EVAL_SAMPLES")
+fi
+if [ "$ALLOW_HF_DOWNLOAD" = "1" ]; then
+    EXTRA_ARGS+=(--allow_hf_download)
+fi
+echo "Run id: $RUN_ID"
+echo "Python: $PYTHON_BIN"
+echo "Accelerate: ${ACCELERATE_CMD[*]}"
+echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+echo "Model path: $MODEL_PATH"
+echo "Data path: $DATA_PATH"
+echo "Output dir: $OUTPUT_DIR"
+echo "Log file: $LOG_FILE"
+"${ACCELERATE_CMD[@]}" launch --config_file "$ACCELERATE_CONFIG" \
+    rats40k_adapter/eval_rats40k.py \
+    --model_path "$MODEL_PATH" \
+    --data_path "$DATA_PATH" \
+    --split "$SPLIT" \
+    --output_dir "$OUTPUT_DIR" \
+    --result_name "$RESULT_NAME" \
+    --eval_batch_size "$EVAL_BATCH_SIZE" \
+    --max_new_tokens "$MAX_NEW_TOKENS" \
+    --max_input_tokens "$MAX_INPUT_TOKENS" \
+    --torch_dtype "$TORCH_DTYPE" \
+    "${EXTRA_ARGS[@]}"

rats40k_adapter/run_zeroshot_then_sft_4gpu.sh ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/bin/bash
+set -Eeuo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "$PROJECT_DIR"
+RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
+PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-/mnt/share01/sqk/ITFormer/accelerate_config.yaml}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+MODEL_PATH="${MODEL_PATH:-/mnt/share01/sqk/models/ChatTime-1-7B-Chat}"
+DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
+ALLOW_HF_DOWNLOAD="${ALLOW_HF_DOWNLOAD:-0}"
+LOAD_IN_4BIT="${LOAD_IN_4BIT:-0}"
+PER_DEVICE_TRAIN_BATCH_SIZE="${PER_DEVICE_TRAIN_BATCH_SIZE:-1}"
+GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-16}"
+OUTPUT_BASE="${OUTPUT_BASE:-${PROJECT_DIR}/rats40k_adapter/outputs/pipeline_${RUN_ID}}"
+LOG_DIR="${LOG_DIR:-${PROJECT_DIR}/rats40k_adapter/logs}"
+ZERO_SHOT_OUTPUT_DIR="${ZERO_SHOT_OUTPUT_DIR:-${OUTPUT_BASE}/zeroshot}"
+SFT_OUTPUT_ROOT="${SFT_OUTPUT_ROOT:-${OUTPUT_BASE}/sft}"
+ZERO_SHOT_LOG_FILE="${ZERO_SHOT_LOG_FILE:-${LOG_DIR}/pipeline_${RUN_ID}_zeroshot.log}"
+SFT_LOG_FILE="${SFT_LOG_FILE:-${LOG_DIR}/pipeline_${RUN_ID}_sft.log}"
+mkdir -p "$OUTPUT_BASE" "$LOG_DIR"
+echo "Pipeline run id: $RUN_ID"
+echo "Project dir: $PROJECT_DIR"
+echo "Python: $PYTHON_BIN"
+echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+echo "Model path: $MODEL_PATH"
+echo "Data path: $DATA_PATH"
+echo "LOAD_IN_4BIT: $LOAD_IN_4BIT"
+echo "Per-device train batch size: $PER_DEVICE_TRAIN_BATCH_SIZE"
+echo "Gradient accumulation steps: $GRADIENT_ACCUMULATION_STEPS"
+echo "Zero-shot output dir: $ZERO_SHOT_OUTPUT_DIR"
+echo "SFT output root: $SFT_OUTPUT_ROOT"
+echo ""
+echo "========== Stage 1/2: Zero-shot eval =========="
+RUN_ID="$RUN_ID" \
+PYTHON_BIN="$PYTHON_BIN" \
+ACCELERATE_CONFIG="$ACCELERATE_CONFIG" \
+CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" \
+MODEL_PATH="$MODEL_PATH" \
+DATA_PATH="$DATA_PATH" \
+ALLOW_HF_DOWNLOAD="$ALLOW_HF_DOWNLOAD" \
+OUTPUT_DIR="$ZERO_SHOT_OUTPUT_DIR" \
+LOG_FILE="$ZERO_SHOT_LOG_FILE" \
+bash rats40k_adapter/run_zeroshot_4gpu.sh
+echo ""
+echo "========== Stage 2/2: SFT + eval =========="
+RUN_ID="$RUN_ID" \
+PYTHON_BIN="$PYTHON_BIN" \
+ACCELERATE_CONFIG="$ACCELERATE_CONFIG" \
+CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" \
+MODEL_PATH="$MODEL_PATH" \
+DATA_PATH="$DATA_PATH" \
+ALLOW_HF_DOWNLOAD="$ALLOW_HF_DOWNLOAD" \
+LOAD_IN_4BIT="$LOAD_IN_4BIT" \
+PER_DEVICE_TRAIN_BATCH_SIZE="$PER_DEVICE_TRAIN_BATCH_SIZE" \
+GRADIENT_ACCUMULATION_STEPS="$GRADIENT_ACCUMULATION_STEPS" \
+OUTPUT_ROOT="$SFT_OUTPUT_ROOT" \
+LOG_FILE="$SFT_LOG_FILE" \
+RUN_EVAL_AFTER_SFT="${RUN_EVAL_AFTER_SFT:-1}" \
+bash rats40k_adapter/run_sft_4gpu.sh
+echo ""
+echo "Pipeline finished."
+echo "Zero-shot outputs: $ZERO_SHOT_OUTPUT_DIR"
+echo "SFT outputs: $SFT_OUTPUT_ROOT"

training/finetune.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import argparse
+import sys
+import torch
+from datasets import load_dataset
+from transformers import TrainingArguments, LlamaTokenizer
+from trl import SFTTrainer
+from unsloth import FastLanguageModel, is_bfloat16_supported
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--code_path", type=str, required=True, default=None)
+    parser.add_argument("--model_path", type=str, required=True, default=None)
+    parser.add_argument("--dataset_path", type=str, required=True, default=None)
+    parser.add_argument("--log_path", type=str, required=True, default=None)
+    parser.add_argument("--output_path", type=str, required=True, default=None)
+    parser.add_argument("--max_seq_length", type=int, default=2048)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--lora_rank", type=int, default=16)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--lora_dropout", type=float, default=0.00)
+    parser.add_argument("--random_seed", type=int, default=3407)
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument("--per_device_train_batch_size", type=int, default=64)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
+    parser.add_argument("--save_steps", type=int, default=2)
+    parser.add_argument("--logging_steps", type=int, default=2)
+    parser.add_argument("--max_steps", type=int, default=-1)
+    args = parser.parse_args()
+    sys.path.append(args.code_path)
+    # load tokenizer
+    tokenizer = LlamaTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    print(f"\nVocabulary number: {len(tokenizer.get_vocab())}\n")
+    EOS_TOKEN = tokenizer.eos_token
+    # load model
+    model, _ = FastLanguageModel.from_pretrained(
+        model_name=args.model_path,
+        max_seq_length=args.max_seq_length,
+        dtype=None,
+        load_in_4bit=args.load_in_4bit,
+    )
+    # add lora to llama model
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ],
+        # modules_to_save=["embed_tokens", "lm_head", ],
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=args.random_seed,
+        max_seq_length=args.max_seq_length,
+    )
+    # load dataset
+    def formatting_func(example):
+        return example["text"] + EOS_TOKEN
+    print(f"\nLoading dataset in {args.dataset_path}")
+    dataset = load_dataset(args.dataset_path, split="train")
+    print(f"Dataset example: \n{dataset[0]['text']}\n")
+    # train model
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=args.max_seq_length,
+        dataset_num_proc=64,
+        packing=False,
+        formatting_func=formatting_func,
+        args=TrainingArguments(
+            per_device_train_batch_size=args.per_device_train_batch_size,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            num_train_epochs=args.num_train_epochs,
+            weight_decay=0.01,
+            warmup_ratio=0.05,
+            max_grad_norm=1.0,
+            learning_rate=2e-4,
+            logging_strategy="steps",
+            logging_steps=args.logging_steps,
+            save_strategy="steps",
+            save_steps=args.save_steps,
+            max_steps=args.max_steps,
+            save_total_limit=1,
+            logging_first_step=True,
+            optim="adamw_8bit",
+            lr_scheduler_type="cosine",
+            seed=args.random_seed,
+            output_dir=args.log_path,
+            fp16=not is_bfloat16_supported(),
+            bf16=is_bfloat16_supported(),
+        ),
+    )
+    # title Show current memory stats
+    gpu_stats = torch.cuda.get_device_properties(0)
+    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    print(f"\nGPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+    print(f"{start_gpu_memory} GB of memory reserved.\n")
+    trainer_stats = trainer.train()
+    # title Show final memory and time stats
+    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
+    used_percentage = round(used_memory / max_memory * 100, 3)
+    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
+    print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
+    print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
+    print(f"Peak reserved memory = {used_memory} GB.")
+    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
+    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
+    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.\n")
+    # save model and tokenizer
+    model.save_pretrained_merged(args.output_path, tokenizer)

training/finetune.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+DATA_PATH=""
+CODE_PATH=""
+MODEL_PATH=""
+code_path=$CODE_PATH
+model_path=$MODEL_PATH/ChatTime-1-7B-Base/
+dataset_path=$DATA_PATH/ChatTime-1-Finetune-100K/
+log_path=$MODEL_PATH/log_finetune/
+output_path=$MODEL_PATH/ChatTime-1-7B-Chat/
+lora_rank=8
+lora_alpha=16
+lora_dropout=0.00
+num_train_epochs=4
+per_device_train_batch_size=8
+gradient_accumulation_steps=32
+save_steps=40
+logging_steps=4
+max_steps=-1
+python "$code_path/training/source/finetune.py" \
+  --code_path "$code_path" \
+  --model_path "$model_path" \
+  --dataset_path "$dataset_path" \
+  --log_path "$log_path" \
+  --output_path "$output_path" \
+  --lora_rank $lora_rank \
+  --lora_alpha $lora_alpha \
+  --lora_dropout $lora_dropout \
+  --num_train_epochs $num_train_epochs \
+  --per_device_train_batch_size $per_device_train_batch_size \
+  --gradient_accumulation_steps $gradient_accumulation_steps \
+  --save_steps $save_steps \
+  --logging_steps $logging_steps \
+  --max_steps $max_steps \
+  --load_in_4bit

training/pretrain.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import argparse
+import sys
+import numpy as np
+import torch
+from datasets import load_dataset
+from transformers import TrainingArguments, LlamaTokenizer
+from trl import SFTTrainer
+from unsloth import FastLanguageModel, is_bfloat16_supported
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--code_path", type=str, required=True, default=None)
+    parser.add_argument("--model_path", type=str, required=True, default=None)
+    parser.add_argument("--dataset_path", type=str, required=True, default=None)
+    parser.add_argument("--log_path", type=str, required=True, default=None)
+    parser.add_argument("--output_path", type=str, required=True, default=None)
+    parser.add_argument("--max_seq_length", type=int, default=2048)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--lora_rank", type=int, default=16)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--lora_dropout", type=float, default=0.00)
+    parser.add_argument("--random_seed", type=int, default=3407)
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument("--per_device_train_batch_size", type=int, default=64)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
+    parser.add_argument("--save_steps", type=int, default=2)
+    parser.add_argument("--logging_steps", type=int, default=2)
+    parser.add_argument("--max_steps", type=int, default=-1)
+    parser.add_argument("--low_limit", type=float, default=-1)
+    parser.add_argument("--high_limit", type=float, default=1)
+    parser.add_argument("--n_tokens", type=int, default=10002)
+    parser.add_argument("--prec", type=int, default=4)
+    parser.add_argument("--time_sep", type=str, default=" ")
+    parser.add_argument("--time_flag", type=str, default="###")
+    parser.add_argument("--nan_flag", type=str, default="Nan")
+    args = parser.parse_args()
+    sys.path.append(args.code_path)
+    from utils.tools import Discretizer, Serializer
+    # construct vocabulary
+    discretizer = Discretizer(low_limit=args.low_limit, high_limit=args.high_limit, n_tokens=args.n_tokens)
+    serializer = Serializer(prec=args.prec, time_sep=args.time_sep, time_flag=args.time_flag, nan_flag=args.nan_flag)
+    vocabulary = np.concatenate((discretizer.centers[1:-1], [np.NaN])).reshape(-1, 1)
+    vocabulary = np.array([serializer.serialize(i) for i in vocabulary])
+    print(f"\nVocabulary: \n{vocabulary}\n")
+    # add token to llama tokenizer
+    tokenizer = LlamaTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    print(f"Old model pieces: {len(tokenizer.get_vocab())}")
+    tokenizer.add_tokens(vocabulary.tolist())
+    print(f"New model pieces: {len(tokenizer.get_vocab())}")
+    EOS_TOKEN = tokenizer.eos_token
+    # load model
+    model, _ = FastLanguageModel.from_pretrained(
+        model_name=args.model_path,
+        max_seq_length=args.max_seq_length,
+        dtype=None,
+        load_in_4bit=args.load_in_4bit,
+        resize_model_vocab=len(tokenizer.get_vocab()),
+    )
+    # add lora to llama model
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ],
+        modules_to_save=["embed_tokens", "lm_head", ],
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=args.random_seed,
+        max_seq_length=args.max_seq_length,
+    )
+    # load dataset
+    def formatting_func(example):
+        return example["text"] + EOS_TOKEN
+    print(f"\nLoading dataset in {args.dataset_path}")
+    dataset = load_dataset(args.dataset_path, split="train")
+    print(f"Dataset example: \n{dataset[0]['text']}\n")
+    # train model
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=args.max_seq_length,
+        dataset_num_proc=64,
+        packing=False,
+        formatting_func=formatting_func,
+        args=TrainingArguments(
+            per_device_train_batch_size=args.per_device_train_batch_size,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            num_train_epochs=args.num_train_epochs,
+            weight_decay=0.01,
+            warmup_ratio=0.05,
+            max_grad_norm=1.0,
+            learning_rate=2e-4,
+            logging_strategy="steps",
+            logging_steps=args.logging_steps,
+            save_strategy="steps",
+            save_steps=args.save_steps,
+            max_steps=args.max_steps,
+            save_total_limit=1,
+            logging_first_step=True,
+            optim="adamw_8bit",
+            lr_scheduler_type="cosine",
+            seed=args.random_seed,
+            output_dir=args.log_path,
+            fp16=not is_bfloat16_supported(),
+            bf16=is_bfloat16_supported(),
+        ),
+    )
+    # title Show current memory stats
+    gpu_stats = torch.cuda.get_device_properties(0)
+    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    print(f"\nGPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+    print(f"{start_gpu_memory} GB of memory reserved.\n")
+    trainer_stats = trainer.train()
+    # title Show final memory and time stats
+    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
+    used_percentage = round(used_memory / max_memory * 100, 3)
+    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
+    print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
+    print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
+    print(f"Peak reserved memory = {used_memory} GB.")
+    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
+    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
+    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.\n")
+    # save model and tokenizer
+    model.save_pretrained_merged(args.output_path, tokenizer)

training/pretrain.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+DATA_PATH=""
+CODE_PATH=""
+MODEL_PATH=""
+code_path=$CODE_PATH
+model_path=meta-llama/Llama-2-7b-hf
+dataset_path=$DATA_PATH/ChatTime-1-Pretrain-1M/
+log_path=$MODEL_PATH/log_pretrain/
+output_path=$MODEL_PATH/ChatTime-1-7B-Base/
+lora_rank=8
+lora_alpha=16
+lora_dropout=0.00
+num_train_epochs=2
+per_device_train_batch_size=8
+gradient_accumulation_steps=32
+save_steps=200
+logging_steps=20
+max_steps=-1
+python "$code_path/training/source/pretrain.py" \
+  --code_path "$code_path" \
+  --model_path "$model_path" \
+  --dataset_path "$dataset_path" \
+  --log_path "$log_path" \
+  --output_path "$output_path" \
+  --lora_rank $lora_rank \
+  --lora_alpha $lora_alpha \
+  --lora_dropout $lora_dropout \
+  --num_train_epochs $num_train_epochs \
+  --per_device_train_batch_size $per_device_train_batch_size \
+  --gradient_accumulation_steps $gradient_accumulation_steps \
+  --save_steps $save_steps \
+  --logging_steps $logging_steps \
+  --max_steps $max_steps \
+  --load_in_4bit

tsqa_adapter/logs/sft_4gpu_20260615_140322.log ADDED Viewed

	@@ -0,0 +1,875 @@












































0	0%\| \| 0/250 [00:00<?, ?it/s]Traceback (most recent call last):

+accelerate: 0.34.2
+Run id: 20260615_140322
+Python: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python
+Accelerate: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python -m accelerate.commands.accelerate_cli
+CUDA_VISIBLE_DEVICES: 0,1,2,3
+Model path: /mnt/share01/sqk/models/ChatTime-1-7B-Chat
+Data root: /mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp (train=train.jsonl eval=eval.jsonl)
+Adapter output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_140322/adapter
+Eval output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_140322/eval
+Log file: /mnt/share01/sqk/ChatTime/tsqa_adapter/logs/sft_4gpu_20260615_140322.log
+⚙️  Running in WANDB offline mode⚙️  Running in WANDB offline mode
+⚙️  Running in WANDB offline mode
+⚙️  Running in WANDB offline mode
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  0%|          | 0/250 [00:00<?, ?it/s]Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PermissionError: [Errno 1] Operation not permitted
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted
+Traceback (most recent call last):
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
+    obj = _ForkingPickler.dumps(obj)
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
+    cls(buf, protocol).dump(obj)
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
+    df = multiprocessing.reduction.DupFd(fd)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
+    return resource_sharer.DupFd(fd)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
+    self._id = _resource_sharer.register(send, close)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
+    self._start()
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
+    self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
+    self._listener = SocketListener(address, family, backlog)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
+    self._socket.bind(address)
+PermissionError: [Errno 1] Operation not permitted

tsqa_adapter/logs/sft_4gpu_20260615_141604.log ADDED Viewed

@@ -0,0 +1,210 @@
  0%|          | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  0%|          | 1/250 [00:14<58:06, 14.00s/it]
  1%|          | 2/250 [00:24<50:27, 12.21s/it]
  1%|          | 3/250 [00:37<50:29, 12.27s/it]
  2%|▏         | 4/250 [00:49<49:31, 12.08s/it]
  2%|▏         | 5/250 [01:00<48:41, 11.92s/it]
  2%|▏         | 6/250 [01:14<51:41, 12.71s/it]
  3%|▎         | 7/250 [01:27<51:10, 12.64s/it]
  3%|▎         | 8/250 [01:41<53:20, 13.22s/it]
  4%|▎         | 9/250 [01:52<50:23, 12.55s/it]
  4%|▍         | 10/250 [02:08<53:48, 13.45s/it]
  4%|▍         | 10/250 [02:08<53:48, 13.45s/it]
  4%|▍         | 11/250 [02:19<50:52, 12.77s/it]
  5%|▍         | 12/250 [02:32<51:15, 12.92s/it]
  5%|▌         | 13/250 [02:46<51:17, 12.99s/it]
  6%|▌         | 14/250 [02:59<51:03, 12.98s/it]
  6%|▌         | 15/250 [03:11<50:07, 12.80s/it]
  6%|▋         | 16/250 [03:22<48:26, 12.42s/it]
  7%|▋         | 17/250 [03:36<50:04, 12.89s/it]
  7%|▋         | 18/250 [03:48<48:20, 12.50s/it]
  8%|▊         | 19/250 [03:58<45:26, 11.80s/it]
  8%|▊         | 20/250 [04:09<44:36, 11.64s/it]
  8%|▊         | 20/250 [04:09<44:36, 11.64s/it]
  8%|▊         | 21/250 [04:24<48:03, 12.59s/it]
  9%|▉         | 22/250 [04:36<46:53, 12.34s/it]
  9%|▉         | 23/250 [04:50<49:03, 12.97s/it]
 10%|▉         | 24/250 [05:07<52:25, 13.92s/it]
 10%|█         | 25/250 [05:20<52:07, 13.90s/it]
 10%|█         | 26/250 [05:36<53:41, 14.38s/it]
 11%|█         | 27/250 [05:52<54:58, 14.79s/it]
 11%|█         | 28/250 [06:09<57:57, 15.66s/it]
 12%|█▏        | 29/250 [06:23<55:16, 15.01s/it]
 12%|█▏        | 30/250 [06:36<53:20, 14.55s/it]
 12%|█▏        | 30/250 [06:36<53:20, 14.55s/it]
 12%|█▏        | 31/250 [06:51<52:42, 14.44s/it]
 13%|█▎        | 32/250 [07:02<49:31, 13.63s/it]
 13%|█▎        | 33/250 [07:14<47:28, 13.13s/it]
 14%|█▎        | 34/250 [07:25<44:37, 12.39s/it]
 14%|█▍        | 35/250 [07:42<48:57, 13.66s/it]
 14%|█▍        | 36/250 [07:56<49:01, 13.74s/it]
 15%|█▍        | 37/250 [08:06<45:37, 12.85s/it]
 15%|█▌        | 38/250 [08:22<48:08, 13.62s/it]
 16%|█▌        | 39/250 [08:33<45:37, 12.97s/it]
 16%|█▌        | 40/250 [08:46<44:47, 12.80s/it]
 16%|█▌        | 40/250 [08:46<44:47, 12.80s/it]
 16%|█▋        | 41/250 [08:55<41:16, 11.85s/it]
 17%|█▋        | 42/250 [09:06<40:00, 11.54s/it]
 17%|█▋        | 43/250 [09:24<46:47, 13.56s/it]
 18%|█▊        | 44/250 [09:36<44:43, 13.03s/it]
 18%|█▊        | 45/250 [09:50<45:32, 13.33s/it]
 18%|█▊        | 46/250 [10:03<44:21, 13.05s/it]
 19%|█▉        | 47/250 [10:19<47:26, 14.02s/it]
 19%|█▉        | 48/250 [10:36<50:17, 14.94s/it]
 20%|█▉        | 49/250 [10:54<53:33, 15.99s/it]
 20%|██        | 50/250 [11:06<49:05, 14.73s/it]
 20%|██        | 50/250 [11:06<49:05, 14.73s/it]
 20%|██        | 51/250 [11:20<47:34, 14.34s/it]
 21%|██        | 52/250 [11:32<45:08, 13.68s/it]
 21%|██        | 53/250 [11:44<43:53, 13.37s/it]
 22%|██▏       | 54/250 [11:58<43:35, 13.34s/it]
 22%|██▏       | 55/250 [12:11<43:18, 13.32s/it]
 22%|██▏       | 56/250 [12:21<40:23, 12.49s/it]
 23%|██▎       | 57/250 [12:35<41:07, 12.78s/it]
 23%|██▎       | 58/250 [12:50<43:07, 13.48s/it]
 24%|██▎       | 59/250 [13:05<44:15, 13.90s/it]
 24%|██▍       | 60/250 [13:18<42:56, 13.56s/it]
 24%|██▍       | 60/250 [13:18<42:56, 13.56s/it]
 24%|██▍       | 61/250 [13:30<41:38, 13.22s/it]
 25%|██▍       | 62/250 [13:43<40:45, 13.01s/it]
 25%|██▌       | 63/250 [13:57<41:43, 13.39s/it]
 26%|██▌       | 64/250 [14:09<40:29, 13.06s/it]
 26%|██▌       | 65/250 [14:24<41:52, 13.58s/it]
 26%|██▋       | 66/250 [14:38<42:00, 13.70s/it]
 27%|██▋       | 67/250 [14:53<43:03, 14.12s/it]
 27%|██▋       | 68/250 [15:08<43:21, 14.29s/it]
 28%|██▊       | 69/250 [15:21<42:37, 14.13s/it]
 28%|██▊       | 70/250 [15:34<41:12, 13.74s/it]
 28%|██▊       | 70/250 [15:34<41:12, 13.74s/it]
 28%|██▊       | 71/250 [15:51<43:27, 14.57s/it]
 29%|██▉       | 72/250 [16:03<41:16, 13.91s/it]
 29%|██▉       | 73/250 [16:15<39:02, 13.24s/it]
 30%|██▉       | 74/250 [16:25<35:48, 12.21s/it]
 30%|███       | 75/250 [16:36<34:30, 11.83s/it]
 30%|███       | 76/250 [16:49<35:39, 12.29s/it]
 31%|███       | 77/250 [17:00<34:42, 12.04s/it]
 31%|███       | 78/250 [17:14<35:30, 12.38s/it]
 32%|███▏      | 79/250 [17:25<34:33, 12.12s/it]
 32%|███▏      | 80/250 [17:41<37:08, 13.11s/it]
 32%|███▏      | 80/250 [17:41<37:08, 13.11s/it]
 32%|███▏      | 81/250 [17:53<36:08, 12.83s/it]
 33%|███▎      | 82/250 [18:08<37:48, 13.50s/it]
 33%|███▎      | 83/250 [18:20<36:12, 13.01s/it]
 34%|███▎      | 84/250 [18:38<40:30, 14.64s/it]
 34%|███▍      | 85/250 [18:52<39:39, 14.42s/it]
 34%|███▍      | 86/250 [19:10<42:03, 15.39s/it]
 35%|███▍      | 87/250 [19:24<41:21, 15.23s/it]
 35%|███▌      | 88/250 [19:36<38:19, 14.19s/it]
 36%|███▌      | 89/250 [19:53<40:26, 15.07s/it]
 36%|███▌      | 90/250 [20:07<38:51, 14.57s/it]
 36%|███▌      | 90/250 [20:07<38:51, 14.57s/it]
 36%|███▋      | 91/250 [20:19<36:35, 13.81s/it]
 37%|███▋      | 92/250 [20:31<34:47, 13.21s/it]
 37%|███▋      | 93/250 [20:44<34:36, 13.22s/it]
 38%|███▊      | 94/250 [20:56<33:31, 12.90s/it]
 38%|███▊      | 95/250 [21:07<31:39, 12.25s/it]
 38%|███▊      | 96/250 [21:21<33:07, 12.91s/it]
 39%|███▉      | 97/250 [21:35<33:24, 13.10s/it]
 39%|███▉      | 98/250 [21:47<32:27, 12.81s/it]
 40%|███▉      | 99/250 [21:59<32:04, 12.74s/it]
 40%|████      | 100/250 [22:15<33:41, 13.47s/it]
 40%|████      | 100/250 [22:15<33:41, 13.47s/it]
 40%|████      | 101/250 [22:26<32:09, 12.95s/it]
 41%|████      | 102/250 [22:41<32:55, 13.35s/it]
 41%|████      | 103/250 [22:54<32:21, 13.21s/it]
 42%|████▏     | 104/250 [23:05<30:35, 12.57s/it]
 42%|████▏     | 105/250 [23:15<29:06, 12.05s/it]
 42%|████▏     | 106/250 [23:28<29:07, 12.14s/it]
 43%|████▎     | 107/250 [23:39<28:36, 12.00s/it]
 43%|████▎     | 108/250 [23:55<30:41, 12.97s/it]
 44%|████▎     | 109/250 [24:08<30:29, 12.97s/it]
 44%|████▍     | 110/250 [24:19<28:46, 12.33s/it]
 44%|████▍     | 110/250 [24:19<28:46, 12.33s/it]
 44%|████▍     | 111/250 [24:31<28:51, 12.46s/it]
 45%|████▍     | 112/250 [24:45<29:12, 12.70s/it]
 45%|████▌     | 113/250 [24:59<30:04, 13.17s/it]
 46%|████▌     | 114/250 [25:13<30:19, 13.38s/it]
 46%|████▌     | 115/250 [25:26<30:21, 13.49s/it]
 46%|████▋     | 116/250 [25:39<29:41, 13.30s/it]
 47%|████▋     | 117/250 [25:54<30:07, 13.59s/it]
 47%|████▋     | 118/250 [26:10<31:36, 14.37s/it]
 48%|████▊     | 119/250 [26:24<31:30, 14.43s/it]
 48%|████▊     | 120/250 [26:40<32:11, 14.86s/it]
 48%|████▊     | 120/250 [26:40<32:11, 14.86s/it]
 48%|████▊     | 121/250 [26:54<31:23, 14.60s/it]
 49%|████▉     | 122/250 [27:06<29:25, 13.79s/it]
 49%|████▉     | 123/250 [27:19<28:45, 13.58s/it]
 50%|████▉     | 124/250 [27:33<28:21, 13.50s/it]
 50%|█████     | 125/250 [27:45<27:38, 13.27s/it]
 50%|█████     | 126/250 [27:58<27:08, 13.13s/it]
 51%|█████     | 127/250 [28:11<27:02, 13.19s/it]
 51%|█████     | 128/250 [28:22<25:27, 12.52s/it]
 52%|█████▏    | 129/250 [28:41<29:07, 14.44s/it]
 52%|█████▏    | 130/250 [28:55<28:43, 14.36s/it]
 52%|█████▏    | 130/250 [28:55<28:43, 14.36s/it]
 52%|█████▏    | 131/250 [29:08<27:21, 13.79s/it]
 53%|█████▎    | 132/250 [29:20<26:09, 13.30s/it]
 53%|█████▎    | 133/250 [29:33<25:54, 13.29s/it]
 54%|█████▎    | 134/250 [29:46<25:12, 13.04s/it]
 54%|█████▍    | 135/250 [30:00<25:46, 13.44s/it]
 54%|█████▍    | 136/250 [30:11<24:14, 12.76s/it]
 55%|█████▍    | 137/250 [30:24<23:43, 12.60s/it]
 55%|█████▌    | 138/250 [30:40<25:54, 13.88s/it]
 56%|█████▌    | 139/250 [30:54<25:16, 13.66s/it]
 56%|█████▌    | 140/250 [31:07<24:42, 13.48s/it]
 56%|█████▌    | 140/250 [31:07<24:42, 13.48s/it]
 56%|█████▋    | 141/250 [31:24<26:26, 14.55s/it]
 57%|█████▋    | 142/250 [31:38<26:20, 14.64s/it]
 57%|█████▋    | 143/250 [31:50<24:34, 13.78s/it]
 58%|█████▊    | 144/250 [32:03<23:47, 13.46s/it]
 58%|█████▊    | 145/250 [32:13<21:46, 12.45s/it]
 58%|█████▊    | 146/250 [32:26<21:41, 12.51s/it]
 59%|█████▉    | 147/250 [32:39<21:56, 12.78s/it]
 59%|█████▉    | 148/250 [32:55<23:31, 13.84s/it]
 60%|█████▉    | 149/250 [33:09<23:13, 13.80s/it]
 60%|██████    | 150/250 [33:22<22:24, 13.45s/it]
 60%|██████    | 150/250 [33:22<22:24, 13.45s/it]
 60%|██████    | 151/250 [33:37<23:04, 13.98s/it]
 61%|██████    | 152/250 [33:49<21:46, 13.34s/it]
 61%|██████    | 153/250 [33:59<20:08, 12.46s/it]
 62%|██████▏   | 154/250 [34:15<21:29, 13.43s/it]
 62%|██████▏   | 155/250 [34:28<21:07, 13.34s/it]
 62%|██████▏   | 156/250 [34:38<19:31, 12.46s/it]
 63%|██████▎   | 157/250 [34:51<19:25, 12.53s/it]
 63%|██████▎   | 158/250 [35:05<19:40, 12.83s/it]
 64%|██████▎   | 159/250 [35:20<20:21, 13.43s/it]
 64%|██████▍   | 160/250 [35:33<20:09, 13.44s/it]
 64%|██████▍   | 160/250 [35:33<20:09, 13.44s/it]
 64%|██████▍   | 161/250 [35:44<18:52, 12.72s/it]
 65%|██████▍   | 162/250 [35:57<18:36, 12.69s/it]
 65%|██████▌   | 163/250 [36:08<17:39, 12.18s/it]
 66%|██████▌   | 164/250 [36:21<18:00, 12.56s/it]
 66%|██████▌   | 165/250 [36:35<18:22, 12.97s/it]
 66%|██████▋   | 166/250 [36:48<18:19, 13.09s/it]
 67%|██████▋   | 167/250 [37:01<18:04, 13.06s/it]
 67%|██████▋   | 168/250 [37:12<16:44, 12.25s/it]
 68%|██████▊   | 169/250 [37:25<16:57, 12.56s/it]
 68%|██████▊   | 170/250 [37:38<16:46, 12.58s/it]
 68%|██████▊   | 170/250 [37:38<16:46, 12.58s/it]
 68%|██████▊   | 171/250 [37:50<16:35, 12.60s/it]
 69%|██████▉   | 172/250 [38:05<17:17, 13.30s/it]
 69%|██████▉   | 173/250 [38:21<17:52, 13.93s/it]
 70%|██████▉   | 174/250 [38:37<18:29, 14.60s/it]
 70%|███████   | 175/250 [38:50<17:50, 14.27s/it]
 70%|███████   | 176/250 [39:04<17:30, 14.20s/it]
 71%|███████   | 177/250 [39:16<16:22, 13.46s/it]
 71%|███████   | 178/250 [39:28<15:42, 13.09s/it]
 72%|███████▏  | 179/250 [39:40<15:06, 12.77s/it]
 72%|███████▏  | 180/250 [39:54<15:17, 13.10s/it]
 72%|███████▏  | 180/250 [39:54<15:17, 13.10s/it]
 72%|███████▏  | 181/250 [40:04<14:04, 12.24s/it]
 73%|███████▎  | 182/250 [40:18<14:27, 12.75s/it]
 73%|███████▎  | 183/250 [40:32<14:31, 13.01s/it]
 74%|███████▎  | 184/250 [40:47<14:53, 13.53s/it]
 74%|███████▍  | 185/250 [41:02<15:19, 14.15s/it]
 74%|███████▍  | 186/250 [41:16<14:48, 13.88s/it]
 75%|███████▍  | 187/250 [41:31<14:55, 14.22s/it]
 75%|███████▌  | 188/250 [41:42<13:57, 13.51s/it]
 76%|███████▌  | 189/250 [41:54<13:05, 12.87s/it]
 76%|███████▌  | 190/250 [42:07<12:50, 12.83s/it]
 76%|███████▌  | 190/250 [42:07<12:50, 12.83s/it]
 76%|███████▋  | 191/250 [42:21<13:14, 13.46s/it]
 77%|███████▋  | 192/250 [42:37<13:29, 13.96s/it]
 77%|███████▋  | 193/250 [42:50<13:01, 13.71s/it]
 78%|███████▊  | 194/250 [43:02<12:17, 13.16s/it]
 78%|███████▊  | 195/250 [43:15<12:14, 13.35s/it]
 78%|███████▊  | 196/250 [43:29<12:06, 13.45s/it]
 79%|███████▉  | 197/250 [43:42<11:45, 13.31s/it]
 79%|███████▉  | 198/250 [43:53<10:47, 12.45s/it]
 80%|███████▉  | 199/250 [44:09<11:29, 13.52s/it]
 80%|████████  | 200/250 [44:21<10:54, 13.08s/it]
 80%|████████  | 200/250 [44:21<10:54, 13.08s/it]
 80%|████████  | 201/250 [44:39<12:06, 14.83s/it]
 81%|████████  | 202/250 [44:55<11:56, 14.92s/it]
 81%|████████  | 203/250 [45:07<11:07, 14.20s/it]
 82%|████████▏ | 204/250 [45:22<11:06, 14.49s/it]
 82%|████████▏ | 205/250 [45:35<10:33, 14.08s/it]
 82%|████████▏ | 206/250 [45:48<10:04, 13.75s/it]
 83%|████████▎ | 207/250 [45:59<09:16, 12.95s/it]
 83%|████████▎ | 208/250 [46:16<09:42, 13.88s/it]
 84%|████████▎ | 209/250 [46:29<09:23, 13.75s/it]
 84%|████████▍ | 210/250 [46:41<08:47, 13.18s/it]
 84%|████████▍ | 210/250 [46:41<08:47, 13.18s/it]
 84%|████████▍ | 211/250 [46:53<08:27, 13.02s/it]
 85%|████████▍ | 212/250 [47:05<08:01, 12.66s/it]
 85%|████████▌ | 213/250 [47:19<07:58, 12.93s/it]
 86%|████████▌ | 214/250 [47:31<07:38, 12.74s/it]
 86%|████████▌ | 215/250 [47:46<07:45, 13.30s/it]
 86%|████████▋ | 216/250 [47:59<07:28, 13.18s/it]
 87%|████████▋ | 217/250 [48:12<07:15, 13.18s/it]
 87%|████████▋ | 218/250 [48:29<07:38, 14.32s/it]
 88%|████████▊ | 219/250 [48:43<07:26, 14.41s/it]
 88%|████████▊ | 220/250 [48:57<07:06, 14.23s/it]
 88%|████████▊ | 220/250 [48:57<07:06, 14.23s/it]
 88%|████████▊ | 221/250 [49:10<06:37, 13.71s/it]
 89%|████████▉ | 222/250 [49:29<07:08, 15.32s/it]
 89%|████████▉ | 223/250 [49:44<06:55, 15.38s/it]
 90%|████████▉ | 224/250 [49:55<06:03, 14.00s/it]
 90%|█████████ | 225/250 [50:09<05:51, 14.04s/it]
 90%|█████████ | 226/250 [50:26<05:56, 14.84s/it]
 91%|█████████ | 227/250 [50:36<05:06, 13.34s/it]
 91%|█████████ | 228/250 [50:52<05:14, 14.31s/it]
 92%|█████████▏| 229/250 [51:04<04:44, 13.54s/it]
 92%|█████████▏| 230/250 [51:17<04:24, 13.22s/it]
 92%|█████████▏| 230/250 [51:17<04:24, 13.22s/it]
 92%|█████████▏| 231/250 [51:29<04:08, 13.09s/it]
 93%|█████████▎| 232/250 [51:41<03:48, 12.71s/it]
 93%|█████████▎| 233/250 [51:55<03:39, 12.90s/it]
 94%|█████████▎| 234/250 [52:06<03:21, 12.58s/it]
 94%|█████████▍| 235/250 [52:22<03:23, 13.54s/it]
 94%|█████████▍| 236/250 [52:35<03:06, 13.35s/it]
 95%|█████████▍| 237/250 [52:49<02:56, 13.58s/it]
 95%|█████████▌| 238/250 [53:03<02:44, 13.67s/it]
 96%|█████████▌| 239/250 [53:16<02:26, 13.31s/it]
 96%|█████████▌| 240/250 [53:29<02:12, 13.29s/it]
 96%|█████████▌| 240/250 [53:29<02:12, 13.29s/it]
 96%|█████████▋| 241/250 [53:40<01:54, 12.74s/it]
 97%|█████████▋| 242/250 [53:54<01:44, 13.02s/it]
 97%|█████████▋| 243/250 [54:09<01:34, 13.53s/it]
 98%|█████████▊| 244/250 [54:22<01:21, 13.58s/it]
 98%|█████████▊| 245/250 [54:34<01:04, 12.89s/it]
 98%|█████████▊| 246/250 [54:46<00:51, 12.81s/it]
 99%|█████████▉| 247/250 [54:59<00:38, 12.85s/it]
 99%|█████████▉| 248/250 [55:15<00:27, 13.65s/it]

+accelerate: 0.34.2
+Run id: 20260615_141604
+Python: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python
+Accelerate: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python -m accelerate.commands.accelerate_cli
+CUDA_VISIBLE_DEVICES: 0,1,2,3
+Model path: /mnt/share01/sqk/models/ChatTime-1-7B-Chat
+Data root: /mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp (train=train.jsonl eval=eval.jsonl)
+Adapter output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/adapter
+Eval output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval
+Log file: /mnt/share01/sqk/ChatTime/tsqa_adapter/logs/sft_4gpu_20260615_141604.log
+⚙️  Running in WANDB offline mode⚙️  Running in WANDB offline mode
+⚙️  Running in WANDB offline mode
+⚙️  Running in WANDB offline mode
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  0%|          | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  0%|          | 1/250 [00:14<58:06, 14.00s/it]
  1%|          | 2/250 [00:24<50:27, 12.21s/it]
  1%|          | 3/250 [00:37<50:29, 12.27s/it]
  2%|▏         | 4/250 [00:49<49:31, 12.08s/it]
  2%|▏         | 5/250 [01:00<48:41, 11.92s/it]
  2%|▏         | 6/250 [01:14<51:41, 12.71s/it]
  3%|▎         | 7/250 [01:27<51:10, 12.64s/it]
  3%|▎         | 8/250 [01:41<53:20, 13.22s/it]
  4%|▎         | 9/250 [01:52<50:23, 12.55s/it]
  4%|▍         | 10/250 [02:08<53:48, 13.45s/it]
  4%|▍         | 10/250 [02:08<53:48, 13.45s/it]
  4%|▍         | 11/250 [02:19<50:52, 12.77s/it]
  5%|▍         | 12/250 [02:32<51:15, 12.92s/it]
  5%|▌         | 13/250 [02:46<51:17, 12.99s/it]
  6%|▌         | 14/250 [02:59<51:03, 12.98s/it]
  6%|▌         | 15/250 [03:11<50:07, 12.80s/it]
  6%|▋         | 16/250 [03:22<48:26, 12.42s/it]
  7%|▋         | 17/250 [03:36<50:04, 12.89s/it]
  7%|▋         | 18/250 [03:48<48:20, 12.50s/it]
  8%|▊         | 19/250 [03:58<45:26, 11.80s/it]
  8%|▊         | 20/250 [04:09<44:36, 11.64s/it]
  8%|▊         | 20/250 [04:09<44:36, 11.64s/it]
  8%|▊         | 21/250 [04:24<48:03, 12.59s/it]
  9%|▉         | 22/250 [04:36<46:53, 12.34s/it]
  9%|▉         | 23/250 [04:50<49:03, 12.97s/it]
 10%|▉         | 24/250 [05:07<52:25, 13.92s/it]
 10%|█         | 25/250 [05:20<52:07, 13.90s/it]
 10%|█         | 26/250 [05:36<53:41, 14.38s/it]
 11%|█         | 27/250 [05:52<54:58, 14.79s/it]
 11%|█         | 28/250 [06:09<57:57, 15.66s/it]
 12%|█▏        | 29/250 [06:23<55:16, 15.01s/it]
 12%|█▏        | 30/250 [06:36<53:20, 14.55s/it]
 12%|█▏        | 30/250 [06:36<53:20, 14.55s/it]
 12%|█▏        | 31/250 [06:51<52:42, 14.44s/it]
 13%|█▎        | 32/250 [07:02<49:31, 13.63s/it]
 13%|█▎        | 33/250 [07:14<47:28, 13.13s/it]
 14%|█▎        | 34/250 [07:25<44:37, 12.39s/it]
 14%|█▍        | 35/250 [07:42<48:57, 13.66s/it]
 14%|█▍        | 36/250 [07:56<49:01, 13.74s/it]
 15%|█▍        | 37/250 [08:06<45:37, 12.85s/it]
 15%|█▌        | 38/250 [08:22<48:08, 13.62s/it]
 16%|█▌        | 39/250 [08:33<45:37, 12.97s/it]
 16%|█▌        | 40/250 [08:46<44:47, 12.80s/it]
 16%|█▌        | 40/250 [08:46<44:47, 12.80s/it]
 16%|█▋        | 41/250 [08:55<41:16, 11.85s/it]
 17%|█▋        | 42/250 [09:06<40:00, 11.54s/it]
 17%|█▋        | 43/250 [09:24<46:47, 13.56s/it]
 18%|█▊        | 44/250 [09:36<44:43, 13.03s/it]
 18%|█▊        | 45/250 [09:50<45:32, 13.33s/it]
 18%|█▊        | 46/250 [10:03<44:21, 13.05s/it]
 19%|█▉        | 47/250 [10:19<47:26, 14.02s/it]
 19%|█▉        | 48/250 [10:36<50:17, 14.94s/it]
 20%|█▉        | 49/250 [10:54<53:33, 15.99s/it]
 20%|██        | 50/250 [11:06<49:05, 14.73s/it]
 20%|██        | 50/250 [11:06<49:05, 14.73s/it]
 20%|██        | 51/250 [11:20<47:34, 14.34s/it]
 21%|██        | 52/250 [11:32<45:08, 13.68s/it]
 21%|██        | 53/250 [11:44<43:53, 13.37s/it]
 22%|██▏       | 54/250 [11:58<43:35, 13.34s/it]
 22%|██▏       | 55/250 [12:11<43:18, 13.32s/it]
 22%|██▏       | 56/250 [12:21<40:23, 12.49s/it]
 23%|██▎       | 57/250 [12:35<41:07, 12.78s/it]
 23%|██▎       | 58/250 [12:50<43:07, 13.48s/it]
 24%|██▎       | 59/250 [13:05<44:15, 13.90s/it]
 24%|██▍       | 60/250 [13:18<42:56, 13.56s/it]
 24%|██▍       | 60/250 [13:18<42:56, 13.56s/it]
 24%|██▍       | 61/250 [13:30<41:38, 13.22s/it]
 25%|██▍       | 62/250 [13:43<40:45, 13.01s/it]
 25%|██▌       | 63/250 [13:57<41:43, 13.39s/it]
 26%|██▌       | 64/250 [14:09<40:29, 13.06s/it]
 26%|██▌       | 65/250 [14:24<41:52, 13.58s/it]
 26%|██▋       | 66/250 [14:38<42:00, 13.70s/it]
 27%|██▋       | 67/250 [14:53<43:03, 14.12s/it]
 27%|██▋       | 68/250 [15:08<43:21, 14.29s/it]
 28%|██▊       | 69/250 [15:21<42:37, 14.13s/it]
 28%|██▊       | 70/250 [15:34<41:12, 13.74s/it]
 28%|██▊       | 70/250 [15:34<41:12, 13.74s/it]
 28%|██▊       | 71/250 [15:51<43:27, 14.57s/it]
 29%|██▉       | 72/250 [16:03<41:16, 13.91s/it]
 29%|██▉       | 73/250 [16:15<39:02, 13.24s/it]
 30%|██▉       | 74/250 [16:25<35:48, 12.21s/it]
 30%|███       | 75/250 [16:36<34:30, 11.83s/it]
 30%|███       | 76/250 [16:49<35:39, 12.29s/it]
 31%|███       | 77/250 [17:00<34:42, 12.04s/it]
 31%|███       | 78/250 [17:14<35:30, 12.38s/it]
 32%|███▏      | 79/250 [17:25<34:33, 12.12s/it]
 32%|███▏      | 80/250 [17:41<37:08, 13.11s/it]
 32%|███▏      | 80/250 [17:41<37:08, 13.11s/it]
 32%|███▏      | 81/250 [17:53<36:08, 12.83s/it]
 33%|███▎      | 82/250 [18:08<37:48, 13.50s/it]
 33%|███▎      | 83/250 [18:20<36:12, 13.01s/it]
 34%|███▎      | 84/250 [18:38<40:30, 14.64s/it]
 34%|███▍      | 85/250 [18:52<39:39, 14.42s/it]
 34%|███▍      | 86/250 [19:10<42:03, 15.39s/it]
 35%|███▍      | 87/250 [19:24<41:21, 15.23s/it]
 35%|███▌      | 88/250 [19:36<38:19, 14.19s/it]
 36%|███▌      | 89/250 [19:53<40:26, 15.07s/it]
 36%|███▌      | 90/250 [20:07<38:51, 14.57s/it]
 36%|███▌      | 90/250 [20:07<38:51, 14.57s/it]
 36%|███▋      | 91/250 [20:19<36:35, 13.81s/it]
 37%|███▋      | 92/250 [20:31<34:47, 13.21s/it]
 37%|███▋      | 93/250 [20:44<34:36, 13.22s/it]
 38%|███▊      | 94/250 [20:56<33:31, 12.90s/it]
 38%|███▊      | 95/250 [21:07<31:39, 12.25s/it]
 38%|███▊      | 96/250 [21:21<33:07, 12.91s/it]
 39%|███▉      | 97/250 [21:35<33:24, 13.10s/it]
 39%|███▉      | 98/250 [21:47<32:27, 12.81s/it]
 40%|███▉      | 99/250 [21:59<32:04, 12.74s/it]
 40%|████      | 100/250 [22:15<33:41, 13.47s/it]
 40%|████      | 100/250 [22:15<33:41, 13.47s/it]
 40%|████      | 101/250 [22:26<32:09, 12.95s/it]
 41%|████      | 102/250 [22:41<32:55, 13.35s/it]
 41%|████      | 103/250 [22:54<32:21, 13.21s/it]
 42%|████▏     | 104/250 [23:05<30:35, 12.57s/it]
 42%|████▏     | 105/250 [23:15<29:06, 12.05s/it]
 42%|████▏     | 106/250 [23:28<29:07, 12.14s/it]
 43%|████▎     | 107/250 [23:39<28:36, 12.00s/it]
 43%|████▎     | 108/250 [23:55<30:41, 12.97s/it]
 44%|████▎     | 109/250 [24:08<30:29, 12.97s/it]
 44%|████▍     | 110/250 [24:19<28:46, 12.33s/it]
 44%|████▍     | 110/250 [24:19<28:46, 12.33s/it]
 44%|████▍     | 111/250 [24:31<28:51, 12.46s/it]
 45%|████▍     | 112/250 [24:45<29:12, 12.70s/it]
 45%|████▌     | 113/250 [24:59<30:04, 13.17s/it]
 46%|████▌     | 114/250 [25:13<30:19, 13.38s/it]
 46%|████▌     | 115/250 [25:26<30:21, 13.49s/it]
 46%|████▋     | 116/250 [25:39<29:41, 13.30s/it]
 47%|████▋     | 117/250 [25:54<30:07, 13.59s/it]
 47%|████▋     | 118/250 [26:10<31:36, 14.37s/it]
 48%|████▊     | 119/250 [26:24<31:30, 14.43s/it]
 48%|████▊     | 120/250 [26:40<32:11, 14.86s/it]
 48%|████▊     | 120/250 [26:40<32:11, 14.86s/it]
 48%|████▊     | 121/250 [26:54<31:23, 14.60s/it]
 49%|████▉     | 122/250 [27:06<29:25, 13.79s/it]
 49%|████▉     | 123/250 [27:19<28:45, 13.58s/it]
 50%|████▉     | 124/250 [27:33<28:21, 13.50s/it]
 50%|█████     | 125/250 [27:45<27:38, 13.27s/it]
 50%|█████     | 126/250 [27:58<27:08, 13.13s/it]
 51%|█████     | 127/250 [28:11<27:02, 13.19s/it]
 51%|█████     | 128/250 [28:22<25:27, 12.52s/it]
 52%|█████▏    | 129/250 [28:41<29:07, 14.44s/it]
 52%|█████▏    | 130/250 [28:55<28:43, 14.36s/it]
 52%|█████▏    | 130/250 [28:55<28:43, 14.36s/it]
 52%|█████▏    | 131/250 [29:08<27:21, 13.79s/it]
 53%|█████▎    | 132/250 [29:20<26:09, 13.30s/it]
 53%|█████▎    | 133/250 [29:33<25:54, 13.29s/it]
 54%|█████▎    | 134/250 [29:46<25:12, 13.04s/it]
 54%|█████▍    | 135/250 [30:00<25:46, 13.44s/it]
 54%|█████▍    | 136/250 [30:11<24:14, 12.76s/it]
 55%|█████▍    | 137/250 [30:24<23:43, 12.60s/it]
 55%|█████▌    | 138/250 [30:40<25:54, 13.88s/it]
 56%|█████▌    | 139/250 [30:54<25:16, 13.66s/it]
 56%|█████▌    | 140/250 [31:07<24:42, 13.48s/it]
 56%|█████▌    | 140/250 [31:07<24:42, 13.48s/it]
 56%|█████▋    | 141/250 [31:24<26:26, 14.55s/it]
 57%|█████▋    | 142/250 [31:38<26:20, 14.64s/it]
 57%|█████▋    | 143/250 [31:50<24:34, 13.78s/it]
 58%|█████▊    | 144/250 [32:03<23:47, 13.46s/it]
 58%|█████▊    | 145/250 [32:13<21:46, 12.45s/it]
 58%|█████▊    | 146/250 [32:26<21:41, 12.51s/it]
 59%|█████▉    | 147/250 [32:39<21:56, 12.78s/it]
 59%|█████▉    | 148/250 [32:55<23:31, 13.84s/it]
 60%|█████▉    | 149/250 [33:09<23:13, 13.80s/it]
 60%|██████    | 150/250 [33:22<22:24, 13.45s/it]
 60%|██████    | 150/250 [33:22<22:24, 13.45s/it]
 60%|██████    | 151/250 [33:37<23:04, 13.98s/it]
 61%|██████    | 152/250 [33:49<21:46, 13.34s/it]
 61%|██████    | 153/250 [33:59<20:08, 12.46s/it]
 62%|██████▏   | 154/250 [34:15<21:29, 13.43s/it]
 62%|██████▏   | 155/250 [34:28<21:07, 13.34s/it]
 62%|██████▏   | 156/250 [34:38<19:31, 12.46s/it]
 63%|██████▎   | 157/250 [34:51<19:25, 12.53s/it]
 63%|██████▎   | 158/250 [35:05<19:40, 12.83s/it]
 64%|██████▎   | 159/250 [35:20<20:21, 13.43s/it]
 64%|██████▍   | 160/250 [35:33<20:09, 13.44s/it]
 64%|██████▍   | 160/250 [35:33<20:09, 13.44s/it]
 64%|██████▍   | 161/250 [35:44<18:52, 12.72s/it]
 65%|██████▍   | 162/250 [35:57<18:36, 12.69s/it]
 65%|██████▌   | 163/250 [36:08<17:39, 12.18s/it]
 66%|██████▌   | 164/250 [36:21<18:00, 12.56s/it]
 66%|██████▌   | 165/250 [36:35<18:22, 12.97s/it]
 66%|██████▋   | 166/250 [36:48<18:19, 13.09s/it]
 67%|██████▋   | 167/250 [37:01<18:04, 13.06s/it]
 67%|██████▋   | 168/250 [37:12<16:44, 12.25s/it]
 68%|██████▊   | 169/250 [37:25<16:57, 12.56s/it]
 68%|██████▊   | 170/250 [37:38<16:46, 12.58s/it]
 68%|██████▊   | 170/250 [37:38<16:46, 12.58s/it]
 68%|██████▊   | 171/250 [37:50<16:35, 12.60s/it]
 69%|██████▉   | 172/250 [38:05<17:17, 13.30s/it]
 69%|██████▉   | 173/250 [38:21<17:52, 13.93s/it]
 70%|██████▉   | 174/250 [38:37<18:29, 14.60s/it]
 70%|███████   | 175/250 [38:50<17:50, 14.27s/it]
 70%|███████   | 176/250 [39:04<17:30, 14.20s/it]
 71%|███████   | 177/250 [39:16<16:22, 13.46s/it]
 71%|███████   | 178/250 [39:28<15:42, 13.09s/it]
 72%|███████▏  | 179/250 [39:40<15:06, 12.77s/it]
 72%|███████▏  | 180/250 [39:54<15:17, 13.10s/it]
 72%|███████▏  | 180/250 [39:54<15:17, 13.10s/it]
 72%|███████▏  | 181/250 [40:04<14:04, 12.24s/it]
 73%|███████▎  | 182/250 [40:18<14:27, 12.75s/it]
 73%|███████▎  | 183/250 [40:32<14:31, 13.01s/it]
 74%|███████▎  | 184/250 [40:47<14:53, 13.53s/it]
 74%|███████▍  | 185/250 [41:02<15:19, 14.15s/it]
 74%|███████▍  | 186/250 [41:16<14:48, 13.88s/it]
 75%|███████▍  | 187/250 [41:31<14:55, 14.22s/it]
 75%|███████▌  | 188/250 [41:42<13:57, 13.51s/it]
 76%|███████▌  | 189/250 [41:54<13:05, 12.87s/it]
 76%|███████▌  | 190/250 [42:07<12:50, 12.83s/it]
 76%|███████▌  | 190/250 [42:07<12:50, 12.83s/it]
 76%|███████▋  | 191/250 [42:21<13:14, 13.46s/it]
 77%|███████▋  | 192/250 [42:37<13:29, 13.96s/it]
 77%|███████▋  | 193/250 [42:50<13:01, 13.71s/it]
 78%|███████▊  | 194/250 [43:02<12:17, 13.16s/it]
 78%|███████▊  | 195/250 [43:15<12:14, 13.35s/it]
 78%|███████▊  | 196/250 [43:29<12:06, 13.45s/it]
 79%|███████▉  | 197/250 [43:42<11:45, 13.31s/it]
 79%|███████▉  | 198/250 [43:53<10:47, 12.45s/it]
 80%|███████▉  | 199/250 [44:09<11:29, 13.52s/it]
 80%|████████  | 200/250 [44:21<10:54, 13.08s/it]
 80%|████████  | 200/250 [44:21<10:54, 13.08s/it]
 80%|████████  | 201/250 [44:39<12:06, 14.83s/it]
 81%|████████  | 202/250 [44:55<11:56, 14.92s/it]
 81%|████████  | 203/250 [45:07<11:07, 14.20s/it]
 82%|████████▏ | 204/250 [45:22<11:06, 14.49s/it]
 82%|████████▏ | 205/250 [45:35<10:33, 14.08s/it]
 82%|████████▏ | 206/250 [45:48<10:04, 13.75s/it]
 83%|████████▎ | 207/250 [45:59<09:16, 12.95s/it]
 83%|████████▎ | 208/250 [46:16<09:42, 13.88s/it]
 84%|████████▎ | 209/250 [46:29<09:23, 13.75s/it]
 84%|████████▍ | 210/250 [46:41<08:47, 13.18s/it]
 84%|████████▍ | 210/250 [46:41<08:47, 13.18s/it]
 84%|████████▍ | 211/250 [46:53<08:27, 13.02s/it]
 85%|████████▍ | 212/250 [47:05<08:01, 12.66s/it]
 85%|████████▌ | 213/250 [47:19<07:58, 12.93s/it]
 86%|████████▌ | 214/250 [47:31<07:38, 12.74s/it]
 86%|████████▌ | 215/250 [47:46<07:45, 13.30s/it]
 86%|████████▋ | 216/250 [47:59<07:28, 13.18s/it]
 87%|████████▋ | 217/250 [48:12<07:15, 13.18s/it]
 87%|████████▋ | 218/250 [48:29<07:38, 14.32s/it]
 88%|████████▊ | 219/250 [48:43<07:26, 14.41s/it]
 88%|████████▊ | 220/250 [48:57<07:06, 14.23s/it]
 88%|████████▊ | 220/250 [48:57<07:06, 14.23s/it]
 88%|████████▊ | 221/250 [49:10<06:37, 13.71s/it]
 89%|████████▉ | 222/250 [49:29<07:08, 15.32s/it]
 89%|████████▉ | 223/250 [49:44<06:55, 15.38s/it]
 90%|████████▉ | 224/250 [49:55<06:03, 14.00s/it]
 90%|█████████ | 225/250 [50:09<05:51, 14.04s/it]
 90%|█████████ | 226/250 [50:26<05:56, 14.84s/it]
 91%|█████████ | 227/250 [50:36<05:06, 13.34s/it]
 91%|█████████ | 228/250 [50:52<05:14, 14.31s/it]
 92%|█████████▏| 229/250 [51:04<04:44, 13.54s/it]
 92%|█████████▏| 230/250 [51:17<04:24, 13.22s/it]
 92%|█████████▏| 230/250 [51:17<04:24, 13.22s/it]
 92%|█████████▏| 231/250 [51:29<04:08, 13.09s/it]
 93%|█████████▎| 232/250 [51:41<03:48, 12.71s/it]
 93%|█████████▎| 233/250 [51:55<03:39, 12.90s/it]
 94%|█████████▎| 234/250 [52:06<03:21, 12.58s/it]
 94%|█████████▍| 235/250 [52:22<03:23, 13.54s/it]
 94%|█████████▍| 236/250 [52:35<03:06, 13.35s/it]
 95%|█████████▍| 237/250 [52:49<02:56, 13.58s/it]
 95%|█████████▌| 238/250 [53:03<02:44, 13.67s/it]
 96%|█████████▌| 239/250 [53:16<02:26, 13.31s/it]
 96%|█████████▌| 240/250 [53:29<02:12, 13.29s/it]
 96%|█████████▌| 240/250 [53:29<02:12, 13.29s/it]
 96%|█████████▋| 241/250 [53:40<01:54, 12.74s/it]
 97%|█████████▋| 242/250 [53:54<01:44, 13.02s/it]
 97%|█████████▋| 243/250 [54:09<01:34, 13.53s/it]
 98%|█████████▊| 244/250 [54:22<01:21, 13.58s/it]
 98%|█████████▊| 245/250 [54:34<01:04, 12.89s/it]
 98%|█████████▊| 246/250 [54:46<00:51, 12.81s/it]
 99%|█████████▉| 247/250 [54:59<00:38, 12.85s/it]
 99%|█████████▉| 248/250 [55:15<00:27, 13.65s/it]
+[rank0]:[W615 15:17:28.247429157 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+Dataset: /mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp/eval.jsonl
+Total samples: 800
+World size: 4
+Per-device eval batch size: 4
+Maximum global eval batch size: 16
+Output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval
+[rank0]:[W615 15:26:42.394095285 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank2]:[W615 15:26:46.615147024 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank3]:[W615 15:27:02.342169152 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank1]:[W615 15:27:07.925204166 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+{
+  "by_group": {
+    "anomaly_detection": {
+      "count": 200,
+      "accuracy": 0.82,
+      "correct": 164,
+      "parsed": 200
+    },
+    "classification": {
+      "count": 200,
+      "accuracy": 0.74,
+      "correct": 148,
+      "parsed": 200
+    },
+    "forecasting": {
+      "count": 200,
+      "valid_samples": 170,
+      "valid_points": 3164,
+      "mse": 825899.547396054,
+      "mae": 151.69107746555417
+    },
+    "open_ended": {
+      "count": 200,
+      "accuracy": 0.45,
+      "parsed_accuracy": 0.45685279187817257,
+      "parse_rate": 0.985,
+      "correct": 90,
+      "parsed": 197,
+      "unparsed": 3,
+      "by_format": {
+        "multiple_choice": {
+          "count": 67,
+          "accuracy": 0.2835820895522388,
+          "parsed_accuracy": 0.2878787878787879,
+          "correct": 19,
+          "parsed": 66,
+          "unparsed": 1
+        },
+        "open_ended_question": {
+          "count": 67,
+          "accuracy": 0.417910447761194,
+          "parsed_accuracy": 0.4307692307692308,
+          "correct": 28,
+          "parsed": 65,
+          "unparsed": 2
+        },
+        "true_false": {
+          "count": 66,
+          "accuracy": 0.6515151515151515,
+          "parsed_accuracy": 0.6515151515151515,
+          "correct": 43,
+          "parsed": 66,
+          "unparsed": 0
+        }
+      },
+      "by_method": {
+        "anomaly": {
+          "count": 1,
+          "accuracy": 1.0,
+          "correct": 1
+        },
+        "cyclical": {
+          "count": 4,
+          "accuracy": 0.75,
+          "correct": 3
+        },
+        "multiple_choice": {
+          "count": 64,
+          "accuracy": 0.296875,
+          "correct": 19
+        },
+        "numeric_scalar": {
+          "count": 31,
+          "accuracy": 0.25806451612903225,
+          "correct": 8
+        },
+        "numeric_sequence": {
+          "count": 1,
+          "accuracy": 0.0,
+          "correct": 0
+        },
+        "seasonality": {
+          "count": 3,
+          "accuracy": 1.0,
+          "correct": 3
+        },
+        "trend": {
+          "count": 20,
+          "accuracy": 0.45,
+          "correct": 9
+        },
+        "true_false": {
+          "count": 65,
+          "accuracy": 0.6461538461538462,
+          "correct": 42
+        },
+        "volatility": {
+          "count": 8,
+          "accuracy": 0.625,
+          "correct": 5
+        }
+      }
+    }
+  },
+  "text_overall": {
+    "count": 800,
+    "exact_match": 0.39,
+    "normalized_exact_match": 0.39,
+    "token_f1": 0.6366794082162608
+  },
+  "num_samples": 800,
+  "counts_by_group": {
+    "anomaly_detection": 200,
+    "classification": 200,
+    "forecasting": 200,
+    "open_ended": 200
+  }
+}
+Saved predictions: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval/predictions.jsonl
+Saved metrics: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval/metrics.json