Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- .gitattributes +1 -0
- img/architecture.png +3 -0
- rats40k_adapter/README.md +70 -0
- rats40k_adapter/eval_rats40k.py +244 -0
- rats40k_adapter/finetune_rats40k_lora.py +314 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/README.md +202 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/adapter_config.json +37 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/added_tokens.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_0.pth +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_1.pth +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_2.pth +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_3.pth +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scaler.pt +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scheduler.pt +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/tokenizer.model +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/training_args.bin +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/added_tokens.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/scheduler.pt +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/special_tokens_map.json +30 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer.model +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer_config.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/trainer_state.json +692 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/training_args.bin +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/special_tokens_map.json +30 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.model +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer_config.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/training_args.bin +3 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/TSAD_test_metrics.json +16 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank0.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank1.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank2.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank3.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/TSAD_test_metrics.json +16 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank0.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank1.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank2.json +0 -0
- rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank3.json +0 -0
- rats40k_adapter/rats40k_common.py +347 -0
- rats40k_adapter/run_sft_4gpu.sh +145 -0
- rats40k_adapter/run_zeroshot_4gpu.sh +81 -0
- rats40k_adapter/run_zeroshot_then_sft_4gpu.sh +75 -0
- training/finetune.py +133 -0
- training/finetune.sh +37 -0
- training/pretrain.py +154 -0
- training/pretrain.sh +37 -0
- tsqa_adapter/logs/sft_4gpu_20260615_140322.log +875 -0
- tsqa_adapter/logs/sft_4gpu_20260615_141604.log +210 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
img/architecture.png filter=lfs diff=lfs merge=lfs -text
|
img/architecture.png
ADDED
|
Git LFS Details
|
rats40k_adapter/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RATs40K Adapter for ChatTime
|
| 2 |
+
|
| 3 |
+
This folder adapts ChatTime to the RATs40K univariate anomaly QA task.
|
| 4 |
+
|
| 5 |
+
It intentionally uses the numeric `Observation` field only. It does not use
|
| 6 |
+
`FigurePath`, so the resulting baseline should be compared with numeric-only
|
| 7 |
+
Time-RA settings rather than VLM image-input settings.
|
| 8 |
+
|
| 9 |
+
## Required inputs
|
| 10 |
+
|
| 11 |
+
- `MODEL_PATH`: local ChatTime model directory. This is required by default.
|
| 12 |
+
- `PYTHON_BIN`: Python executable. The shell scripts default to
|
| 13 |
+
`/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python`.
|
| 14 |
+
- `DATA_PATH`: defaults to
|
| 15 |
+
`/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json`.
|
| 16 |
+
|
| 17 |
+
The scripts do not download HuggingFace weights unless `ALLOW_HF_DOWNLOAD=1`
|
| 18 |
+
is set explicitly.
|
| 19 |
+
|
| 20 |
+
The default precision is FP16 because the configured four-GPU machine uses
|
| 21 |
+
Tesla V100 GPUs. SFT defaults to regular FP16 LoRA with
|
| 22 |
+
`LOAD_IN_4BIT=0`, `PER_DEVICE_TRAIN_BATCH_SIZE=1`, and
|
| 23 |
+
`GRADIENT_ACCUMULATION_STEPS=16`.
|
| 24 |
+
|
| 25 |
+
Evaluation defaults to `EVAL_BATCH_SIZE=4` per GPU. With four GPUs, the
|
| 26 |
+
maximum global evaluation batch size is 16.
|
| 27 |
+
|
| 28 |
+
The task prompt is aligned with Time-RA's univariate
|
| 29 |
+
`USER_DETECTION_PROMPT`; ChatTime still receives the normalized/discretized
|
| 30 |
+
series through its native `### Input` section. Evaluation checks prompt token
|
| 31 |
+
lengths and fails instead of truncating. Defaults are `MAX_INPUT_TOKENS=3936`,
|
| 32 |
+
`MAX_NEW_TOKENS=160`, and `MAX_SEQ_LENGTH=4096` for SFT.
|
| 33 |
+
|
| 34 |
+
## Zero-shot
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
cd /mnt/share01/sqk/ChatTime
|
| 38 |
+
MODEL_PATH=/mnt/share01/sqk/models/ChatTime-1-7B-Chat \
|
| 39 |
+
bash rats40k_adapter/run_zeroshot_4gpu.sh
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
## SFT + Eval
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
cd /mnt/share01/sqk/ChatTime
|
| 46 |
+
MODEL_PATH=/mnt/share01/sqk/models/ChatTime-1-7B-Chat \
|
| 47 |
+
bash rats40k_adapter/run_sft_4gpu.sh
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Zero-shot Then SFT + Eval
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
cd /mnt/share01/sqk/ChatTime
|
| 54 |
+
bash rats40k_adapter/run_zeroshot_then_sft_4gpu.sh
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Saved Results
|
| 58 |
+
|
| 59 |
+
- Zero-shot outputs:
|
| 60 |
+
`/mnt/share01/sqk/ChatTime/rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot`
|
| 61 |
+
- SFT outputs:
|
| 62 |
+
`/mnt/share01/sqk/ChatTime/rats40k_adapter/outputs/pipeline_20260608_175250/sft`
|
| 63 |
+
|
| 64 |
+
Useful smoke-test knobs:
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
MAX_TRAIN_SAMPLES=128 MAX_EVAL_SAMPLES=64 bash rats40k_adapter/run_zeroshot_then_sft_4gpu.sh
|
| 68 |
+
MAX_TRAIN_SAMPLES=128 MAX_EVAL_SAMPLES=64 bash rats40k_adapter/run_sft_4gpu.sh
|
| 69 |
+
MAX_EVAL_SAMPLES=64 bash rats40k_adapter/run_zeroshot_4gpu.sh
|
| 70 |
+
```
|
rats40k_adapter/eval_rats40k.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import copy
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import torch.distributed as dist
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 10 |
+
|
| 11 |
+
from rats40k_common import (
|
| 12 |
+
atomic_write_json,
|
| 13 |
+
build_prediction,
|
| 14 |
+
build_prompt,
|
| 15 |
+
compute_metrics,
|
| 16 |
+
load_dataset_json,
|
| 17 |
+
valid_split_items,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def parse_args():
|
| 22 |
+
parser = argparse.ArgumentParser(
|
| 23 |
+
description="Evaluate ChatTime on RATs40K univariate anomaly QA."
|
| 24 |
+
)
|
| 25 |
+
parser.add_argument("--model_path", required=True)
|
| 26 |
+
parser.add_argument("--adapter_path", default=None)
|
| 27 |
+
parser.add_argument(
|
| 28 |
+
"--data_path",
|
| 29 |
+
default="/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json",
|
| 30 |
+
)
|
| 31 |
+
parser.add_argument("--split", default="TSAD_test")
|
| 32 |
+
parser.add_argument("--output_dir", required=True)
|
| 33 |
+
parser.add_argument("--result_name", default=None)
|
| 34 |
+
parser.add_argument("--max_eval_samples", type=int, default=None)
|
| 35 |
+
parser.add_argument("--eval_batch_size", type=int, default=4)
|
| 36 |
+
parser.add_argument("--max_input_tokens", type=int, default=3936)
|
| 37 |
+
parser.add_argument("--max_new_tokens", type=int, default=160)
|
| 38 |
+
parser.add_argument("--temperature", type=float, default=0.0)
|
| 39 |
+
parser.add_argument("--top_p", type=float, default=1.0)
|
| 40 |
+
parser.add_argument("--top_k", type=int, default=50)
|
| 41 |
+
parser.add_argument("--torch_dtype", choices=["auto", "bf16", "fp16", "fp32"], default="fp16")
|
| 42 |
+
parser.add_argument("--allow_hf_download", action="store_true")
|
| 43 |
+
return parser.parse_args()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def init_distributed():
|
| 47 |
+
world_size = int(os.environ.get("WORLD_SIZE", "1"))
|
| 48 |
+
rank = int(os.environ.get("RANK", "0"))
|
| 49 |
+
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
|
| 50 |
+
if torch.cuda.is_available():
|
| 51 |
+
torch.cuda.set_device(local_rank)
|
| 52 |
+
if world_size > 1 and not dist.is_initialized():
|
| 53 |
+
dist.init_process_group(backend="nccl")
|
| 54 |
+
return rank, local_rank, world_size
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def dtype_from_arg(value):
|
| 58 |
+
if value == "auto":
|
| 59 |
+
return "auto"
|
| 60 |
+
if value == "bf16":
|
| 61 |
+
return torch.bfloat16
|
| 62 |
+
if value == "fp16":
|
| 63 |
+
return torch.float16
|
| 64 |
+
return torch.float32
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def load_model_and_tokenizer(args, local_rank):
|
| 68 |
+
local_files_only = not args.allow_hf_download
|
| 69 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 70 |
+
args.model_path,
|
| 71 |
+
trust_remote_code=True,
|
| 72 |
+
local_files_only=local_files_only,
|
| 73 |
+
)
|
| 74 |
+
if tokenizer.pad_token is None:
|
| 75 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 76 |
+
tokenizer.padding_side = "left"
|
| 77 |
+
|
| 78 |
+
device = f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu"
|
| 79 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 80 |
+
args.model_path,
|
| 81 |
+
trust_remote_code=True,
|
| 82 |
+
torch_dtype=dtype_from_arg(args.torch_dtype),
|
| 83 |
+
low_cpu_mem_usage=True,
|
| 84 |
+
device_map={"": device} if torch.cuda.is_available() else None,
|
| 85 |
+
local_files_only=local_files_only,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
if args.adapter_path:
|
| 89 |
+
try:
|
| 90 |
+
from peft import PeftModel
|
| 91 |
+
except Exception as exc:
|
| 92 |
+
raise RuntimeError(
|
| 93 |
+
"peft is required when --adapter_path is provided. "
|
| 94 |
+
"Install peft in the selected Python environment."
|
| 95 |
+
) from exc
|
| 96 |
+
model = PeftModel.from_pretrained(
|
| 97 |
+
model,
|
| 98 |
+
args.adapter_path,
|
| 99 |
+
local_files_only=local_files_only,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
model.eval()
|
| 103 |
+
return model, tokenizer, device
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def generate_responses(model, tokenizer, device, prompts, args):
|
| 107 |
+
raw_encodings = tokenizer(
|
| 108 |
+
prompts,
|
| 109 |
+
add_special_tokens=True,
|
| 110 |
+
truncation=False,
|
| 111 |
+
)
|
| 112 |
+
prompt_lengths = [len(input_ids) for input_ids in raw_encodings["input_ids"]]
|
| 113 |
+
max_prompt_length = max(prompt_lengths) if prompt_lengths else 0
|
| 114 |
+
if max_prompt_length > args.max_input_tokens:
|
| 115 |
+
longest = max(range(len(prompt_lengths)), key=lambda idx: prompt_lengths[idx])
|
| 116 |
+
raise RuntimeError(
|
| 117 |
+
"Prompt token length exceeds max_input_tokens. "
|
| 118 |
+
f"max_prompt_length={max_prompt_length}, "
|
| 119 |
+
f"max_input_tokens={args.max_input_tokens}, "
|
| 120 |
+
f"batch_index={longest}. Increase MAX_INPUT_TOKENS or shorten the prompt."
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
model_context = getattr(model.config, "max_position_embeddings", None)
|
| 124 |
+
if model_context and max_prompt_length + args.max_new_tokens > model_context:
|
| 125 |
+
raise RuntimeError(
|
| 126 |
+
"Prompt plus generation budget exceeds model context length. "
|
| 127 |
+
f"max_prompt_length={max_prompt_length}, "
|
| 128 |
+
f"max_new_tokens={args.max_new_tokens}, "
|
| 129 |
+
f"model_context={model_context}. "
|
| 130 |
+
"Lower MAX_NEW_TOKENS or MAX_INPUT_TOKENS."
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
inputs = tokenizer(
|
| 134 |
+
prompts,
|
| 135 |
+
return_tensors="pt",
|
| 136 |
+
padding=True,
|
| 137 |
+
truncation=False,
|
| 138 |
+
)
|
| 139 |
+
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 140 |
+
do_sample = args.temperature > 0
|
| 141 |
+
generation_config = copy.deepcopy(model.generation_config)
|
| 142 |
+
generation_config.do_sample = do_sample
|
| 143 |
+
generation_config.pad_token_id = tokenizer.pad_token_id
|
| 144 |
+
generation_config.eos_token_id = tokenizer.eos_token_id
|
| 145 |
+
if do_sample:
|
| 146 |
+
generation_config.temperature = args.temperature
|
| 147 |
+
generation_config.top_p = args.top_p
|
| 148 |
+
generation_config.top_k = args.top_k
|
| 149 |
+
else:
|
| 150 |
+
generation_config.temperature = None
|
| 151 |
+
generation_config.top_p = None
|
| 152 |
+
generation_config.top_k = None
|
| 153 |
+
|
| 154 |
+
generation_kwargs = {
|
| 155 |
+
"max_new_tokens": args.max_new_tokens,
|
| 156 |
+
}
|
| 157 |
+
with torch.inference_mode():
|
| 158 |
+
output = model.generate(
|
| 159 |
+
**inputs,
|
| 160 |
+
generation_config=generation_config,
|
| 161 |
+
**generation_kwargs,
|
| 162 |
+
)
|
| 163 |
+
new_tokens = output[:, inputs["input_ids"].shape[-1] :]
|
| 164 |
+
return [
|
| 165 |
+
response.strip()
|
| 166 |
+
for response in tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def main():
|
| 171 |
+
args = parse_args()
|
| 172 |
+
rank, local_rank, world_size = init_distributed()
|
| 173 |
+
output_dir = Path(args.output_dir)
|
| 174 |
+
shard_dir = output_dir / "shards"
|
| 175 |
+
shard_dir.mkdir(parents=True, exist_ok=True)
|
| 176 |
+
|
| 177 |
+
data = load_dataset_json(args.data_path)
|
| 178 |
+
items = valid_split_items(data, args.split)
|
| 179 |
+
if args.max_eval_samples is not None and args.max_eval_samples >= 0:
|
| 180 |
+
items = items[: args.max_eval_samples]
|
| 181 |
+
shard_items = items[rank::world_size]
|
| 182 |
+
|
| 183 |
+
if rank == 0:
|
| 184 |
+
print(f"Dataset: {args.data_path}")
|
| 185 |
+
print(f"Split: {args.split}")
|
| 186 |
+
print(f"Total samples: {len(items)}")
|
| 187 |
+
print(f"World size: {world_size}")
|
| 188 |
+
print(f"Per-device eval batch size: {args.eval_batch_size}")
|
| 189 |
+
print(f"Maximum global eval batch size: {args.eval_batch_size * world_size}")
|
| 190 |
+
print(f"Output dir: {output_dir}")
|
| 191 |
+
|
| 192 |
+
model, tokenizer, device = load_model_and_tokenizer(args, local_rank)
|
| 193 |
+
|
| 194 |
+
predictions = {}
|
| 195 |
+
batch_size = max(1, args.eval_batch_size)
|
| 196 |
+
batch_starts = range(0, len(shard_items), batch_size)
|
| 197 |
+
for start in tqdm(
|
| 198 |
+
batch_starts,
|
| 199 |
+
total=(len(shard_items) + batch_size - 1) // batch_size,
|
| 200 |
+
desc=f"rank {rank}",
|
| 201 |
+
disable=rank != 0,
|
| 202 |
+
):
|
| 203 |
+
batch_items = shard_items[start : start + batch_size]
|
| 204 |
+
prompts = [
|
| 205 |
+
build_prompt(item["Observation"], item.get("Source"))
|
| 206 |
+
for _, item in batch_items
|
| 207 |
+
]
|
| 208 |
+
responses = generate_responses(model, tokenizer, device, prompts, args)
|
| 209 |
+
for (idx, _), response in zip(batch_items, responses):
|
| 210 |
+
predictions[idx] = build_prediction(response)
|
| 211 |
+
|
| 212 |
+
result_name = args.result_name or f"{args.split}_predictions.json"
|
| 213 |
+
shard_path = shard_dir / f"{Path(result_name).stem}.rank{rank}.json"
|
| 214 |
+
atomic_write_json(predictions, shard_path)
|
| 215 |
+
|
| 216 |
+
if world_size > 1:
|
| 217 |
+
dist.barrier()
|
| 218 |
+
|
| 219 |
+
if rank == 0:
|
| 220 |
+
merged = {}
|
| 221 |
+
for shard_rank in range(world_size):
|
| 222 |
+
path = shard_dir / f"{Path(result_name).stem}.rank{shard_rank}.json"
|
| 223 |
+
shard = load_dataset_json(path)
|
| 224 |
+
merged.update(shard)
|
| 225 |
+
def sort_key(pair):
|
| 226 |
+
key = pair[0]
|
| 227 |
+
return (0, int(key)) if key.isdigit() else (1, key)
|
| 228 |
+
|
| 229 |
+
merged = dict(sorted(merged.items(), key=sort_key))
|
| 230 |
+
|
| 231 |
+
result_path = output_dir / result_name
|
| 232 |
+
metrics_path = output_dir / f"{args.split}_metrics.json"
|
| 233 |
+
atomic_write_json({args.split: merged}, result_path)
|
| 234 |
+
atomic_write_json(compute_metrics(data, merged, args.split), metrics_path)
|
| 235 |
+
print(f"Saved predictions: {result_path}")
|
| 236 |
+
print(f"Saved metrics: {metrics_path}")
|
| 237 |
+
|
| 238 |
+
if world_size > 1:
|
| 239 |
+
dist.barrier()
|
| 240 |
+
dist.destroy_process_group()
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
if __name__ == "__main__":
|
| 244 |
+
main()
|
rats40k_adapter/finetune_rats40k_lora.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import inspect
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import Dataset
|
| 8 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
| 9 |
+
|
| 10 |
+
from rats40k_common import build_prompt, build_response, load_dataset_json, valid_split_items
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def patch_accelerate_compatibility():
|
| 14 |
+
from accelerate import Accelerator
|
| 15 |
+
|
| 16 |
+
signature = inspect.signature(Accelerator.unwrap_model)
|
| 17 |
+
if "keep_torch_compile" in signature.parameters:
|
| 18 |
+
return
|
| 19 |
+
|
| 20 |
+
original_unwrap_model = Accelerator.unwrap_model
|
| 21 |
+
|
| 22 |
+
def unwrap_model_compat(
|
| 23 |
+
self,
|
| 24 |
+
model,
|
| 25 |
+
keep_fp32_wrapper=True,
|
| 26 |
+
keep_torch_compile=True,
|
| 27 |
+
):
|
| 28 |
+
del keep_torch_compile
|
| 29 |
+
return original_unwrap_model(
|
| 30 |
+
self,
|
| 31 |
+
model,
|
| 32 |
+
keep_fp32_wrapper=keep_fp32_wrapper,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
Accelerator.unwrap_model = unwrap_model_compat
|
| 36 |
+
print(
|
| 37 |
+
"Applied accelerate compatibility patch: "
|
| 38 |
+
"Accelerator.unwrap_model accepts keep_torch_compile."
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class PromptResponseDataset(Dataset):
|
| 43 |
+
def __init__(self, rows, tokenizer, max_seq_length):
|
| 44 |
+
self.rows = rows
|
| 45 |
+
self.tokenizer = tokenizer
|
| 46 |
+
self.max_seq_length = max_seq_length
|
| 47 |
+
self._validate_lengths()
|
| 48 |
+
|
| 49 |
+
def __len__(self):
|
| 50 |
+
return len(self.rows)
|
| 51 |
+
|
| 52 |
+
def __getitem__(self, index):
|
| 53 |
+
_, prompt, response = self.rows[index]
|
| 54 |
+
prompt_ids = self.tokenizer(
|
| 55 |
+
prompt,
|
| 56 |
+
add_special_tokens=True,
|
| 57 |
+
truncation=False,
|
| 58 |
+
)["input_ids"]
|
| 59 |
+
response_ids = self.tokenizer(
|
| 60 |
+
response + self.tokenizer.eos_token,
|
| 61 |
+
add_special_tokens=False,
|
| 62 |
+
truncation=False,
|
| 63 |
+
)["input_ids"]
|
| 64 |
+
|
| 65 |
+
input_ids = prompt_ids + response_ids
|
| 66 |
+
prompt_len = len(prompt_ids)
|
| 67 |
+
labels = [-100] * prompt_len + input_ids[prompt_len:]
|
| 68 |
+
attention_mask = [1] * len(input_ids)
|
| 69 |
+
return {
|
| 70 |
+
"input_ids": input_ids,
|
| 71 |
+
"attention_mask": attention_mask,
|
| 72 |
+
"labels": labels,
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
def _validate_lengths(self):
|
| 76 |
+
max_prompt_len = 0
|
| 77 |
+
max_total_len = 0
|
| 78 |
+
too_long = []
|
| 79 |
+
for idx, prompt, response in self.rows:
|
| 80 |
+
prompt_ids = self.tokenizer(
|
| 81 |
+
prompt,
|
| 82 |
+
add_special_tokens=True,
|
| 83 |
+
truncation=False,
|
| 84 |
+
)["input_ids"]
|
| 85 |
+
response_ids = self.tokenizer(
|
| 86 |
+
response + self.tokenizer.eos_token,
|
| 87 |
+
add_special_tokens=False,
|
| 88 |
+
truncation=False,
|
| 89 |
+
)["input_ids"]
|
| 90 |
+
prompt_len = len(prompt_ids)
|
| 91 |
+
total_len = prompt_len + len(response_ids)
|
| 92 |
+
max_prompt_len = max(max_prompt_len, prompt_len)
|
| 93 |
+
max_total_len = max(max_total_len, total_len)
|
| 94 |
+
if total_len > self.max_seq_length:
|
| 95 |
+
too_long.append((idx, prompt_len, total_len))
|
| 96 |
+
if len(too_long) >= 5:
|
| 97 |
+
break
|
| 98 |
+
|
| 99 |
+
print(
|
| 100 |
+
"SFT token length check: "
|
| 101 |
+
f"max_prompt_len={max_prompt_len}, "
|
| 102 |
+
f"max_total_len={max_total_len}, "
|
| 103 |
+
f"max_seq_length={self.max_seq_length}"
|
| 104 |
+
)
|
| 105 |
+
if too_long:
|
| 106 |
+
examples = ", ".join(
|
| 107 |
+
f"{idx}:prompt={prompt_len},total={total_len}"
|
| 108 |
+
for idx, prompt_len, total_len in too_long
|
| 109 |
+
)
|
| 110 |
+
raise RuntimeError(
|
| 111 |
+
"Some SFT samples exceed max_seq_length and would be truncated. "
|
| 112 |
+
f"Examples: {examples}. Increase MAX_SEQ_LENGTH or shorten the prompt."
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class CausalLMCollator:
|
| 117 |
+
def __init__(self, tokenizer):
|
| 118 |
+
self.tokenizer = tokenizer
|
| 119 |
+
|
| 120 |
+
def __call__(self, features):
|
| 121 |
+
max_len = max(len(feature["input_ids"]) for feature in features)
|
| 122 |
+
input_ids = []
|
| 123 |
+
attention_mask = []
|
| 124 |
+
labels = []
|
| 125 |
+
for feature in features:
|
| 126 |
+
pad_len = max_len - len(feature["input_ids"])
|
| 127 |
+
input_ids.append(feature["input_ids"] + [self.tokenizer.pad_token_id] * pad_len)
|
| 128 |
+
attention_mask.append(feature["attention_mask"] + [0] * pad_len)
|
| 129 |
+
labels.append(feature["labels"] + [-100] * pad_len)
|
| 130 |
+
return {
|
| 131 |
+
"input_ids": torch.tensor(input_ids, dtype=torch.long),
|
| 132 |
+
"attention_mask": torch.tensor(attention_mask, dtype=torch.long),
|
| 133 |
+
"labels": torch.tensor(labels, dtype=torch.long),
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def parse_args():
|
| 138 |
+
parser = argparse.ArgumentParser(
|
| 139 |
+
description="LoRA SFT for ChatTime on RATs40K univariate anomaly QA."
|
| 140 |
+
)
|
| 141 |
+
parser.add_argument("--model_path", required=True)
|
| 142 |
+
parser.add_argument(
|
| 143 |
+
"--data_path",
|
| 144 |
+
default="/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json",
|
| 145 |
+
)
|
| 146 |
+
parser.add_argument("--train_split", default="TSAD_train")
|
| 147 |
+
parser.add_argument("--output_dir", required=True)
|
| 148 |
+
parser.add_argument("--max_train_samples", type=int, default=None)
|
| 149 |
+
parser.add_argument("--max_seq_length", type=int, default=4096)
|
| 150 |
+
parser.add_argument("--per_device_train_batch_size", type=int, default=2)
|
| 151 |
+
parser.add_argument("--gradient_accumulation_steps", type=int, default=8)
|
| 152 |
+
parser.add_argument("--num_train_epochs", type=float, default=2.0)
|
| 153 |
+
parser.add_argument("--learning_rate", type=float, default=2e-4)
|
| 154 |
+
parser.add_argument("--warmup_ratio", type=float, default=0.05)
|
| 155 |
+
parser.add_argument("--weight_decay", type=float, default=0.01)
|
| 156 |
+
parser.add_argument("--logging_steps", type=int, default=10)
|
| 157 |
+
parser.add_argument("--save_steps", type=int, default=200)
|
| 158 |
+
parser.add_argument("--save_total_limit", type=int, default=2)
|
| 159 |
+
parser.add_argument("--dataloader_num_workers", type=int, default=4)
|
| 160 |
+
parser.add_argument("--lora_rank", type=int, default=16)
|
| 161 |
+
parser.add_argument("--lora_alpha", type=int, default=32)
|
| 162 |
+
parser.add_argument("--lora_dropout", type=float, default=0.05)
|
| 163 |
+
parser.add_argument("--load_in_4bit", action="store_true")
|
| 164 |
+
parser.add_argument("--gradient_checkpointing", action="store_true")
|
| 165 |
+
parser.add_argument("--torch_dtype", choices=["bf16", "fp16", "fp32"], default="fp16")
|
| 166 |
+
parser.add_argument("--allow_hf_download", action="store_true")
|
| 167 |
+
return parser.parse_args()
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def dtype_from_arg(value):
|
| 171 |
+
if value == "bf16":
|
| 172 |
+
return torch.bfloat16
|
| 173 |
+
if value == "fp16":
|
| 174 |
+
return torch.float16
|
| 175 |
+
return torch.float32
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def local_rank():
|
| 179 |
+
return int(os.environ.get("LOCAL_RANK", "0"))
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def build_rows(data_path, split, max_samples):
|
| 183 |
+
data = load_dataset_json(data_path)
|
| 184 |
+
items = valid_split_items(data, split)
|
| 185 |
+
if max_samples is not None and max_samples >= 0:
|
| 186 |
+
items = items[:max_samples]
|
| 187 |
+
rows = []
|
| 188 |
+
for idx, item in items:
|
| 189 |
+
prompt = build_prompt(item["Observation"], item.get("Source"))
|
| 190 |
+
response = build_response(item.get("Thought", ""), item.get("ActionID"))
|
| 191 |
+
rows.append((idx, prompt, response))
|
| 192 |
+
return rows
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def load_model(args):
|
| 196 |
+
try:
|
| 197 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 198 |
+
except Exception as exc:
|
| 199 |
+
raise RuntimeError(
|
| 200 |
+
"peft is required for SFT. Install peft in the selected Python environment."
|
| 201 |
+
) from exc
|
| 202 |
+
|
| 203 |
+
dtype = dtype_from_arg(args.torch_dtype)
|
| 204 |
+
local_files_only = not args.allow_hf_download
|
| 205 |
+
device = f"cuda:{local_rank()}" if torch.cuda.is_available() else "cpu"
|
| 206 |
+
|
| 207 |
+
quantization_config = None
|
| 208 |
+
if args.load_in_4bit:
|
| 209 |
+
try:
|
| 210 |
+
from transformers import BitsAndBytesConfig
|
| 211 |
+
except Exception as exc:
|
| 212 |
+
raise RuntimeError(
|
| 213 |
+
"transformers BitsAndBytesConfig is required for --load_in_4bit."
|
| 214 |
+
) from exc
|
| 215 |
+
quantization_config = BitsAndBytesConfig(
|
| 216 |
+
load_in_4bit=True,
|
| 217 |
+
bnb_4bit_compute_dtype=dtype,
|
| 218 |
+
bnb_4bit_quant_type="nf4",
|
| 219 |
+
bnb_4bit_use_double_quant=True,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 223 |
+
args.model_path,
|
| 224 |
+
trust_remote_code=True,
|
| 225 |
+
torch_dtype=dtype,
|
| 226 |
+
low_cpu_mem_usage=True,
|
| 227 |
+
quantization_config=quantization_config,
|
| 228 |
+
device_map={"": device} if torch.cuda.is_available() and args.load_in_4bit else None,
|
| 229 |
+
local_files_only=local_files_only,
|
| 230 |
+
)
|
| 231 |
+
if not args.load_in_4bit and torch.cuda.is_available():
|
| 232 |
+
model.to(device)
|
| 233 |
+
if args.load_in_4bit:
|
| 234 |
+
model = prepare_model_for_kbit_training(
|
| 235 |
+
model,
|
| 236 |
+
use_gradient_checkpointing=args.gradient_checkpointing,
|
| 237 |
+
)
|
| 238 |
+
elif args.gradient_checkpointing:
|
| 239 |
+
model.gradient_checkpointing_enable()
|
| 240 |
+
|
| 241 |
+
lora_config = LoraConfig(
|
| 242 |
+
r=args.lora_rank,
|
| 243 |
+
lora_alpha=args.lora_alpha,
|
| 244 |
+
lora_dropout=args.lora_dropout,
|
| 245 |
+
bias="none",
|
| 246 |
+
task_type="CAUSAL_LM",
|
| 247 |
+
target_modules=[
|
| 248 |
+
"q_proj",
|
| 249 |
+
"k_proj",
|
| 250 |
+
"v_proj",
|
| 251 |
+
"o_proj",
|
| 252 |
+
"gate_proj",
|
| 253 |
+
"up_proj",
|
| 254 |
+
"down_proj",
|
| 255 |
+
],
|
| 256 |
+
)
|
| 257 |
+
model = get_peft_model(model, lora_config)
|
| 258 |
+
model.print_trainable_parameters()
|
| 259 |
+
return model
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def main():
|
| 263 |
+
args = parse_args()
|
| 264 |
+
patch_accelerate_compatibility()
|
| 265 |
+
if torch.cuda.is_available():
|
| 266 |
+
torch.cuda.set_device(local_rank())
|
| 267 |
+
|
| 268 |
+
local_files_only = not args.allow_hf_download
|
| 269 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 270 |
+
args.model_path,
|
| 271 |
+
trust_remote_code=True,
|
| 272 |
+
local_files_only=local_files_only,
|
| 273 |
+
)
|
| 274 |
+
if tokenizer.pad_token is None:
|
| 275 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 276 |
+
tokenizer.padding_side = "right"
|
| 277 |
+
|
| 278 |
+
rows = build_rows(args.data_path, args.train_split, args.max_train_samples)
|
| 279 |
+
train_dataset = PromptResponseDataset(rows, tokenizer, args.max_seq_length)
|
| 280 |
+
model = load_model(args)
|
| 281 |
+
|
| 282 |
+
bf16 = args.torch_dtype == "bf16"
|
| 283 |
+
fp16 = args.torch_dtype == "fp16"
|
| 284 |
+
training_args = TrainingArguments(
|
| 285 |
+
output_dir=args.output_dir,
|
| 286 |
+
per_device_train_batch_size=args.per_device_train_batch_size,
|
| 287 |
+
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
| 288 |
+
num_train_epochs=args.num_train_epochs,
|
| 289 |
+
learning_rate=args.learning_rate,
|
| 290 |
+
warmup_ratio=args.warmup_ratio,
|
| 291 |
+
weight_decay=args.weight_decay,
|
| 292 |
+
logging_steps=args.logging_steps,
|
| 293 |
+
save_steps=args.save_steps,
|
| 294 |
+
save_total_limit=args.save_total_limit,
|
| 295 |
+
dataloader_num_workers=args.dataloader_num_workers,
|
| 296 |
+
bf16=bf16,
|
| 297 |
+
fp16=fp16,
|
| 298 |
+
report_to="none",
|
| 299 |
+
remove_unused_columns=False,
|
| 300 |
+
ddp_find_unused_parameters=False,
|
| 301 |
+
)
|
| 302 |
+
trainer = Trainer(
|
| 303 |
+
model=model,
|
| 304 |
+
args=training_args,
|
| 305 |
+
train_dataset=train_dataset,
|
| 306 |
+
data_collator=CausalLMCollator(tokenizer),
|
| 307 |
+
)
|
| 308 |
+
trainer.train()
|
| 309 |
+
trainer.save_model(args.output_dir)
|
| 310 |
+
tokenizer.save_pretrained(args.output_dir)
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
if __name__ == "__main__":
|
| 314 |
+
main()
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /mnt/share01/sqk/models/ChatTime-1-7B-Chat
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.14.0
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/adapter_config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/mnt/share01/sqk/models/ChatTime-1-7B-Chat",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"eva_config": null,
|
| 7 |
+
"exclude_modules": null,
|
| 8 |
+
"fan_in_fan_out": false,
|
| 9 |
+
"inference_mode": true,
|
| 10 |
+
"init_lora_weights": true,
|
| 11 |
+
"layer_replication": null,
|
| 12 |
+
"layers_pattern": null,
|
| 13 |
+
"layers_to_transform": null,
|
| 14 |
+
"loftq_config": {},
|
| 15 |
+
"lora_alpha": 32,
|
| 16 |
+
"lora_bias": false,
|
| 17 |
+
"lora_dropout": 0.05,
|
| 18 |
+
"megatron_config": null,
|
| 19 |
+
"megatron_core": "megatron.core",
|
| 20 |
+
"modules_to_save": null,
|
| 21 |
+
"peft_type": "LORA",
|
| 22 |
+
"r": 16,
|
| 23 |
+
"rank_pattern": {},
|
| 24 |
+
"revision": null,
|
| 25 |
+
"target_modules": [
|
| 26 |
+
"gate_proj",
|
| 27 |
+
"o_proj",
|
| 28 |
+
"v_proj",
|
| 29 |
+
"up_proj",
|
| 30 |
+
"q_proj",
|
| 31 |
+
"k_proj",
|
| 32 |
+
"down_proj"
|
| 33 |
+
],
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_rslora": false
|
| 37 |
+
}
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/added_tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4c9bdb7f1fdf439aa0f3c5fb41c3ce23e5e6e873bea3f378cf26a709d3a3d22
|
| 3 |
+
size 15024
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f195cb3d44682c30ad9c0e1b320d29a952dc22676a666d5b7c0a105f554e012b
|
| 3 |
+
size 15024
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_2.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d644f876963b59d7d58322d0dbd4f84b5f005eb85a095c14ef20d7e8528948b
|
| 3 |
+
size 15024
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/rng_state_3.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2835319bf595568b23d432fbbab931291be0d746234b19ee4344a5852238e357
|
| 3 |
+
size 15024
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba165e391bcfa2e1188f6c4a775e972bb6f49e4c5970a96da748324529cedb20
|
| 3 |
+
size 988
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a175fc835d2784e8615427cf828af918ee04b274e34925b9edf89d29106ab1c1
|
| 3 |
+
size 1064
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-800/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:277bbf113ecf76ec5b62586e2b4fa91501b2571b1380f4721de69ef68675511f
|
| 3 |
+
size 5432
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/added_tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:617fa12ac6cb39783256991c6577b58ec2981bdfd4cdfb58008163c743049429
|
| 3 |
+
size 1064
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/trainer_state.json
ADDED
|
@@ -0,0 +1,692 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 946,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.021144442976080348,
|
| 14 |
+
"grad_norm": 6.2254319190979,
|
| 15 |
+
"learning_rate": 3.3333333333333335e-05,
|
| 16 |
+
"loss": 3.0149,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.042288885952160696,
|
| 21 |
+
"grad_norm": 0.7327573895454407,
|
| 22 |
+
"learning_rate": 7.500000000000001e-05,
|
| 23 |
+
"loss": 1.623,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.06343332892824105,
|
| 28 |
+
"grad_norm": 0.5618261098861694,
|
| 29 |
+
"learning_rate": 0.00011250000000000001,
|
| 30 |
+
"loss": 1.1099,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.08457777190432139,
|
| 35 |
+
"grad_norm": 0.48980122804641724,
|
| 36 |
+
"learning_rate": 0.00015416666666666668,
|
| 37 |
+
"loss": 0.9131,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.10572221488040175,
|
| 42 |
+
"grad_norm": 0.488565593957901,
|
| 43 |
+
"learning_rate": 0.00019583333333333334,
|
| 44 |
+
"loss": 0.7716,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.1268666578564821,
|
| 49 |
+
"grad_norm": 0.4297373592853546,
|
| 50 |
+
"learning_rate": 0.0001979955456570156,
|
| 51 |
+
"loss": 0.723,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.14801110083256244,
|
| 56 |
+
"grad_norm": 0.4536869525909424,
|
| 57 |
+
"learning_rate": 0.0001957683741648107,
|
| 58 |
+
"loss": 0.6879,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.16915554380864278,
|
| 63 |
+
"grad_norm": 0.41550788283348083,
|
| 64 |
+
"learning_rate": 0.0001935412026726058,
|
| 65 |
+
"loss": 0.6586,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.19029998678472315,
|
| 70 |
+
"grad_norm": 0.38494592905044556,
|
| 71 |
+
"learning_rate": 0.0001913140311804009,
|
| 72 |
+
"loss": 0.6324,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2114444297608035,
|
| 77 |
+
"grad_norm": 0.3633956015110016,
|
| 78 |
+
"learning_rate": 0.000189086859688196,
|
| 79 |
+
"loss": 0.631,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.23258887273688383,
|
| 84 |
+
"grad_norm": 0.3775959312915802,
|
| 85 |
+
"learning_rate": 0.0001868596881959911,
|
| 86 |
+
"loss": 0.6103,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.2537333157129642,
|
| 91 |
+
"grad_norm": 0.35080480575561523,
|
| 92 |
+
"learning_rate": 0.0001846325167037862,
|
| 93 |
+
"loss": 0.6159,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.2748777586890445,
|
| 98 |
+
"grad_norm": 0.4399946928024292,
|
| 99 |
+
"learning_rate": 0.0001824053452115813,
|
| 100 |
+
"loss": 0.5983,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.2960222016651249,
|
| 105 |
+
"grad_norm": 0.4049876928329468,
|
| 106 |
+
"learning_rate": 0.0001801781737193764,
|
| 107 |
+
"loss": 0.5881,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.31716664464120525,
|
| 112 |
+
"grad_norm": 0.3834834396839142,
|
| 113 |
+
"learning_rate": 0.0001779510022271715,
|
| 114 |
+
"loss": 0.5703,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.33831108761728557,
|
| 119 |
+
"grad_norm": 0.3201199471950531,
|
| 120 |
+
"learning_rate": 0.0001757238307349666,
|
| 121 |
+
"loss": 0.5777,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.35945553059336594,
|
| 126 |
+
"grad_norm": 0.3475135564804077,
|
| 127 |
+
"learning_rate": 0.0001734966592427617,
|
| 128 |
+
"loss": 0.5627,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.3805999735694463,
|
| 133 |
+
"grad_norm": 0.3944849371910095,
|
| 134 |
+
"learning_rate": 0.0001712694877505568,
|
| 135 |
+
"loss": 0.569,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.4017444165455266,
|
| 140 |
+
"grad_norm": 0.3674592673778534,
|
| 141 |
+
"learning_rate": 0.0001690423162583519,
|
| 142 |
+
"loss": 0.5621,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.422888859521607,
|
| 147 |
+
"grad_norm": 0.3651335835456848,
|
| 148 |
+
"learning_rate": 0.000166815144766147,
|
| 149 |
+
"loss": 0.5597,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.4440333024976873,
|
| 154 |
+
"grad_norm": 0.3435162901878357,
|
| 155 |
+
"learning_rate": 0.0001645879732739421,
|
| 156 |
+
"loss": 0.5538,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.46517774547376767,
|
| 161 |
+
"grad_norm": 0.3870578408241272,
|
| 162 |
+
"learning_rate": 0.0001623608017817372,
|
| 163 |
+
"loss": 0.5574,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.48632218844984804,
|
| 168 |
+
"grad_norm": 0.40335071086883545,
|
| 169 |
+
"learning_rate": 0.0001601336302895323,
|
| 170 |
+
"loss": 0.5394,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.5074666314259284,
|
| 175 |
+
"grad_norm": 0.3105282187461853,
|
| 176 |
+
"learning_rate": 0.0001579064587973274,
|
| 177 |
+
"loss": 0.5403,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.5286110744020087,
|
| 182 |
+
"grad_norm": 0.3729188144207001,
|
| 183 |
+
"learning_rate": 0.00015567928730512252,
|
| 184 |
+
"loss": 0.5466,
|
| 185 |
+
"step": 250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.549755517378089,
|
| 189 |
+
"grad_norm": 0.3619287312030792,
|
| 190 |
+
"learning_rate": 0.0001534521158129176,
|
| 191 |
+
"loss": 0.5305,
|
| 192 |
+
"step": 260
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.5708999603541695,
|
| 196 |
+
"grad_norm": 0.34232136607170105,
|
| 197 |
+
"learning_rate": 0.0001512249443207127,
|
| 198 |
+
"loss": 0.5319,
|
| 199 |
+
"step": 270
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.5920444033302498,
|
| 203 |
+
"grad_norm": 0.38660332560539246,
|
| 204 |
+
"learning_rate": 0.0001489977728285078,
|
| 205 |
+
"loss": 0.5242,
|
| 206 |
+
"step": 280
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.6131888463063301,
|
| 210 |
+
"grad_norm": 0.35314109921455383,
|
| 211 |
+
"learning_rate": 0.0001467706013363029,
|
| 212 |
+
"loss": 0.5255,
|
| 213 |
+
"step": 290
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.6343332892824105,
|
| 217 |
+
"grad_norm": 0.3418401777744293,
|
| 218 |
+
"learning_rate": 0.00014454342984409802,
|
| 219 |
+
"loss": 0.5357,
|
| 220 |
+
"step": 300
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.6554777322584908,
|
| 224 |
+
"grad_norm": 0.357149213552475,
|
| 225 |
+
"learning_rate": 0.0001423162583518931,
|
| 226 |
+
"loss": 0.5131,
|
| 227 |
+
"step": 310
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.6766221752345711,
|
| 231 |
+
"grad_norm": 0.3720100224018097,
|
| 232 |
+
"learning_rate": 0.0001400890868596882,
|
| 233 |
+
"loss": 0.5072,
|
| 234 |
+
"step": 320
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.6977666182106516,
|
| 238 |
+
"grad_norm": 0.342650443315506,
|
| 239 |
+
"learning_rate": 0.0001378619153674833,
|
| 240 |
+
"loss": 0.5194,
|
| 241 |
+
"step": 330
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 0.7189110611867319,
|
| 245 |
+
"grad_norm": 0.34781211614608765,
|
| 246 |
+
"learning_rate": 0.00013563474387527841,
|
| 247 |
+
"loss": 0.5094,
|
| 248 |
+
"step": 340
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 0.7400555041628122,
|
| 252 |
+
"grad_norm": 0.3401576280593872,
|
| 253 |
+
"learning_rate": 0.00013340757238307352,
|
| 254 |
+
"loss": 0.5192,
|
| 255 |
+
"step": 350
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 0.7611999471388926,
|
| 259 |
+
"grad_norm": 0.3490856885910034,
|
| 260 |
+
"learning_rate": 0.0001311804008908686,
|
| 261 |
+
"loss": 0.5045,
|
| 262 |
+
"step": 360
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 0.7823443901149729,
|
| 266 |
+
"grad_norm": 0.3488720655441284,
|
| 267 |
+
"learning_rate": 0.0001289532293986637,
|
| 268 |
+
"loss": 0.502,
|
| 269 |
+
"step": 370
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 0.8034888330910532,
|
| 273 |
+
"grad_norm": 0.37278613448143005,
|
| 274 |
+
"learning_rate": 0.00012672605790645878,
|
| 275 |
+
"loss": 0.5038,
|
| 276 |
+
"step": 380
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 0.8246332760671337,
|
| 280 |
+
"grad_norm": 0.3677748441696167,
|
| 281 |
+
"learning_rate": 0.00012449888641425391,
|
| 282 |
+
"loss": 0.505,
|
| 283 |
+
"step": 390
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 0.845777719043214,
|
| 287 |
+
"grad_norm": 0.3815574049949646,
|
| 288 |
+
"learning_rate": 0.00012227171492204902,
|
| 289 |
+
"loss": 0.4997,
|
| 290 |
+
"step": 400
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 0.8669221620192943,
|
| 294 |
+
"grad_norm": 0.37245893478393555,
|
| 295 |
+
"learning_rate": 0.0001200445434298441,
|
| 296 |
+
"loss": 0.4989,
|
| 297 |
+
"step": 410
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 0.8880666049953746,
|
| 301 |
+
"grad_norm": 0.3642374277114868,
|
| 302 |
+
"learning_rate": 0.0001178173719376392,
|
| 303 |
+
"loss": 0.4992,
|
| 304 |
+
"step": 420
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 0.909211047971455,
|
| 308 |
+
"grad_norm": 0.32838189601898193,
|
| 309 |
+
"learning_rate": 0.0001155902004454343,
|
| 310 |
+
"loss": 0.4947,
|
| 311 |
+
"step": 430
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 0.9303554909475353,
|
| 315 |
+
"grad_norm": 0.36527854204177856,
|
| 316 |
+
"learning_rate": 0.00011336302895322941,
|
| 317 |
+
"loss": 0.4952,
|
| 318 |
+
"step": 440
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 0.9514999339236156,
|
| 322 |
+
"grad_norm": 0.3686304986476898,
|
| 323 |
+
"learning_rate": 0.0001111358574610245,
|
| 324 |
+
"loss": 0.4964,
|
| 325 |
+
"step": 450
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 0.9726443768996961,
|
| 329 |
+
"grad_norm": 0.3496793210506439,
|
| 330 |
+
"learning_rate": 0.0001089086859688196,
|
| 331 |
+
"loss": 0.4827,
|
| 332 |
+
"step": 460
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 0.9937888198757764,
|
| 336 |
+
"grad_norm": 0.3722958266735077,
|
| 337 |
+
"learning_rate": 0.0001066815144766147,
|
| 338 |
+
"loss": 0.4838,
|
| 339 |
+
"step": 470
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.0148011100832564,
|
| 343 |
+
"grad_norm": 0.3902372717857361,
|
| 344 |
+
"learning_rate": 0.00010445434298440981,
|
| 345 |
+
"loss": 0.4696,
|
| 346 |
+
"step": 480
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.0359455530593367,
|
| 350 |
+
"grad_norm": 0.3780229687690735,
|
| 351 |
+
"learning_rate": 0.00010222717149220491,
|
| 352 |
+
"loss": 0.4686,
|
| 353 |
+
"step": 490
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.057089996035417,
|
| 357 |
+
"grad_norm": 0.3552299737930298,
|
| 358 |
+
"learning_rate": 0.0001,
|
| 359 |
+
"loss": 0.457,
|
| 360 |
+
"step": 500
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.0782344390114973,
|
| 364 |
+
"grad_norm": 0.3887428045272827,
|
| 365 |
+
"learning_rate": 9.77728285077951e-05,
|
| 366 |
+
"loss": 0.4735,
|
| 367 |
+
"step": 510
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.0993788819875776,
|
| 371 |
+
"grad_norm": 0.3928622603416443,
|
| 372 |
+
"learning_rate": 9.55456570155902e-05,
|
| 373 |
+
"loss": 0.4675,
|
| 374 |
+
"step": 520
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.120523324963658,
|
| 378 |
+
"grad_norm": 0.3686327636241913,
|
| 379 |
+
"learning_rate": 9.331848552338531e-05,
|
| 380 |
+
"loss": 0.4804,
|
| 381 |
+
"step": 530
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.1416677679397385,
|
| 385 |
+
"grad_norm": 0.35772374272346497,
|
| 386 |
+
"learning_rate": 9.109131403118041e-05,
|
| 387 |
+
"loss": 0.4609,
|
| 388 |
+
"step": 540
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.1628122109158188,
|
| 392 |
+
"grad_norm": 0.35283800959587097,
|
| 393 |
+
"learning_rate": 8.88641425389755e-05,
|
| 394 |
+
"loss": 0.4693,
|
| 395 |
+
"step": 550
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.183956653891899,
|
| 399 |
+
"grad_norm": 0.37653160095214844,
|
| 400 |
+
"learning_rate": 8.663697104677061e-05,
|
| 401 |
+
"loss": 0.4551,
|
| 402 |
+
"step": 560
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.2051010968679794,
|
| 406 |
+
"grad_norm": 0.35314637422561646,
|
| 407 |
+
"learning_rate": 8.44097995545657e-05,
|
| 408 |
+
"loss": 0.4539,
|
| 409 |
+
"step": 570
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.2262455398440597,
|
| 413 |
+
"grad_norm": 0.35260340571403503,
|
| 414 |
+
"learning_rate": 8.21826280623608e-05,
|
| 415 |
+
"loss": 0.4531,
|
| 416 |
+
"step": 580
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.24738998282014,
|
| 420 |
+
"grad_norm": 0.3616096079349518,
|
| 421 |
+
"learning_rate": 7.995545657015591e-05,
|
| 422 |
+
"loss": 0.4645,
|
| 423 |
+
"step": 590
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.2685344257962203,
|
| 427 |
+
"grad_norm": 0.3933924436569214,
|
| 428 |
+
"learning_rate": 7.7728285077951e-05,
|
| 429 |
+
"loss": 0.4469,
|
| 430 |
+
"step": 600
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.2896788687723006,
|
| 434 |
+
"grad_norm": 0.3878353536128998,
|
| 435 |
+
"learning_rate": 7.550111358574611e-05,
|
| 436 |
+
"loss": 0.467,
|
| 437 |
+
"step": 610
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.3108233117483812,
|
| 441 |
+
"grad_norm": 0.41165846586227417,
|
| 442 |
+
"learning_rate": 7.32739420935412e-05,
|
| 443 |
+
"loss": 0.4504,
|
| 444 |
+
"step": 620
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.3319677547244615,
|
| 448 |
+
"grad_norm": 0.36190614104270935,
|
| 449 |
+
"learning_rate": 7.10467706013363e-05,
|
| 450 |
+
"loss": 0.4517,
|
| 451 |
+
"step": 630
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.3531121977005418,
|
| 455 |
+
"grad_norm": 0.3983185887336731,
|
| 456 |
+
"learning_rate": 6.881959910913141e-05,
|
| 457 |
+
"loss": 0.444,
|
| 458 |
+
"step": 640
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.3742566406766221,
|
| 462 |
+
"grad_norm": 0.38672661781311035,
|
| 463 |
+
"learning_rate": 6.659242761692652e-05,
|
| 464 |
+
"loss": 0.4488,
|
| 465 |
+
"step": 650
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.3954010836527027,
|
| 469 |
+
"grad_norm": 0.36232879757881165,
|
| 470 |
+
"learning_rate": 6.436525612472161e-05,
|
| 471 |
+
"loss": 0.4371,
|
| 472 |
+
"step": 660
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 1.416545526628783,
|
| 476 |
+
"grad_norm": 0.40571126341819763,
|
| 477 |
+
"learning_rate": 6.21380846325167e-05,
|
| 478 |
+
"loss": 0.4427,
|
| 479 |
+
"step": 670
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 1.4376899696048633,
|
| 483 |
+
"grad_norm": 0.36234796047210693,
|
| 484 |
+
"learning_rate": 5.9910913140311805e-05,
|
| 485 |
+
"loss": 0.4439,
|
| 486 |
+
"step": 680
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"epoch": 1.4588344125809436,
|
| 490 |
+
"grad_norm": 0.4014786183834076,
|
| 491 |
+
"learning_rate": 5.7683741648106904e-05,
|
| 492 |
+
"loss": 0.4548,
|
| 493 |
+
"step": 690
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"epoch": 1.479978855557024,
|
| 497 |
+
"grad_norm": 0.3884125053882599,
|
| 498 |
+
"learning_rate": 5.545657015590201e-05,
|
| 499 |
+
"loss": 0.4531,
|
| 500 |
+
"step": 700
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"epoch": 1.5011232985331042,
|
| 504 |
+
"grad_norm": 0.3621061146259308,
|
| 505 |
+
"learning_rate": 5.322939866369711e-05,
|
| 506 |
+
"loss": 0.4407,
|
| 507 |
+
"step": 710
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"epoch": 1.5222677415091845,
|
| 511 |
+
"grad_norm": 0.3601549565792084,
|
| 512 |
+
"learning_rate": 5.100222717149221e-05,
|
| 513 |
+
"loss": 0.439,
|
| 514 |
+
"step": 720
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 1.5434121844852648,
|
| 518 |
+
"grad_norm": 0.37766754627227783,
|
| 519 |
+
"learning_rate": 4.8775055679287305e-05,
|
| 520 |
+
"loss": 0.4397,
|
| 521 |
+
"step": 730
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"epoch": 1.5645566274613452,
|
| 525 |
+
"grad_norm": 0.38728606700897217,
|
| 526 |
+
"learning_rate": 4.654788418708241e-05,
|
| 527 |
+
"loss": 0.4455,
|
| 528 |
+
"step": 740
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"epoch": 1.5857010704374257,
|
| 532 |
+
"grad_norm": 0.3532933294773102,
|
| 533 |
+
"learning_rate": 4.432071269487751e-05,
|
| 534 |
+
"loss": 0.4375,
|
| 535 |
+
"step": 750
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 1.606845513413506,
|
| 539 |
+
"grad_norm": 0.37484633922576904,
|
| 540 |
+
"learning_rate": 4.209354120267261e-05,
|
| 541 |
+
"loss": 0.4386,
|
| 542 |
+
"step": 760
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"epoch": 1.6279899563895863,
|
| 546 |
+
"grad_norm": 0.40252485871315,
|
| 547 |
+
"learning_rate": 3.986636971046771e-05,
|
| 548 |
+
"loss": 0.4394,
|
| 549 |
+
"step": 770
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"epoch": 1.6491343993656669,
|
| 553 |
+
"grad_norm": 0.3895283043384552,
|
| 554 |
+
"learning_rate": 3.7639198218262804e-05,
|
| 555 |
+
"loss": 0.4356,
|
| 556 |
+
"step": 780
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"epoch": 1.6702788423417472,
|
| 560 |
+
"grad_norm": 0.4058088958263397,
|
| 561 |
+
"learning_rate": 3.541202672605791e-05,
|
| 562 |
+
"loss": 0.4461,
|
| 563 |
+
"step": 790
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"epoch": 1.6914232853178275,
|
| 567 |
+
"grad_norm": 0.40314358472824097,
|
| 568 |
+
"learning_rate": 3.318485523385301e-05,
|
| 569 |
+
"loss": 0.4311,
|
| 570 |
+
"step": 800
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 1.7125677282939078,
|
| 574 |
+
"grad_norm": 0.384658545255661,
|
| 575 |
+
"learning_rate": 3.095768374164811e-05,
|
| 576 |
+
"loss": 0.4363,
|
| 577 |
+
"step": 810
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 1.7337121712699881,
|
| 581 |
+
"grad_norm": 0.3810129463672638,
|
| 582 |
+
"learning_rate": 2.873051224944321e-05,
|
| 583 |
+
"loss": 0.4383,
|
| 584 |
+
"step": 820
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 1.7548566142460684,
|
| 588 |
+
"grad_norm": 0.39279329776763916,
|
| 589 |
+
"learning_rate": 2.650334075723831e-05,
|
| 590 |
+
"loss": 0.4228,
|
| 591 |
+
"step": 830
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 1.7760010572221487,
|
| 595 |
+
"grad_norm": 0.39959919452667236,
|
| 596 |
+
"learning_rate": 2.427616926503341e-05,
|
| 597 |
+
"loss": 0.4262,
|
| 598 |
+
"step": 840
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 1.797145500198229,
|
| 602 |
+
"grad_norm": 0.3827113211154938,
|
| 603 |
+
"learning_rate": 2.2048997772828508e-05,
|
| 604 |
+
"loss": 0.4311,
|
| 605 |
+
"step": 850
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 1.8182899431743094,
|
| 609 |
+
"grad_norm": 0.39276352524757385,
|
| 610 |
+
"learning_rate": 1.982182628062361e-05,
|
| 611 |
+
"loss": 0.4341,
|
| 612 |
+
"step": 860
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 1.83943438615039,
|
| 616 |
+
"grad_norm": 0.38558751344680786,
|
| 617 |
+
"learning_rate": 1.759465478841871e-05,
|
| 618 |
+
"loss": 0.4207,
|
| 619 |
+
"step": 870
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 1.8605788291264702,
|
| 623 |
+
"grad_norm": 0.4052915573120117,
|
| 624 |
+
"learning_rate": 1.5367483296213807e-05,
|
| 625 |
+
"loss": 0.4254,
|
| 626 |
+
"step": 880
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 1.8817232721025505,
|
| 630 |
+
"grad_norm": 0.3884909749031067,
|
| 631 |
+
"learning_rate": 1.3140311804008909e-05,
|
| 632 |
+
"loss": 0.4198,
|
| 633 |
+
"step": 890
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 1.902867715078631,
|
| 637 |
+
"grad_norm": 0.39251548051834106,
|
| 638 |
+
"learning_rate": 1.091314031180401e-05,
|
| 639 |
+
"loss": 0.4312,
|
| 640 |
+
"step": 900
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 1.9240121580547114,
|
| 644 |
+
"grad_norm": 0.382098525762558,
|
| 645 |
+
"learning_rate": 8.685968819599109e-06,
|
| 646 |
+
"loss": 0.4257,
|
| 647 |
+
"step": 910
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 1.9451566010307917,
|
| 651 |
+
"grad_norm": 0.3773449957370758,
|
| 652 |
+
"learning_rate": 6.45879732739421e-06,
|
| 653 |
+
"loss": 0.4215,
|
| 654 |
+
"step": 920
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 1.966301044006872,
|
| 658 |
+
"grad_norm": 0.39837542176246643,
|
| 659 |
+
"learning_rate": 4.231625835189309e-06,
|
| 660 |
+
"loss": 0.4232,
|
| 661 |
+
"step": 930
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 1.9874454869829523,
|
| 665 |
+
"grad_norm": 0.38558995723724365,
|
| 666 |
+
"learning_rate": 2.00445434298441e-06,
|
| 667 |
+
"loss": 0.4254,
|
| 668 |
+
"step": 940
|
| 669 |
+
}
|
| 670 |
+
],
|
| 671 |
+
"logging_steps": 10,
|
| 672 |
+
"max_steps": 946,
|
| 673 |
+
"num_input_tokens_seen": 0,
|
| 674 |
+
"num_train_epochs": 2,
|
| 675 |
+
"save_steps": 200,
|
| 676 |
+
"stateful_callbacks": {
|
| 677 |
+
"TrainerControl": {
|
| 678 |
+
"args": {
|
| 679 |
+
"should_epoch_stop": false,
|
| 680 |
+
"should_evaluate": false,
|
| 681 |
+
"should_log": false,
|
| 682 |
+
"should_save": true,
|
| 683 |
+
"should_training_stop": true
|
| 684 |
+
},
|
| 685 |
+
"attributes": {}
|
| 686 |
+
}
|
| 687 |
+
},
|
| 688 |
+
"total_flos": 6.876005077664924e+18,
|
| 689 |
+
"train_batch_size": 1,
|
| 690 |
+
"trial_name": null,
|
| 691 |
+
"trial_params": null
|
| 692 |
+
}
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/checkpoint-946/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:277bbf113ecf76ec5b62586e2b4fa91501b2571b1380f4721de69ef68675511f
|
| 3 |
+
size 5432
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/adapter/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:277bbf113ecf76ec5b62586e2b4fa91501b2571b1380f4721de69ef68675511f
|
| 3 |
+
size 5432
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/TSAD_test_metrics.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"split": "TSAD_test",
|
| 3 |
+
"num_dataset_samples": 6034,
|
| 4 |
+
"num_prediction_samples": 6034,
|
| 5 |
+
"num_valid_samples": 5980,
|
| 6 |
+
"num_missing_predictions": 0,
|
| 7 |
+
"num_invalid_predictions": 54,
|
| 8 |
+
"type_accuracy": 0.19581939799331102,
|
| 9 |
+
"type_precision_macro": 0.14972735341824286,
|
| 10 |
+
"type_recall_macro": 0.2321495128168519,
|
| 11 |
+
"type_f1_macro": 0.11749066864105973,
|
| 12 |
+
"binary_accuracy": 0.8377926421404682,
|
| 13 |
+
"binary_precision_macro": 0.7210006797954495,
|
| 14 |
+
"binary_recall_macro": 0.7288534724234521,
|
| 15 |
+
"binary_f1_macro": 0.7247792958267192
|
| 16 |
+
}
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank0.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank1.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank2.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/sft/eval/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.rank3.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/TSAD_test_metrics.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"split": "TSAD_test",
|
| 3 |
+
"num_dataset_samples": 6034,
|
| 4 |
+
"num_prediction_samples": 6034,
|
| 5 |
+
"num_valid_samples": 6034,
|
| 6 |
+
"num_missing_predictions": 0,
|
| 7 |
+
"num_invalid_predictions": 0,
|
| 8 |
+
"type_accuracy": 0.14053695724229368,
|
| 9 |
+
"type_precision_macro": 0.016068805185920968,
|
| 10 |
+
"type_recall_macro": 0.0665676819309319,
|
| 11 |
+
"type_f1_macro": 0.025105704354246135,
|
| 12 |
+
"binary_accuracy": 0.3667550546900895,
|
| 13 |
+
"binary_precision_macro": 0.519821130521865,
|
| 14 |
+
"binary_recall_macro": 0.5270888471072895,
|
| 15 |
+
"binary_f1_macro": 0.36091116180192573
|
| 16 |
+
}
|
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank0.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank1.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank2.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/outputs/pipeline_20260608_175250/zeroshot/shards/RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.rank3.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rats40k_adapter/rats40k_common.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib.util
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
CHAT_TIME_DIR = Path(__file__).resolve().parents[1]
|
| 12 |
+
TIME_RA_PROMPT_PATH = (
|
| 13 |
+
CHAT_TIME_DIR.parent / "Time-RA" / "prompts" / "prompt_llama_anoclf_reason.py"
|
| 14 |
+
)
|
| 15 |
+
_TIME_RA_PROMPT_TEMPLATE = None
|
| 16 |
+
_TIME_RA_PROMPT_TEMPLATE_LOADED = False
|
| 17 |
+
if str(CHAT_TIME_DIR) not in sys.path:
|
| 18 |
+
sys.path.insert(0, str(CHAT_TIME_DIR))
|
| 19 |
+
|
| 20 |
+
from utils.prompt import getPrompt # noqa: E402
|
| 21 |
+
from utils.tools import Discretizer, Serializer # noqa: E402
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
ACTION_ID_TO_NAME = {
|
| 25 |
+
0: "Normal Sequence",
|
| 26 |
+
1: "Point Anomaly",
|
| 27 |
+
2: "Periodic Change Anomaly",
|
| 28 |
+
3: "Trend Change Anomaly",
|
| 29 |
+
4: "Change Point Anomaly",
|
| 30 |
+
5: "Distributional Change Anomaly",
|
| 31 |
+
6: "Amplitude Anomaly",
|
| 32 |
+
7: "Pattern Change Anomaly",
|
| 33 |
+
8: "Sparse Anomaly",
|
| 34 |
+
9: "Repeated Value Anomaly",
|
| 35 |
+
10: "Sudden Flatline Anomaly",
|
| 36 |
+
11: "Drift Anomaly",
|
| 37 |
+
12: "Sudden Spike Anomaly",
|
| 38 |
+
13: "Continuous Segment Anomaly",
|
| 39 |
+
14: "Nonlinear Pattern Anomaly",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
ACTION_DESCRIPTIONS = {
|
| 43 |
+
0: "There are no abnormal situations in this time series.",
|
| 44 |
+
1: "A single data point significantly deviates from the local or global pattern.",
|
| 45 |
+
2: "The original periodic pattern is disrupted.",
|
| 46 |
+
3: "A sudden change appears in the long-term trend.",
|
| 47 |
+
4: "Statistical properties such as mean or variance change abruptly.",
|
| 48 |
+
5: "The statistical distribution changes significantly.",
|
| 49 |
+
6: "The amplitude exceeds normal upper or lower bounds.",
|
| 50 |
+
7: "The pattern suddenly changes from one form to another.",
|
| 51 |
+
8: "Isolated anomalous patterns occasionally appear in a long series.",
|
| 52 |
+
9: "Continuous or intermittent repeated values disrupt normal fluctuations.",
|
| 53 |
+
10: "The series suddenly becomes a flat line with no normal fluctuations.",
|
| 54 |
+
11: "The data gradually drifts away from the normal level.",
|
| 55 |
+
12: "The data suddenly spikes or drops briefly and then returns to normal.",
|
| 56 |
+
13: "A continuous segment deviates from the normal pattern.",
|
| 57 |
+
14: "Nonlinear changes break the original linear rule.",
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def action_mapping_text():
|
| 62 |
+
lines = []
|
| 63 |
+
for action_id in sorted(ACTION_ID_TO_NAME):
|
| 64 |
+
lines.append(
|
| 65 |
+
f"{action_id}. {ACTION_ID_TO_NAME[action_id]}: "
|
| 66 |
+
f"{ACTION_DESCRIPTIONS[action_id]}"
|
| 67 |
+
)
|
| 68 |
+
return "\n".join(lines)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def build_instruction(source):
|
| 72 |
+
source_text = source or "unknown"
|
| 73 |
+
time_ra_template = load_time_ra_prompt_template()
|
| 74 |
+
if time_ra_template:
|
| 75 |
+
return time_ra_template.format(
|
| 76 |
+
our_source=source_text,
|
| 77 |
+
our_observation="the serialized time series provided in the ### Input section",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
source_text = source or "unknown"
|
| 81 |
+
return (
|
| 82 |
+
"Classify the provided univariate time series for anomaly detection. "
|
| 83 |
+
"The sequence is from the domain of "
|
| 84 |
+
f"{source_text}.\n\n"
|
| 85 |
+
"Use exactly one ActionID from the following mapping:\n"
|
| 86 |
+
f"{action_mapping_text()}\n\n"
|
| 87 |
+
"Return exactly two fields: Thought and ActionID. "
|
| 88 |
+
"Do not return a category name instead of ActionID."
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def load_time_ra_prompt_template():
|
| 93 |
+
global _TIME_RA_PROMPT_TEMPLATE, _TIME_RA_PROMPT_TEMPLATE_LOADED
|
| 94 |
+
if _TIME_RA_PROMPT_TEMPLATE_LOADED:
|
| 95 |
+
return _TIME_RA_PROMPT_TEMPLATE
|
| 96 |
+
_TIME_RA_PROMPT_TEMPLATE_LOADED = True
|
| 97 |
+
if not TIME_RA_PROMPT_PATH.exists():
|
| 98 |
+
return None
|
| 99 |
+
spec = importlib.util.spec_from_file_location(
|
| 100 |
+
"time_ra_prompt_llama_anoclf_reason",
|
| 101 |
+
TIME_RA_PROMPT_PATH,
|
| 102 |
+
)
|
| 103 |
+
if spec is None or spec.loader is None:
|
| 104 |
+
return None
|
| 105 |
+
module = importlib.util.module_from_spec(spec)
|
| 106 |
+
spec.loader.exec_module(module)
|
| 107 |
+
_TIME_RA_PROMPT_TEMPLATE = getattr(module, "USER_DETECTION_PROMPT", None)
|
| 108 |
+
return _TIME_RA_PROMPT_TEMPLATE
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def _to_float_array(series):
|
| 112 |
+
if isinstance(series, np.ndarray):
|
| 113 |
+
arr = series.astype(float, copy=False)
|
| 114 |
+
else:
|
| 115 |
+
arr = np.asarray(series, dtype=float)
|
| 116 |
+
if arr.ndim != 1:
|
| 117 |
+
arr = arr.reshape(-1)
|
| 118 |
+
return arr
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def serialize_observation(series):
|
| 122 |
+
arr = _to_float_array(series)
|
| 123 |
+
discretizer = Discretizer()
|
| 124 |
+
serializer = Serializer()
|
| 125 |
+
return serializer.serialize(discretizer.discretize(arr))
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def build_prompt(series, source, response=None):
|
| 129 |
+
instruction = build_instruction(source)
|
| 130 |
+
serialized = serialize_observation(series)
|
| 131 |
+
return getPrompt(
|
| 132 |
+
flag="analysis",
|
| 133 |
+
instruction=instruction,
|
| 134 |
+
input=serialized,
|
| 135 |
+
response="" if response is None else response,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def build_response(thought, action_id):
|
| 140 |
+
thought = (thought or "").strip()
|
| 141 |
+
try:
|
| 142 |
+
action_id = int(action_id)
|
| 143 |
+
except (TypeError, ValueError):
|
| 144 |
+
action_id = -1
|
| 145 |
+
return f"Thought: {thought}\nActionID: {action_id}"
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def load_dataset_json(path):
|
| 149 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 150 |
+
return json.load(f)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def valid_split_items(data, split):
|
| 154 |
+
split_data = data.get(split, {})
|
| 155 |
+
if not isinstance(split_data, dict):
|
| 156 |
+
return []
|
| 157 |
+
items = [
|
| 158 |
+
(str(idx), item)
|
| 159 |
+
for idx, item in split_data.items()
|
| 160 |
+
if isinstance(item, dict) and isinstance(item.get("Observation"), list)
|
| 161 |
+
]
|
| 162 |
+
return sorted(items, key=lambda pair: _sort_key(pair[0]))
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _sort_key(value):
|
| 166 |
+
try:
|
| 167 |
+
return (0, int(value))
|
| 168 |
+
except (TypeError, ValueError):
|
| 169 |
+
return (1, str(value))
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def _json_candidates(text):
|
| 173 |
+
text = "" if text is None else str(text).strip()
|
| 174 |
+
yield text
|
| 175 |
+
fenced = re.findall(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.S | re.I)
|
| 176 |
+
for snippet in fenced:
|
| 177 |
+
yield snippet
|
| 178 |
+
match = re.search(r"\{.*\}", text, flags=re.S)
|
| 179 |
+
if match:
|
| 180 |
+
yield match.group(0)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def parse_model_response(response):
|
| 184 |
+
text = "" if response is None else str(response)
|
| 185 |
+
thought = ""
|
| 186 |
+
action_id = None
|
| 187 |
+
|
| 188 |
+
for snippet in _json_candidates(text):
|
| 189 |
+
try:
|
| 190 |
+
obj = json.loads(snippet)
|
| 191 |
+
except (TypeError, ValueError):
|
| 192 |
+
continue
|
| 193 |
+
if not isinstance(obj, dict):
|
| 194 |
+
continue
|
| 195 |
+
for key in ("Thought", "thought", "Reason", "reason"):
|
| 196 |
+
if key in obj:
|
| 197 |
+
thought = str(obj[key]).strip()
|
| 198 |
+
break
|
| 199 |
+
for key in ("ActionID", "action_id", "actionId", "Action", "Label", "label"):
|
| 200 |
+
if key in obj:
|
| 201 |
+
action_id = _parse_action_id(obj[key])
|
| 202 |
+
break
|
| 203 |
+
if action_id is not None:
|
| 204 |
+
return thought, action_id
|
| 205 |
+
|
| 206 |
+
thought_match = re.search(
|
| 207 |
+
r"Thought\s*[:οΌ]\s*(.*?)(?=\n\s*(?:ActionID|Action\s*ID|Action|Label)\s*[:οΌ]|$)",
|
| 208 |
+
text,
|
| 209 |
+
flags=re.I | re.S,
|
| 210 |
+
)
|
| 211 |
+
if thought_match:
|
| 212 |
+
thought = thought_match.group(1).strip()
|
| 213 |
+
|
| 214 |
+
patterns = [
|
| 215 |
+
r"(?:ActionID|Action\s*ID)\s*[:οΌ]\s*(-?\d{1,2})",
|
| 216 |
+
r'"ActionID"\s*:\s*(-?\d{1,2})',
|
| 217 |
+
r"\bAction\s*[:οΌ]\s*(-?\d{1,2})",
|
| 218 |
+
r"\bLabel\s*[:οΌ]\s*(-?\d{1,2})",
|
| 219 |
+
r"\b(-?\d{1,2})\b",
|
| 220 |
+
]
|
| 221 |
+
for pattern in patterns:
|
| 222 |
+
match = re.search(pattern, text, flags=re.I)
|
| 223 |
+
if not match:
|
| 224 |
+
continue
|
| 225 |
+
action_id = _parse_action_id(match.group(1))
|
| 226 |
+
if action_id is not None:
|
| 227 |
+
return thought, action_id
|
| 228 |
+
|
| 229 |
+
lowered = re.sub(r"[^a-z0-9]+", " ", text.lower())
|
| 230 |
+
for candidate_id, name in ACTION_ID_TO_NAME.items():
|
| 231 |
+
normalized_name = re.sub(r"[^a-z0-9]+", " ", name.lower())
|
| 232 |
+
if normalized_name in lowered:
|
| 233 |
+
return thought, candidate_id
|
| 234 |
+
|
| 235 |
+
return thought, None
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def _parse_action_id(value):
|
| 239 |
+
if isinstance(value, bool):
|
| 240 |
+
return None
|
| 241 |
+
if isinstance(value, int):
|
| 242 |
+
return value if 0 <= value <= 14 else None
|
| 243 |
+
match = re.search(r"-?\d{1,2}", str(value))
|
| 244 |
+
if not match:
|
| 245 |
+
return None
|
| 246 |
+
action_id = int(match.group(0))
|
| 247 |
+
return action_id if 0 <= action_id <= 14 else None
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def build_prediction(response):
|
| 251 |
+
thought, action_id = parse_model_response(response)
|
| 252 |
+
if action_id is None:
|
| 253 |
+
return {
|
| 254 |
+
"Thought": thought,
|
| 255 |
+
"RawResponse": "" if response is None else str(response),
|
| 256 |
+
"ParseError": "unrecognized_action_id",
|
| 257 |
+
}
|
| 258 |
+
return {
|
| 259 |
+
"Thought": thought,
|
| 260 |
+
"ActionID": action_id,
|
| 261 |
+
"Action": ACTION_ID_TO_NAME[action_id],
|
| 262 |
+
"Label": 0 if action_id == 0 else 1,
|
| 263 |
+
"RawResponse": "" if response is None else str(response),
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def compute_metrics(data, predictions, split):
|
| 268 |
+
items = dict(valid_split_items(data, split))
|
| 269 |
+
y_true = []
|
| 270 |
+
y_pred = []
|
| 271 |
+
valid_keys = []
|
| 272 |
+
for idx, item in items.items():
|
| 273 |
+
pred = predictions.get(idx)
|
| 274 |
+
if not isinstance(pred, dict):
|
| 275 |
+
continue
|
| 276 |
+
pred_id = _parse_action_id(pred.get("ActionID"))
|
| 277 |
+
true_id = _parse_action_id(item.get("ActionID"))
|
| 278 |
+
if pred_id is None or true_id is None:
|
| 279 |
+
continue
|
| 280 |
+
y_true.append(true_id)
|
| 281 |
+
y_pred.append(pred_id)
|
| 282 |
+
valid_keys.append(idx)
|
| 283 |
+
|
| 284 |
+
metrics = {
|
| 285 |
+
"split": split,
|
| 286 |
+
"num_dataset_samples": len(items),
|
| 287 |
+
"num_prediction_samples": len(predictions),
|
| 288 |
+
"num_valid_samples": len(valid_keys),
|
| 289 |
+
"num_missing_predictions": len(items) - len(set(items) & set(predictions)),
|
| 290 |
+
"num_invalid_predictions": len(set(items) & set(predictions)) - len(valid_keys),
|
| 291 |
+
}
|
| 292 |
+
if not y_true:
|
| 293 |
+
return metrics
|
| 294 |
+
|
| 295 |
+
try:
|
| 296 |
+
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
| 297 |
+
|
| 298 |
+
y_true_binary = [0 if x == 0 else 1 for x in y_true]
|
| 299 |
+
y_pred_binary = [0 if x == 0 else 1 for x in y_pred]
|
| 300 |
+
metrics.update(
|
| 301 |
+
{
|
| 302 |
+
"type_accuracy": float(accuracy_score(y_true, y_pred)),
|
| 303 |
+
"type_precision_macro": float(
|
| 304 |
+
precision_score(y_true, y_pred, average="macro", zero_division=0)
|
| 305 |
+
),
|
| 306 |
+
"type_recall_macro": float(
|
| 307 |
+
recall_score(y_true, y_pred, average="macro", zero_division=0)
|
| 308 |
+
),
|
| 309 |
+
"type_f1_macro": float(
|
| 310 |
+
f1_score(y_true, y_pred, average="macro", zero_division=0)
|
| 311 |
+
),
|
| 312 |
+
"binary_accuracy": float(accuracy_score(y_true_binary, y_pred_binary)),
|
| 313 |
+
"binary_precision_macro": float(
|
| 314 |
+
precision_score(
|
| 315 |
+
y_true_binary, y_pred_binary, average="macro", zero_division=0
|
| 316 |
+
)
|
| 317 |
+
),
|
| 318 |
+
"binary_recall_macro": float(
|
| 319 |
+
recall_score(
|
| 320 |
+
y_true_binary, y_pred_binary, average="macro", zero_division=0
|
| 321 |
+
)
|
| 322 |
+
),
|
| 323 |
+
"binary_f1_macro": float(
|
| 324 |
+
f1_score(y_true_binary, y_pred_binary, average="macro", zero_division=0)
|
| 325 |
+
),
|
| 326 |
+
}
|
| 327 |
+
)
|
| 328 |
+
except Exception:
|
| 329 |
+
correct = sum(int(a == b) for a, b in zip(y_true, y_pred))
|
| 330 |
+
metrics["type_accuracy"] = correct / len(y_true)
|
| 331 |
+
true_binary = [0 if x == 0 else 1 for x in y_true]
|
| 332 |
+
pred_binary = [0 if x == 0 else 1 for x in y_pred]
|
| 333 |
+
metrics["binary_accuracy"] = sum(
|
| 334 |
+
int(a == b) for a, b in zip(true_binary, pred_binary)
|
| 335 |
+
) / len(y_true)
|
| 336 |
+
metrics["metric_warning"] = "sklearn unavailable; only accuracy was computed."
|
| 337 |
+
|
| 338 |
+
return metrics
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def atomic_write_json(obj, path):
|
| 342 |
+
path = Path(path)
|
| 343 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 344 |
+
tmp = path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
|
| 345 |
+
with open(tmp, "w", encoding="utf-8") as f:
|
| 346 |
+
json.dump(obj, f, indent=4, ensure_ascii=False)
|
| 347 |
+
os.replace(tmp, path)
|
rats40k_adapter/run_sft_4gpu.sh
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -Eeuo pipefail
|
| 3 |
+
|
| 4 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 5 |
+
PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 6 |
+
cd "$PROJECT_DIR"
|
| 7 |
+
|
| 8 |
+
RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
|
| 9 |
+
PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
|
| 10 |
+
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-/mnt/share01/sqk/ITFormer/accelerate_config.yaml}"
|
| 11 |
+
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
|
| 12 |
+
MODEL_PATH="${MODEL_PATH:-/mnt/share01/sqk/models/ChatTime-1-7B-Chat}"
|
| 13 |
+
ALLOW_HF_DOWNLOAD="${ALLOW_HF_DOWNLOAD:-0}"
|
| 14 |
+
DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
|
| 15 |
+
TRAIN_SPLIT="${TRAIN_SPLIT:-TSAD_train}"
|
| 16 |
+
EVAL_SPLIT="${EVAL_SPLIT:-TSAD_test}"
|
| 17 |
+
OUTPUT_ROOT="${OUTPUT_ROOT:-${PROJECT_DIR}/rats40k_adapter/outputs/sft_${RUN_ID}}"
|
| 18 |
+
ADAPTER_OUTPUT_DIR="${ADAPTER_OUTPUT_DIR:-${OUTPUT_ROOT}/adapter}"
|
| 19 |
+
EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-${OUTPUT_ROOT}/eval}"
|
| 20 |
+
RUN_EVAL_AFTER_SFT="${RUN_EVAL_AFTER_SFT:-1}"
|
| 21 |
+
RESULT_NAME="${RESULT_NAME:-RATs-Uni-TSImage_Reason_Reason_by_chattime_sft.json}"
|
| 22 |
+
MAX_TRAIN_SAMPLES="${MAX_TRAIN_SAMPLES:-}"
|
| 23 |
+
MAX_EVAL_SAMPLES="${MAX_EVAL_SAMPLES:-}"
|
| 24 |
+
EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
|
| 25 |
+
MAX_SEQ_LENGTH="${MAX_SEQ_LENGTH:-4096}"
|
| 26 |
+
PER_DEVICE_TRAIN_BATCH_SIZE="${PER_DEVICE_TRAIN_BATCH_SIZE:-1}"
|
| 27 |
+
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-16}"
|
| 28 |
+
NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-2}"
|
| 29 |
+
LEARNING_RATE="${LEARNING_RATE:-2e-4}"
|
| 30 |
+
LORA_RANK="${LORA_RANK:-16}"
|
| 31 |
+
LORA_ALPHA="${LORA_ALPHA:-32}"
|
| 32 |
+
LORA_DROPOUT="${LORA_DROPOUT:-0.05}"
|
| 33 |
+
SAVE_STEPS="${SAVE_STEPS:-200}"
|
| 34 |
+
LOGGING_STEPS="${LOGGING_STEPS:-10}"
|
| 35 |
+
SAVE_TOTAL_LIMIT="${SAVE_TOTAL_LIMIT:-2}"
|
| 36 |
+
DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-4}"
|
| 37 |
+
LOAD_IN_4BIT="${LOAD_IN_4BIT:-0}"
|
| 38 |
+
GRADIENT_CHECKPOINTING="${GRADIENT_CHECKPOINTING:-1}"
|
| 39 |
+
TORCH_DTYPE="${TORCH_DTYPE:-fp16}"
|
| 40 |
+
MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-160}"
|
| 41 |
+
MAX_INPUT_TOKENS="${MAX_INPUT_TOKENS:-3936}"
|
| 42 |
+
LOG_DIR="${LOG_DIR:-${PROJECT_DIR}/rats40k_adapter/logs}"
|
| 43 |
+
LOG_FILE="${LOG_FILE:-${LOG_DIR}/sft_4gpu_${RUN_ID}.log}"
|
| 44 |
+
|
| 45 |
+
mkdir -p "$LOG_DIR" "$OUTPUT_ROOT"
|
| 46 |
+
|
| 47 |
+
fail() {
|
| 48 |
+
echo "$*" >&2
|
| 49 |
+
exit 1
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
exec > >(tee -a "$LOG_FILE") 2>&1
|
| 53 |
+
|
| 54 |
+
export CUDA_VISIBLE_DEVICES
|
| 55 |
+
export PYTHONPATH="${PROJECT_DIR}:${PYTHONPATH:-}"
|
| 56 |
+
export TOKENIZERS_PARALLELISM=false
|
| 57 |
+
export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
|
| 58 |
+
export WANDB_MODE=offline
|
| 59 |
+
|
| 60 |
+
[ -x "$PYTHON_BIN" ] || fail "Python executable not found: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python."
|
| 61 |
+
[ -f "$ACCELERATE_CONFIG" ] || fail "Accelerate config not found: $ACCELERATE_CONFIG"
|
| 62 |
+
[ -f "$DATA_PATH" ] || fail "RATs40K data file not found: $DATA_PATH"
|
| 63 |
+
[ -n "$MODEL_PATH" ] || fail "MODEL_PATH is required. Use a local ChatTime model path, or set ALLOW_HF_DOWNLOAD=1 with a HuggingFace model id."
|
| 64 |
+
|
| 65 |
+
if [ ! -d "$MODEL_PATH" ] && [ "$ALLOW_HF_DOWNLOAD" != "1" ]; then
|
| 66 |
+
fail "MODEL_PATH is not a local directory: $MODEL_PATH. Set ALLOW_HF_DOWNLOAD=1 if you intentionally want HuggingFace downloads."
|
| 67 |
+
fi
|
| 68 |
+
|
| 69 |
+
"$PYTHON_BIN" -c "import accelerate; print('accelerate:', accelerate.__version__)" || \
|
| 70 |
+
fail "The selected Python cannot import accelerate: $PYTHON_BIN"
|
| 71 |
+
ACCELERATE_CMD=("$PYTHON_BIN" -m accelerate.commands.accelerate_cli)
|
| 72 |
+
|
| 73 |
+
if [ "$LOAD_IN_4BIT" = "1" ]; then
|
| 74 |
+
"$PYTHON_BIN" -c "import importlib.metadata as m; print('bitsandbytes:', m.version('bitsandbytes'))" || \
|
| 75 |
+
fail "LOAD_IN_4BIT=1 requires bitsandbytes in $PYTHON_BIN. Install it with: $PYTHON_BIN -m pip install bitsandbytes. To run without downloading it, set LOAD_IN_4BIT=0 PER_DEVICE_TRAIN_BATCH_SIZE=1 GRADIENT_ACCUMULATION_STEPS=16."
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
TRAIN_EXTRA_ARGS=()
|
| 79 |
+
if [ -n "$MAX_TRAIN_SAMPLES" ]; then
|
| 80 |
+
TRAIN_EXTRA_ARGS+=(--max_train_samples "$MAX_TRAIN_SAMPLES")
|
| 81 |
+
fi
|
| 82 |
+
if [ "$ALLOW_HF_DOWNLOAD" = "1" ]; then
|
| 83 |
+
TRAIN_EXTRA_ARGS+=(--allow_hf_download)
|
| 84 |
+
fi
|
| 85 |
+
if [ "$LOAD_IN_4BIT" = "1" ]; then
|
| 86 |
+
TRAIN_EXTRA_ARGS+=(--load_in_4bit)
|
| 87 |
+
fi
|
| 88 |
+
if [ "$GRADIENT_CHECKPOINTING" = "1" ]; then
|
| 89 |
+
TRAIN_EXTRA_ARGS+=(--gradient_checkpointing)
|
| 90 |
+
fi
|
| 91 |
+
|
| 92 |
+
echo "Run id: $RUN_ID"
|
| 93 |
+
echo "Python: $PYTHON_BIN"
|
| 94 |
+
echo "Accelerate: ${ACCELERATE_CMD[*]}"
|
| 95 |
+
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
|
| 96 |
+
echo "Model path: $MODEL_PATH"
|
| 97 |
+
echo "Data path: $DATA_PATH"
|
| 98 |
+
echo "Adapter output dir: $ADAPTER_OUTPUT_DIR"
|
| 99 |
+
echo "Eval output dir: $EVAL_OUTPUT_DIR"
|
| 100 |
+
echo "Log file: $LOG_FILE"
|
| 101 |
+
|
| 102 |
+
"${ACCELERATE_CMD[@]}" launch --config_file "$ACCELERATE_CONFIG" \
|
| 103 |
+
rats40k_adapter/finetune_rats40k_lora.py \
|
| 104 |
+
--model_path "$MODEL_PATH" \
|
| 105 |
+
--data_path "$DATA_PATH" \
|
| 106 |
+
--train_split "$TRAIN_SPLIT" \
|
| 107 |
+
--output_dir "$ADAPTER_OUTPUT_DIR" \
|
| 108 |
+
--max_seq_length "$MAX_SEQ_LENGTH" \
|
| 109 |
+
--per_device_train_batch_size "$PER_DEVICE_TRAIN_BATCH_SIZE" \
|
| 110 |
+
--gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS" \
|
| 111 |
+
--num_train_epochs "$NUM_TRAIN_EPOCHS" \
|
| 112 |
+
--learning_rate "$LEARNING_RATE" \
|
| 113 |
+
--lora_rank "$LORA_RANK" \
|
| 114 |
+
--lora_alpha "$LORA_ALPHA" \
|
| 115 |
+
--lora_dropout "$LORA_DROPOUT" \
|
| 116 |
+
--save_steps "$SAVE_STEPS" \
|
| 117 |
+
--logging_steps "$LOGGING_STEPS" \
|
| 118 |
+
--save_total_limit "$SAVE_TOTAL_LIMIT" \
|
| 119 |
+
--dataloader_num_workers "$DATALOADER_NUM_WORKERS" \
|
| 120 |
+
--torch_dtype "$TORCH_DTYPE" \
|
| 121 |
+
"${TRAIN_EXTRA_ARGS[@]}"
|
| 122 |
+
|
| 123 |
+
if [ "$RUN_EVAL_AFTER_SFT" = "1" ]; then
|
| 124 |
+
EVAL_EXTRA_ARGS=()
|
| 125 |
+
if [ -n "$MAX_EVAL_SAMPLES" ]; then
|
| 126 |
+
EVAL_EXTRA_ARGS+=(--max_eval_samples "$MAX_EVAL_SAMPLES")
|
| 127 |
+
fi
|
| 128 |
+
if [ "$ALLOW_HF_DOWNLOAD" = "1" ]; then
|
| 129 |
+
EVAL_EXTRA_ARGS+=(--allow_hf_download)
|
| 130 |
+
fi
|
| 131 |
+
|
| 132 |
+
"${ACCELERATE_CMD[@]}" launch --config_file "$ACCELERATE_CONFIG" \
|
| 133 |
+
rats40k_adapter/eval_rats40k.py \
|
| 134 |
+
--model_path "$MODEL_PATH" \
|
| 135 |
+
--adapter_path "$ADAPTER_OUTPUT_DIR" \
|
| 136 |
+
--data_path "$DATA_PATH" \
|
| 137 |
+
--split "$EVAL_SPLIT" \
|
| 138 |
+
--output_dir "$EVAL_OUTPUT_DIR" \
|
| 139 |
+
--result_name "$RESULT_NAME" \
|
| 140 |
+
--eval_batch_size "$EVAL_BATCH_SIZE" \
|
| 141 |
+
--max_new_tokens "$MAX_NEW_TOKENS" \
|
| 142 |
+
--max_input_tokens "$MAX_INPUT_TOKENS" \
|
| 143 |
+
--torch_dtype "$TORCH_DTYPE" \
|
| 144 |
+
"${EVAL_EXTRA_ARGS[@]}"
|
| 145 |
+
fi
|
rats40k_adapter/run_zeroshot_4gpu.sh
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -Eeuo pipefail
|
| 3 |
+
|
| 4 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 5 |
+
PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 6 |
+
cd "$PROJECT_DIR"
|
| 7 |
+
|
| 8 |
+
RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
|
| 9 |
+
PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
|
| 10 |
+
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-/mnt/share01/sqk/ITFormer/accelerate_config.yaml}"
|
| 11 |
+
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
|
| 12 |
+
MODEL_PATH="${MODEL_PATH:-/mnt/share01/sqk/models/ChatTime-1-7B-Chat}"
|
| 13 |
+
ALLOW_HF_DOWNLOAD="${ALLOW_HF_DOWNLOAD:-0}"
|
| 14 |
+
DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
|
| 15 |
+
SPLIT="${SPLIT:-TSAD_test}"
|
| 16 |
+
OUTPUT_DIR="${OUTPUT_DIR:-${PROJECT_DIR}/rats40k_adapter/outputs/zeroshot_${RUN_ID}}"
|
| 17 |
+
RESULT_NAME="${RESULT_NAME:-RATs-Uni-TSImage_Reason_Reason_by_chattime_zeroshot.json}"
|
| 18 |
+
MAX_EVAL_SAMPLES="${MAX_EVAL_SAMPLES:-}"
|
| 19 |
+
EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
|
| 20 |
+
MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-160}"
|
| 21 |
+
MAX_INPUT_TOKENS="${MAX_INPUT_TOKENS:-3936}"
|
| 22 |
+
TORCH_DTYPE="${TORCH_DTYPE:-fp16}"
|
| 23 |
+
LOG_DIR="${LOG_DIR:-${PROJECT_DIR}/rats40k_adapter/logs}"
|
| 24 |
+
LOG_FILE="${LOG_FILE:-${LOG_DIR}/zeroshot_4gpu_${RUN_ID}.log}"
|
| 25 |
+
|
| 26 |
+
mkdir -p "$LOG_DIR" "$OUTPUT_DIR"
|
| 27 |
+
|
| 28 |
+
fail() {
|
| 29 |
+
echo "$*" >&2
|
| 30 |
+
exit 1
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
exec > >(tee -a "$LOG_FILE") 2>&1
|
| 34 |
+
|
| 35 |
+
export CUDA_VISIBLE_DEVICES
|
| 36 |
+
export PYTHONPATH="${PROJECT_DIR}:${PYTHONPATH:-}"
|
| 37 |
+
export TOKENIZERS_PARALLELISM=false
|
| 38 |
+
export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
|
| 39 |
+
|
| 40 |
+
[ -x "$PYTHON_BIN" ] || fail "Python executable not found: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python."
|
| 41 |
+
[ -f "$ACCELERATE_CONFIG" ] || fail "Accelerate config not found: $ACCELERATE_CONFIG"
|
| 42 |
+
[ -f "$DATA_PATH" ] || fail "RATs40K data file not found: $DATA_PATH"
|
| 43 |
+
[ -n "$MODEL_PATH" ] || fail "MODEL_PATH is required. Use a local ChatTime model path, or set ALLOW_HF_DOWNLOAD=1 with a HuggingFace model id."
|
| 44 |
+
|
| 45 |
+
if [ ! -d "$MODEL_PATH" ] && [ "$ALLOW_HF_DOWNLOAD" != "1" ]; then
|
| 46 |
+
fail "MODEL_PATH is not a local directory: $MODEL_PATH. Set ALLOW_HF_DOWNLOAD=1 if you intentionally want HuggingFace downloads."
|
| 47 |
+
fi
|
| 48 |
+
|
| 49 |
+
"$PYTHON_BIN" -c "import accelerate; print('accelerate:', accelerate.__version__)" || \
|
| 50 |
+
fail "The selected Python cannot import accelerate: $PYTHON_BIN"
|
| 51 |
+
ACCELERATE_CMD=("$PYTHON_BIN" -m accelerate.commands.accelerate_cli)
|
| 52 |
+
|
| 53 |
+
EXTRA_ARGS=()
|
| 54 |
+
if [ -n "$MAX_EVAL_SAMPLES" ]; then
|
| 55 |
+
EXTRA_ARGS+=(--max_eval_samples "$MAX_EVAL_SAMPLES")
|
| 56 |
+
fi
|
| 57 |
+
if [ "$ALLOW_HF_DOWNLOAD" = "1" ]; then
|
| 58 |
+
EXTRA_ARGS+=(--allow_hf_download)
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
echo "Run id: $RUN_ID"
|
| 62 |
+
echo "Python: $PYTHON_BIN"
|
| 63 |
+
echo "Accelerate: ${ACCELERATE_CMD[*]}"
|
| 64 |
+
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
|
| 65 |
+
echo "Model path: $MODEL_PATH"
|
| 66 |
+
echo "Data path: $DATA_PATH"
|
| 67 |
+
echo "Output dir: $OUTPUT_DIR"
|
| 68 |
+
echo "Log file: $LOG_FILE"
|
| 69 |
+
|
| 70 |
+
"${ACCELERATE_CMD[@]}" launch --config_file "$ACCELERATE_CONFIG" \
|
| 71 |
+
rats40k_adapter/eval_rats40k.py \
|
| 72 |
+
--model_path "$MODEL_PATH" \
|
| 73 |
+
--data_path "$DATA_PATH" \
|
| 74 |
+
--split "$SPLIT" \
|
| 75 |
+
--output_dir "$OUTPUT_DIR" \
|
| 76 |
+
--result_name "$RESULT_NAME" \
|
| 77 |
+
--eval_batch_size "$EVAL_BATCH_SIZE" \
|
| 78 |
+
--max_new_tokens "$MAX_NEW_TOKENS" \
|
| 79 |
+
--max_input_tokens "$MAX_INPUT_TOKENS" \
|
| 80 |
+
--torch_dtype "$TORCH_DTYPE" \
|
| 81 |
+
"${EXTRA_ARGS[@]}"
|
rats40k_adapter/run_zeroshot_then_sft_4gpu.sh
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -Eeuo pipefail
|
| 3 |
+
|
| 4 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 5 |
+
PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 6 |
+
cd "$PROJECT_DIR"
|
| 7 |
+
|
| 8 |
+
RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
|
| 9 |
+
PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
|
| 10 |
+
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-/mnt/share01/sqk/ITFormer/accelerate_config.yaml}"
|
| 11 |
+
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
|
| 12 |
+
MODEL_PATH="${MODEL_PATH:-/mnt/share01/sqk/models/ChatTime-1-7B-Chat}"
|
| 13 |
+
DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
|
| 14 |
+
ALLOW_HF_DOWNLOAD="${ALLOW_HF_DOWNLOAD:-0}"
|
| 15 |
+
LOAD_IN_4BIT="${LOAD_IN_4BIT:-0}"
|
| 16 |
+
PER_DEVICE_TRAIN_BATCH_SIZE="${PER_DEVICE_TRAIN_BATCH_SIZE:-1}"
|
| 17 |
+
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-16}"
|
| 18 |
+
|
| 19 |
+
OUTPUT_BASE="${OUTPUT_BASE:-${PROJECT_DIR}/rats40k_adapter/outputs/pipeline_${RUN_ID}}"
|
| 20 |
+
LOG_DIR="${LOG_DIR:-${PROJECT_DIR}/rats40k_adapter/logs}"
|
| 21 |
+
|
| 22 |
+
ZERO_SHOT_OUTPUT_DIR="${ZERO_SHOT_OUTPUT_DIR:-${OUTPUT_BASE}/zeroshot}"
|
| 23 |
+
SFT_OUTPUT_ROOT="${SFT_OUTPUT_ROOT:-${OUTPUT_BASE}/sft}"
|
| 24 |
+
|
| 25 |
+
ZERO_SHOT_LOG_FILE="${ZERO_SHOT_LOG_FILE:-${LOG_DIR}/pipeline_${RUN_ID}_zeroshot.log}"
|
| 26 |
+
SFT_LOG_FILE="${SFT_LOG_FILE:-${LOG_DIR}/pipeline_${RUN_ID}_sft.log}"
|
| 27 |
+
|
| 28 |
+
mkdir -p "$OUTPUT_BASE" "$LOG_DIR"
|
| 29 |
+
|
| 30 |
+
echo "Pipeline run id: $RUN_ID"
|
| 31 |
+
echo "Project dir: $PROJECT_DIR"
|
| 32 |
+
echo "Python: $PYTHON_BIN"
|
| 33 |
+
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
|
| 34 |
+
echo "Model path: $MODEL_PATH"
|
| 35 |
+
echo "Data path: $DATA_PATH"
|
| 36 |
+
echo "LOAD_IN_4BIT: $LOAD_IN_4BIT"
|
| 37 |
+
echo "Per-device train batch size: $PER_DEVICE_TRAIN_BATCH_SIZE"
|
| 38 |
+
echo "Gradient accumulation steps: $GRADIENT_ACCUMULATION_STEPS"
|
| 39 |
+
echo "Zero-shot output dir: $ZERO_SHOT_OUTPUT_DIR"
|
| 40 |
+
echo "SFT output root: $SFT_OUTPUT_ROOT"
|
| 41 |
+
|
| 42 |
+
echo ""
|
| 43 |
+
echo "========== Stage 1/2: Zero-shot eval =========="
|
| 44 |
+
RUN_ID="$RUN_ID" \
|
| 45 |
+
PYTHON_BIN="$PYTHON_BIN" \
|
| 46 |
+
ACCELERATE_CONFIG="$ACCELERATE_CONFIG" \
|
| 47 |
+
CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" \
|
| 48 |
+
MODEL_PATH="$MODEL_PATH" \
|
| 49 |
+
DATA_PATH="$DATA_PATH" \
|
| 50 |
+
ALLOW_HF_DOWNLOAD="$ALLOW_HF_DOWNLOAD" \
|
| 51 |
+
OUTPUT_DIR="$ZERO_SHOT_OUTPUT_DIR" \
|
| 52 |
+
LOG_FILE="$ZERO_SHOT_LOG_FILE" \
|
| 53 |
+
bash rats40k_adapter/run_zeroshot_4gpu.sh
|
| 54 |
+
|
| 55 |
+
echo ""
|
| 56 |
+
echo "========== Stage 2/2: SFT + eval =========="
|
| 57 |
+
RUN_ID="$RUN_ID" \
|
| 58 |
+
PYTHON_BIN="$PYTHON_BIN" \
|
| 59 |
+
ACCELERATE_CONFIG="$ACCELERATE_CONFIG" \
|
| 60 |
+
CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" \
|
| 61 |
+
MODEL_PATH="$MODEL_PATH" \
|
| 62 |
+
DATA_PATH="$DATA_PATH" \
|
| 63 |
+
ALLOW_HF_DOWNLOAD="$ALLOW_HF_DOWNLOAD" \
|
| 64 |
+
LOAD_IN_4BIT="$LOAD_IN_4BIT" \
|
| 65 |
+
PER_DEVICE_TRAIN_BATCH_SIZE="$PER_DEVICE_TRAIN_BATCH_SIZE" \
|
| 66 |
+
GRADIENT_ACCUMULATION_STEPS="$GRADIENT_ACCUMULATION_STEPS" \
|
| 67 |
+
OUTPUT_ROOT="$SFT_OUTPUT_ROOT" \
|
| 68 |
+
LOG_FILE="$SFT_LOG_FILE" \
|
| 69 |
+
RUN_EVAL_AFTER_SFT="${RUN_EVAL_AFTER_SFT:-1}" \
|
| 70 |
+
bash rats40k_adapter/run_sft_4gpu.sh
|
| 71 |
+
|
| 72 |
+
echo ""
|
| 73 |
+
echo "Pipeline finished."
|
| 74 |
+
echo "Zero-shot outputs: $ZERO_SHOT_OUTPUT_DIR"
|
| 75 |
+
echo "SFT outputs: $SFT_OUTPUT_ROOT"
|
training/finetune.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from transformers import TrainingArguments, LlamaTokenizer
|
| 7 |
+
from trl import SFTTrainer
|
| 8 |
+
from unsloth import FastLanguageModel, is_bfloat16_supported
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
parser = argparse.ArgumentParser()
|
| 12 |
+
parser.add_argument("--code_path", type=str, required=True, default=None)
|
| 13 |
+
parser.add_argument("--model_path", type=str, required=True, default=None)
|
| 14 |
+
parser.add_argument("--dataset_path", type=str, required=True, default=None)
|
| 15 |
+
parser.add_argument("--log_path", type=str, required=True, default=None)
|
| 16 |
+
parser.add_argument("--output_path", type=str, required=True, default=None)
|
| 17 |
+
|
| 18 |
+
parser.add_argument("--max_seq_length", type=int, default=2048)
|
| 19 |
+
parser.add_argument("--load_in_4bit", action="store_true", default=False)
|
| 20 |
+
|
| 21 |
+
parser.add_argument("--lora_rank", type=int, default=16)
|
| 22 |
+
parser.add_argument("--lora_alpha", type=int, default=16)
|
| 23 |
+
parser.add_argument("--lora_dropout", type=float, default=0.00)
|
| 24 |
+
parser.add_argument("--random_seed", type=int, default=3407)
|
| 25 |
+
|
| 26 |
+
parser.add_argument("--num_train_epochs", type=int, default=1)
|
| 27 |
+
parser.add_argument("--per_device_train_batch_size", type=int, default=64)
|
| 28 |
+
parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
|
| 29 |
+
parser.add_argument("--save_steps", type=int, default=2)
|
| 30 |
+
parser.add_argument("--logging_steps", type=int, default=2)
|
| 31 |
+
parser.add_argument("--max_steps", type=int, default=-1)
|
| 32 |
+
|
| 33 |
+
args = parser.parse_args()
|
| 34 |
+
|
| 35 |
+
sys.path.append(args.code_path)
|
| 36 |
+
|
| 37 |
+
# load tokenizer
|
| 38 |
+
tokenizer = LlamaTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
|
| 39 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 40 |
+
tokenizer.padding_side = "right"
|
| 41 |
+
print(f"\nVocabulary number: {len(tokenizer.get_vocab())}\n")
|
| 42 |
+
|
| 43 |
+
EOS_TOKEN = tokenizer.eos_token
|
| 44 |
+
|
| 45 |
+
# load model
|
| 46 |
+
model, _ = FastLanguageModel.from_pretrained(
|
| 47 |
+
model_name=args.model_path,
|
| 48 |
+
max_seq_length=args.max_seq_length,
|
| 49 |
+
dtype=None,
|
| 50 |
+
load_in_4bit=args.load_in_4bit,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# add lora to llama model
|
| 54 |
+
model = FastLanguageModel.get_peft_model(
|
| 55 |
+
model,
|
| 56 |
+
r=args.lora_rank,
|
| 57 |
+
lora_alpha=args.lora_alpha,
|
| 58 |
+
lora_dropout=args.lora_dropout,
|
| 59 |
+
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ],
|
| 60 |
+
# modules_to_save=["embed_tokens", "lm_head", ],
|
| 61 |
+
bias="none",
|
| 62 |
+
use_gradient_checkpointing="unsloth",
|
| 63 |
+
random_state=args.random_seed,
|
| 64 |
+
max_seq_length=args.max_seq_length,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# load dataset
|
| 69 |
+
def formatting_func(example):
|
| 70 |
+
return example["text"] + EOS_TOKEN
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
print(f"\nLoading dataset in {args.dataset_path}")
|
| 74 |
+
dataset = load_dataset(args.dataset_path, split="train")
|
| 75 |
+
print(f"Dataset example: \n{dataset[0]['text']}\n")
|
| 76 |
+
|
| 77 |
+
# train model
|
| 78 |
+
trainer = SFTTrainer(
|
| 79 |
+
model=model,
|
| 80 |
+
tokenizer=tokenizer,
|
| 81 |
+
train_dataset=dataset,
|
| 82 |
+
dataset_text_field="text",
|
| 83 |
+
max_seq_length=args.max_seq_length,
|
| 84 |
+
dataset_num_proc=64,
|
| 85 |
+
packing=False,
|
| 86 |
+
formatting_func=formatting_func,
|
| 87 |
+
args=TrainingArguments(
|
| 88 |
+
per_device_train_batch_size=args.per_device_train_batch_size,
|
| 89 |
+
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
| 90 |
+
num_train_epochs=args.num_train_epochs,
|
| 91 |
+
weight_decay=0.01,
|
| 92 |
+
warmup_ratio=0.05,
|
| 93 |
+
max_grad_norm=1.0,
|
| 94 |
+
learning_rate=2e-4,
|
| 95 |
+
logging_strategy="steps",
|
| 96 |
+
logging_steps=args.logging_steps,
|
| 97 |
+
save_strategy="steps",
|
| 98 |
+
save_steps=args.save_steps,
|
| 99 |
+
max_steps=args.max_steps,
|
| 100 |
+
save_total_limit=1,
|
| 101 |
+
logging_first_step=True,
|
| 102 |
+
optim="adamw_8bit",
|
| 103 |
+
lr_scheduler_type="cosine",
|
| 104 |
+
seed=args.random_seed,
|
| 105 |
+
output_dir=args.log_path,
|
| 106 |
+
fp16=not is_bfloat16_supported(),
|
| 107 |
+
bf16=is_bfloat16_supported(),
|
| 108 |
+
),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# title Show current memory stats
|
| 112 |
+
gpu_stats = torch.cuda.get_device_properties(0)
|
| 113 |
+
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
| 114 |
+
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
| 115 |
+
print(f"\nGPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
| 116 |
+
print(f"{start_gpu_memory} GB of memory reserved.\n")
|
| 117 |
+
|
| 118 |
+
trainer_stats = trainer.train()
|
| 119 |
+
|
| 120 |
+
# title Show final memory and time stats
|
| 121 |
+
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
| 122 |
+
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
|
| 123 |
+
used_percentage = round(used_memory / max_memory * 100, 3)
|
| 124 |
+
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
|
| 125 |
+
print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
|
| 126 |
+
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
|
| 127 |
+
print(f"Peak reserved memory = {used_memory} GB.")
|
| 128 |
+
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
|
| 129 |
+
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
| 130 |
+
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.\n")
|
| 131 |
+
|
| 132 |
+
# save model and tokenizer
|
| 133 |
+
model.save_pretrained_merged(args.output_path, tokenizer)
|
training/finetune.sh
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA_PATH=""
|
| 2 |
+
CODE_PATH=""
|
| 3 |
+
MODEL_PATH=""
|
| 4 |
+
|
| 5 |
+
code_path=$CODE_PATH
|
| 6 |
+
model_path=$MODEL_PATH/ChatTime-1-7B-Base/
|
| 7 |
+
dataset_path=$DATA_PATH/ChatTime-1-Finetune-100K/
|
| 8 |
+
log_path=$MODEL_PATH/log_finetune/
|
| 9 |
+
output_path=$MODEL_PATH/ChatTime-1-7B-Chat/
|
| 10 |
+
|
| 11 |
+
lora_rank=8
|
| 12 |
+
lora_alpha=16
|
| 13 |
+
lora_dropout=0.00
|
| 14 |
+
|
| 15 |
+
num_train_epochs=4
|
| 16 |
+
per_device_train_batch_size=8
|
| 17 |
+
gradient_accumulation_steps=32
|
| 18 |
+
save_steps=40
|
| 19 |
+
logging_steps=4
|
| 20 |
+
max_steps=-1
|
| 21 |
+
|
| 22 |
+
python "$code_path/training/source/finetune.py" \
|
| 23 |
+
--code_path "$code_path" \
|
| 24 |
+
--model_path "$model_path" \
|
| 25 |
+
--dataset_path "$dataset_path" \
|
| 26 |
+
--log_path "$log_path" \
|
| 27 |
+
--output_path "$output_path" \
|
| 28 |
+
--lora_rank $lora_rank \
|
| 29 |
+
--lora_alpha $lora_alpha \
|
| 30 |
+
--lora_dropout $lora_dropout \
|
| 31 |
+
--num_train_epochs $num_train_epochs \
|
| 32 |
+
--per_device_train_batch_size $per_device_train_batch_size \
|
| 33 |
+
--gradient_accumulation_steps $gradient_accumulation_steps \
|
| 34 |
+
--save_steps $save_steps \
|
| 35 |
+
--logging_steps $logging_steps \
|
| 36 |
+
--max_steps $max_steps \
|
| 37 |
+
--load_in_4bit
|
training/pretrain.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
from transformers import TrainingArguments, LlamaTokenizer
|
| 8 |
+
from trl import SFTTrainer
|
| 9 |
+
from unsloth import FastLanguageModel, is_bfloat16_supported
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
parser = argparse.ArgumentParser()
|
| 13 |
+
parser.add_argument("--code_path", type=str, required=True, default=None)
|
| 14 |
+
parser.add_argument("--model_path", type=str, required=True, default=None)
|
| 15 |
+
parser.add_argument("--dataset_path", type=str, required=True, default=None)
|
| 16 |
+
parser.add_argument("--log_path", type=str, required=True, default=None)
|
| 17 |
+
parser.add_argument("--output_path", type=str, required=True, default=None)
|
| 18 |
+
|
| 19 |
+
parser.add_argument("--max_seq_length", type=int, default=2048)
|
| 20 |
+
parser.add_argument("--load_in_4bit", action="store_true", default=False)
|
| 21 |
+
|
| 22 |
+
parser.add_argument("--lora_rank", type=int, default=16)
|
| 23 |
+
parser.add_argument("--lora_alpha", type=int, default=16)
|
| 24 |
+
parser.add_argument("--lora_dropout", type=float, default=0.00)
|
| 25 |
+
parser.add_argument("--random_seed", type=int, default=3407)
|
| 26 |
+
|
| 27 |
+
parser.add_argument("--num_train_epochs", type=int, default=1)
|
| 28 |
+
parser.add_argument("--per_device_train_batch_size", type=int, default=64)
|
| 29 |
+
parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
|
| 30 |
+
parser.add_argument("--save_steps", type=int, default=2)
|
| 31 |
+
parser.add_argument("--logging_steps", type=int, default=2)
|
| 32 |
+
parser.add_argument("--max_steps", type=int, default=-1)
|
| 33 |
+
|
| 34 |
+
parser.add_argument("--low_limit", type=float, default=-1)
|
| 35 |
+
parser.add_argument("--high_limit", type=float, default=1)
|
| 36 |
+
parser.add_argument("--n_tokens", type=int, default=10002)
|
| 37 |
+
parser.add_argument("--prec", type=int, default=4)
|
| 38 |
+
parser.add_argument("--time_sep", type=str, default=" ")
|
| 39 |
+
parser.add_argument("--time_flag", type=str, default="###")
|
| 40 |
+
parser.add_argument("--nan_flag", type=str, default="Nan")
|
| 41 |
+
|
| 42 |
+
args = parser.parse_args()
|
| 43 |
+
|
| 44 |
+
sys.path.append(args.code_path)
|
| 45 |
+
from utils.tools import Discretizer, Serializer
|
| 46 |
+
|
| 47 |
+
# construct vocabulary
|
| 48 |
+
discretizer = Discretizer(low_limit=args.low_limit, high_limit=args.high_limit, n_tokens=args.n_tokens)
|
| 49 |
+
serializer = Serializer(prec=args.prec, time_sep=args.time_sep, time_flag=args.time_flag, nan_flag=args.nan_flag)
|
| 50 |
+
|
| 51 |
+
vocabulary = np.concatenate((discretizer.centers[1:-1], [np.NaN])).reshape(-1, 1)
|
| 52 |
+
vocabulary = np.array([serializer.serialize(i) for i in vocabulary])
|
| 53 |
+
print(f"\nVocabulary: \n{vocabulary}\n")
|
| 54 |
+
|
| 55 |
+
# add token to llama tokenizer
|
| 56 |
+
tokenizer = LlamaTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
|
| 57 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 58 |
+
tokenizer.padding_side = "right"
|
| 59 |
+
print(f"Old model pieces: {len(tokenizer.get_vocab())}")
|
| 60 |
+
tokenizer.add_tokens(vocabulary.tolist())
|
| 61 |
+
print(f"New model pieces: {len(tokenizer.get_vocab())}")
|
| 62 |
+
|
| 63 |
+
EOS_TOKEN = tokenizer.eos_token
|
| 64 |
+
|
| 65 |
+
# load model
|
| 66 |
+
model, _ = FastLanguageModel.from_pretrained(
|
| 67 |
+
model_name=args.model_path,
|
| 68 |
+
max_seq_length=args.max_seq_length,
|
| 69 |
+
dtype=None,
|
| 70 |
+
load_in_4bit=args.load_in_4bit,
|
| 71 |
+
resize_model_vocab=len(tokenizer.get_vocab()),
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# add lora to llama model
|
| 75 |
+
model = FastLanguageModel.get_peft_model(
|
| 76 |
+
model,
|
| 77 |
+
r=args.lora_rank,
|
| 78 |
+
lora_alpha=args.lora_alpha,
|
| 79 |
+
lora_dropout=args.lora_dropout,
|
| 80 |
+
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ],
|
| 81 |
+
modules_to_save=["embed_tokens", "lm_head", ],
|
| 82 |
+
bias="none",
|
| 83 |
+
use_gradient_checkpointing="unsloth",
|
| 84 |
+
random_state=args.random_seed,
|
| 85 |
+
max_seq_length=args.max_seq_length,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# load dataset
|
| 90 |
+
def formatting_func(example):
|
| 91 |
+
return example["text"] + EOS_TOKEN
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
print(f"\nLoading dataset in {args.dataset_path}")
|
| 95 |
+
dataset = load_dataset(args.dataset_path, split="train")
|
| 96 |
+
print(f"Dataset example: \n{dataset[0]['text']}\n")
|
| 97 |
+
|
| 98 |
+
# train model
|
| 99 |
+
trainer = SFTTrainer(
|
| 100 |
+
model=model,
|
| 101 |
+
tokenizer=tokenizer,
|
| 102 |
+
train_dataset=dataset,
|
| 103 |
+
dataset_text_field="text",
|
| 104 |
+
max_seq_length=args.max_seq_length,
|
| 105 |
+
dataset_num_proc=64,
|
| 106 |
+
packing=False,
|
| 107 |
+
formatting_func=formatting_func,
|
| 108 |
+
args=TrainingArguments(
|
| 109 |
+
per_device_train_batch_size=args.per_device_train_batch_size,
|
| 110 |
+
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
| 111 |
+
num_train_epochs=args.num_train_epochs,
|
| 112 |
+
weight_decay=0.01,
|
| 113 |
+
warmup_ratio=0.05,
|
| 114 |
+
max_grad_norm=1.0,
|
| 115 |
+
learning_rate=2e-4,
|
| 116 |
+
logging_strategy="steps",
|
| 117 |
+
logging_steps=args.logging_steps,
|
| 118 |
+
save_strategy="steps",
|
| 119 |
+
save_steps=args.save_steps,
|
| 120 |
+
max_steps=args.max_steps,
|
| 121 |
+
save_total_limit=1,
|
| 122 |
+
logging_first_step=True,
|
| 123 |
+
optim="adamw_8bit",
|
| 124 |
+
lr_scheduler_type="cosine",
|
| 125 |
+
seed=args.random_seed,
|
| 126 |
+
output_dir=args.log_path,
|
| 127 |
+
fp16=not is_bfloat16_supported(),
|
| 128 |
+
bf16=is_bfloat16_supported(),
|
| 129 |
+
),
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# title Show current memory stats
|
| 133 |
+
gpu_stats = torch.cuda.get_device_properties(0)
|
| 134 |
+
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
| 135 |
+
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
| 136 |
+
print(f"\nGPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
| 137 |
+
print(f"{start_gpu_memory} GB of memory reserved.\n")
|
| 138 |
+
|
| 139 |
+
trainer_stats = trainer.train()
|
| 140 |
+
|
| 141 |
+
# title Show final memory and time stats
|
| 142 |
+
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
| 143 |
+
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
|
| 144 |
+
used_percentage = round(used_memory / max_memory * 100, 3)
|
| 145 |
+
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
|
| 146 |
+
print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
|
| 147 |
+
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
|
| 148 |
+
print(f"Peak reserved memory = {used_memory} GB.")
|
| 149 |
+
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
|
| 150 |
+
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
| 151 |
+
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.\n")
|
| 152 |
+
|
| 153 |
+
# save model and tokenizer
|
| 154 |
+
model.save_pretrained_merged(args.output_path, tokenizer)
|
training/pretrain.sh
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA_PATH=""
|
| 2 |
+
CODE_PATH=""
|
| 3 |
+
MODEL_PATH=""
|
| 4 |
+
|
| 5 |
+
code_path=$CODE_PATH
|
| 6 |
+
model_path=meta-llama/Llama-2-7b-hf
|
| 7 |
+
dataset_path=$DATA_PATH/ChatTime-1-Pretrain-1M/
|
| 8 |
+
log_path=$MODEL_PATH/log_pretrain/
|
| 9 |
+
output_path=$MODEL_PATH/ChatTime-1-7B-Base/
|
| 10 |
+
|
| 11 |
+
lora_rank=8
|
| 12 |
+
lora_alpha=16
|
| 13 |
+
lora_dropout=0.00
|
| 14 |
+
|
| 15 |
+
num_train_epochs=2
|
| 16 |
+
per_device_train_batch_size=8
|
| 17 |
+
gradient_accumulation_steps=32
|
| 18 |
+
save_steps=200
|
| 19 |
+
logging_steps=20
|
| 20 |
+
max_steps=-1
|
| 21 |
+
|
| 22 |
+
python "$code_path/training/source/pretrain.py" \
|
| 23 |
+
--code_path "$code_path" \
|
| 24 |
+
--model_path "$model_path" \
|
| 25 |
+
--dataset_path "$dataset_path" \
|
| 26 |
+
--log_path "$log_path" \
|
| 27 |
+
--output_path "$output_path" \
|
| 28 |
+
--lora_rank $lora_rank \
|
| 29 |
+
--lora_alpha $lora_alpha \
|
| 30 |
+
--lora_dropout $lora_dropout \
|
| 31 |
+
--num_train_epochs $num_train_epochs \
|
| 32 |
+
--per_device_train_batch_size $per_device_train_batch_size \
|
| 33 |
+
--gradient_accumulation_steps $gradient_accumulation_steps \
|
| 34 |
+
--save_steps $save_steps \
|
| 35 |
+
--logging_steps $logging_steps \
|
| 36 |
+
--max_steps $max_steps \
|
| 37 |
+
--load_in_4bit
|
tsqa_adapter/logs/sft_4gpu_20260615_140322.log
ADDED
|
@@ -0,0 +1,875 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 0 |
0%| | 0/250 [00:00<?, ?it/s]Traceback (most recent call last):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate: 0.34.2
|
| 2 |
+
Run id: 20260615_140322
|
| 3 |
+
Python: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python
|
| 4 |
+
Accelerate: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python -m accelerate.commands.accelerate_cli
|
| 5 |
+
CUDA_VISIBLE_DEVICES: 0,1,2,3
|
| 6 |
+
Model path: /mnt/share01/sqk/models/ChatTime-1-7B-Chat
|
| 7 |
+
Data root: /mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp (train=train.jsonl eval=eval.jsonl)
|
| 8 |
+
Adapter output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_140322/adapter
|
| 9 |
+
Eval output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_140322/eval
|
| 10 |
+
Log file: /mnt/share01/sqk/ChatTime/tsqa_adapter/logs/sft_4gpu_20260615_140322.log
|
| 11 |
+
βοΈ Running in WANDB offline modeβοΈ Running in WANDB offline mode
|
| 12 |
+
|
| 13 |
+
βοΈ Running in WANDB offline mode
|
| 14 |
+
βοΈ Running in WANDB offline mode
|
| 15 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 16 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 17 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 18 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 19 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 20 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 21 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 22 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 28 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 29 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 30 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 31 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 32 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 33 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 34 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 35 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 36 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 37 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 38 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 39 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 40 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 41 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 42 |
+
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
| 43 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 44 |
+
|
| 45 |
0%| | 0/250 [00:00<?, ?it/s]Traceback (most recent call last):
|
| 46 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 47 |
+
obj = _ForkingPickler.dumps(obj)
|
| 48 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 49 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 50 |
+
cls(buf, protocol).dump(obj)
|
| 51 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 52 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 53 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 54 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 55 |
+
return resource_sharer.DupFd(fd)
|
| 56 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 57 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 58 |
+
self._id = _resource_sharer.register(send, close)
|
| 59 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 60 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 61 |
+
self._start()
|
| 62 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 63 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 64 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 65 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 66 |
+
self._listener = SocketListener(address, family, backlog)
|
| 67 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 68 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 69 |
+
self._socket.bind(address)
|
| 70 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 71 |
+
Traceback (most recent call last):
|
| 72 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 73 |
+
obj = _ForkingPickler.dumps(obj)
|
| 74 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 75 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 76 |
+
cls(buf, protocol).dump(obj)
|
| 77 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 78 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 79 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 80 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 81 |
+
return resource_sharer.DupFd(fd)
|
| 82 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 83 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 84 |
+
self._id = _resource_sharer.register(send, close)
|
| 85 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 86 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 87 |
+
self._start()
|
| 88 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 89 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 90 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 91 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 92 |
+
self._listener = SocketListener(address, family, backlog)
|
| 93 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 94 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 95 |
+
self._socket.bind(address)
|
| 96 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 97 |
+
Traceback (most recent call last):
|
| 98 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 99 |
+
obj = _ForkingPickler.dumps(obj)
|
| 100 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 101 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 102 |
+
cls(buf, protocol).dump(obj)
|
| 103 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 104 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 105 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 106 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 107 |
+
return resource_sharer.DupFd(fd)
|
| 108 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 109 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 110 |
+
self._id = _resource_sharer.register(send, close)
|
| 111 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 112 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 113 |
+
self._start()
|
| 114 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 115 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 116 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 117 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 118 |
+
self._listener = SocketListener(address, family, backlog)
|
| 119 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 120 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 121 |
+
self._socket.bind(address)
|
| 122 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 123 |
+
Traceback (most recent call last):
|
| 124 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 125 |
+
obj = _ForkingPickler.dumps(obj)
|
| 126 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 127 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 128 |
+
cls(buf, protocol).dump(obj)
|
| 129 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 130 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 131 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 132 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 133 |
+
return resource_sharer.DupFd(fd)
|
| 134 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 135 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 136 |
+
self._id = _resource_sharer.register(send, close)
|
| 137 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 138 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 139 |
+
self._start()
|
| 140 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 141 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 142 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 143 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 144 |
+
self._listener = SocketListener(address, family, backlog)
|
| 145 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 146 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 147 |
+
self._socket.bind(address)
|
| 148 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 149 |
+
Traceback (most recent call last):
|
| 150 |
+
Traceback (most recent call last):
|
| 151 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 152 |
+
obj = _ForkingPickler.dumps(obj)
|
| 153 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 154 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 155 |
+
cls(buf, protocol).dump(obj)
|
| 156 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 157 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 158 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 159 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 160 |
+
return resource_sharer.DupFd(fd)
|
| 161 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 162 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 163 |
+
self._id = _resource_sharer.register(send, close)
|
| 164 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 165 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 166 |
+
self._start()
|
| 167 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 168 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 169 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 170 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 171 |
+
self._listener = SocketListener(address, family, backlog)
|
| 172 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 173 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 174 |
+
self._socket.bind(address)
|
| 175 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 176 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 177 |
+
obj = _ForkingPickler.dumps(obj)
|
| 178 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 179 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 180 |
+
cls(buf, protocol).dump(obj)
|
| 181 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 182 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 183 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 184 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 185 |
+
return resource_sharer.DupFd(fd)
|
| 186 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 187 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 188 |
+
self._id = _resource_sharer.register(send, close)
|
| 189 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 190 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 191 |
+
self._start()
|
| 192 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 193 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 194 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 195 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 196 |
+
self._listener = SocketListener(address, family, backlog)
|
| 197 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 198 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 199 |
+
self._socket.bind(address)
|
| 200 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 201 |
+
Traceback (most recent call last):
|
| 202 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 203 |
+
obj = _ForkingPickler.dumps(obj)
|
| 204 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 205 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 206 |
+
cls(buf, protocol).dump(obj)
|
| 207 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 208 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 209 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 210 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 211 |
+
return resource_sharer.DupFd(fd)
|
| 212 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 213 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 214 |
+
self._id = _resource_sharer.register(send, close)
|
| 215 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 216 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 217 |
+
self._start()
|
| 218 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 219 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 220 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 221 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 222 |
+
self._listener = SocketListener(address, family, backlog)
|
| 223 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 224 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 225 |
+
self._socket.bind(address)
|
| 226 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 227 |
+
Traceback (most recent call last):
|
| 228 |
+
Traceback (most recent call last):
|
| 229 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 230 |
+
obj = _ForkingPickler.dumps(obj)
|
| 231 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 232 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 233 |
+
cls(buf, protocol).dump(obj)
|
| 234 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 235 |
+
obj = _ForkingPickler.dumps(obj)
|
| 236 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 237 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 238 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 239 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 240 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 241 |
+
cls(buf, protocol).dump(obj)
|
| 242 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 243 |
+
return resource_sharer.DupFd(fd)
|
| 244 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 245 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 246 |
+
self._id = _resource_sharer.register(send, close)
|
| 247 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 248 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 249 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 250 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 251 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 252 |
+
self._start()
|
| 253 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 254 |
+
return resource_sharer.DupFd(fd)
|
| 255 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 256 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 257 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 258 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 259 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 260 |
+
self._id = _resource_sharer.register(send, close)
|
| 261 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 262 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 263 |
+
self._listener = SocketListener(address, family, backlog)
|
| 264 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 265 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 266 |
+
self._start()
|
| 267 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 268 |
+
self._socket.bind(address)
|
| 269 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 270 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 271 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 272 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 273 |
+
self._listener = SocketListener(address, family, backlog)
|
| 274 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 275 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 276 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 277 |
+
self._socket.bind(address)
|
| 278 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 279 |
+
Traceback (most recent call last):
|
| 280 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 281 |
+
obj = _ForkingPickler.dumps(obj)
|
| 282 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 283 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 284 |
+
cls(buf, protocol).dump(obj)
|
| 285 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 286 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 287 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 288 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 289 |
+
return resource_sharer.DupFd(fd)
|
| 290 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 291 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 292 |
+
self._id = _resource_sharer.register(send, close)
|
| 293 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 294 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 295 |
+
self._start()
|
| 296 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 297 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 298 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 299 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 300 |
+
self._listener = SocketListener(address, family, backlog)
|
| 301 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 302 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 303 |
+
self._socket.bind(address)
|
| 304 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 305 |
+
Traceback (most recent call last):
|
| 306 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 307 |
+
obj = _ForkingPickler.dumps(obj)
|
| 308 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 309 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 310 |
+
cls(buf, protocol).dump(obj)
|
| 311 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 312 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 313 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 314 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 315 |
+
return resource_sharer.DupFd(fd)
|
| 316 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 317 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 318 |
+
self._id = _resource_sharer.register(send, close)
|
| 319 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 320 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 321 |
+
self._start()
|
| 322 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 323 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 324 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 325 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 326 |
+
self._listener = SocketListener(address, family, backlog)
|
| 327 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 328 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 329 |
+
self._socket.bind(address)
|
| 330 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 331 |
+
Traceback (most recent call last):
|
| 332 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 333 |
+
obj = _ForkingPickler.dumps(obj)
|
| 334 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 335 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 336 |
+
cls(buf, protocol).dump(obj)
|
| 337 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 338 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 339 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 340 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 341 |
+
return resource_sharer.DupFd(fd)
|
| 342 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 343 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 344 |
+
self._id = _resource_sharer.register(send, close)
|
| 345 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 346 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 347 |
+
self._start()
|
| 348 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 349 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 350 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 351 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 352 |
+
self._listener = SocketListener(address, family, backlog)
|
| 353 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 354 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 355 |
+
self._socket.bind(address)
|
| 356 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 357 |
+
Traceback (most recent call last):
|
| 358 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 359 |
+
obj = _ForkingPickler.dumps(obj)
|
| 360 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 361 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 362 |
+
cls(buf, protocol).dump(obj)
|
| 363 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 364 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 365 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 366 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 367 |
+
return resource_sharer.DupFd(fd)
|
| 368 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 369 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 370 |
+
self._id = _resource_sharer.register(send, close)
|
| 371 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 372 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 373 |
+
self._start()
|
| 374 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 375 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 376 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 377 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 378 |
+
self._listener = SocketListener(address, family, backlog)
|
| 379 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 380 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 381 |
+
self._socket.bind(address)
|
| 382 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 383 |
+
Traceback (most recent call last):
|
| 384 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 385 |
+
obj = _ForkingPickler.dumps(obj)
|
| 386 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 387 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 388 |
+
cls(buf, protocol).dump(obj)
|
| 389 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 390 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 391 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 392 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 393 |
+
return resource_sharer.DupFd(fd)
|
| 394 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 395 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 396 |
+
self._id = _resource_sharer.register(send, close)
|
| 397 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 398 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 399 |
+
self._start()
|
| 400 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 401 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 402 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 403 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 404 |
+
self._listener = SocketListener(address, family, backlog)
|
| 405 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 406 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 407 |
+
self._socket.bind(address)
|
| 408 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 409 |
+
Traceback (most recent call last):
|
| 410 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 411 |
+
obj = _ForkingPickler.dumps(obj)
|
| 412 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 413 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 414 |
+
cls(buf, protocol).dump(obj)
|
| 415 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 416 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 417 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 418 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 419 |
+
return resource_sharer.DupFd(fd)
|
| 420 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 421 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 422 |
+
self._id = _resource_sharer.register(send, close)
|
| 423 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 424 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 425 |
+
self._start()
|
| 426 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 427 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 428 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 429 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 430 |
+
self._listener = SocketListener(address, family, backlog)
|
| 431 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 432 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 433 |
+
self._socket.bind(address)
|
| 434 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 435 |
+
Traceback (most recent call last):
|
| 436 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 437 |
+
obj = _ForkingPickler.dumps(obj)
|
| 438 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 439 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 440 |
+
cls(buf, protocol).dump(obj)
|
| 441 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 442 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 443 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 444 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 445 |
+
return resource_sharer.DupFd(fd)
|
| 446 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 448 |
+
self._id = _resource_sharer.register(send, close)
|
| 449 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 450 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 451 |
+
self._start()
|
| 452 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 453 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 454 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 455 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 456 |
+
self._listener = SocketListener(address, family, backlog)
|
| 457 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 458 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 459 |
+
self._socket.bind(address)
|
| 460 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 461 |
+
Traceback (most recent call last):
|
| 462 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 463 |
+
obj = _ForkingPickler.dumps(obj)
|
| 464 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 466 |
+
cls(buf, protocol).dump(obj)
|
| 467 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 468 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 469 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 470 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 471 |
+
return resource_sharer.DupFd(fd)
|
| 472 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 473 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 474 |
+
self._id = _resource_sharer.register(send, close)
|
| 475 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 476 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 477 |
+
self._start()
|
| 478 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 479 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 480 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 481 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 482 |
+
self._listener = SocketListener(address, family, backlog)
|
| 483 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 484 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 485 |
+
self._socket.bind(address)
|
| 486 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 487 |
+
Traceback (most recent call last):
|
| 488 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 489 |
+
obj = _ForkingPickler.dumps(obj)
|
| 490 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 491 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 492 |
+
cls(buf, protocol).dump(obj)
|
| 493 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 494 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 495 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 496 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 497 |
+
return resource_sharer.DupFd(fd)
|
| 498 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 499 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 500 |
+
self._id = _resource_sharer.register(send, close)
|
| 501 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 502 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 503 |
+
self._start()
|
| 504 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 505 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 506 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 507 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 508 |
+
self._listener = SocketListener(address, family, backlog)
|
| 509 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 510 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 511 |
+
self._socket.bind(address)
|
| 512 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 513 |
+
Traceback (most recent call last):
|
| 514 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 515 |
+
obj = _ForkingPickler.dumps(obj)
|
| 516 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 517 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 518 |
+
cls(buf, protocol).dump(obj)
|
| 519 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 520 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 521 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 522 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 523 |
+
return resource_sharer.DupFd(fd)
|
| 524 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 525 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 526 |
+
self._id = _resource_sharer.register(send, close)
|
| 527 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 528 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 529 |
+
self._start()
|
| 530 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 531 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 532 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 533 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 534 |
+
self._listener = SocketListener(address, family, backlog)
|
| 535 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 536 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 537 |
+
self._socket.bind(address)
|
| 538 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 539 |
+
Traceback (most recent call last):
|
| 540 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 541 |
+
obj = _ForkingPickler.dumps(obj)
|
| 542 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 543 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 544 |
+
cls(buf, protocol).dump(obj)
|
| 545 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 546 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 547 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 548 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 549 |
+
return resource_sharer.DupFd(fd)
|
| 550 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 551 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 552 |
+
self._id = _resource_sharer.register(send, close)
|
| 553 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 554 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 555 |
+
self._start()
|
| 556 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 557 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 558 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 559 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 560 |
+
self._listener = SocketListener(address, family, backlog)
|
| 561 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 562 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 563 |
+
self._socket.bind(address)
|
| 564 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 565 |
+
Traceback (most recent call last):
|
| 566 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 567 |
+
obj = _ForkingPickler.dumps(obj)
|
| 568 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 569 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 570 |
+
cls(buf, protocol).dump(obj)
|
| 571 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 572 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 573 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 574 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 575 |
+
return resource_sharer.DupFd(fd)
|
| 576 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 577 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 578 |
+
self._id = _resource_sharer.register(send, close)
|
| 579 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 580 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 581 |
+
self._start()
|
| 582 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 583 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 584 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 585 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 586 |
+
self._listener = SocketListener(address, family, backlog)
|
| 587 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 588 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 589 |
+
self._socket.bind(address)
|
| 590 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 591 |
+
Traceback (most recent call last):
|
| 592 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 593 |
+
obj = _ForkingPickler.dumps(obj)
|
| 594 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 595 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 596 |
+
cls(buf, protocol).dump(obj)
|
| 597 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 598 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 599 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 600 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 601 |
+
return resource_sharer.DupFd(fd)
|
| 602 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 603 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 604 |
+
self._id = _resource_sharer.register(send, close)
|
| 605 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 606 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 607 |
+
self._start()
|
| 608 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 609 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 610 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 611 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 612 |
+
self._listener = SocketListener(address, family, backlog)
|
| 613 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 614 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 615 |
+
self._socket.bind(address)
|
| 616 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 617 |
+
Traceback (most recent call last):
|
| 618 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 619 |
+
obj = _ForkingPickler.dumps(obj)
|
| 620 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 621 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 622 |
+
cls(buf, protocol).dump(obj)
|
| 623 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 624 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 625 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 626 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 627 |
+
return resource_sharer.DupFd(fd)
|
| 628 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 629 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 630 |
+
self._id = _resource_sharer.register(send, close)
|
| 631 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 632 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 633 |
+
self._start()
|
| 634 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 635 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 636 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 637 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 638 |
+
self._listener = SocketListener(address, family, backlog)
|
| 639 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 640 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 641 |
+
self._socket.bind(address)
|
| 642 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 643 |
+
Traceback (most recent call last):
|
| 644 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 645 |
+
obj = _ForkingPickler.dumps(obj)
|
| 646 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 647 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 648 |
+
cls(buf, protocol).dump(obj)
|
| 649 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 650 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 651 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 652 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 653 |
+
return resource_sharer.DupFd(fd)
|
| 654 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 655 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 656 |
+
self._id = _resource_sharer.register(send, close)
|
| 657 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 658 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 659 |
+
self._start()
|
| 660 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 661 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 662 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 663 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 664 |
+
self._listener = SocketListener(address, family, backlog)
|
| 665 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 666 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 667 |
+
self._socket.bind(address)
|
| 668 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 669 |
+
Traceback (most recent call last):
|
| 670 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 671 |
+
obj = _ForkingPickler.dumps(obj)
|
| 672 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 673 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 674 |
+
cls(buf, protocol).dump(obj)
|
| 675 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 676 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 677 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 678 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 679 |
+
return resource_sharer.DupFd(fd)
|
| 680 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 681 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 682 |
+
self._id = _resource_sharer.register(send, close)
|
| 683 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 684 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 685 |
+
self._start()
|
| 686 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 687 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 688 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 689 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 690 |
+
self._listener = SocketListener(address, family, backlog)
|
| 691 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 692 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 693 |
+
self._socket.bind(address)
|
| 694 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 695 |
+
Traceback (most recent call last):
|
| 696 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 697 |
+
obj = _ForkingPickler.dumps(obj)
|
| 698 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 699 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 700 |
+
cls(buf, protocol).dump(obj)
|
| 701 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 702 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 703 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 704 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 705 |
+
return resource_sharer.DupFd(fd)
|
| 706 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 707 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 708 |
+
self._id = _resource_sharer.register(send, close)
|
| 709 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 710 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 711 |
+
self._start()
|
| 712 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 713 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 714 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 715 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 716 |
+
self._listener = SocketListener(address, family, backlog)
|
| 717 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 718 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 719 |
+
self._socket.bind(address)
|
| 720 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 721 |
+
Traceback (most recent call last):
|
| 722 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 723 |
+
obj = _ForkingPickler.dumps(obj)
|
| 724 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 725 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 726 |
+
cls(buf, protocol).dump(obj)
|
| 727 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 728 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 729 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 730 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 731 |
+
return resource_sharer.DupFd(fd)
|
| 732 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 733 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 734 |
+
self._id = _resource_sharer.register(send, close)
|
| 735 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 736 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 737 |
+
self._start()
|
| 738 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 739 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 740 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 741 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 742 |
+
self._listener = SocketListener(address, family, backlog)
|
| 743 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 744 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 745 |
+
self._socket.bind(address)
|
| 746 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 747 |
+
Traceback (most recent call last):
|
| 748 |
+
Traceback (most recent call last):
|
| 749 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 750 |
+
obj = _ForkingPickler.dumps(obj)
|
| 751 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 752 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 753 |
+
cls(buf, protocol).dump(obj)
|
| 754 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 755 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 756 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 757 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 758 |
+
return resource_sharer.DupFd(fd)
|
| 759 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 760 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 761 |
+
self._id = _resource_sharer.register(send, close)
|
| 762 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 763 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 764 |
+
self._start()
|
| 765 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 766 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 767 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 768 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 769 |
+
self._listener = SocketListener(address, family, backlog)
|
| 770 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 771 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 772 |
+
self._socket.bind(address)
|
| 773 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 774 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 775 |
+
obj = _ForkingPickler.dumps(obj)
|
| 776 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 777 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 778 |
+
cls(buf, protocol).dump(obj)
|
| 779 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 780 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 781 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 782 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 783 |
+
return resource_sharer.DupFd(fd)
|
| 784 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 785 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 786 |
+
self._id = _resource_sharer.register(send, close)
|
| 787 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 788 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 789 |
+
self._start()
|
| 790 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 791 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 792 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 793 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 794 |
+
self._listener = SocketListener(address, family, backlog)
|
| 795 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 796 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 797 |
+
self._socket.bind(address)
|
| 798 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 799 |
+
Traceback (most recent call last):
|
| 800 |
+
Traceback (most recent call last):
|
| 801 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 802 |
+
obj = _ForkingPickler.dumps(obj)
|
| 803 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 804 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 805 |
+
cls(buf, protocol).dump(obj)
|
| 806 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 807 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 808 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 809 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 810 |
+
return resource_sharer.DupFd(fd)
|
| 811 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 812 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 813 |
+
self._id = _resource_sharer.register(send, close)
|
| 814 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 815 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 816 |
+
self._start()
|
| 817 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 818 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 819 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 820 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 821 |
+
self._listener = SocketListener(address, family, backlog)
|
| 822 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 823 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 824 |
+
self._socket.bind(address)
|
| 825 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 826 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 827 |
+
obj = _ForkingPickler.dumps(obj)
|
| 828 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 829 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 830 |
+
cls(buf, protocol).dump(obj)
|
| 831 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 832 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 833 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 834 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 835 |
+
return resource_sharer.DupFd(fd)
|
| 836 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 837 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 838 |
+
self._id = _resource_sharer.register(send, close)
|
| 839 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 840 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 841 |
+
self._start()
|
| 842 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 843 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 844 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 845 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 846 |
+
self._listener = SocketListener(address, family, backlog)
|
| 847 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 848 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 849 |
+
self._socket.bind(address)
|
| 850 |
+
PermissionError: [Errno 1] Operation not permitted
|
| 851 |
+
Traceback (most recent call last):
|
| 852 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/queues.py", line 244, in _feed
|
| 853 |
+
obj = _ForkingPickler.dumps(obj)
|
| 854 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 855 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 51, in dumps
|
| 856 |
+
cls(buf, protocol).dump(obj)
|
| 857 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/torch/multiprocessing/reductions.py", line 619, in reduce_storage
|
| 858 |
+
df = multiprocessing.reduction.DupFd(fd)
|
| 859 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 860 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/reduction.py", line 198, in DupFd
|
| 861 |
+
return resource_sharer.DupFd(fd)
|
| 862 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 863 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 53, in __init__
|
| 864 |
+
self._id = _resource_sharer.register(send, close)
|
| 865 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 866 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 76, in register
|
| 867 |
+
self._start()
|
| 868 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/resource_sharer.py", line 126, in _start
|
| 869 |
+
self._listener = Listener(authkey=process.current_process().authkey, backlog=128)
|
| 870 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 871 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 464, in __init__
|
| 872 |
+
self._listener = SocketListener(address, family, backlog)
|
| 873 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 874 |
+
File "/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/multiprocessing/connection.py", line 608, in __init__
|
| 875 |
+
self._socket.bind(address)
|
| 876 |
+
PermissionError: [Errno 1] Operation not permitted
|
tsqa_adapter/logs/sft_4gpu_20260615_141604.log
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 0 |
0%| | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
0%| | 1/250 [00:14<58:06, 14.00s/it]
|
| 2 |
1%| | 2/250 [00:24<50:27, 12.21s/it]
|
| 3 |
1%| | 3/250 [00:37<50:29, 12.27s/it]
|
| 4 |
2%|β | 4/250 [00:49<49:31, 12.08s/it]
|
| 5 |
2%|β | 5/250 [01:00<48:41, 11.92s/it]
|
| 6 |
2%|β | 6/250 [01:14<51:41, 12.71s/it]
|
| 7 |
3%|β | 7/250 [01:27<51:10, 12.64s/it]
|
| 8 |
3%|β | 8/250 [01:41<53:20, 13.22s/it]
|
| 9 |
4%|β | 9/250 [01:52<50:23, 12.55s/it]
|
| 10 |
4%|β | 10/250 [02:08<53:48, 13.45s/it]
|
| 11 |
|
|
|
|
| 12 |
4%|β | 10/250 [02:08<53:48, 13.45s/it]
|
| 13 |
4%|β | 11/250 [02:19<50:52, 12.77s/it]
|
| 14 |
5%|β | 12/250 [02:32<51:15, 12.92s/it]
|
| 15 |
5%|β | 13/250 [02:46<51:17, 12.99s/it]
|
| 16 |
6%|β | 14/250 [02:59<51:03, 12.98s/it]
|
| 17 |
6%|β | 15/250 [03:11<50:07, 12.80s/it]
|
| 18 |
6%|β | 16/250 [03:22<48:26, 12.42s/it]
|
| 19 |
7%|β | 17/250 [03:36<50:04, 12.89s/it]
|
| 20 |
7%|β | 18/250 [03:48<48:20, 12.50s/it]
|
| 21 |
8%|β | 19/250 [03:58<45:26, 11.80s/it]
|
| 22 |
8%|β | 20/250 [04:09<44:36, 11.64s/it]
|
| 23 |
|
|
|
|
| 24 |
8%|β | 20/250 [04:09<44:36, 11.64s/it]
|
| 25 |
8%|β | 21/250 [04:24<48:03, 12.59s/it]
|
| 26 |
9%|β | 22/250 [04:36<46:53, 12.34s/it]
|
| 27 |
9%|β | 23/250 [04:50<49:03, 12.97s/it]
|
| 28 |
10%|β | 24/250 [05:07<52:25, 13.92s/it]
|
| 29 |
10%|β | 25/250 [05:20<52:07, 13.90s/it]
|
| 30 |
10%|β | 26/250 [05:36<53:41, 14.38s/it]
|
| 31 |
11%|β | 27/250 [05:52<54:58, 14.79s/it]
|
| 32 |
11%|β | 28/250 [06:09<57:57, 15.66s/it]
|
| 33 |
12%|ββ | 29/250 [06:23<55:16, 15.01s/it]
|
| 34 |
12%|ββ | 30/250 [06:36<53:20, 14.55s/it]
|
| 35 |
|
|
|
|
| 36 |
12%|ββ | 30/250 [06:36<53:20, 14.55s/it]
|
| 37 |
12%|ββ | 31/250 [06:51<52:42, 14.44s/it]
|
| 38 |
13%|ββ | 32/250 [07:02<49:31, 13.63s/it]
|
| 39 |
13%|ββ | 33/250 [07:14<47:28, 13.13s/it]
|
| 40 |
14%|ββ | 34/250 [07:25<44:37, 12.39s/it]
|
| 41 |
14%|ββ | 35/250 [07:42<48:57, 13.66s/it]
|
| 42 |
14%|ββ | 36/250 [07:56<49:01, 13.74s/it]
|
| 43 |
15%|ββ | 37/250 [08:06<45:37, 12.85s/it]
|
| 44 |
15%|ββ | 38/250 [08:22<48:08, 13.62s/it]
|
| 45 |
16%|ββ | 39/250 [08:33<45:37, 12.97s/it]
|
| 46 |
16%|ββ | 40/250 [08:46<44:47, 12.80s/it]
|
| 47 |
|
|
|
|
| 48 |
16%|ββ | 40/250 [08:46<44:47, 12.80s/it]
|
| 49 |
16%|ββ | 41/250 [08:55<41:16, 11.85s/it]
|
| 50 |
17%|ββ | 42/250 [09:06<40:00, 11.54s/it]
|
| 51 |
17%|ββ | 43/250 [09:24<46:47, 13.56s/it]
|
| 52 |
18%|ββ | 44/250 [09:36<44:43, 13.03s/it]
|
| 53 |
18%|ββ | 45/250 [09:50<45:32, 13.33s/it]
|
| 54 |
18%|ββ | 46/250 [10:03<44:21, 13.05s/it]
|
| 55 |
19%|ββ | 47/250 [10:19<47:26, 14.02s/it]
|
| 56 |
19%|ββ | 48/250 [10:36<50:17, 14.94s/it]
|
| 57 |
20%|ββ | 49/250 [10:54<53:33, 15.99s/it]
|
| 58 |
20%|ββ | 50/250 [11:06<49:05, 14.73s/it]
|
| 59 |
|
|
|
|
| 60 |
20%|ββ | 50/250 [11:06<49:05, 14.73s/it]
|
| 61 |
20%|ββ | 51/250 [11:20<47:34, 14.34s/it]
|
| 62 |
21%|ββ | 52/250 [11:32<45:08, 13.68s/it]
|
| 63 |
21%|ββ | 53/250 [11:44<43:53, 13.37s/it]
|
| 64 |
22%|βββ | 54/250 [11:58<43:35, 13.34s/it]
|
| 65 |
22%|βββ | 55/250 [12:11<43:18, 13.32s/it]
|
| 66 |
22%|βββ | 56/250 [12:21<40:23, 12.49s/it]
|
| 67 |
23%|βββ | 57/250 [12:35<41:07, 12.78s/it]
|
| 68 |
23%|βββ | 58/250 [12:50<43:07, 13.48s/it]
|
| 69 |
24%|βββ | 59/250 [13:05<44:15, 13.90s/it]
|
| 70 |
24%|βββ | 60/250 [13:18<42:56, 13.56s/it]
|
| 71 |
|
|
|
|
| 72 |
24%|βββ | 60/250 [13:18<42:56, 13.56s/it]
|
| 73 |
24%|βββ | 61/250 [13:30<41:38, 13.22s/it]
|
| 74 |
25%|βββ | 62/250 [13:43<40:45, 13.01s/it]
|
| 75 |
25%|βββ | 63/250 [13:57<41:43, 13.39s/it]
|
| 76 |
26%|βββ | 64/250 [14:09<40:29, 13.06s/it]
|
| 77 |
26%|βββ | 65/250 [14:24<41:52, 13.58s/it]
|
| 78 |
26%|βββ | 66/250 [14:38<42:00, 13.70s/it]
|
| 79 |
27%|βββ | 67/250 [14:53<43:03, 14.12s/it]
|
| 80 |
27%|βββ | 68/250 [15:08<43:21, 14.29s/it]
|
| 81 |
28%|βββ | 69/250 [15:21<42:37, 14.13s/it]
|
| 82 |
28%|βββ | 70/250 [15:34<41:12, 13.74s/it]
|
| 83 |
|
|
|
|
| 84 |
28%|βββ | 70/250 [15:34<41:12, 13.74s/it]
|
| 85 |
28%|βββ | 71/250 [15:51<43:27, 14.57s/it]
|
| 86 |
29%|βββ | 72/250 [16:03<41:16, 13.91s/it]
|
| 87 |
29%|βββ | 73/250 [16:15<39:02, 13.24s/it]
|
| 88 |
30%|βββ | 74/250 [16:25<35:48, 12.21s/it]
|
| 89 |
30%|βββ | 75/250 [16:36<34:30, 11.83s/it]
|
| 90 |
30%|βββ | 76/250 [16:49<35:39, 12.29s/it]
|
| 91 |
31%|βββ | 77/250 [17:00<34:42, 12.04s/it]
|
| 92 |
31%|βββ | 78/250 [17:14<35:30, 12.38s/it]
|
| 93 |
32%|ββββ | 79/250 [17:25<34:33, 12.12s/it]
|
| 94 |
32%|ββββ | 80/250 [17:41<37:08, 13.11s/it]
|
| 95 |
|
|
|
|
| 96 |
32%|ββββ | 80/250 [17:41<37:08, 13.11s/it]
|
| 97 |
32%|ββββ | 81/250 [17:53<36:08, 12.83s/it]
|
| 98 |
33%|ββββ | 82/250 [18:08<37:48, 13.50s/it]
|
| 99 |
33%|ββββ | 83/250 [18:20<36:12, 13.01s/it]
|
| 100 |
34%|ββββ | 84/250 [18:38<40:30, 14.64s/it]
|
| 101 |
34%|ββββ | 85/250 [18:52<39:39, 14.42s/it]
|
| 102 |
34%|ββββ | 86/250 [19:10<42:03, 15.39s/it]
|
| 103 |
35%|ββββ | 87/250 [19:24<41:21, 15.23s/it]
|
| 104 |
35%|ββββ | 88/250 [19:36<38:19, 14.19s/it]
|
| 105 |
36%|ββββ | 89/250 [19:53<40:26, 15.07s/it]
|
| 106 |
36%|ββββ | 90/250 [20:07<38:51, 14.57s/it]
|
| 107 |
|
|
|
|
| 108 |
36%|ββββ | 90/250 [20:07<38:51, 14.57s/it]
|
| 109 |
36%|ββββ | 91/250 [20:19<36:35, 13.81s/it]
|
| 110 |
37%|ββββ | 92/250 [20:31<34:47, 13.21s/it]
|
| 111 |
37%|ββββ | 93/250 [20:44<34:36, 13.22s/it]
|
| 112 |
38%|ββββ | 94/250 [20:56<33:31, 12.90s/it]
|
| 113 |
38%|ββββ | 95/250 [21:07<31:39, 12.25s/it]
|
| 114 |
38%|ββββ | 96/250 [21:21<33:07, 12.91s/it]
|
| 115 |
39%|ββββ | 97/250 [21:35<33:24, 13.10s/it]
|
| 116 |
39%|ββββ | 98/250 [21:47<32:27, 12.81s/it]
|
| 117 |
40%|ββββ | 99/250 [21:59<32:04, 12.74s/it]
|
| 118 |
40%|ββββ | 100/250 [22:15<33:41, 13.47s/it]
|
| 119 |
|
|
|
|
| 120 |
40%|ββββ | 100/250 [22:15<33:41, 13.47s/it]
|
| 121 |
40%|ββββ | 101/250 [22:26<32:09, 12.95s/it]
|
| 122 |
41%|ββββ | 102/250 [22:41<32:55, 13.35s/it]
|
| 123 |
41%|ββββ | 103/250 [22:54<32:21, 13.21s/it]
|
| 124 |
42%|βββββ | 104/250 [23:05<30:35, 12.57s/it]
|
| 125 |
42%|βββββ | 105/250 [23:15<29:06, 12.05s/it]
|
| 126 |
42%|βββββ | 106/250 [23:28<29:07, 12.14s/it]
|
| 127 |
43%|βββββ | 107/250 [23:39<28:36, 12.00s/it]
|
| 128 |
43%|βββββ | 108/250 [23:55<30:41, 12.97s/it]
|
| 129 |
44%|βββββ | 109/250 [24:08<30:29, 12.97s/it]
|
| 130 |
44%|βββββ | 110/250 [24:19<28:46, 12.33s/it]
|
| 131 |
|
|
|
|
| 132 |
44%|βββββ | 110/250 [24:19<28:46, 12.33s/it]
|
| 133 |
44%|βββββ | 111/250 [24:31<28:51, 12.46s/it]
|
| 134 |
45%|βββββ | 112/250 [24:45<29:12, 12.70s/it]
|
| 135 |
45%|βββββ | 113/250 [24:59<30:04, 13.17s/it]
|
| 136 |
46%|βββββ | 114/250 [25:13<30:19, 13.38s/it]
|
| 137 |
46%|βββββ | 115/250 [25:26<30:21, 13.49s/it]
|
| 138 |
46%|βββββ | 116/250 [25:39<29:41, 13.30s/it]
|
| 139 |
47%|βββββ | 117/250 [25:54<30:07, 13.59s/it]
|
| 140 |
47%|βββββ | 118/250 [26:10<31:36, 14.37s/it]
|
| 141 |
48%|βββββ | 119/250 [26:24<31:30, 14.43s/it]
|
| 142 |
48%|βββββ | 120/250 [26:40<32:11, 14.86s/it]
|
| 143 |
|
|
|
|
| 144 |
48%|βββββ | 120/250 [26:40<32:11, 14.86s/it]
|
| 145 |
48%|βββββ | 121/250 [26:54<31:23, 14.60s/it]
|
| 146 |
49%|βββββ | 122/250 [27:06<29:25, 13.79s/it]
|
| 147 |
49%|βββββ | 123/250 [27:19<28:45, 13.58s/it]
|
| 148 |
50%|βββββ | 124/250 [27:33<28:21, 13.50s/it]
|
| 149 |
50%|βββββ | 125/250 [27:45<27:38, 13.27s/it]
|
| 150 |
50%|βββββ | 126/250 [27:58<27:08, 13.13s/it]
|
| 151 |
51%|βββββ | 127/250 [28:11<27:02, 13.19s/it]
|
| 152 |
51%|βββββ | 128/250 [28:22<25:27, 12.52s/it]
|
| 153 |
52%|ββββββ | 129/250 [28:41<29:07, 14.44s/it]
|
| 154 |
52%|ββββββ | 130/250 [28:55<28:43, 14.36s/it]
|
| 155 |
|
|
|
|
| 156 |
52%|ββββββ | 130/250 [28:55<28:43, 14.36s/it]
|
| 157 |
52%|ββββββ | 131/250 [29:08<27:21, 13.79s/it]
|
| 158 |
53%|ββββββ | 132/250 [29:20<26:09, 13.30s/it]
|
| 159 |
53%|ββββββ | 133/250 [29:33<25:54, 13.29s/it]
|
| 160 |
54%|ββββββ | 134/250 [29:46<25:12, 13.04s/it]
|
| 161 |
54%|ββββββ | 135/250 [30:00<25:46, 13.44s/it]
|
| 162 |
54%|ββββββ | 136/250 [30:11<24:14, 12.76s/it]
|
| 163 |
55%|ββββββ | 137/250 [30:24<23:43, 12.60s/it]
|
| 164 |
55%|ββββββ | 138/250 [30:40<25:54, 13.88s/it]
|
| 165 |
56%|ββββββ | 139/250 [30:54<25:16, 13.66s/it]
|
| 166 |
56%|ββββββ | 140/250 [31:07<24:42, 13.48s/it]
|
| 167 |
|
|
|
|
| 168 |
56%|ββββββ | 140/250 [31:07<24:42, 13.48s/it]
|
| 169 |
56%|ββββββ | 141/250 [31:24<26:26, 14.55s/it]
|
| 170 |
57%|ββββββ | 142/250 [31:38<26:20, 14.64s/it]
|
| 171 |
57%|ββββββ | 143/250 [31:50<24:34, 13.78s/it]
|
| 172 |
58%|ββββββ | 144/250 [32:03<23:47, 13.46s/it]
|
| 173 |
58%|ββββββ | 145/250 [32:13<21:46, 12.45s/it]
|
| 174 |
58%|ββββββ | 146/250 [32:26<21:41, 12.51s/it]
|
| 175 |
59%|ββββββ | 147/250 [32:39<21:56, 12.78s/it]
|
| 176 |
59%|ββββββ | 148/250 [32:55<23:31, 13.84s/it]
|
| 177 |
60%|ββββββ | 149/250 [33:09<23:13, 13.80s/it]
|
| 178 |
60%|ββββββ | 150/250 [33:22<22:24, 13.45s/it]
|
| 179 |
|
|
|
|
| 180 |
60%|ββββββ | 150/250 [33:22<22:24, 13.45s/it]
|
| 181 |
60%|ββββββ | 151/250 [33:37<23:04, 13.98s/it]
|
| 182 |
61%|ββββββ | 152/250 [33:49<21:46, 13.34s/it]
|
| 183 |
61%|ββββββ | 153/250 [33:59<20:08, 12.46s/it]
|
| 184 |
62%|βββββββ | 154/250 [34:15<21:29, 13.43s/it]
|
| 185 |
62%|βββββββ | 155/250 [34:28<21:07, 13.34s/it]
|
| 186 |
62%|βββββββ | 156/250 [34:38<19:31, 12.46s/it]
|
| 187 |
63%|βββββββ | 157/250 [34:51<19:25, 12.53s/it]
|
| 188 |
63%|βββββββ | 158/250 [35:05<19:40, 12.83s/it]
|
| 189 |
64%|βββββββ | 159/250 [35:20<20:21, 13.43s/it]
|
| 190 |
64%|βββββββ | 160/250 [35:33<20:09, 13.44s/it]
|
| 191 |
|
|
|
|
| 192 |
64%|βββββββ | 160/250 [35:33<20:09, 13.44s/it]
|
| 193 |
64%|βββββββ | 161/250 [35:44<18:52, 12.72s/it]
|
| 194 |
65%|βββββββ | 162/250 [35:57<18:36, 12.69s/it]
|
| 195 |
65%|βββββββ | 163/250 [36:08<17:39, 12.18s/it]
|
| 196 |
66%|βββββββ | 164/250 [36:21<18:00, 12.56s/it]
|
| 197 |
66%|βββββββ | 165/250 [36:35<18:22, 12.97s/it]
|
| 198 |
66%|βββββββ | 166/250 [36:48<18:19, 13.09s/it]
|
| 199 |
67%|βββββββ | 167/250 [37:01<18:04, 13.06s/it]
|
| 200 |
67%|βββββββ | 168/250 [37:12<16:44, 12.25s/it]
|
| 201 |
68%|βββββββ | 169/250 [37:25<16:57, 12.56s/it]
|
| 202 |
68%|βββββββ | 170/250 [37:38<16:46, 12.58s/it]
|
| 203 |
|
|
|
|
| 204 |
68%|βββββββ | 170/250 [37:38<16:46, 12.58s/it]
|
| 205 |
68%|βββββββ | 171/250 [37:50<16:35, 12.60s/it]
|
| 206 |
69%|βββββββ | 172/250 [38:05<17:17, 13.30s/it]
|
| 207 |
69%|βββββββ | 173/250 [38:21<17:52, 13.93s/it]
|
| 208 |
70%|βββββββ | 174/250 [38:37<18:29, 14.60s/it]
|
| 209 |
70%|βββββββ | 175/250 [38:50<17:50, 14.27s/it]
|
| 210 |
70%|βββββββ | 176/250 [39:04<17:30, 14.20s/it]
|
| 211 |
71%|βββββββ | 177/250 [39:16<16:22, 13.46s/it]
|
| 212 |
71%|βββββββ | 178/250 [39:28<15:42, 13.09s/it]
|
| 213 |
72%|ββββββββ | 179/250 [39:40<15:06, 12.77s/it]
|
| 214 |
72%|ββββββββ | 180/250 [39:54<15:17, 13.10s/it]
|
| 215 |
|
|
|
|
| 216 |
72%|ββββββββ | 180/250 [39:54<15:17, 13.10s/it]
|
| 217 |
72%|ββββββββ | 181/250 [40:04<14:04, 12.24s/it]
|
| 218 |
73%|ββββββββ | 182/250 [40:18<14:27, 12.75s/it]
|
| 219 |
73%|ββββββββ | 183/250 [40:32<14:31, 13.01s/it]
|
| 220 |
74%|ββββββββ | 184/250 [40:47<14:53, 13.53s/it]
|
| 221 |
74%|ββββββββ | 185/250 [41:02<15:19, 14.15s/it]
|
| 222 |
74%|ββββββββ | 186/250 [41:16<14:48, 13.88s/it]
|
| 223 |
75%|ββββββββ | 187/250 [41:31<14:55, 14.22s/it]
|
| 224 |
75%|ββββββββ | 188/250 [41:42<13:57, 13.51s/it]
|
| 225 |
76%|ββββββββ | 189/250 [41:54<13:05, 12.87s/it]
|
| 226 |
76%|ββββββββ | 190/250 [42:07<12:50, 12.83s/it]
|
| 227 |
|
|
|
|
| 228 |
76%|ββββββββ | 190/250 [42:07<12:50, 12.83s/it]
|
| 229 |
76%|ββββββββ | 191/250 [42:21<13:14, 13.46s/it]
|
| 230 |
77%|ββββββββ | 192/250 [42:37<13:29, 13.96s/it]
|
| 231 |
77%|ββββββββ | 193/250 [42:50<13:01, 13.71s/it]
|
| 232 |
78%|ββββββββ | 194/250 [43:02<12:17, 13.16s/it]
|
| 233 |
78%|ββββββββ | 195/250 [43:15<12:14, 13.35s/it]
|
| 234 |
78%|ββββββββ | 196/250 [43:29<12:06, 13.45s/it]
|
| 235 |
79%|ββββββββ | 197/250 [43:42<11:45, 13.31s/it]
|
| 236 |
79%|ββββββββ | 198/250 [43:53<10:47, 12.45s/it]
|
| 237 |
80%|ββββββββ | 199/250 [44:09<11:29, 13.52s/it]
|
| 238 |
80%|ββββββββ | 200/250 [44:21<10:54, 13.08s/it]
|
| 239 |
|
|
|
|
| 240 |
80%|ββββββββ | 200/250 [44:21<10:54, 13.08s/it]
|
| 241 |
80%|ββββββββ | 201/250 [44:39<12:06, 14.83s/it]
|
| 242 |
81%|ββββββββ | 202/250 [44:55<11:56, 14.92s/it]
|
| 243 |
81%|ββββββββ | 203/250 [45:07<11:07, 14.20s/it]
|
| 244 |
82%|βββββββββ | 204/250 [45:22<11:06, 14.49s/it]
|
| 245 |
82%|βββββββββ | 205/250 [45:35<10:33, 14.08s/it]
|
| 246 |
82%|βββββββββ | 206/250 [45:48<10:04, 13.75s/it]
|
| 247 |
83%|βββββββββ | 207/250 [45:59<09:16, 12.95s/it]
|
| 248 |
83%|βββββββββ | 208/250 [46:16<09:42, 13.88s/it]
|
| 249 |
84%|βββββββββ | 209/250 [46:29<09:23, 13.75s/it]
|
| 250 |
84%|βββββββββ | 210/250 [46:41<08:47, 13.18s/it]
|
| 251 |
|
|
|
|
| 252 |
84%|βββββββββ | 210/250 [46:41<08:47, 13.18s/it]
|
| 253 |
84%|βββββββββ | 211/250 [46:53<08:27, 13.02s/it]
|
| 254 |
85%|βββββββββ | 212/250 [47:05<08:01, 12.66s/it]
|
| 255 |
85%|βββββββββ | 213/250 [47:19<07:58, 12.93s/it]
|
| 256 |
86%|βββββββββ | 214/250 [47:31<07:38, 12.74s/it]
|
| 257 |
86%|βββββββββ | 215/250 [47:46<07:45, 13.30s/it]
|
| 258 |
86%|βββββββββ | 216/250 [47:59<07:28, 13.18s/it]
|
| 259 |
87%|βββββββββ | 217/250 [48:12<07:15, 13.18s/it]
|
| 260 |
87%|βββββββββ | 218/250 [48:29<07:38, 14.32s/it]
|
| 261 |
88%|βββββββββ | 219/250 [48:43<07:26, 14.41s/it]
|
| 262 |
88%|βββββββββ | 220/250 [48:57<07:06, 14.23s/it]
|
| 263 |
|
|
|
|
| 264 |
88%|βββββββββ | 220/250 [48:57<07:06, 14.23s/it]
|
| 265 |
88%|βββββββββ | 221/250 [49:10<06:37, 13.71s/it]
|
| 266 |
89%|βββββββββ | 222/250 [49:29<07:08, 15.32s/it]
|
| 267 |
89%|βββββββββ | 223/250 [49:44<06:55, 15.38s/it]
|
| 268 |
90%|βββββββββ | 224/250 [49:55<06:03, 14.00s/it]
|
| 269 |
90%|βββββββββ | 225/250 [50:09<05:51, 14.04s/it]
|
| 270 |
90%|βββββββββ | 226/250 [50:26<05:56, 14.84s/it]
|
| 271 |
91%|βββββββββ | 227/250 [50:36<05:06, 13.34s/it]
|
| 272 |
91%|βββββββββ | 228/250 [50:52<05:14, 14.31s/it]
|
| 273 |
92%|ββββββββββ| 229/250 [51:04<04:44, 13.54s/it]
|
| 274 |
92%|ββββββββββ| 230/250 [51:17<04:24, 13.22s/it]
|
| 275 |
|
|
|
|
| 276 |
92%|ββββββββββ| 230/250 [51:17<04:24, 13.22s/it]
|
| 277 |
92%|ββββββββββ| 231/250 [51:29<04:08, 13.09s/it]
|
| 278 |
93%|ββββββββββ| 232/250 [51:41<03:48, 12.71s/it]
|
| 279 |
93%|ββββββββββ| 233/250 [51:55<03:39, 12.90s/it]
|
| 280 |
94%|ββββββββββ| 234/250 [52:06<03:21, 12.58s/it]
|
| 281 |
94%|ββββββββββ| 235/250 [52:22<03:23, 13.54s/it]
|
| 282 |
94%|ββββββββββ| 236/250 [52:35<03:06, 13.35s/it]
|
| 283 |
95%|ββββββββββ| 237/250 [52:49<02:56, 13.58s/it]
|
| 284 |
95%|ββββββββββ| 238/250 [53:03<02:44, 13.67s/it]
|
| 285 |
96%|ββββββββββ| 239/250 [53:16<02:26, 13.31s/it]
|
| 286 |
96%|ββββββββββ| 240/250 [53:29<02:12, 13.29s/it]
|
| 287 |
|
|
|
|
| 288 |
96%|ββββββββββ| 240/250 [53:29<02:12, 13.29s/it]
|
| 289 |
96%|ββββββββββ| 241/250 [53:40<01:54, 12.74s/it]
|
| 290 |
97%|ββββββββββ| 242/250 [53:54<01:44, 13.02s/it]
|
| 291 |
97%|ββββββββββ| 243/250 [54:09<01:34, 13.53s/it]
|
| 292 |
98%|ββββββββββ| 244/250 [54:22<01:21, 13.58s/it]
|
| 293 |
98%|ββββββββββ| 245/250 [54:34<01:04, 12.89s/it]
|
| 294 |
98%|ββββββββββ| 246/250 [54:46<00:51, 12.81s/it]
|
| 295 |
99%|ββββββββββ| 247/250 [54:59<00:38, 12.85s/it]
|
| 296 |
99%|ββββββββββ| 248/250 [55:15<00:27, 13.65s/it]
|
| 297 |
|
|
|
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate: 0.34.2
|
| 2 |
+
Run id: 20260615_141604
|
| 3 |
+
Python: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python
|
| 4 |
+
Accelerate: /dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python -m accelerate.commands.accelerate_cli
|
| 5 |
+
CUDA_VISIBLE_DEVICES: 0,1,2,3
|
| 6 |
+
Model path: /mnt/share01/sqk/models/ChatTime-1-7B-Chat
|
| 7 |
+
Data root: /mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp (train=train.jsonl eval=eval.jsonl)
|
| 8 |
+
Adapter output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/adapter
|
| 9 |
+
Eval output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval
|
| 10 |
+
Log file: /mnt/share01/sqk/ChatTime/tsqa_adapter/logs/sft_4gpu_20260615_141604.log
|
| 11 |
+
βοΈ Running in WANDB offline modeβοΈ Running in WANDB offline mode
|
| 12 |
+
|
| 13 |
+
βοΈ Running in WANDB offline mode
|
| 14 |
+
βοΈ Running in WANDB offline mode
|
| 15 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 16 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 17 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 18 |
+
Applied accelerate compatibility patch: Accelerator.unwrap_model accepts keep_torch_compile.
|
| 19 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 20 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 21 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 22 |
+
SFT token length check: input_rows=8000, kept_rows=7987, skipped_overlong=13, left_truncated_rows=0, max_prompt_len=3818, max_total_len=3842, max_seq_length=4096, skip_overlong=True
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 28 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 29 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 30 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 31 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 32 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 33 |
+
trainable params: 39,976,960 || all params: 6,860,320,768 || trainable%: 0.5827
|
| 34 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 35 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 36 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 37 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 38 |
+
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
| 39 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 40 |
+
/dev/shm/suiqk/conda_envs/scalerag-ts-v4/lib/python3.11/site-packages/accelerate/accelerator.py:494: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
|
| 41 |
+
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
| 42 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 43 |
+
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 44 |
+
|
| 45 |
0%| | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 46 |
+
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 47 |
+
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 48 |
+
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 49 |
+
|
| 50 |
0%| | 1/250 [00:14<58:06, 14.00s/it]
|
| 51 |
1%| | 2/250 [00:24<50:27, 12.21s/it]
|
| 52 |
1%| | 3/250 [00:37<50:29, 12.27s/it]
|
| 53 |
2%|β | 4/250 [00:49<49:31, 12.08s/it]
|
| 54 |
2%|β | 5/250 [01:00<48:41, 11.92s/it]
|
| 55 |
2%|β | 6/250 [01:14<51:41, 12.71s/it]
|
| 56 |
3%|β | 7/250 [01:27<51:10, 12.64s/it]
|
| 57 |
3%|β | 8/250 [01:41<53:20, 13.22s/it]
|
| 58 |
4%|β | 9/250 [01:52<50:23, 12.55s/it]
|
| 59 |
4%|β | 10/250 [02:08<53:48, 13.45s/it]
|
| 60 |
|
| 61 |
+
|
| 62 |
4%|β | 10/250 [02:08<53:48, 13.45s/it]
|
| 63 |
4%|β | 11/250 [02:19<50:52, 12.77s/it]
|
| 64 |
5%|β | 12/250 [02:32<51:15, 12.92s/it]
|
| 65 |
5%|β | 13/250 [02:46<51:17, 12.99s/it]
|
| 66 |
6%|β | 14/250 [02:59<51:03, 12.98s/it]
|
| 67 |
6%|β | 15/250 [03:11<50:07, 12.80s/it]
|
| 68 |
6%|β | 16/250 [03:22<48:26, 12.42s/it]
|
| 69 |
7%|β | 17/250 [03:36<50:04, 12.89s/it]
|
| 70 |
7%|β | 18/250 [03:48<48:20, 12.50s/it]
|
| 71 |
8%|β | 19/250 [03:58<45:26, 11.80s/it]
|
| 72 |
8%|β | 20/250 [04:09<44:36, 11.64s/it]
|
| 73 |
|
| 74 |
+
|
| 75 |
8%|β | 20/250 [04:09<44:36, 11.64s/it]
|
| 76 |
8%|β | 21/250 [04:24<48:03, 12.59s/it]
|
| 77 |
9%|β | 22/250 [04:36<46:53, 12.34s/it]
|
| 78 |
9%|β | 23/250 [04:50<49:03, 12.97s/it]
|
| 79 |
10%|β | 24/250 [05:07<52:25, 13.92s/it]
|
| 80 |
10%|β | 25/250 [05:20<52:07, 13.90s/it]
|
| 81 |
10%|β | 26/250 [05:36<53:41, 14.38s/it]
|
| 82 |
11%|β | 27/250 [05:52<54:58, 14.79s/it]
|
| 83 |
11%|β | 28/250 [06:09<57:57, 15.66s/it]
|
| 84 |
12%|ββ | 29/250 [06:23<55:16, 15.01s/it]
|
| 85 |
12%|ββ | 30/250 [06:36<53:20, 14.55s/it]
|
| 86 |
|
| 87 |
+
|
| 88 |
12%|ββ | 30/250 [06:36<53:20, 14.55s/it]
|
| 89 |
12%|ββ | 31/250 [06:51<52:42, 14.44s/it]
|
| 90 |
13%|ββ | 32/250 [07:02<49:31, 13.63s/it]
|
| 91 |
13%|ββ | 33/250 [07:14<47:28, 13.13s/it]
|
| 92 |
14%|ββ | 34/250 [07:25<44:37, 12.39s/it]
|
| 93 |
14%|ββ | 35/250 [07:42<48:57, 13.66s/it]
|
| 94 |
14%|ββ | 36/250 [07:56<49:01, 13.74s/it]
|
| 95 |
15%|ββ | 37/250 [08:06<45:37, 12.85s/it]
|
| 96 |
15%|ββ | 38/250 [08:22<48:08, 13.62s/it]
|
| 97 |
16%|ββ | 39/250 [08:33<45:37, 12.97s/it]
|
| 98 |
16%|ββ | 40/250 [08:46<44:47, 12.80s/it]
|
| 99 |
|
| 100 |
+
|
| 101 |
16%|ββ | 40/250 [08:46<44:47, 12.80s/it]
|
| 102 |
16%|ββ | 41/250 [08:55<41:16, 11.85s/it]
|
| 103 |
17%|ββ | 42/250 [09:06<40:00, 11.54s/it]
|
| 104 |
17%|ββ | 43/250 [09:24<46:47, 13.56s/it]
|
| 105 |
18%|ββ | 44/250 [09:36<44:43, 13.03s/it]
|
| 106 |
18%|ββ | 45/250 [09:50<45:32, 13.33s/it]
|
| 107 |
18%|ββ | 46/250 [10:03<44:21, 13.05s/it]
|
| 108 |
19%|ββ | 47/250 [10:19<47:26, 14.02s/it]
|
| 109 |
19%|ββ | 48/250 [10:36<50:17, 14.94s/it]
|
| 110 |
20%|ββ | 49/250 [10:54<53:33, 15.99s/it]
|
| 111 |
20%|ββ | 50/250 [11:06<49:05, 14.73s/it]
|
| 112 |
|
| 113 |
+
|
| 114 |
20%|ββ | 50/250 [11:06<49:05, 14.73s/it]
|
| 115 |
20%|ββ | 51/250 [11:20<47:34, 14.34s/it]
|
| 116 |
21%|ββ | 52/250 [11:32<45:08, 13.68s/it]
|
| 117 |
21%|ββ | 53/250 [11:44<43:53, 13.37s/it]
|
| 118 |
22%|βββ | 54/250 [11:58<43:35, 13.34s/it]
|
| 119 |
22%|βββ | 55/250 [12:11<43:18, 13.32s/it]
|
| 120 |
22%|βββ | 56/250 [12:21<40:23, 12.49s/it]
|
| 121 |
23%|βββ | 57/250 [12:35<41:07, 12.78s/it]
|
| 122 |
23%|βββ | 58/250 [12:50<43:07, 13.48s/it]
|
| 123 |
24%|βββ | 59/250 [13:05<44:15, 13.90s/it]
|
| 124 |
24%|βββ | 60/250 [13:18<42:56, 13.56s/it]
|
| 125 |
|
| 126 |
+
|
| 127 |
24%|βββ | 60/250 [13:18<42:56, 13.56s/it]
|
| 128 |
24%|βββ | 61/250 [13:30<41:38, 13.22s/it]
|
| 129 |
25%|βββ | 62/250 [13:43<40:45, 13.01s/it]
|
| 130 |
25%|βββ | 63/250 [13:57<41:43, 13.39s/it]
|
| 131 |
26%|βββ | 64/250 [14:09<40:29, 13.06s/it]
|
| 132 |
26%|βββ | 65/250 [14:24<41:52, 13.58s/it]
|
| 133 |
26%|βββ | 66/250 [14:38<42:00, 13.70s/it]
|
| 134 |
27%|βββ | 67/250 [14:53<43:03, 14.12s/it]
|
| 135 |
27%|βββ | 68/250 [15:08<43:21, 14.29s/it]
|
| 136 |
28%|βββ | 69/250 [15:21<42:37, 14.13s/it]
|
| 137 |
28%|βββ | 70/250 [15:34<41:12, 13.74s/it]
|
| 138 |
|
| 139 |
+
|
| 140 |
28%|βββ | 70/250 [15:34<41:12, 13.74s/it]
|
| 141 |
28%|βββ | 71/250 [15:51<43:27, 14.57s/it]
|
| 142 |
29%|βββ | 72/250 [16:03<41:16, 13.91s/it]
|
| 143 |
29%|βββ | 73/250 [16:15<39:02, 13.24s/it]
|
| 144 |
30%|βββ | 74/250 [16:25<35:48, 12.21s/it]
|
| 145 |
30%|βββ | 75/250 [16:36<34:30, 11.83s/it]
|
| 146 |
30%|βββ | 76/250 [16:49<35:39, 12.29s/it]
|
| 147 |
31%|βββ | 77/250 [17:00<34:42, 12.04s/it]
|
| 148 |
31%|βββ | 78/250 [17:14<35:30, 12.38s/it]
|
| 149 |
32%|ββββ | 79/250 [17:25<34:33, 12.12s/it]
|
| 150 |
32%|ββββ | 80/250 [17:41<37:08, 13.11s/it]
|
| 151 |
|
| 152 |
+
|
| 153 |
32%|ββββ | 80/250 [17:41<37:08, 13.11s/it]
|
| 154 |
32%|ββββ | 81/250 [17:53<36:08, 12.83s/it]
|
| 155 |
33%|ββββ | 82/250 [18:08<37:48, 13.50s/it]
|
| 156 |
33%|ββββ | 83/250 [18:20<36:12, 13.01s/it]
|
| 157 |
34%|ββββ | 84/250 [18:38<40:30, 14.64s/it]
|
| 158 |
34%|ββββ | 85/250 [18:52<39:39, 14.42s/it]
|
| 159 |
34%|ββββ | 86/250 [19:10<42:03, 15.39s/it]
|
| 160 |
35%|ββββ | 87/250 [19:24<41:21, 15.23s/it]
|
| 161 |
35%|ββββ | 88/250 [19:36<38:19, 14.19s/it]
|
| 162 |
36%|ββββ | 89/250 [19:53<40:26, 15.07s/it]
|
| 163 |
36%|ββββ | 90/250 [20:07<38:51, 14.57s/it]
|
| 164 |
|
| 165 |
+
|
| 166 |
36%|ββββ | 90/250 [20:07<38:51, 14.57s/it]
|
| 167 |
36%|ββββ | 91/250 [20:19<36:35, 13.81s/it]
|
| 168 |
37%|ββββ | 92/250 [20:31<34:47, 13.21s/it]
|
| 169 |
37%|ββββ | 93/250 [20:44<34:36, 13.22s/it]
|
| 170 |
38%|ββββ | 94/250 [20:56<33:31, 12.90s/it]
|
| 171 |
38%|ββββ | 95/250 [21:07<31:39, 12.25s/it]
|
| 172 |
38%|ββββ | 96/250 [21:21<33:07, 12.91s/it]
|
| 173 |
39%|ββββ | 97/250 [21:35<33:24, 13.10s/it]
|
| 174 |
39%|ββββ | 98/250 [21:47<32:27, 12.81s/it]
|
| 175 |
40%|ββββ | 99/250 [21:59<32:04, 12.74s/it]
|
| 176 |
40%|ββββ | 100/250 [22:15<33:41, 13.47s/it]
|
| 177 |
|
| 178 |
+
|
| 179 |
40%|ββββ | 100/250 [22:15<33:41, 13.47s/it]
|
| 180 |
40%|ββββ | 101/250 [22:26<32:09, 12.95s/it]
|
| 181 |
41%|ββββ | 102/250 [22:41<32:55, 13.35s/it]
|
| 182 |
41%|ββββ | 103/250 [22:54<32:21, 13.21s/it]
|
| 183 |
42%|βββββ | 104/250 [23:05<30:35, 12.57s/it]
|
| 184 |
42%|βββββ | 105/250 [23:15<29:06, 12.05s/it]
|
| 185 |
42%|βββββ | 106/250 [23:28<29:07, 12.14s/it]
|
| 186 |
43%|βββββ | 107/250 [23:39<28:36, 12.00s/it]
|
| 187 |
43%|βββββ | 108/250 [23:55<30:41, 12.97s/it]
|
| 188 |
44%|βββββ | 109/250 [24:08<30:29, 12.97s/it]
|
| 189 |
44%|βββββ | 110/250 [24:19<28:46, 12.33s/it]
|
| 190 |
|
| 191 |
+
|
| 192 |
44%|βββββ | 110/250 [24:19<28:46, 12.33s/it]
|
| 193 |
44%|βββββ | 111/250 [24:31<28:51, 12.46s/it]
|
| 194 |
45%|βββββ | 112/250 [24:45<29:12, 12.70s/it]
|
| 195 |
45%|βββββ | 113/250 [24:59<30:04, 13.17s/it]
|
| 196 |
46%|βββββ | 114/250 [25:13<30:19, 13.38s/it]
|
| 197 |
46%|βββββ | 115/250 [25:26<30:21, 13.49s/it]
|
| 198 |
46%|βββββ | 116/250 [25:39<29:41, 13.30s/it]
|
| 199 |
47%|βββββ | 117/250 [25:54<30:07, 13.59s/it]
|
| 200 |
47%|βββββ | 118/250 [26:10<31:36, 14.37s/it]
|
| 201 |
48%|βββββ | 119/250 [26:24<31:30, 14.43s/it]
|
| 202 |
48%|βββββ | 120/250 [26:40<32:11, 14.86s/it]
|
| 203 |
|
| 204 |
+
|
| 205 |
48%|βββββ | 120/250 [26:40<32:11, 14.86s/it]
|
| 206 |
48%|βββββ | 121/250 [26:54<31:23, 14.60s/it]
|
| 207 |
49%|βββββ | 122/250 [27:06<29:25, 13.79s/it]
|
| 208 |
49%|βββββ | 123/250 [27:19<28:45, 13.58s/it]
|
| 209 |
50%|βββββ | 124/250 [27:33<28:21, 13.50s/it]
|
| 210 |
50%|βββββ | 125/250 [27:45<27:38, 13.27s/it]
|
| 211 |
50%|βββββ | 126/250 [27:58<27:08, 13.13s/it]
|
| 212 |
51%|βββββ | 127/250 [28:11<27:02, 13.19s/it]
|
| 213 |
51%|βββββ | 128/250 [28:22<25:27, 12.52s/it]
|
| 214 |
52%|ββββββ | 129/250 [28:41<29:07, 14.44s/it]
|
| 215 |
52%|ββββββ | 130/250 [28:55<28:43, 14.36s/it]
|
| 216 |
|
| 217 |
+
|
| 218 |
52%|ββββββ | 130/250 [28:55<28:43, 14.36s/it]
|
| 219 |
52%|ββββββ | 131/250 [29:08<27:21, 13.79s/it]
|
| 220 |
53%|ββββββ | 132/250 [29:20<26:09, 13.30s/it]
|
| 221 |
53%|ββββββ | 133/250 [29:33<25:54, 13.29s/it]
|
| 222 |
54%|ββββββ | 134/250 [29:46<25:12, 13.04s/it]
|
| 223 |
54%|ββββββ | 135/250 [30:00<25:46, 13.44s/it]
|
| 224 |
54%|ββββββ | 136/250 [30:11<24:14, 12.76s/it]
|
| 225 |
55%|ββββββ | 137/250 [30:24<23:43, 12.60s/it]
|
| 226 |
55%|ββββββ | 138/250 [30:40<25:54, 13.88s/it]
|
| 227 |
56%|ββββββ | 139/250 [30:54<25:16, 13.66s/it]
|
| 228 |
56%|ββββββ | 140/250 [31:07<24:42, 13.48s/it]
|
| 229 |
|
| 230 |
+
|
| 231 |
56%|ββββββ | 140/250 [31:07<24:42, 13.48s/it]
|
| 232 |
56%|ββββββ | 141/250 [31:24<26:26, 14.55s/it]
|
| 233 |
57%|ββββββ | 142/250 [31:38<26:20, 14.64s/it]
|
| 234 |
57%|ββββββ | 143/250 [31:50<24:34, 13.78s/it]
|
| 235 |
58%|ββββββ | 144/250 [32:03<23:47, 13.46s/it]
|
| 236 |
58%|ββββββ | 145/250 [32:13<21:46, 12.45s/it]
|
| 237 |
58%|ββββββ | 146/250 [32:26<21:41, 12.51s/it]
|
| 238 |
59%|ββββββ | 147/250 [32:39<21:56, 12.78s/it]
|
| 239 |
59%|ββββββ | 148/250 [32:55<23:31, 13.84s/it]
|
| 240 |
60%|ββββββ | 149/250 [33:09<23:13, 13.80s/it]
|
| 241 |
60%|ββββββ | 150/250 [33:22<22:24, 13.45s/it]
|
| 242 |
|
| 243 |
+
|
| 244 |
60%|ββββββ | 150/250 [33:22<22:24, 13.45s/it]
|
| 245 |
60%|ββββββ | 151/250 [33:37<23:04, 13.98s/it]
|
| 246 |
61%|ββββββ | 152/250 [33:49<21:46, 13.34s/it]
|
| 247 |
61%|ββββββ | 153/250 [33:59<20:08, 12.46s/it]
|
| 248 |
62%|βββββββ | 154/250 [34:15<21:29, 13.43s/it]
|
| 249 |
62%|βββββββ | 155/250 [34:28<21:07, 13.34s/it]
|
| 250 |
62%|βββββββ | 156/250 [34:38<19:31, 12.46s/it]
|
| 251 |
63%|βββββββ | 157/250 [34:51<19:25, 12.53s/it]
|
| 252 |
63%|βββββββ | 158/250 [35:05<19:40, 12.83s/it]
|
| 253 |
64%|βββββββ | 159/250 [35:20<20:21, 13.43s/it]
|
| 254 |
64%|βββββββ | 160/250 [35:33<20:09, 13.44s/it]
|
| 255 |
|
| 256 |
+
|
| 257 |
64%|βββββββ | 160/250 [35:33<20:09, 13.44s/it]
|
| 258 |
64%|βββββββ | 161/250 [35:44<18:52, 12.72s/it]
|
| 259 |
65%|βββββββ | 162/250 [35:57<18:36, 12.69s/it]
|
| 260 |
65%|βββββββ | 163/250 [36:08<17:39, 12.18s/it]
|
| 261 |
66%|βββββββ | 164/250 [36:21<18:00, 12.56s/it]
|
| 262 |
66%|βββββββ | 165/250 [36:35<18:22, 12.97s/it]
|
| 263 |
66%|βββββββ | 166/250 [36:48<18:19, 13.09s/it]
|
| 264 |
67%|βββββββ | 167/250 [37:01<18:04, 13.06s/it]
|
| 265 |
67%|βββββββ | 168/250 [37:12<16:44, 12.25s/it]
|
| 266 |
68%|βββββββ | 169/250 [37:25<16:57, 12.56s/it]
|
| 267 |
68%|βββββββ | 170/250 [37:38<16:46, 12.58s/it]
|
| 268 |
|
| 269 |
+
|
| 270 |
68%|βββββββ | 170/250 [37:38<16:46, 12.58s/it]
|
| 271 |
68%|βββββββ | 171/250 [37:50<16:35, 12.60s/it]
|
| 272 |
69%|βββββββ | 172/250 [38:05<17:17, 13.30s/it]
|
| 273 |
69%|βββββββ | 173/250 [38:21<17:52, 13.93s/it]
|
| 274 |
70%|βββββββ | 174/250 [38:37<18:29, 14.60s/it]
|
| 275 |
70%|βββββββ | 175/250 [38:50<17:50, 14.27s/it]
|
| 276 |
70%|βββββββ | 176/250 [39:04<17:30, 14.20s/it]
|
| 277 |
71%|βββββββ | 177/250 [39:16<16:22, 13.46s/it]
|
| 278 |
71%|βββββββ | 178/250 [39:28<15:42, 13.09s/it]
|
| 279 |
72%|ββββββββ | 179/250 [39:40<15:06, 12.77s/it]
|
| 280 |
72%|ββββββββ | 180/250 [39:54<15:17, 13.10s/it]
|
| 281 |
|
| 282 |
+
|
| 283 |
72%|ββββββββ | 180/250 [39:54<15:17, 13.10s/it]
|
| 284 |
72%|ββββββββ | 181/250 [40:04<14:04, 12.24s/it]
|
| 285 |
73%|ββββββββ | 182/250 [40:18<14:27, 12.75s/it]
|
| 286 |
73%|ββββββββ | 183/250 [40:32<14:31, 13.01s/it]
|
| 287 |
74%|ββββββββ | 184/250 [40:47<14:53, 13.53s/it]
|
| 288 |
74%|ββββββββ | 185/250 [41:02<15:19, 14.15s/it]
|
| 289 |
74%|ββββββββ | 186/250 [41:16<14:48, 13.88s/it]
|
| 290 |
75%|ββββββββ | 187/250 [41:31<14:55, 14.22s/it]
|
| 291 |
75%|ββββββββ | 188/250 [41:42<13:57, 13.51s/it]
|
| 292 |
76%|ββββββββ | 189/250 [41:54<13:05, 12.87s/it]
|
| 293 |
76%|ββββββββ | 190/250 [42:07<12:50, 12.83s/it]
|
| 294 |
|
| 295 |
+
|
| 296 |
76%|ββββββββ | 190/250 [42:07<12:50, 12.83s/it]
|
| 297 |
76%|ββββββββ | 191/250 [42:21<13:14, 13.46s/it]
|
| 298 |
77%|ββββββββ | 192/250 [42:37<13:29, 13.96s/it]
|
| 299 |
77%|ββββββββ | 193/250 [42:50<13:01, 13.71s/it]
|
| 300 |
78%|ββββββββ | 194/250 [43:02<12:17, 13.16s/it]
|
| 301 |
78%|ββββββββ | 195/250 [43:15<12:14, 13.35s/it]
|
| 302 |
78%|ββββββββ | 196/250 [43:29<12:06, 13.45s/it]
|
| 303 |
79%|ββββββββ | 197/250 [43:42<11:45, 13.31s/it]
|
| 304 |
79%|ββββββββ | 198/250 [43:53<10:47, 12.45s/it]
|
| 305 |
80%|ββββββββ | 199/250 [44:09<11:29, 13.52s/it]
|
| 306 |
80%|ββββββββ | 200/250 [44:21<10:54, 13.08s/it]
|
| 307 |
|
| 308 |
+
|
| 309 |
80%|ββββββββ | 200/250 [44:21<10:54, 13.08s/it]
|
| 310 |
80%|ββββββββ | 201/250 [44:39<12:06, 14.83s/it]
|
| 311 |
81%|ββββββββ | 202/250 [44:55<11:56, 14.92s/it]
|
| 312 |
81%|ββββββββ | 203/250 [45:07<11:07, 14.20s/it]
|
| 313 |
82%|βββββββββ | 204/250 [45:22<11:06, 14.49s/it]
|
| 314 |
82%|βββββββββ | 205/250 [45:35<10:33, 14.08s/it]
|
| 315 |
82%|βββββββββ | 206/250 [45:48<10:04, 13.75s/it]
|
| 316 |
83%|βββββββββ | 207/250 [45:59<09:16, 12.95s/it]
|
| 317 |
83%|βββββββββ | 208/250 [46:16<09:42, 13.88s/it]
|
| 318 |
84%|βββββββββ | 209/250 [46:29<09:23, 13.75s/it]
|
| 319 |
84%|βββββββββ | 210/250 [46:41<08:47, 13.18s/it]
|
| 320 |
|
| 321 |
+
|
| 322 |
84%|βββββββββ | 210/250 [46:41<08:47, 13.18s/it]
|
| 323 |
84%|βββββββββ | 211/250 [46:53<08:27, 13.02s/it]
|
| 324 |
85%|βββββββββ | 212/250 [47:05<08:01, 12.66s/it]
|
| 325 |
85%|βββββββββ | 213/250 [47:19<07:58, 12.93s/it]
|
| 326 |
86%|βββββββββ | 214/250 [47:31<07:38, 12.74s/it]
|
| 327 |
86%|βββββββββ | 215/250 [47:46<07:45, 13.30s/it]
|
| 328 |
86%|βββββββββ | 216/250 [47:59<07:28, 13.18s/it]
|
| 329 |
87%|βββββββββ | 217/250 [48:12<07:15, 13.18s/it]
|
| 330 |
87%|βββββββββ | 218/250 [48:29<07:38, 14.32s/it]
|
| 331 |
88%|βββββββββ | 219/250 [48:43<07:26, 14.41s/it]
|
| 332 |
88%|βββββββββ | 220/250 [48:57<07:06, 14.23s/it]
|
| 333 |
|
| 334 |
+
|
| 335 |
88%|βββββββββ | 220/250 [48:57<07:06, 14.23s/it]
|
| 336 |
88%|βββββββββ | 221/250 [49:10<06:37, 13.71s/it]
|
| 337 |
89%|βββββββββ | 222/250 [49:29<07:08, 15.32s/it]
|
| 338 |
89%|βββββββββ | 223/250 [49:44<06:55, 15.38s/it]
|
| 339 |
90%|βββββββββ | 224/250 [49:55<06:03, 14.00s/it]
|
| 340 |
90%|βββββββββ | 225/250 [50:09<05:51, 14.04s/it]
|
| 341 |
90%|βββββββββ | 226/250 [50:26<05:56, 14.84s/it]
|
| 342 |
91%|βββββββββ | 227/250 [50:36<05:06, 13.34s/it]
|
| 343 |
91%|βββββββββ | 228/250 [50:52<05:14, 14.31s/it]
|
| 344 |
92%|ββββββββββ| 229/250 [51:04<04:44, 13.54s/it]
|
| 345 |
92%|ββββββββββ| 230/250 [51:17<04:24, 13.22s/it]
|
| 346 |
|
| 347 |
+
|
| 348 |
92%|ββββββββββ| 230/250 [51:17<04:24, 13.22s/it]
|
| 349 |
92%|ββββββββββ| 231/250 [51:29<04:08, 13.09s/it]
|
| 350 |
93%|ββββββββββ| 232/250 [51:41<03:48, 12.71s/it]
|
| 351 |
93%|ββββββββββ| 233/250 [51:55<03:39, 12.90s/it]
|
| 352 |
94%|ββββββββββ| 234/250 [52:06<03:21, 12.58s/it]
|
| 353 |
94%|ββββββββββ| 235/250 [52:22<03:23, 13.54s/it]
|
| 354 |
94%|ββββββββββ| 236/250 [52:35<03:06, 13.35s/it]
|
| 355 |
95%|ββββββββββ| 237/250 [52:49<02:56, 13.58s/it]
|
| 356 |
95%|ββββββββββ| 238/250 [53:03<02:44, 13.67s/it]
|
| 357 |
96%|ββββββββββ| 239/250 [53:16<02:26, 13.31s/it]
|
| 358 |
96%|ββββββββββ| 240/250 [53:29<02:12, 13.29s/it]
|
| 359 |
|
| 360 |
+
|
| 361 |
96%|ββββββββββ| 240/250 [53:29<02:12, 13.29s/it]
|
| 362 |
96%|ββββββββββ| 241/250 [53:40<01:54, 12.74s/it]
|
| 363 |
97%|ββββββββββ| 242/250 [53:54<01:44, 13.02s/it]
|
| 364 |
97%|ββββββββββ| 243/250 [54:09<01:34, 13.53s/it]
|
| 365 |
98%|ββββββββββ| 244/250 [54:22<01:21, 13.58s/it]
|
| 366 |
98%|ββββββββββ| 245/250 [54:34<01:04, 12.89s/it]
|
| 367 |
98%|ββββββββββ| 246/250 [54:46<00:51, 12.81s/it]
|
| 368 |
99%|ββββββββββ| 247/250 [54:59<00:38, 12.85s/it]
|
| 369 |
99%|ββββββββββ| 248/250 [55:15<00:27, 13.65s/it]
|
| 370 |
|
| 371 |
+
|
| 372 |
|
| 373 |
+
|
| 374 |
+
[rank0]:[W615 15:17:28.247429157 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 375 |
+
Dataset: /mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp/eval.jsonl
|
| 376 |
+
Total samples: 800
|
| 377 |
+
World size: 4
|
| 378 |
+
Per-device eval batch size: 4
|
| 379 |
+
Maximum global eval batch size: 16
|
| 380 |
+
Output dir: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
[rank0]:[W615 15:26:42.394095285 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
|
| 387 |
+
[rank2]:[W615 15:26:46.615147024 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
|
| 388 |
+
[rank3]:[W615 15:27:02.342169152 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
|
| 389 |
+
[rank1]:[W615 15:27:07.925204166 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
|
| 390 |
+
{
|
| 391 |
+
"by_group": {
|
| 392 |
+
"anomaly_detection": {
|
| 393 |
+
"count": 200,
|
| 394 |
+
"accuracy": 0.82,
|
| 395 |
+
"correct": 164,
|
| 396 |
+
"parsed": 200
|
| 397 |
+
},
|
| 398 |
+
"classification": {
|
| 399 |
+
"count": 200,
|
| 400 |
+
"accuracy": 0.74,
|
| 401 |
+
"correct": 148,
|
| 402 |
+
"parsed": 200
|
| 403 |
+
},
|
| 404 |
+
"forecasting": {
|
| 405 |
+
"count": 200,
|
| 406 |
+
"valid_samples": 170,
|
| 407 |
+
"valid_points": 3164,
|
| 408 |
+
"mse": 825899.547396054,
|
| 409 |
+
"mae": 151.69107746555417
|
| 410 |
+
},
|
| 411 |
+
"open_ended": {
|
| 412 |
+
"count": 200,
|
| 413 |
+
"accuracy": 0.45,
|
| 414 |
+
"parsed_accuracy": 0.45685279187817257,
|
| 415 |
+
"parse_rate": 0.985,
|
| 416 |
+
"correct": 90,
|
| 417 |
+
"parsed": 197,
|
| 418 |
+
"unparsed": 3,
|
| 419 |
+
"by_format": {
|
| 420 |
+
"multiple_choice": {
|
| 421 |
+
"count": 67,
|
| 422 |
+
"accuracy": 0.2835820895522388,
|
| 423 |
+
"parsed_accuracy": 0.2878787878787879,
|
| 424 |
+
"correct": 19,
|
| 425 |
+
"parsed": 66,
|
| 426 |
+
"unparsed": 1
|
| 427 |
+
},
|
| 428 |
+
"open_ended_question": {
|
| 429 |
+
"count": 67,
|
| 430 |
+
"accuracy": 0.417910447761194,
|
| 431 |
+
"parsed_accuracy": 0.4307692307692308,
|
| 432 |
+
"correct": 28,
|
| 433 |
+
"parsed": 65,
|
| 434 |
+
"unparsed": 2
|
| 435 |
+
},
|
| 436 |
+
"true_false": {
|
| 437 |
+
"count": 66,
|
| 438 |
+
"accuracy": 0.6515151515151515,
|
| 439 |
+
"parsed_accuracy": 0.6515151515151515,
|
| 440 |
+
"correct": 43,
|
| 441 |
+
"parsed": 66,
|
| 442 |
+
"unparsed": 0
|
| 443 |
+
}
|
| 444 |
+
},
|
| 445 |
+
"by_method": {
|
| 446 |
+
"anomaly": {
|
| 447 |
+
"count": 1,
|
| 448 |
+
"accuracy": 1.0,
|
| 449 |
+
"correct": 1
|
| 450 |
+
},
|
| 451 |
+
"cyclical": {
|
| 452 |
+
"count": 4,
|
| 453 |
+
"accuracy": 0.75,
|
| 454 |
+
"correct": 3
|
| 455 |
+
},
|
| 456 |
+
"multiple_choice": {
|
| 457 |
+
"count": 64,
|
| 458 |
+
"accuracy": 0.296875,
|
| 459 |
+
"correct": 19
|
| 460 |
+
},
|
| 461 |
+
"numeric_scalar": {
|
| 462 |
+
"count": 31,
|
| 463 |
+
"accuracy": 0.25806451612903225,
|
| 464 |
+
"correct": 8
|
| 465 |
+
},
|
| 466 |
+
"numeric_sequence": {
|
| 467 |
+
"count": 1,
|
| 468 |
+
"accuracy": 0.0,
|
| 469 |
+
"correct": 0
|
| 470 |
+
},
|
| 471 |
+
"seasonality": {
|
| 472 |
+
"count": 3,
|
| 473 |
+
"accuracy": 1.0,
|
| 474 |
+
"correct": 3
|
| 475 |
+
},
|
| 476 |
+
"trend": {
|
| 477 |
+
"count": 20,
|
| 478 |
+
"accuracy": 0.45,
|
| 479 |
+
"correct": 9
|
| 480 |
+
},
|
| 481 |
+
"true_false": {
|
| 482 |
+
"count": 65,
|
| 483 |
+
"accuracy": 0.6461538461538462,
|
| 484 |
+
"correct": 42
|
| 485 |
+
},
|
| 486 |
+
"volatility": {
|
| 487 |
+
"count": 8,
|
| 488 |
+
"accuracy": 0.625,
|
| 489 |
+
"correct": 5
|
| 490 |
+
}
|
| 491 |
+
}
|
| 492 |
+
}
|
| 493 |
+
},
|
| 494 |
+
"text_overall": {
|
| 495 |
+
"count": 800,
|
| 496 |
+
"exact_match": 0.39,
|
| 497 |
+
"normalized_exact_match": 0.39,
|
| 498 |
+
"token_f1": 0.6366794082162608
|
| 499 |
+
},
|
| 500 |
+
"num_samples": 800,
|
| 501 |
+
"counts_by_group": {
|
| 502 |
+
"anomaly_detection": 200,
|
| 503 |
+
"classification": 200,
|
| 504 |
+
"forecasting": 200,
|
| 505 |
+
"open_ended": 200
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
Saved predictions: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval/predictions.jsonl
|
| 509 |
+
Saved metrics: /mnt/share01/sqk/ChatTime/tsqa_adapter/outputs/sft_20260615_141604/eval/metrics.json
|