Zaynes commited on Oct 25, 2025

Commit

156cd4d

verified ·

1 Parent(s): 4c02321

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +8 -35
Modelfile +16 -0
added_tokens.json +24 -0
chat_template.jinja +54 -0
config.json +58 -0
generation_config.json +13 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +208 -0
training_artifacts/README.md +16 -0
training_artifacts/hydra_config.yaml +197 -0
training_artifacts/logs/pipeline_cleaned.txt +532 -0
training_artifacts/merge_config.yaml +4 -0
training_artifacts/train_config.yaml +32 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,8 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Mark all log files as text to prevent binary file issues
+*.log text
+*.txt text
+*.out text
+*.err text
+training_artifacts/logs/* text
+model.safetensors filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

Modelfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# ollama modelfile auto-generated by llamafactory
+FROM .
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+<|im_start|>assistant
+{{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
+{{ end }}{{ end }}"""
+SYSTEM """You are Qwen, created by Alibaba Cloud. You are a helpful assistant."""
+PARAMETER stop "<|im_end|>"
+PARAMETER num_ctx 4096

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae93c3f1fdb1fde9b64a6952d89905d3254cec875368c652d418df991d3b4e1a
+size 3087466808

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_artifacts/README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# Training Artifacts
+This directory contains the training configuration and logs for this model.
+## Contents
+- **hydra_config.yaml**: Complete Hydra configuration used for training
+- **train_config.yaml**: LlamaFactory training configuration
+- **merge_config.yaml**: LlamaFactory merge/export configuration
+- **logs/**: Training logs from the job (cleaned for text format)
+## Job Information
+- Job Name: testing__pvv2_resume
+- Timestamp: 2025-10-25 03:02:47 UTC
+- Execution Mode: Local

training_artifacts/hydra_config.yaml ADDED Viewed

	@@ -0,0 +1,197 @@

+? ''
+: ? ''
+  : ? ''
+    : hydra:
+        run:
+          dir: .
+        output_subdir: null
+        job:
+          chdir: false
+      _target_: null
+      job:
+        name: ???
+        mode: slurm
+        dry_run: false
+      slurm:
+        time_limit: ???
+        constraint:
+        - h200
+        memory: 200
+        cpus_per_task: 16
+        partition: null
+        mail_user: user@example.com
+      execution:
+        nodes: null
+        gpus_per_node: null
+        num_gpus: null
+        hostfile: null
+        secrets_file: null
+      model:
+        name_or_path: ???
+        finetuning_type: lora
+      dataset:
+        name: null
+        dir: null
+        info_json: null
+        template: default
+        cutoff_len: 1024
+        val_size: 0.1
+        tokenized_path: null
+        hf_hub_url: null
+        formatting: alpaca
+        ranking: false
+        subset: null
+        split: train
+        folder: null
+        num_samples: null
+        columns:
+          prompt: null
+          query: null
+          response: null
+          history: null
+          messages: null
+          system: null
+          tools: null
+          images: null
+          videos: null
+          audios: null
+          chosen: null
+          rejected: null
+          kto_tag: null
+        tags:
+          role: null
+          content: null
+          user: null
+          assistant: null
+          observation: null
+          function: null
+          system: null
+      output:
+        experiment_dir: ./experiments
+      wandb:
+        project: null
+        run_name: null
+        entity: null
+      hf:
+        repo_id: null
+        private: false
+        upload_artifacts: true
+      cleanup:
+        checkpoints: false
+        merged: false
+      finetuning:
+        training:
+          stage: sft
+          do_train: true
+          finetuning_type: lora
+          lora_rank: 8
+          lora_alpha: 16
+          lora_dropout: 0.05
+          lora_target: all
+          overwrite_cache: true
+          preprocessing_num_workers: 16
+          dataloader_num_workers: 4
+          logging_steps: 10
+          save_steps: 500
+          plot_loss: true
+          overwrite_output_dir: true
+          save_only_model: false
+          report_to: none
+          per_device_train_batch_size: 1
+          gradient_accumulation_steps: 8
+          learning_rate: 0.0001
+          num_train_epochs: 3.0
+          lr_scheduler_type: cosine
+          warmup_ratio: 0.1
+          bf16: true
+          ddp_timeout: 180000000
+          resume_from_checkpoint: null
+          val_size: 0.1
+          per_device_eval_batch_size: 1
+          eval_strategy: steps
+          eval_steps: 500
+          do_eval: true
+        merge:
+          export_dir: null
+          export_size: 5
+          export_device: cpu
+          export_legacy_format: false
+job:
+  name: testing__pvv2_resume
+  mode: local
+  work_dir: null
+  dry_run: false
+slurm:
+  time_limit: null
+  constraint: null
+  memory: null
+  partition: null
+  mail_user: null
+execution:
+  nodes: 1
+  gpus_per_node: 2
+  num_gpus: null
+  hostfile: null
+  secrets_file: ./secrets.env
+model:
+  name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+  finetuning_type: full
+dataset:
+  name: TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
+  dir: null
+  info_json: null
+  template: qwen
+  cutoff_len: 16192
+  val_size: 0.0
+  hf_hub_url: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
+  formatting: sharegpt
+  ranking: false
+  subset: null
+  split: train
+  folder: null
+  num_samples: null
+  columns:
+    messages: conversations
+  tags:
+    role: role
+    content: content
+    user: user
+    assistant: assistant
+  tokenized_path: /scratch/zrs2020/.cache/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
+output:
+  experiment_dir: ./experiments
+wandb:
+  project: null
+  run_name: testing__pvv2_resume
+  entity: null
+hf:
+  repo_id: TAUR-dev/testing__lf_pvv2_resume
+  private: false
+cleanup:
+  checkpoints: false
+  merged: false
+training:
+  stage: sft
+  do_train: true
+  max_samples: 100000
+  do_eval: false
+  save_strategy: steps
+  save_steps: 5
+  logging_steps: 10
+  fp16: false
+  bf16: true
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  overwrite_output_dir: true
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 1
+  gradient_checkpointing: true
+  learning_rate: 1.0e-06
+  lr_scheduler_type: cosine
+  num_train_epochs: 2
+  warmup_ratio: 0.05
+  weight_decay: 0.0001
+  template: qwen
+  max_steps: 10
+  preprocessing_num_workers: 16
+  overwrite_cache: true

training_artifacts/logs/pipeline_cleaned.txt ADDED Viewed

	@@ -0,0 +1,532 @@

+[2025-10-24 23:00:49] ========================================
+[2025-10-24 23:00:49] Job Name: testing__pvv2_resume
+[2025-10-24 23:00:49] Hostname: gl001.hpc.nyu.edu
+[2025-10-24 23:00:49] Number of nodes: 1
+[2025-10-24 23:00:49] GPUs per node: 2
+[2025-10-24 23:00:49] Start Time: Fri Oct 24 11:00:49 PM EDT 2025
+[2025-10-24 23:00:49] Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/pipeline.log
+[2025-10-24 23:00:49] ========================================
+[2025-10-24 23:00:49] Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
+[2025-10-24 23:00:52]
+[2025-10-24 23:00:52] ========================================
+[2025-10-24 23:00:52] Configuration Paths
+[2025-10-24 23:00:52] ========================================
+[2025-10-24 23:00:52] Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/configs/train_config.yaml
+[2025-10-24 23:00:52] Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/configs/merge_config.yaml
+[2025-10-24 23:00:52] Dataset Info: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data/dataset_info.json
+[2025-10-24 23:00:52] Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
+[2025-10-24 23:00:52] Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged
+[2025-10-24 23:00:52] HF Repo ID: TAUR-dev/testing__lf_pvv2_resume
+[2025-10-24 23:00:52]
+[make-effective-cfg] tokenized_path: /scratch/zrs2020/.cache/hf_cache/home/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full_fb94f2a3
+[make-effective-cfg] wrote: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/train_config.effective.yaml
+[2025-10-24 23:00:52]
+[2025-10-24 23:00:52] ========================================
+[2025-10-24 23:00:52] STAGE 0: Downloading Dataset
+[2025-10-24 23:00:52] Dataset: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
+[2025-10-24 23:00:52] Start Time: Fri Oct 24 11:00:52 PM EDT 2025
+[2025-10-24 23:00:52] ========================================
+[dataset-download] Loading dataset from: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
+[dataset-download] Dataset loaded successfully
+[dataset-download] Dataset info: DatasetDict({
+    train: Dataset({
+        features: ['conversations', 'sft_template_type_idx'],
+        num_rows: 29130
+    })
+})
+[2025-10-24 23:00:54]
+[2025-10-24 23:00:54] ========================================
+[2025-10-24 23:00:54] Dataset download completed
+[2025-10-24 23:00:54] End Time: Fri Oct 24 11:00:54 PM EDT 2025
+[2025-10-24 23:00:54] ========================================
+[2025-10-24 23:00:54]
+[2025-10-24 23:00:54] ========================================
+[2025-10-24 23:00:54] STAGE 1: Training Model
+[2025-10-24 23:00:54] Start Time: Fri Oct 24 11:00:54 PM EDT 2025
+[2025-10-24 23:00:54] ========================================
+[2025-10-24 23:00:54] Job: testing__pvv2_resume
+[2025-10-24 23:00:54] Nodes: 1  |  GPUs/node: 2
+[2025-10-24 23:00:54] Master: 127.0.0.1:29500
+[2025-10-24 23:00:54] LLaMA-Factory: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+[2025-10-24 23:00:54] Train cfg (effective): /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/train_config.effective.yaml
+[2025-10-24 23:00:54] HF cache: /scratch/zrs2020/.cache/hf_cache/home/datasets
+[2025-10-24 23:00:54] Launcher: torchrun
+[2025-10-24 23:00:54]
+[2025-10-24 23:00:54] Single-node training (2 GPU(s))
+[2025-10-24 23:00:54] Executing command: llamafactory-cli train /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/train_config.effective.yaml
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+[INFO|2025-10-24 23:01:02] llamafactory.launcher:143 >> Initializing 2 distributed tasks at: 127.0.0.1:29500
+W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803]
+W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803] *****************************************
+W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803] *****************************************
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  import pkg_resources
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  import pkg_resources
+[W1024 23:01:11.583869947 ProcessGroupNCCL.cpp:924] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+[W1024 23:01:11.583875563 ProcessGroupNCCL.cpp:924] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+[INFO|2025-10-24 23:01:12] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 2, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16
+[INFO|2025-10-24 23:01:12] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 2, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-24 23:01:12,402 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-24 23:01:12,622 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
+[INFO|configuration_utils.py:839] 2025-10-24 23:01:12,624 >> Model config Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-24 23:01:12,895 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[WARNING|2025-10-24 23:01:12] llamafactory.data.loader:148 >> Loading dataset from disk will ignore other data arguments.
+[INFO|2025-10-24 23:01:12] llamafactory.data.loader:143 >> Loaded tokenized dataset from /scratch/zrs2020/.cache/hf_cache/home/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full_fb94f2a3.
+[INFO|configuration_utils.py:765] 2025-10-24 23:01:12,971 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
+[INFO|configuration_utils.py:839] 2025-10-24 23:01:12,972 >> Model config Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[INFO|2025-10-24 23:01:12] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
+`torch_dtype` is deprecated! Use `dtype` instead!
+[WARNING|logging.py:328] 2025-10-24 23:01:13,309 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1172] 2025-10-24 23:01:13,309 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/model.safetensors
+[INFO|modeling_utils.py:2341] 2025-10-24 23:01:13,310 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
+[INFO|configuration_utils.py:986] 2025-10-24 23:01:13,311 >> Generate config GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "use_cache": false
+}
+[INFO|configuration_utils.py:941] 2025-10-24 23:01:13,896 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-24 23:01:13,896 >> Generate config GenerationConfig {
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8
+}
+[INFO|dynamic_module_utils.py:423] 2025-10-24 23:01:13,938 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-1.5B-Instruct.
+[INFO|2025-10-24 23:01:13] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
+[INFO|2025-10-24 23:01:13] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-10-24 23:01:13] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
+[INFO|2025-10-24 23:01:13] llamafactory.model.adapter:143 >> Fine-tuning method: Full
+[INFO|2025-10-24 23:01:13] llamafactory.model.loader:143 >> trainable params: 1,543,714,304 || all params: 1,543,714,304 || trainable%: 100.0000
+[WARNING|trainer.py:906] 2025-10-24 23:01:13,975 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
+The model is already on multiple devices. Skipping the move to device specified in `args`.
+[INFO|trainer.py:699] 2025-10-24 23:01:13,977 >> max_steps is given, it will override any value given in num_train_epochs
+[INFO|trainer.py:749] 2025-10-24 23:01:13,977 >> Using auto half precision backend
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
+[WARNING|trainer.py:982] 2025-10-24 23:01:13,979 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
+NCCL version 2.27.5+cuda12.9
+[INFO|trainer.py:2519] 2025-10-24 23:01:14,677 >> ***** Running training *****
+[INFO|trainer.py:2520] 2025-10-24 23:01:14,677 >>   Num examples = 29,130
+[INFO|trainer.py:2521] 2025-10-24 23:01:14,677 >>   Num Epochs = 1
+[INFO|trainer.py:2522] 2025-10-24 23:01:14,677 >>   Instantaneous batch size per device = 1
+[INFO|trainer.py:2525] 2025-10-24 23:01:14,677 >>   Total train batch size (w. parallel, distributed & accumulation) = 2
+[INFO|trainer.py:2526] 2025-10-24 23:01:14,677 >>   Gradient Accumulation steps = 1
+[INFO|trainer.py:2527] 2025-10-24 23:01:14,677 >>   Total optimization steps = 10
+[INFO|trainer.py:2528] 2025-10-24 23:01:14,678 >>   Number of trainable parameters = 1,543,714,304
+[INFO|integration_utils.py:867] 2025-10-24 23:01:14,871 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
+wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Tracking run with wandb version 0.22.2
+wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251024_230115-mlpoab58
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run testing__pvv2_resume
+wandb:  View project at https://wandb.ai/ut_nlp_deduce/llamafactory
+wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/mlpoab58
+  0%|          | 0/10 [00:00<?, ?it/s] 10%|         | 1/10 [00:01<00:13,  1.55s/it] 20%|        | 2/10 [00:02<00:08,  1.09s/it] 30%|       | 3/10 [00:03<00:06,  1.09it/s] 40%|      | 4/10 [00:04<00:05,  1.03it/s] 50%|     | 5/10 [00:04<00:04,  1.10it/s][INFO|trainer.py:4309] 2025-10-24 23:01:20,972 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5
+[INFO|configuration_utils.py:491] 2025-10-24 23:01:20,978 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/config.json
+[INFO|configuration_utils.py:757] 2025-10-24 23:01:20,983 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/generation_config.json
+[INFO|modeling_utils.py:4189] 2025-10-24 23:01:29,871 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/model.safetensors.index.json.
+[INFO|tokenization_utils_base.py:2421] 2025-10-24 23:01:29,892 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-24 23:01:29,897 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-24 23:01:29,916 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/special_tokens_map.json
+ 60%|    | 6/10 [00:30<00:36,  9.15s/it] 70%|   | 7/10 [00:30<00:19,  6.39s/it] 80%|  | 8/10 [00:31<00:09,  4.62s/it] 90%| | 9/10 [00:32<00:03,  3.41s/it]100%|| 10/10 [00:33<00:00,  2.75s/it]                                               {'loss': 0.7146, 'grad_norm': 3.4265639781951904, 'learning_rate': 3.015368960704584e-08, 'epoch': 0.0}
+100%|| 10/10 [00:33<00:00,  2.75s/it][INFO|trainer.py:4309] 2025-10-24 23:01:49,645 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
+[INFO|configuration_utils.py:491] 2025-10-24 23:01:49,698 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/config.json
+[INFO|configuration_utils.py:757] 2025-10-24 23:01:49,736 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/generation_config.json
+[INFO|modeling_utils.py:4189] 2025-10-24 23:01:58,152 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/model.safetensors.index.json.
+[INFO|tokenization_utils_base.py:2421] 2025-10-24 23:01:58,172 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-24 23:01:58,177 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-24 23:01:58,181 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/special_tokens_map.json
+[INFO|trainer.py:2810] 2025-10-24 23:02:13,817 >>
+Training completed. Do not forget to share your model on huggingface.co/models =)
+                                               {'train_runtime': 59.1398, 'train_samples_per_second': 0.338, 'train_steps_per_second': 0.169, 'train_loss': 0.7145515441894531, 'epoch': 0.0}
+100%|| 10/10 [00:57<00:00,  2.75s/it]100%|| 10/10 [00:57<00:00,  5.79s/it]
+[INFO|trainer.py:4309] 2025-10-24 23:02:13,845 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
+[INFO|configuration_utils.py:491] 2025-10-24 23:02:13,901 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/config.json
+[INFO|configuration_utils.py:757] 2025-10-24 23:02:13,906 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/generation_config.json
+[INFO|modeling_utils.py:4189] 2025-10-24 23:02:25,043 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/model.safetensors.index.json.
+[INFO|tokenization_utils_base.py:2421] 2025-10-24 23:02:25,076 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-24 23:02:25,081 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-24 23:02:25,102 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/special_tokens_map.json
+***** train metrics *****
+  epoch                    =     0.0007
+  total_flos               =   411619GF
+  train_loss               =     0.7146
+  train_runtime            = 0:00:59.13
+  train_samples_per_second =      0.338
+  train_steps_per_second   =      0.169
+[INFO|modelcard.py:456] 2025-10-24 23:02:25,366 >> Dropping the following result as it does not have all the necessary fields:
+{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
+[W1024 23:02:25.664082565 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
+[1;34mwandb[0m:
+[1;34mwandb[0m:  View run [33mtesting__pvv2_resume[0m at: [34m[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251024_230115-mlpoab58/logs[0m
+[W1024 23:02:26.279498684 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
+[W1024 23:02:27.811812041 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
+[W1024 23:02:27.258510656 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
+[2025-10-24 23:02:27]
+[2025-10-24 23:02:27] ========================================
+[2025-10-24 23:02:27] Training completed successfully
+[2025-10-24 23:02:27] End Time: Fri Oct 24 11:02:27 PM EDT 2025
+[2025-10-24 23:02:27] ========================================
+[2025-10-24 23:02:27]
+[2025-10-24 23:02:27] ========================================
+[2025-10-24 23:02:27] STAGE 2: Merging/Exporting Model
+[2025-10-24 23:02:27] Start Time: Fri Oct 24 11:02:27 PM EDT 2025
+[2025-10-24 23:02:27] ========================================
+[2025-10-24 23:02:27] Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
+[2025-10-24 23:02:27] Analyzing checkpoints to find the one from current training run...
+[2025-10-24 23:02:27]   - checkpoint-10: trainer_state.json modified at Fri Oct 24 11:02:13 PM EDT 2025
+[2025-10-24 23:02:27]   - checkpoint-5: trainer_state.json modified at Fri Oct 24 11:01:44 PM EDT 2025
+[2025-10-24 23:02:27]
+[2025-10-24 23:02:27] Selected checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
+[2025-10-24 23:02:27] This checkpoint has the most recently updated trainer_state.json
+[2025-10-24 23:02:27] Checkpoint details:
+[2025-10-24 23:02:27]   Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
+[2025-10-24 23:02:27]   Last modified: 2025-10-24 23:02:13.814457753 -0400
+[2025-10-24 23:02:27]   Training step: 10
+[2025-10-24 23:02:27] Updating merge config to point to checkpoint...
+Successfully updated merge config
+[2025-10-24 23:02:28] Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
+[2025-10-24 23:02:28]
+[2025-10-24 23:02:28] Merge config contents:
+[2025-10-24 23:02:28]   template: qwen
+[2025-10-24 23:02:28]   trust_remote_code: true
+[2025-10-24 23:02:28]   export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged
+[2025-10-24 23:02:28]   model_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
+[2025-10-24 23:02:28]
+[2025-10-24 23:02:28] Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/configs/merge_config.yaml
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  import pkg_resources
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file vocab.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file merges.txt
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file tokenizer.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file added_tokens.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file special_tokens_map.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file tokenizer_config.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file chat_template.jinja
+[INFO|tokenization_utils_base.py:2364] 2025-10-24 23:02:38,733 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:763] 2025-10-24 23:02:38,735 >> loading configuration file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/config.json
+[INFO|configuration_utils.py:839] 2025-10-24 23:02:38,737 >> Model config Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file vocab.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file merges.txt
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file tokenizer.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file added_tokens.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file special_tokens_map.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file tokenizer_config.json
+[INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file chat_template.jinja
+[INFO|tokenization_utils_base.py:2364] 2025-10-24 23:02:38,961 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:763] 2025-10-24 23:02:38,979 >> loading configuration file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/config.json
+[INFO|configuration_utils.py:839] 2025-10-24 23:02:38,979 >> Model config Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[WARNING|logging.py:328] 2025-10-24 23:02:38,979 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|2025-10-24 23:02:38] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
+[WARNING|logging.py:328] 2025-10-24 23:02:39,312 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1169] 2025-10-24 23:02:39,313 >> loading weights file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/model.safetensors.index.json
+[INFO|modeling_utils.py:2341] 2025-10-24 23:02:39,314 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
+[INFO|configuration_utils.py:986] 2025-10-24 23:02:39,314 >> Generate config GenerationConfig {
+  "eos_token_id": 151645,
+  "pad_token_id": 151643
+}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|     | 1/2 [00:02<00:02,  2.68s/it]Loading checkpoint shards: 100%|| 2/2 [00:03<00:00,  1.44s/it]Loading checkpoint shards: 100%|| 2/2 [00:03<00:00,  1.62s/it]
+[INFO|configuration_utils.py:939] 2025-10-24 23:02:42,581 >> loading configuration file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-24 23:02:42,581 >> Generate config GenerationConfig {
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8
+}
+[INFO|dynamic_module_utils.py:423] 2025-10-24 23:02:42,582 >> Could not locate the custom_generate/generate.py inside /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints.
+[INFO|2025-10-24 23:02:42] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-10-24 23:02:42] llamafactory.model.loader:143 >> all params: 1,543,714,304
+[INFO|2025-10-24 23:02:42] llamafactory.train.tuner:143 >> Convert model dtype to: torch.float16.
+[INFO|configuration_utils.py:491] 2025-10-24 23:02:42,596 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/config.json
+[INFO|configuration_utils.py:757] 2025-10-24 23:02:42,601 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/generation_config.json
+[INFO|modeling_utils.py:4181] 2025-10-24 23:02:46,185 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/model.safetensors
+[INFO|tokenization_utils_base.py:2421] 2025-10-24 23:02:46,205 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-24 23:02:46,224 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-24 23:02:46,243 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/special_tokens_map.json
+[INFO|2025-10-24 23:02:46] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/Modelfile
+[2025-10-24 23:02:47]
+[2025-10-24 23:02:47] ========================================
+[2025-10-24 23:02:47] Merge/Export completed successfully
+[2025-10-24 23:02:47] End Time: Fri Oct 24 11:02:47 PM EDT 2025
+[2025-10-24 23:02:47] ========================================
+[2025-10-24 23:02:47]
+[2025-10-24 23:02:47] ========================================
+[2025-10-24 23:02:47] Preparing Training Artifacts
+[2025-10-24 23:02:47] ========================================
+[2025-10-24 23:02:47] Copying configuration files...
+[2025-10-24 23:02:47] Copying and cleaning training logs...

training_artifacts/merge_config.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+template: qwen
+trust_remote_code: true
+export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged
+model_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints

training_artifacts/train_config.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+stage: sft
+do_train: true
+max_samples: 100000
+do_eval: false
+save_strategy: steps
+save_steps: 5
+logging_steps: 10
+fp16: false
+bf16: true
+adam_beta1: 0.9
+adam_beta2: 0.95
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-06
+lr_scheduler_type: cosine
+num_train_epochs: 2
+warmup_ratio: 0.05
+weight_decay: 0.0001
+template: qwen
+max_steps: 10
+preprocessing_num_workers: 16
+overwrite_cache: true
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+finetuning_type: full
+trust_remote_code: true
+dataset: TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
+dataset_dir: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data
+cutoff_len: 16192
+tokenized_path: /scratch/zrs2020/.cache/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
+output_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff