GLM-4.5-Air-HS / checkpoints /logs /error_log.jsonl
Ashx098's picture
Finalizing GLM checkpoints commit via huggingface_hub API
82ea551 verified
{"timestamp": 1762329556.5648425, "datetime": "2025-11-05T07:59:16.564844", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762329557.7520196, "datetime": "2025-11-05T07:59:17.752022", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762329558.7568176, "datetime": "2025-11-05T07:59:18.756820", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762329558.8107224, "datetime": "2025-11-05T07:59:18.810724", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762329750.7416234, "datetime": "2025-11-05T08:02:30.741625", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762329753.7109349, "datetime": "2025-11-05T08:02:33.710937", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762329753.7214928, "datetime": "2025-11-05T08:02:33.721494", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 851, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 515, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762330338.3560638, "datetime": "2025-11-05T08:12:18.356068", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 854, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 561, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762330339.5585098, "datetime": "2025-11-05T08:12:19.558514", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 854, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 561, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762330339.6687417, "datetime": "2025-11-05T08:12:19.668748", "error_type": "ValueError", "error_message": "optimizer got an empty parameter list", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 854, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 561, in main\n optimizer = torch.optim.AdamW(\n ^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adamw.py\", line 37, in __init__\n super().__init__(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/adam.py\", line 101, in __init__\n super().__init__(params, defaults)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/optim/optimizer.py\", line 396, in __init__\n raise ValueError(\"optimizer got an empty parameter list\")\nValueError: optimizer got an empty parameter list\n"}
{"timestamp": 1762331402.635015, "datetime": "2025-11-05T08:30:02.635020", "error_type": "UnboundLocalError", "error_message": "cannot access local variable 'eval_metrics' where it is not associated with a value", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 862, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 810, in main\n eval_loss = eval_metrics[\"eval_loss\"]\n ^^^^^^^^^^^^\nUnboundLocalError: cannot access local variable 'eval_metrics' where it is not associated with a value\n"}
{"timestamp": 1762415055.2916036, "datetime": "2025-11-06T07:44:15.291605", "error_type": "OutOfMemoryError", "error_message": "CUDA out of memory. Tried to allocate 1.16 GiB. GPU 2 has a total capacity of 139.81 GiB of which 412.12 MiB is free. Process 911021 has 137.82 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 512 bytes is allocated by PyTorch, and 2.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 883, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 464, in main\n model = AutoModelForCausalLM.from_pretrained(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py\", line 604, in from_pretrained\n return model_class.from_pretrained(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 277, in _wrapper\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 5048, in from_pretrained\n ) = cls._load_pretrained_model(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 5468, in _load_pretrained_model\n _error_msgs, disk_offload_index = load_shard_file(args)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 843, in load_shard_file\n disk_offload_index = _load_state_dict_into_meta_model(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/utils/_contextlib.py\", line 120, in decorate_context\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 748, in _load_state_dict_into_meta_model\n param = param[...]\n ~~~~~^^^^^\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 2 has a total capacity of 139.81 GiB of which 412.12 MiB is free. Process 911021 has 137.82 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 512 bytes is allocated by PyTorch, and 2.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n"}
{"timestamp": 1762415055.359307, "datetime": "2025-11-06T07:44:15.359308", "error_type": "OutOfMemoryError", "error_message": "CUDA out of memory. Tried to allocate 1.16 GiB. GPU 1 has a total capacity of 139.81 GiB of which 496.12 MiB is free. Process 911020 has 137.85 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 512 bytes is allocated by PyTorch, and 2.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 883, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 464, in main\n model = AutoModelForCausalLM.from_pretrained(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py\", line 604, in from_pretrained\n return model_class.from_pretrained(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 277, in _wrapper\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 5048, in from_pretrained\n ) = cls._load_pretrained_model(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 5468, in _load_pretrained_model\n _error_msgs, disk_offload_index = load_shard_file(args)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 843, in load_shard_file\n disk_offload_index = _load_state_dict_into_meta_model(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/utils/_contextlib.py\", line 120, in decorate_context\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 748, in _load_state_dict_into_meta_model\n param = param[...]\n ~~~~~^^^^^\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 1 has a total capacity of 139.81 GiB of which 496.12 MiB is free. Process 911020 has 137.85 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 512 bytes is allocated by PyTorch, and 2.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n"}
{"timestamp": 1762415055.6543758, "datetime": "2025-11-06T07:44:15.654377", "error_type": "OutOfMemoryError", "error_message": "CUDA out of memory. Tried to allocate 1.16 GiB. GPU 3 has a total capacity of 139.81 GiB of which 20.12 MiB is free. Process 911022 has 138.21 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 512 bytes is allocated by PyTorch, and 2.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 883, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 464, in main\n model = AutoModelForCausalLM.from_pretrained(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py\", line 604, in from_pretrained\n return model_class.from_pretrained(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 277, in _wrapper\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 5048, in from_pretrained\n ) = cls._load_pretrained_model(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 5468, in _load_pretrained_model\n _error_msgs, disk_offload_index = load_shard_file(args)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 843, in load_shard_file\n disk_offload_index = _load_state_dict_into_meta_model(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/utils/_contextlib.py\", line 120, in decorate_context\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 748, in _load_state_dict_into_meta_model\n param = param[...]\n ~~~~~^^^^^\ntorch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 3 has a total capacity of 139.81 GiB of which 20.12 MiB is free. Process 911022 has 138.21 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 512 bytes is allocated by PyTorch, and 2.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n"}
{"timestamp": 1762415415.5231204, "datetime": "2025-11-06T07:50:15.523123", "error_type": "ValueError", "error_message": "Target module Glm4MoeTopkRouter() is not supported. Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv1d`, `torch.nn.Conv2d`, `torch.nn.Conv3d`, `transformers.pytorch_utils.Conv1D`, `torch.nn.MultiheadAttention.`.", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 883, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 547, in main\n model = get_peft_model(model, lora_config)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/mapping_func.py\", line 125, in get_peft_model\n return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/peft_model.py\", line 1815, in __init__\n super().__init__(model, peft_config, adapter_name, **kwargs)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/peft_model.py\", line 130, in __init__\n self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/tuners/tuners_utils.py\", line 209, in __init__\n self.inject_adapter(self.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage, state_dict=state_dict)\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/tuners/tuners_utils.py\", line 578, in inject_adapter\n self._create_and_replace(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/tuners/lora/model.py\", line 259, in _create_and_replace\n new_module = self._create_new_module(lora_config, adapter_name, target, device_map=device_map, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/peft/tuners/lora/model.py\", line 371, in _create_new_module\n raise ValueError(\nValueError: Target module Glm4MoeTopkRouter() is not supported. Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv1d`, `torch.nn.Conv2d`, `torch.nn.Conv3d`, `transformers.pytorch_utils.Conv1D`, `torch.nn.MultiheadAttention.`.\n"}
{"timestamp": 1762415794.2165732, "datetime": "2025-11-06T07:56:34.216576", "error_type": "ValueError", "error_message": "Inconsistent compute device and `device_id` on rank 2: cuda:0 vs cuda:2", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 883, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 563, in main\n model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1559, in prepare\n result = tuple(\n ^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1560, in <genexpr>\n self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1402, in _prepare_one\n return self.prepare_model(obj, device_placement=device_placement)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1895, in prepare_model\n model = FSDP(model, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py\", line 468, in __init__\n _auto_wrap(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/_wrap_utils.py\", line 101, in _auto_wrap\n _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs) # type: ignore[arg-type]\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 533, in _recursive_wrap\n wrapped_child, num_wrapped_params = _recursive_wrap(\n ^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 533, in _recursive_wrap\n wrapped_child, num_wrapped_params = _recursive_wrap(\n ^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 533, in _recursive_wrap\n wrapped_child, num_wrapped_params = _recursive_wrap(\n ^^^^^^^^^^^^^^^^\n [Previous line repeated 2 more times]\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 551, in _recursive_wrap\n return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 480, in _wrap\n return wrapper_cls(module, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py\", line 494, in __init__\n _init_param_handle_from_module(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py\", line 604, in _init_param_handle_from_module\n state.compute_device = _get_compute_device(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py\", line 1068, in _get_compute_device\n raise ValueError(\nValueError: Inconsistent compute device and `device_id` on rank 2: cuda:0 vs cuda:2\n"}
{"timestamp": 1762415797.4179983, "datetime": "2025-11-06T07:56:37.418001", "error_type": "ValueError", "error_message": "Inconsistent compute device and `device_id` on rank 3: cuda:0 vs cuda:3", "step": null, "epoch": null, "traceback": "Traceback (most recent call last):\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 883, in <module>\n main()\n File \"/workspace/Avinash/CPT_GLM/train_qlora_improved.py\", line 563, in main\n model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1559, in prepare\n result = tuple(\n ^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1560, in <genexpr>\n self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1402, in _prepare_one\n return self.prepare_model(obj, device_placement=device_placement)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/accelerate/accelerator.py\", line 1895, in prepare_model\n model = FSDP(model, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py\", line 468, in __init__\n _auto_wrap(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/_wrap_utils.py\", line 101, in _auto_wrap\n _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs) # type: ignore[arg-type]\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 533, in _recursive_wrap\n wrapped_child, num_wrapped_params = _recursive_wrap(\n ^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 533, in _recursive_wrap\n wrapped_child, num_wrapped_params = _recursive_wrap(\n ^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 533, in _recursive_wrap\n wrapped_child, num_wrapped_params = _recursive_wrap(\n ^^^^^^^^^^^^^^^^\n [Previous line repeated 2 more times]\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 551, in _recursive_wrap\n return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/wrap.py\", line 480, in _wrap\n return wrapper_cls(module, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py\", line 494, in __init__\n _init_param_handle_from_module(\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py\", line 604, in _init_param_handle_from_module\n state.compute_device = _get_compute_device(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/workspace/Avinash/ash_env/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py\", line 1068, in _get_compute_device\n raise ValueError(\nValueError: Inconsistent compute device and `device_id` on rank 3: cuda:0 vs cuda:3\n"}