cornuHGF commited on
Commit
f86062e
·
verified ·
1 Parent(s): b8946c7

Upload folder using huggingface_hub

Browse files
LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-310.pyc CHANGED
Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-310.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-310.pyc differ
 
LLaMA-Factory/src/llamafactory/data/converter.py CHANGED
@@ -50,7 +50,6 @@ class DatasetConverter:
50
  return None
51
  else:
52
  medias = medias[:]
53
- logger.warning_rank0_once(f"dataset_attr.load_from: {self.dataset_attr.load_from}")
54
  if self.dataset_attr.load_from in ["script", "file"]:
55
  if isinstance(medias[0], str):
56
  for i in range(len(medias)):
 
50
  return None
51
  else:
52
  medias = medias[:]
 
53
  if self.dataset_attr.load_from in ["script", "file"]:
54
  if isinstance(medias[0], str):
55
  for i in range(len(medias)):
checkpoints/cold_start/qwen2_5_vl-7b_vilasr_cold_start_8gpu_lora/runs/Oct21_17-55-59_c31/events.out.tfevents.1761084094.c31.1268522.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:885f043ce17c41b93f61f3c56e3b425910f35d8b5f10c50dd790e5ef2591bc16
3
- size 6796
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd413fe9ba7f4f1d735b6fed20a120364ad7686c5096fc45b84e72352acab45
3
+ size 8038
checkpoints/cold_start/qwen2_5_vl-7b_vilasr_cold_start_8gpu_lora/trainer_log.jsonl CHANGED
@@ -1,2 +1,8 @@
1
  {"current_steps": 10, "total_steps": 3147, "loss": 0.422, "lr": 2.8571428571428575e-07, "epoch": 0.009529481834425254, "percentage": 0.32, "elapsed_time": "0:23:07", "remaining_time": "5 days, 0:53:15"}
2
  {"current_steps": 20, "total_steps": 3147, "loss": 0.411, "lr": 6.031746031746032e-07, "epoch": 0.019058963668850508, "percentage": 0.64, "elapsed_time": "0:46:53", "remaining_time": "5 days, 2:11:25"}
 
 
 
 
 
 
 
1
  {"current_steps": 10, "total_steps": 3147, "loss": 0.422, "lr": 2.8571428571428575e-07, "epoch": 0.009529481834425254, "percentage": 0.32, "elapsed_time": "0:23:07", "remaining_time": "5 days, 0:53:15"}
2
  {"current_steps": 20, "total_steps": 3147, "loss": 0.411, "lr": 6.031746031746032e-07, "epoch": 0.019058963668850508, "percentage": 0.64, "elapsed_time": "0:46:53", "remaining_time": "5 days, 2:11:25"}
3
+ {"current_steps": 30, "total_steps": 3147, "loss": 0.4276, "lr": 9.206349206349208e-07, "epoch": 0.02858844550327576, "percentage": 0.95, "elapsed_time": "1:09:35", "remaining_time": "5 days, 0:31:07"}
4
+ {"current_steps": 40, "total_steps": 3147, "loss": 0.4214, "lr": 1.2380952380952382e-06, "epoch": 0.038117927337701016, "percentage": 1.27, "elapsed_time": "1:34:10", "remaining_time": "5 days, 1:54:57"}
5
+ {"current_steps": 50, "total_steps": 3147, "loss": 0.414, "lr": 1.5555555555555558e-06, "epoch": 0.047647409172126266, "percentage": 1.59, "elapsed_time": "1:57:56", "remaining_time": "5 days, 1:44:55"}
6
+ {"current_steps": 60, "total_steps": 3147, "loss": 0.4039, "lr": 1.8730158730158732e-06, "epoch": 0.05717689100655152, "percentage": 1.91, "elapsed_time": "2:21:36", "remaining_time": "5 days, 1:25:25"}
7
+ {"current_steps": 70, "total_steps": 3147, "loss": 0.418, "lr": 2.1904761904761908e-06, "epoch": 0.06670637284097677, "percentage": 2.22, "elapsed_time": "2:46:27", "remaining_time": "5 days, 1:56:44"}
8
+ {"current_steps": 80, "total_steps": 3147, "loss": 0.4186, "lr": 2.507936507936508e-06, "epoch": 0.07623585467540203, "percentage": 2.54, "elapsed_time": "3:09:39", "remaining_time": "5 days, 1:11:03"}
checkpoints/cold_start/qwen2_5_vl-7b_vilasr_cold_start_8gpu_lora_z2/runs/Oct21_22-00-59_c31/events.out.tfevents.1761098789.c31.1297219.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faf2f36464713ee200d079fb397bbd02e8345417cca5094ce37d19747d64d499
3
+ size 8256
checkpoints/cold_start/qwen2_5_vl-7b_vilasr_cold_start_8gpu_lora_z2/trainer_log.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 3147, "loss": 0.4221, "lr": 2.8571428571428575e-07, "epoch": 0.009529481834425254, "percentage": 0.32, "elapsed_time": "0:09:20", "remaining_time": "2 days, 0:51:03"}
2
+ {"current_steps": 20, "total_steps": 3147, "loss": 0.4111, "lr": 6.031746031746032e-07, "epoch": 0.019058963668850508, "percentage": 0.64, "elapsed_time": "0:18:42", "remaining_time": "2 days, 0:44:50"}
3
+ {"current_steps": 30, "total_steps": 3147, "loss": 0.4277, "lr": 9.206349206349208e-07, "epoch": 0.02858844550327576, "percentage": 0.95, "elapsed_time": "0:27:39", "remaining_time": "1 day, 23:54:15"}
4
+ {"current_steps": 40, "total_steps": 3147, "loss": 0.4215, "lr": 1.2380952380952382e-06, "epoch": 0.038117927337701016, "percentage": 1.27, "elapsed_time": "0:37:14", "remaining_time": "2 days, 0:12:06"}
5
+ {"current_steps": 50, "total_steps": 3147, "loss": 0.4139, "lr": 1.5555555555555558e-06, "epoch": 0.047647409172126266, "percentage": 1.59, "elapsed_time": "0:46:37", "remaining_time": "2 days, 0:08:23"}
6
+ {"current_steps": 60, "total_steps": 3147, "loss": 0.404, "lr": 1.8730158730158732e-06, "epoch": 0.05717689100655152, "percentage": 1.91, "elapsed_time": "0:55:56", "remaining_time": "1 day, 23:58:30"}
7
+ {"current_steps": 70, "total_steps": 3147, "loss": 0.4182, "lr": 2.1904761904761908e-06, "epoch": 0.06670637284097677, "percentage": 2.22, "elapsed_time": "1:05:33", "remaining_time": "2 days, 0:01:49"}
8
+ {"current_steps": 80, "total_steps": 3147, "loss": 0.4186, "lr": 2.507936507936508e-06, "epoch": 0.07623585467540203, "percentage": 2.54, "elapsed_time": "1:14:38", "remaining_time": "1 day, 23:41:52"}
9
+ {"current_steps": 90, "total_steps": 3147, "loss": 0.4138, "lr": 2.8253968253968255e-06, "epoch": 0.08576533650982728, "percentage": 2.86, "elapsed_time": "1:23:56", "remaining_time": "1 day, 23:31:18"}
env.yaml CHANGED
@@ -1,4 +1,3 @@
1
- name: vilasr
2
  channels:
3
  - defaults
4
  - https://repo.anaconda.com/pkgs/main
 
 
1
  channels:
2
  - defaults
3
  - https://repo.anaconda.com/pkgs/main
full_z3_offload.log ADDED
The diff for this file is too large to render. See raw diff
 
lora_z2.log ADDED
The diff for this file is too large to render. See raw diff
 
lora_z3.log ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.1
2
+ accelerate==1.6.0
3
+ aiofiles==24.1.0
4
+ aiohappyeyeballs==2.6.1
5
+ aiohttp==3.13.1
6
+ aiosignal==1.4.0
7
+ airportsdata==20250909
8
+ annotated-types==0.7.0
9
+ antlr4-python3-runtime==4.9.3
10
+ anyio==4.11.0
11
+ astor==0.8.1
12
+ async-timeout==5.0.1
13
+ attrs==25.4.0
14
+ audioread==3.0.1
15
+ av==15.1.0
16
+ blake3==1.0.8
17
+ Brotli==1.1.0
18
+ cachetools==6.2.1
19
+ certifi==2025.8.3
20
+ cffi==2.0.0
21
+ charset-normalizer==3.4.3
22
+ click==8.3.0
23
+ cloudpickle==3.1.1
24
+ codetiming==1.4.0
25
+ compressed-tensors==0.9.1
26
+ contourpy==1.3.2
27
+ cupy-cuda12x==13.6.0
28
+ cycler==0.12.1
29
+ datasets==3.5.0
30
+ decorator==5.2.1
31
+ deepspeed==0.16.9
32
+ depyf==0.18.0
33
+ dill==0.3.8
34
+ diskcache==5.6.3
35
+ distro==1.9.0
36
+ dnspython==2.8.0
37
+ docker-pycreds==0.4.0
38
+ docstring_parser==0.17.0
39
+ einops==0.8.1
40
+ email-validator==2.3.0
41
+ exceptiongroup==1.3.0
42
+ fastapi==0.118.0
43
+ fastapi-cli==0.0.14
44
+ fastapi-cloud-cli==0.3.1
45
+ fastrlock==0.8.3
46
+ ffmpy==0.6.1
47
+ filelock==3.19.1
48
+ fire==0.7.1
49
+ flash_attn==2.5.3
50
+ fonttools==4.60.0
51
+ frozenlist==1.8.0
52
+ fsspec==2024.12.0
53
+ gguf==0.10.0
54
+ gitdb==4.0.12
55
+ GitPython==3.1.45
56
+ google-auth==2.41.1
57
+ google-auth-oauthlib==1.0.0
58
+ gradio==5.45.0
59
+ gradio_client==1.13.0
60
+ groovy==0.1.2
61
+ grpcio==1.75.1
62
+ h11==0.16.0
63
+ hf-xet==1.1.10
64
+ hf_transfer==0.1.9
65
+ hjson==3.1.0
66
+ httpcore==1.0.9
67
+ httptools==0.7.1
68
+ httpx==0.28.1
69
+ huggingface-hub==0.35.3
70
+ idna==3.10
71
+ imageio==2.37.0
72
+ importlib_metadata==8.7.0
73
+ iniconfig==2.3.0
74
+ interegular==0.3.3
75
+ jieba==0.42.1
76
+ Jinja2==3.1.6
77
+ jiter==0.11.0
78
+ joblib==1.5.2
79
+ jsonschema==4.25.1
80
+ jsonschema-specifications==2025.9.1
81
+ kiwisolver==1.4.9
82
+ lark==1.2.2
83
+ lazy_loader==0.4
84
+ librosa==0.11.0
85
+ -e git+https://github.com/hiyouga/LLaMA-Factory.git@2c6aded5d4f4ff23aa1887d16972afb3c2543ac3#egg=llamafactory
86
+ llvmlite==0.43.0
87
+ lm-format-enforcer==0.10.12
88
+ loguru==0.7.3
89
+ Markdown==3.9
90
+ markdown-it-py==4.0.0
91
+ MarkupSafe==3.0.3
92
+ mathruler==0.1.0
93
+ matplotlib==3.10.6
94
+ mdurl==0.1.2
95
+ mistral_common==1.8.5
96
+ modelscope==1.31.0
97
+ mpmath==1.3.0
98
+ msgpack==1.1.2
99
+ msgspec==0.19.0
100
+ multidict==6.7.0
101
+ multiprocess==0.70.16
102
+ nest-asyncio==1.6.0
103
+ networkx==3.4.2
104
+ ninja==1.13.0
105
+ nltk==3.9.1
106
+ numba==0.60.0
107
+ numpy==1.26.4
108
+ nvidia-cublas-cu12==12.6.4.1
109
+ nvidia-cuda-cupti-cu12==12.6.80
110
+ nvidia-cuda-nvrtc-cu12==12.6.77
111
+ nvidia-cuda-runtime-cu12==12.6.77
112
+ nvidia-cudnn-cu12==9.5.1.17
113
+ nvidia-cufft-cu12==11.3.0.4
114
+ nvidia-cufile-cu12==1.13.1.3
115
+ nvidia-curand-cu12==10.3.7.77
116
+ nvidia-cusolver-cu12==11.7.1.2
117
+ nvidia-cusparse-cu12==12.5.4.2
118
+ nvidia-cusparselt-cu12==0.6.3
119
+ nvidia-ml-py==13.580.82
120
+ nvidia-nccl-cu12==2.21.5
121
+ nvidia-nvjitlink-cu12==12.6.85
122
+ nvidia-nvshmem-cu12==3.3.20
123
+ nvidia-nvtx-cu12==12.6.77
124
+ oauthlib==3.3.1
125
+ omegaconf==2.3.0
126
+ openai==1.109.1
127
+ opencv-python-headless==4.11.0.86
128
+ orjson==3.11.3
129
+ outlines==0.1.11
130
+ outlines_core==0.1.26
131
+ packaging==25.0
132
+ pandas==2.3.2
133
+ partial-json-parser==0.2.1.1.post6
134
+ Pebble==5.1.3
135
+ peft==0.17.1
136
+ pillow==11.3.0
137
+ platformdirs==4.5.0
138
+ pluggy==1.6.0
139
+ pooch==1.8.2
140
+ prometheus-fastapi-instrumentator==7.1.0
141
+ prometheus_client==0.23.1
142
+ propcache==0.4.1
143
+ protobuf==3.19.0
144
+ psutil==7.1.0
145
+ py-cpuinfo==9.0.0
146
+ pyarrow==21.0.0
147
+ pyasn1==0.6.1
148
+ pyasn1_modules==0.4.2
149
+ pybind11==3.0.1
150
+ pycountry==24.6.1
151
+ pycparser==2.23
152
+ pydantic==2.10.6
153
+ pydantic-extra-types==2.10.6
154
+ pydantic_core==2.27.2
155
+ pydub==0.25.1
156
+ Pygments==2.19.2
157
+ pylatexenc==2.10
158
+ pyparsing==3.2.5
159
+ pytesseract==0.3.13
160
+ pytest==8.4.2
161
+ python-dateutil==2.9.0.post0
162
+ python-dotenv==1.1.1
163
+ python-multipart==0.0.20
164
+ pytz==2025.2
165
+ PyYAML==6.0.3
166
+ pyzmq==27.1.0
167
+ qwen-vl-utils==0.0.14
168
+ ray==2.47.1
169
+ referencing==0.37.0
170
+ regex==2025.9.18
171
+ requests==2.32.5
172
+ requests-oauthlib==2.0.0
173
+ rich==14.1.0
174
+ rich-toolkit==0.15.1
175
+ rignore==0.7.1
176
+ rouge-chinese==1.0.3
177
+ rpds-py==0.27.1
178
+ rsa==4.9.1
179
+ ruff==0.13.2
180
+ safehttpx==0.1.6
181
+ safetensors==0.5.3
182
+ scikit-image==0.25.2
183
+ scikit-learn==1.7.2
184
+ scipy==1.15.3
185
+ semantic-version==2.10.0
186
+ sentencepiece==0.2.1
187
+ sentry-sdk==2.42.1
188
+ setproctitle==1.3.7
189
+ shellingham==1.5.4
190
+ shtab==1.7.2
191
+ six==1.17.0
192
+ smmap==5.0.2
193
+ sniffio==1.3.1
194
+ soundfile==0.13.1
195
+ soxr==1.0.0
196
+ sse-starlette==3.0.2
197
+ starlette==0.48.0
198
+ sympy==1.13.1
199
+ tabulate==0.9.0
200
+ tensorboard==2.14.0
201
+ tensorboard-data-server==0.7.2
202
+ termcolor==3.1.0
203
+ threadpoolctl==3.6.0
204
+ tifffile==2025.5.10
205
+ tiktoken==0.12.0
206
+ timeout-decorator==0.5.0
207
+ tokenizers==0.21.4
208
+ tomli==2.3.0
209
+ tomlkit==0.13.3
210
+ torch==2.6.0+cu126
211
+ torchaudio==2.6.0+cu126
212
+ torchdata==0.11.0
213
+ torchvision==0.21.0+cu126
214
+ tqdm==4.67.1
215
+ transformers==4.51.1
216
+ triton==3.2.0
217
+ trl==0.9.6
218
+ typer==0.19.2
219
+ typing-inspection==0.4.2
220
+ typing_extensions==4.15.0
221
+ tyro==0.8.14
222
+ tzdata==2025.2
223
+ urllib3==2.5.0
224
+ uvicorn==0.37.0
225
+ uvloop==0.22.1
226
+ vllm==0.7.3
227
+ wandb==0.19.11
228
+ watchfiles==1.1.1
229
+ websockets==15.0.1
230
+ Werkzeug==3.1.3
231
+ xformers==0.0.28.post3
232
+ xgrammar==0.1.11
233
+ xxhash==3.6.0
234
+ yarl==1.22.0
235
+ zipp==3.23.0
train/cold_start/vilasr_full_qwen2.5_vl_7b_8gpu/lora.yaml CHANGED
@@ -4,6 +4,7 @@ model_name_or_path: ../Qwen2.5-VL-7B-Instruct
4
  image_max_pixels: 1003520 # 1280*28*28
5
  video_max_pixels: 1003520
6
  trust_remote_code: true
 
7
  # infer_backend: vllm
8
 
9
  ### method
@@ -13,7 +14,7 @@ finetuning_type: lora
13
  freeze_vision_tower: true
14
  freeze_multi_modal_projector: true
15
  freeze_language_model: false
16
- deepspeed: train/examples/deepspeed/ds_z3_config.json
17
 
18
  ### dataset
19
  dataset: vqa_cold_start,maze_cold_start,GPT4Scene_cold_start,SR_91k_cold_start
@@ -27,7 +28,7 @@ dataloader_num_workers: 8 # 4
27
  # mask_history 是否仅使用当前对话轮次进行训练。default: False
28
 
29
  ### output
30
- output_dir: checkpoints/cold_start/qwen2_5_vl-7b_vilasr_cold_start_8gpu_lora
31
  logging_steps: 10
32
  save_steps: 200
33
  save_total_limit: 1 # new
 
4
  image_max_pixels: 1003520 # 1280*28*28
5
  video_max_pixels: 1003520
6
  trust_remote_code: true
7
+ flash_attn: fa2
8
  # infer_backend: vllm
9
 
10
  ### method
 
14
  freeze_vision_tower: true
15
  freeze_multi_modal_projector: true
16
  freeze_language_model: false
17
+ deepspeed: train/examples/deepspeed/ds_z2_config.json
18
 
19
  ### dataset
20
  dataset: vqa_cold_start,maze_cold_start,GPT4Scene_cold_start,SR_91k_cold_start
 
28
  # mask_history 是否仅使用当前对话轮次进行训练。default: False
29
 
30
  ### output
31
+ output_dir: checkpoints/cold_start/qwen2_5_vl-7b_vilasr_cold_start_8gpu_lora_z2
32
  logging_steps: 10
33
  save_steps: 200
34
  save_total_limit: 1 # new
train/cold_start/vilasr_full_qwen2.5_vl_7b_8gpu/step2_lora.sh.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --nodes=1
4
+ #SBATCH --ntasks=1
5
+ #SBATCH --wait-all-nodes=1
6
+ #SBATCH --partition=a6000
7
+ #SBATCH --cpus-per-task=64
8
+ #SBATCH -w c31
9
+ export HF_HOME=/mnt/beegfs/dzhu6/.cache
10
+
11
+ eval "$(/home/dzhu6/miniconda3/bin/conda shell.bash hook)" # init conda
12
+
13
+ export MASTER_PORT=12763
14
+
15
+ master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
16
+ export MASTER_ADDR=$master_addr
17
+ echo "MASTER_ADDR: $MASTER_ADDR"
18
+ #cpu per task
19
+ echo "SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
20
+
21
+ source env/bin/activate
22
+ conda env list
23
+ nvidia-smi
24
+
25
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
26
+ export TOKENIZERS_PARALLELISM=false
27
+ FORCE_TORCHRUN=1 llamafactory-cli train train/cold_start/vilasr_full_qwen2.5_vl_7b_8gpu/lora.yaml