wyhwhy commited on
Commit
369e662
·
verified ·
1 Parent(s): dac4dc6

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .metadata filter=lfs diff=lfs merge=lfs -text
37
+ __0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ __1_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ __2_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ __3_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ __4_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ __5_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ __6_0.distcp filter=lfs diff=lfs merge=lfs -text
44
+ __7_0.distcp filter=lfs diff=lfs merge=lfs -text
.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:576a8eed9f1307b7510f408efba3d944cb8ec4dabc504c859040cf01559e8998
3
+ size 1778515
__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dafff61930e08895d7c3537a7c1fbc0b7cd95889d1df110f10688a1e81ba4a2
3
+ size 11424624828
__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7292e8a6ae03b0ce711235d6a69378948a5a290b2cd5eed8e988c898ac8986f7
3
+ size 11424682648
__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d1629bcc1342cff4789b75cb184fc423a570f7063e8ea117174c688c30e0205
3
+ size 11424682648
__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af25bbcde17f9ae338e6d4da1c179eed9424316cdfba0fea226b1caeecc3a20
3
+ size 11424682648
__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02ffe6082cd6dd2da5c091c924548359444a4e8642ebd36c6a3b5563274ea385
3
+ size 11424684060
__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50923fc5d1cd15c46eb850592b0a18377132891a54d83d0013c17c420d19c026
3
+ size 11424684060
__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99c757ccf27fd77854f16fcbc5a02d687be621e6115d41fd03277cd739974507
3
+ size 11424684060
__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451f0d09dc654f2f8ae3ea6966bd66d5be691026d4522f08d2b78ab91d26ced2
3
+ size 11424699420
params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "dream", "dump_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/checkpoint/", "seed": 777, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 2500, "eval_steps": 10, "data": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "val_sources": {}, "batch_size": 1, "seq_len": 4096, "val_seq_len": 2048, "val_batch_size": 10, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}}, "optim": {"lr": 1e-05, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "wsd", "warmup": 5000, "lr_min_ratio": 0.0, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.2, "exp_factor": 0.5}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 500, "keep": 20}, "eval": {"every": 1000, "keep": 0}, "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/checkpoint/checkpoints", "init_ckpt_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp", "continue_training_from_init": false}, "reset_lr": false, "log_start_step": 0, "profiling": {"run": false, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2}, "logging": {"freq": 1, "acc_freq": null, "wandb": {"job_type": null, "dir": null, "project": "diffllm-2", "entity": null, "tags": null, "group": null, "name": "dream", "notes": null, "config_exclude_keys": null, "config_include_keys": null, "anonymous": null, "mode": null, "allow_val_change": null, "resume": null, "force": null, "tensorboard": null, "sync_tensorboard": null, "monitor_gym": null, "save_code": null, "id": null, "fork_from": null, "resume_from": null}}, "async_eval_gpus": null, "eval": null, "weighting": "cart", "cart_p": 0.1}
train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 9, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 39540750, "block_size": 8, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 226543099903625816227601880187755888755, "inc": 252101603063402394885084957393789173453}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 134993764224185824407197694329903048938, "inc": 257317082376085721142933171929815648017}, "has_uint32": 1, "uinteger": 2952168449}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 23988, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 34485101, "block_size": 8, "offset": 1, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 282594048343705998936993135951521156951, "inc": 246509925186285949978196491240064802315}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 280079809912542376044034366709682031621, "inc": 173555323965545256606922338259303677603}, "has_uint32": 1, "uinteger": 322963895}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1574, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 16091345, "block_size": 8, "offset": 2, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 187667763942101143982928526630677040802, "inc": 234358335530849485425064040311006256713}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 180695400501975883782990138575842928657, "inc": 319170006889470250209362588441616495209}, "has_uint32": 0, "uinteger": 2273197488}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 197, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 47492926, "block_size": 8, "offset": 3, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 12986492799214244244791808130043976593, "inc": 148211758571781046255077612135386035203}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 130957549404782263888846267584236103538, "inc": 115810872492597857501795428972873905393}, "has_uint32": 0, "uinteger": 1628108682}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00004.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1251, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 40873452, "block_size": 8, "offset": 4, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 227135332697807534426190885043870638037, "inc": 186633262021180533256729114674950595327}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 172201970092670839766122039582155308497, "inc": 303111205818808944921858206842105131807}, "has_uint32": 0, "uinteger": 287498280}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00005.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 196, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 29012505, "block_size": 8, "offset": 5, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 85870468604602557174443864053038101054, "inc": 329233669073478483697346584247981015037}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 194536430070890552104514061335431366772, "inc": 47382953940698287647753879262736142901}, "has_uint32": 1, "uinteger": 370246442}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00006.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 3855, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 21731360, "block_size": 8, "offset": 6, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 77219623706622878032514777022459078586, "inc": 95963489890761403814531195999220475639}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 89754956132571653168928923625265475545, "inc": 72545526324180839152750112646078969085}, "has_uint32": 1, "uinteger": 279914116}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}
train_state_00007.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 2500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 4371, "it_state": {"it_state": {"root_dir": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset", "sources": {"data1": 1.0}, "source_to_state": {"data1": {"file_path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dataset/Pretraining_Dataset/data1/data1.chunk.00.jsonl", "position": 44372642, "block_size": 8, "offset": 7, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 98717493748280879239644659893389689105, "inc": 53245743019587277358203950863334653629}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "hf", "path": "/scratch/dyvm6xra/dyvm6xrauseryuhao/dream-training/Qwen2.5-7B-dcp"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 452, "rng_state": {"bit_generator": "PCG64", "state": {"state": 73407772541118432696768880670808056645, "inc": 19761753544780285878460645500694854795}, "has_uint32": 1, "uinteger": 1182720522}, "batch_size": 1, "prefetch_size": 1024}, "scheduler": {"base_lrs": [1e-05], "last_epoch": 2500, "verbose": false, "_step_count": 2501, "_get_lr_called_within_step": false, "_last_lr": [5e-06], "lr_lambdas": [{}]}}