alwinjin8 commited on
Commit
0dc38ea
·
verified ·
1 Parent(s): 9e6aacd

Training in progress, step 1000

Browse files
config.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3ForConditionalGeneration"
4
+ ],
5
+ "boi_token_index": 255999,
6
+ "cosmos_model_name": "nvidia/Cosmos-Predict2-2B-Video2World",
7
+ "dtype": "bfloat16",
8
+ "enable_world": true,
9
+ "eoi_token_index": 256000,
10
+ "eos_token_id": [
11
+ 1,
12
+ 106
13
+ ],
14
+ "freeze_cosmos_vae": true,
15
+ "freeze_gemma_language": true,
16
+ "freeze_gemma_vision": true,
17
+ "gemma_model_name": "google/gemma-3-4b-it",
18
+ "image_token_index": 262144,
19
+ "initializer_range": 0.02,
20
+ "mm_tokens_per_image": 256,
21
+ "model_type": "the_world",
22
+ "projection_architecture": "mlp",
23
+ "text_config": {
24
+ "_sliding_window_pattern": 6,
25
+ "attention_bias": false,
26
+ "attention_dropout": 0.0,
27
+ "attn_logit_softcapping": null,
28
+ "dtype": "bfloat16",
29
+ "final_logit_softcapping": null,
30
+ "head_dim": 256,
31
+ "hidden_activation": "gelu_pytorch_tanh",
32
+ "hidden_size": 2560,
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 10240,
35
+ "layer_types": [
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "full_attention",
60
+ "sliding_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "sliding_attention",
65
+ "full_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "sliding_attention"
70
+ ],
71
+ "max_position_embeddings": 131072,
72
+ "model_type": "gemma3_text",
73
+ "num_attention_heads": 8,
74
+ "num_hidden_layers": 34,
75
+ "num_key_value_heads": 4,
76
+ "query_pre_attn_scalar": 256,
77
+ "rms_norm_eps": 1e-06,
78
+ "rope_local_base_freq": 10000.0,
79
+ "rope_scaling": {
80
+ "factor": 8.0,
81
+ "rope_type": "linear"
82
+ },
83
+ "rope_theta": 1000000.0,
84
+ "sliding_window": 1024,
85
+ "use_cache": true,
86
+ "vocab_size": 262147
87
+ },
88
+ "transformers_version": "4.56.2",
89
+ "vision_config": {
90
+ "attention_dropout": 0.0,
91
+ "dtype": "bfloat16",
92
+ "hidden_act": "gelu_pytorch_tanh",
93
+ "hidden_size": 1152,
94
+ "image_size": 896,
95
+ "intermediate_size": 4304,
96
+ "layer_norm_eps": 1e-06,
97
+ "model_type": "siglip_vision_model",
98
+ "num_attention_heads": 16,
99
+ "num_channels": 3,
100
+ "num_hidden_layers": 27,
101
+ "patch_size": 14,
102
+ "vision_use_head": false
103
+ },
104
+ "world_projection_mode": "channel"
105
+ }
config_used.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_comment": "Projection-only alignment on local CC3M align dataset (full set)",
3
+ "model_name": "google/gemma-3-4b-it",
4
+ "cosmos_model_name": "nvidia/Cosmos-Predict2-2B-Video2World",
5
+ "enable_world": true,
6
+ "num_world_steps": 0,
7
+ "world_projection_mode": "channel",
8
+ "projection_architecture": "mlp",
9
+ "freeze_gemma_vision": true,
10
+ "freeze_gemma_language": true,
11
+ "freeze_cosmos_vae": true,
12
+ "learning_rate": 0.0001,
13
+ "batch_size": 2,
14
+ "gradient_accumulation_steps": 1,
15
+ "num_epochs": 1,
16
+ "warmup_steps": 100,
17
+ "weight_decay": 0.01,
18
+ "max_grad_norm": 1.0,
19
+ "use_gradient_checkpointing": false,
20
+ "mixed_precision": "bf16",
21
+ "output_dir": "/storage/ice1/8/2/ajin37/checkpoints/theworld-cc3m-align-full",
22
+ "save_steps": 1000,
23
+ "save_total_limit": 3,
24
+ "resume_from_checkpoint": null,
25
+ "eval_steps": 2000,
26
+ "eval_batch_size": 8,
27
+ "do_eval": false,
28
+ "logging_steps": 100,
29
+ "log_to_wandb": false,
30
+ "wandb_project": "theworld-cc3m",
31
+ "wandb_run_name": "cc3m-align-full",
32
+ "log_to_tensorboard": true,
33
+ "max_seq_length": 2048,
34
+ "num_workers": 4,
35
+ "dataset_name": "cc3m_align",
36
+ "train_dataset_path": "/storage/ice1/8/2/ajin37/data/cc3m-align/cc3m_full.jsonl",
37
+ "eval_dataset_path": null,
38
+ "num_samples": null,
39
+ "streaming": false,
40
+ "image_folder": "/storage/ice1/8/2/ajin37/data/cc3m-align/images",
41
+ "draw_bboxes": false,
42
+ "hf_token": null,
43
+ "push_to_hub": true,
44
+ "hub_model_id": "alwinjin8/theworld-cc3m-align-full",
45
+ "hub_strategy": "every_save",
46
+ "hub_private_repo": false
47
+ }
logs/events.out.tfevents.1763876359.atl1-1-03-013-8-0.pace.gatech.edu.62208.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d07b1bfb6d3ca7a6d0844bcc6f8c0c166da84989f4084d25e6bb3611a2d17eb4
3
+ size 9325
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5717941fecf8dff14197472099feeeb1e439a112e97b31dfdb247f89ebca1912
3
+ size 34089384
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58c33431e836c7f0b241ae8f5b5880e8a42e68b4f0bf9bbb98c2f98808658d50
3
+ size 5496