model version 1.0

Files changed (15) hide show

README.md CHANGED Viewed

@@ -1,3 +1,14 @@
----
-license: apache-2.0
----

+## Libra-Base
+This model was trained on image-text pairs for basic multi-modal understanding ability.
+### !!! NOTE !!!
+In addition to the pretrained weights in this repo, please download the pretrained CLIP model in huggingface and merge it into the path, as:
+```
+libra-base/
+├── ...
+└── openai-clip-vit-large-patch14-336/
+    └── ...
+```
+The CLIP model can be downloaded [here](https://huggingface.co/openai/clip-vit-large-patch14-336).

config.json ADDED Viewed

+{
+  "_name_or_path": "/home/yfxu/libra/CHECKPOINTS/Libra/pretrain",
+  "addition_mode": false,
+  "architectures": [
+    "LibraForCausalLM"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "bridge_rank": 8,
+  "concat_signals": true,
+  "contiguous_signal_size": 4096,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_feature_resolution": 24,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "max_vision_token_length": 578,
+  "model_type": "libra",
+  "newline_token_id": 13,
+  "norm_signals": true,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "unified_head": false,
+  "use_2d_rope": false,
+  "use_bridge": true,
+  "use_cache": true,
+  "use_vision_position_embedding": false,
+  "vision_codebook_num": 2,
+  "vision_down_ratio": 4,
+  "vision_embd_pdrop": 0.0,
+  "vision_prediction_mode": "1d",
+  "vision_resid_pdrop": 0.0,
+  "vision_vocab_size": 514,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.38.2"
+}

model-00001-of-00005.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:066568e9a2b531909fd3d97278e9c6e48e66c18e98010fd0a1ce00b0ff90abaf
+size 4983174656

model-00002-of-00005.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:402139c96fc3f1e6514460c6230ff56294e2f88ee735f8eff8cbe11a8e122e42
+size 4945806000

model-00003-of-00005.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6703a8363e088675c8088e9c7894d22effc7f5de2f726853f65efa91e39bf1a
+size 4981750512

model-00004-of-00005.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a3bd20da82baaf95163b15620a4652c2040525190304c8bf4e5b848d543171
+size 4972348888

model-00005-of-00005.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:3447f64db98077279a2d2aae50995c7e34ac0cd6a3e6e055ddeca1806c08adb9
+size 2148920376

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 2048,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

vision_tokenizer_config.yaml ADDED Viewed

+freeze: True
+max_vision_token_length: 578 # 24*24 (resolution) + 2 (<img> and <\img>); corresponding to model_config.max_vision_token_length, dataset_config.image_size
+params:
+  embed_dim: 1024 # debug
+  ckpt_path: vqgan.ckpt
+  codebook_size: 512
+  num_codebook: 2
+  ddconfig:
+    # only_auto_encoder: True
+    encoder_name: openai-clip-vit-large-patch14-336
+    select_layer: [2,10,18,22]
+    double_z: False
+    z_channels: 1024
+    resolution: 336 # 336
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [ 1,1,2,4,8]  # num_down = len(ch_mult)-1
+    num_res_blocks: 2
+    attn_resolutions: [24]
+    dropout: 0.0
+    initial_resolution: 24
+    num_attn_head: 8

vqgan.ckpt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:d01a38fadd81dec3557120ec6e8d36d51758ac1a8a8afe58102f404d03e47a08
+size 3247360961