DruryXu commited on
Commit
b0dab8e
·
verified ·
1 Parent(s): 2101e8e

Update README.md

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +28 -0
  3. assets/model.png +3 -0
  4. config.json +29 -0
  5. model.safetensors.index.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/model.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TBAC-UniImage-3B
2
+
3
+ ## Overview
4
+ This repository contains the official model checkpoints of **TBAC-UniImage-3B**, an unified understanding and generation model developed by Basic Algorithm Center, Platform and Content Group, Tencent.
5
+
6
+ Our model is composed of two components: the [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) serves as the understanding module, while the [SANA-1600M](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_diffusers) acts as the generation module. The conditions for generation are originate from representations of different Qwen2.5-VL-3B-Instruct layers.
7
+
8
+ ![Model](./assets/model.png)
9
+
10
+ ## Performance
11
+ | Method | Base (M)LLM | GenEval | DPG-Bench |
12
+ | :--- | :--- | :--- | :--- |
13
+ | MetaQuery | Qwen2.5-VL-3B-Instruct | 0.78 | 81.10 |
14
+ | | Qwen2.5-VL-7B-Instruct | 0.80 | 82.05 |
15
+ | BILP-3o | Qwen2.5-VL-3B-Instruct | 0.81 | 79.36 |
16
+ | | Qwen2.5-VL-7B-Instruct | 0.84 | 81.60 |
17
+ | BAGEL | MoT-7B | 0.82 | - |
18
+ | Show-o2 | Qwen2.5-1.5B-Instruct | 0.73 | 85.02 |
19
+ | | Qwen2.5-7B-Instruct | 0.76 | 86.14 |
20
+ | **Ours** | **Qwen2.5-VL-3B-Instruct** | **0.87** | 81.00 |
21
+
22
+ ## Acknowledgements
23
+
24
+ The training and inference codes are modified from [MetaQuery](https://github.com/facebookresearch/metaquery). We thank them for their contribution!
25
+
26
+ ## About
27
+
28
+ Created by the Tencent PCG Basic Algorithm Center. All rights reserved.
assets/model.png ADDED

Git LFS Details

  • SHA256: 952a91f681d13c437ce35ca0f3d58f6184a341f58d36e733825af34d112f15da
  • Pointer size: 131 Bytes
  • Size of remote file: 456 kB
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_gradient_checkpointing": true,
3
+ "architectures": [
4
+ "TBACUniImage"
5
+ ],
6
+ "attn_implementation": null,
7
+ "diffusion_model_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers",
8
+ "in_channels": 32,
9
+ "input_size": 16,
10
+ "loss_type": "flow",
11
+ "max_input_text_tokens": 256,
12
+ "mllm_id": "Qwen/Qwen2.5-VL-3B-Instruct",
13
+ "model_type": "metaquery",
14
+ "modules_to_freeze": [
15
+ "vae",
16
+ "model.mllm_backbone"
17
+ ],
18
+ "modules_to_unfreeze": [
19
+ "model.mllm_backbone.model.embed_tokens"
20
+ ],
21
+ "noise_scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers",
22
+ "num_metaqueries": 64,
23
+ "scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers",
24
+ "system_prompt": "You will be given an image or its caption. Please describe the content of the image in detail in your own words.",
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.49.0",
27
+ "vae_downsample_f": 32,
28
+ "vae_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers"
29
+ }
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff