File size: 10,051 Bytes
094ff22 b494520 094ff22 1bb3933 b494520 094ff22 b494520 094ff22 1bb3933 d3db8fb b494520 59f1876 094ff22 d3db8fb 13188b6 b494520 13188b6 b494520 094ff22 b494520 094ff22 7076c09 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 b494520 094ff22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | ---
language:
- en
- zh
library_name: transformers
license: apache-2.0
pipeline_tag: zero-shot-image-classification
tags:
- clip
---
# FG-CLIP 2: A Bilingual Fine-grained Vision-language Alignment Model
Code: https://github.com/360CVGroup/FG-CLIP
Project page: https://360cvgroup.github.io/FG-CLIP
FG-CLIP 2 is the foundation model for fine-grained vision-language understanding in both English and Chinese.
Across 29 datasets and 8 diverse tasks, it consistently surpasses recent strong baselines such as SigLIP 2 and MetaCLIP 2, achieving the best reported performance to date in both languages.
**[FG-CLIP 2: A Bilingual Fine-grained Vision-language Alignment Model](https://arxiv.org/abs/2510.10921)**
</br>
Chunyu Xie*, Bin Wang*, Fanjing Kong, Jincheng Li, Dawei Liang, Ji Ao, Dawei Leng†, Yuhui Yin(*Equal Contribution, †Corresponding Author)
</br>
[](https://arxiv.org/abs/2510.10921)
[](https://huggingface.co/collections/qihoo360/fg-clip-2-68ecbf9c548623bb78bc7913)
[](https://huggingface.co/collections/qihoo360/fg-clip-2-68ecbf9c548623bb78bc7913)
[](https://research.360.cn/sass/index)
**[FG-CLIP: Fine-Grained Visual and Textual Alignment](https://arxiv.org/abs/2505.05071)** ([code branch: v1.0](https://github.com/360CVGroup/FG-CLIP/tree/v1.0))
</br>
Chunyu Xie*, Bin Wang*, Fanjing Kong, Jincheng Li, Dawei Liang, Gengshen Zhang, Dawei Leng†, Yuhui Yin (*Equal Contribution, †Corresponding Author)
</br>
[](https://arxiv.org/abs/2505.05071)
[](https://icml.cc/Conferences/2025)
[](https://huggingface.co/collections/qihoo360/fg-clip-681da45d4acfb65c240a6d08)
[](https://huggingface.co/datasets/qihoo360/FineHARD)
[](https://deepwiki.com/360CVGroup/FG-CLIP)
<p align="center">
<img src="https://huggingface.co/qihoo360/fg-clip2-base/resolve/main/use_imgs/FGCLIP2_compare_all_n.png" width="500" height="440"/>
</p>
## Quick Start 🤗
### Load Model
```python
import torch
from PIL import Image
from transformers import (
AutoImageProcessor,
AutoTokenizer,
AutoModelForCausalLM,
)
model_root = "qihoo360/fg-clip2-base"
model = AutoModelForCausalLM.from_pretrained(model_root,trust_remote_code=True).cuda()
device = model.device
tokenizer = AutoTokenizer.from_pretrained(model_root)
image_processor = AutoImageProcessor.from_pretrained(model_root)
```
### Retrieval
```python
def determine_max_value(image):
w,h = image.size
max_val = (w//16)*(h//16)
if max_val > 784:
return 1024
elif max_val > 576:
return 784
elif max_val > 256:
return 576
elif max_val > 128:
return 256
else:
return 128
img_root = "cat_dfclor.jpg"
image = Image.open(img_root).convert("RGB")
image_input = image_processor(images=image, max_num_patches=determine_max_value(image), return_tensors="pt").to(device)
# NOTE Short captions: max_length=64 walk_type="short"(default)
# NOTE Long captions: max_length=196 walk_type="long"
captions = [
"一个简约风格的卧室角落,黑色金属衣架上挂着多件米色和白色的衣物,下方架子放着两双浅色鞋子,旁边是一盆绿植,左侧可见一张铺有白色床单和灰色枕头的床。",
"一个简约风格的卧室角落,黑色金属衣架上挂着多件红色和蓝色的衣物,下方架子放着两双黑色高跟鞋,旁边是一盆绿植,左侧可见一张铺有白色床单和灰色枕头的床。",
"一个简约风格的卧室角落,黑色金属衣架上挂着多件米色和白色的衣物,下方架子放着两双运动鞋,旁边是一盆仙人掌,左侧可见一张铺有白色床单和灰色枕头的床。",
"一个繁忙的街头市场,摊位上摆满水果,背景是高楼大厦,人们在喧闹中购物。"
]
captions = [caption.lower() for caption in captions]
caption_input = tokenizer(captions, padding="max_length", max_length=196, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
image_feature = model.get_image_features(**image_input)
text_feature = model.get_text_features(**caption_input,walk_type="long")
image_feature = image_feature / image_feature.norm(p=2, dim=-1, keepdim=True)
text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
logits_per_image = image_feature @ text_feature.T
logit_scale, logit_bias = model.logit_scale.to(text_feature.device), model.logit_bias.to(text_feature.device)
logits_per_image = logits_per_image * logit_scale.exp() + logit_bias
# The original Github example does not print probabilities for retrieval, keeping consistency.
```
<p align="left">
<img src="https://huggingface.co/qihoo360/fg-clip2-base/resolve/main/use_imgs/cn_re_demo.png" width=100%/>
</p>
### Dense feature effect display
```python
import math
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
img_root = "cat_dfclor.jpg"
image = Image.open(img_root).convert("RGB")
# The 'resize_short_edge' function is not defined in the snippet or provided context.
# Assuming 'cat_dfclor.jpg' is pre-processed or the model handles sizing.
# image = resize_short_edge(image,target_size=2048)
image_input = image_processor(images=image, max_num_patches=16384, return_tensors="pt").to(device)
captions = ["电脑","黑猫","窗户","window","white cat","book"]
with torch.no_grad():
dense_image_feature = model.get_image_dense_feature(**image_input)
spatial_values = image_input["spatial_shapes"][0]
real_h = spatial_values[0].item()
real_w = spatial_values[1].item()
real_pixel_tokens_num = real_w*real_h
dense_image_feature = dense_image_feature[0][:real_pixel_tokens_num]
captions = [caption.lower() for caption in captions]
caption_input = tokenizer(captions, padding="max_length", max_length=64, truncation=True, return_tensors="pt").to(device)
text_feature = model.get_text_features(**caption_input, walk_type="box")
text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
dense_image_feature = dense_image_feature / dense_image_feature.norm(p=2, dim=-1, keepdim=True)
similarity = dense_image_feature @ text_feature.T
similarity = similarity.cpu()
num_classes = len(captions)
cols = 3
rows = (num_classes + cols - 1) // cols
aspect_ratio = real_w / real_h
fig_width_inch = 3 * cols
fig_height_inch = fig_width_inch / aspect_ratio * rows / cols
fig, axes = plt.subplots(rows, cols, figsize=(fig_width_inch, fig_height_inch))
fig.subplots_adjust(wspace=0.01, hspace=0.01)
if num_classes == 1:
axes = [axes]
else:
axes = axes.flatten()
for cls_index in range(num_classes):
similarity_map = similarity[:, cls_index].cpu().numpy()
show_image = similarity_map.reshape((real_h, real_w))
ax = axes[cls_index]
ax.imshow(show_image, cmap='viridis', aspect='equal')
ax.set_xticks([])
ax.set_yticks([])
ax.axis('off')
for idx in range(num_classes, len(axes)):
axes[idx].axis('off')
savename = "FGCLIP2_dfcolor_cat_all_2K.png"
plt.savefig(savename, dpi=150, bbox_inches='tight', pad_inches=0.05)
plt.close()
```
<p align="left">
<img src="https://huggingface.co/qihoo360/fg-clip2-base/resolve/main/use_imgs/FGCLIP2_dfcolor_cat_all_2K.png" width=100%/>
</p>
## Citation
If you find FG-CLIP 2 useful for your research and applications, please cite using this BibTeX:
```
@article{xie2025fg2,
title={FG-CLIP 2: A Bilingual Fine-grained Vision-language Alignment Model},
author={Xie, Chunyu and Wang, Bin and Kong, Fanjing and Li, Jincheng and Liang, Dawei and Ao, Ji and Leng, Dawei and Yin, Yuhui},
journal={arXiv preprint arXiv:2510.10921},
year={2025}
}
```
```
@article{xie2025fg,
title={FG-CLIP: Fine-Grained Visual and Textual Alignment},
author={Xie, Chunyu and Wang, Bin and Kong, Fanjing and Li, Jincheng and Liang, Dawei and Zhang, Gengshen and Leng, Dawei and Yin, Yuhui},
journal={arXiv preprint arXiv:2505.05071},
year={2025}
}
```
## License
This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses.
The content of this project itself is licensed under the [Apache license 2.0](./LICENSE). |