Add metadata

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +119 -115
README.md CHANGED
@@ -1,116 +1,120 @@
1
- ---
2
- license: apache-2.0
3
- tags:
4
- - vision-language
5
- - abnormality-grounding
6
- - medical-imaging
7
- - knowledge-distillation
8
- - multimodal
9
- model-index:
10
- - name: AG-KD
11
- results:
12
- - task:
13
- type: Abnormality Grounding
14
- name: Grounding
15
- metrics:
16
- - name: none
17
- type: none
18
- value: null
19
- ---
20
-
21
-
22
- # ๐Ÿš€ Enhancing Abnormality Grounding for Vision-Language Models with Knowledge Descriptions
23
-
24
- This repository provides the code and model weights for our paper:
25
- **[Enhancing Abnormality Grounding for Vision-Language Models with Knowledge Descriptions](https://arxiv.org/abs/2503.03278)**
26
-
27
- ๐Ÿงช Explore our live demo on [Hugging Face Spaces](https://huggingface.co/spaces/Anonymous-AC/AG-KD-anonymous-Demo) to see the model in action!
28
-
29
-
30
- ## ๐Ÿ“Œ Overview
31
-
32
- **AG-KD (Abnormality Grounding with Knowledge Descriptions)** is a compact 0.23B vision-language model designed for abnormality grounding in medical images. Despite its small size, it delivers performance **comparable to 7B state-of-the-art medical VLMs**. Our approach integrates **structured knowledge descriptions** into prompts, enhancing the modelโ€™s ability to localize medical abnormalities in images.
33
-
34
-
35
- ## ๐Ÿ’ป How to Use
36
-
37
- ### Simple Example
38
-
39
- For detailed examples, visit: [AG-KD GitHub Repository](https://github.com/LijunRio/AG-KD)
40
-
41
- ```python
42
-
43
- import torch
44
- import requests
45
- from io import BytesIO
46
- from PIL import Image
47
- import numpy as np
48
- import albumentations as A
49
- from transformers import AutoModelForCausalLM, AutoProcessor
50
-
51
-
52
- def apply_transform(image, size=512):
53
- transform = A.Compose([
54
- A.LongestMaxSize(max_size=size),
55
- A.PadIfNeeded(min_height=size, min_width=size, border_mode=0, value=(0,0,0)),
56
- A.Resize(height=size, width=size)
57
- ])
58
- return transform(image=np.array(image))["image"]
59
-
60
- def run_simple(image_url, target, definition, model, processor, device):
61
- prompt = f"<CAPTION_TO_PHRASE_GROUNDING>Locate the phrases in the caption: {target} means {definition}."
62
- response = requests.get(image_url)
63
- image = Image.open(BytesIO(response.content)).convert("RGB")
64
- np_image = apply_transform(image)
65
-
66
- inputs = processor(text=[prompt], images=[np_image], return_tensors="pt", padding=True).to(device)
67
-
68
- outputs = model.generate(
69
- input_ids=inputs["input_ids"],
70
- pixel_values=inputs["pixel_values"],
71
- max_new_tokens=1024,
72
- num_beams=3,
73
- output_scores=True,
74
- return_dict_in_generate=True
75
- )
76
-
77
- transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False)
78
- generated_text = processor.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
79
-
80
- output_len = np.sum(transition_scores.cpu().numpy() < 0, axis=1)
81
- length_penalty = model.generation_config.length_penalty
82
- score = transition_scores.cpu().sum(axis=1) / (output_len**length_penalty)
83
- prob = np.exp(score.cpu().numpy())
84
-
85
- print(f"\n[IMAGE URL] {image_url}")
86
- print(f"[TARGET] {target}")
87
- print(f"[PROBABILITY] {prob[0] * 100:.2f}%")
88
- print(f"[GENERATED TEXT]\n{generated_text}")
89
-
90
- if __name__ == "__main__":
91
- image_url = "https://huggingface.co/spaces/RioJune/AG-KD/resolve/main/examples/f1eb2216d773ced6330b1f31e18f04f8.png"
92
- target = "pulmonary fibrosis"
93
- definition = "Scarring of the lung tissue creating a dense fibrous appearance."
94
-
95
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
96
- model_name = "RioJune/AG-KD"
97
-
98
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)
99
- processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
100
-
101
- run_simple(image_url, target, definition, model, processor, device)
102
- ```
103
-
104
-
105
- ## ๐Ÿ“– Citation
106
-
107
- If you use our work, please cite:
108
-
109
- ```
110
- @article{li2025enhancing,
111
- title={Enhancing Abnormality Grounding for Vision Language Models with Knowledge Descriptions},
112
- author={Li, J. and Liu, C. and Bai, W. and Arcucci, R. and Bercea, C. I. and Schnabel, J. A.},
113
- journal={arXiv preprint arXiv:2503.03278},
114
- year={2025}
115
- }
 
 
 
 
116
  ```
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: image-text-to-text
4
+ base_model:
5
+ - microsoft/Florence-2-base-ft
6
+ license: apache-2.0
7
+ tags:
8
+ - vision-language
9
+ - abnormality-grounding
10
+ - medical-imaging
11
+ - knowledge-distillation
12
+ - multimodal
13
+ model-index:
14
+ - name: AG-KD
15
+ results:
16
+ - task:
17
+ type: Abnormality Grounding
18
+ name: Grounding
19
+ metrics:
20
+ - name: none
21
+ type: none
22
+ value: null
23
+ ---
24
+
25
+
26
+ # ๐Ÿš€ Enhancing Abnormality Grounding for Vision-Language Models with Knowledge Descriptions
27
+
28
+ This repository provides the code and model weights for our paper:
29
+ **[Enhancing Abnormality Grounding for Vision-Language Models with Knowledge Descriptions](https://arxiv.org/abs/2503.03278)**
30
+
31
+ ๐Ÿงช Explore our live demo on [Hugging Face Spaces](https://huggingface.co/spaces/Anonymous-AC/AG-KD-anonymous-Demo) to see the model in action!
32
+
33
+
34
+ ## ๐Ÿ“Œ Overview
35
+
36
+ **AG-KD (Abnormality Grounding with Knowledge Descriptions)** is a compact 0.23B vision-language model designed for abnormality grounding in medical images. Despite its small size, it delivers performance **comparable to 7B state-of-the-art medical VLMs**. Our approach integrates **structured knowledge descriptions** into prompts, enhancing the modelโ€™s ability to localize medical abnormalities in images.
37
+
38
+
39
+ ## ๐Ÿ’ป How to Use
40
+
41
+ ### Simple Example
42
+
43
+ For detailed examples, visit: [AG-KD GitHub Repository](https://github.com/LijunRio/AG-KD)
44
+
45
+ ```python
46
+
47
+ import torch
48
+ import requests
49
+ from io import BytesIO
50
+ from PIL import Image
51
+ import numpy as np
52
+ import albumentations as A
53
+ from transformers import AutoModelForCausalLM, AutoProcessor
54
+
55
+
56
+ def apply_transform(image, size=512):
57
+ transform = A.Compose([
58
+ A.LongestMaxSize(max_size=size),
59
+ A.PadIfNeeded(min_height=size, min_width=size, border_mode=0, value=(0,0,0)),
60
+ A.Resize(height=size, width=size)
61
+ ])
62
+ return transform(image=np.array(image))["image"]
63
+
64
+ def run_simple(image_url, target, definition, model, processor, device):
65
+ prompt = f"<CAPTION_TO_PHRASE_GROUNDING>Locate the phrases in the caption: {target} means {definition}."
66
+ response = requests.get(image_url)
67
+ image = Image.open(BytesIO(response.content)).convert("RGB")
68
+ np_image = apply_transform(image)
69
+
70
+ inputs = processor(text=[prompt], images=[np_image], return_tensors="pt", padding=True).to(device)
71
+
72
+ outputs = model.generate(
73
+ input_ids=inputs["input_ids"],
74
+ pixel_values=inputs["pixel_values"],
75
+ max_new_tokens=1024,
76
+ num_beams=3,
77
+ output_scores=True,
78
+ return_dict_in_generate=True
79
+ )
80
+
81
+ transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False)
82
+ generated_text = processor.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
83
+
84
+ output_len = np.sum(transition_scores.cpu().numpy() < 0, axis=1)
85
+ length_penalty = model.generation_config.length_penalty
86
+ score = transition_scores.cpu().sum(axis=1) / (output_len**length_penalty)
87
+ prob = np.exp(score.cpu().numpy())
88
+
89
+ print(f"\n[IMAGE URL] {image_url}")
90
+ print(f"[TARGET] {target}")
91
+ print(f"[PROBABILITY] {prob[0] * 100:.2f}%")
92
+ print(f"[GENERATED TEXT]\n{generated_text}")
93
+
94
+ if __name__ == "__main__":
95
+ image_url = "https://huggingface.co/spaces/RioJune/AG-KD/resolve/main/examples/f1eb2216d773ced6330b1f31e18f04f8.png"
96
+ target = "pulmonary fibrosis"
97
+ definition = "Scarring of the lung tissue creating a dense fibrous appearance."
98
+
99
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
100
+ model_name = "RioJune/AG-KD"
101
+
102
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)
103
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
104
+
105
+ run_simple(image_url, target, definition, model, processor, device)
106
+ ```
107
+
108
+
109
+ ## ๐Ÿ“– Citation
110
+
111
+ If you use our work, please cite:
112
+
113
+ ```
114
+ @article{li2025enhancing,
115
+ title={Enhancing Abnormality Grounding for Vision Language Models with Knowledge Descriptions},
116
+ author={Li, J. and Liu, C. and Bai, W. and Arcucci, R. and Bercea, C. I. and Schnabel, J. A.},
117
+ journal={arXiv preprint arXiv:2503.03278},
118
+ year={2025}
119
+ }
120
  ```