spicy03 commited on
Commit
1ab8ca0
·
verified ·
1 Parent(s): 6a5a75d

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +55 -0
  2. config.json +48 -0
  3. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - clip
5
+ - medical-imaging
6
+ - radiology
7
+ - roco
8
+ - vision-language
9
+ base_model: openai/clip-vit-base-patch32
10
+ metrics:
11
+ - recall
12
+ license: mit
13
+ ---
14
+
15
+ # ROCO-Radiology-CLIP (ViT-B/32)
16
+
17
+ > **A specialized vision-language model for radiology, fine-tuned on the ROCO dataset.**
18
+
19
+ This model aligns medical images (X-rays, CTs, MRIs) with their textual descriptions, enabling **zero-shot classification** and **semantic search** for radiology concepts.
20
+
21
+ ## Performance (Test Set)
22
+
23
+ | Metric | Score | Description |
24
+ | :--- | :--- | :--- |
25
+ | **Batch-wise R@1** | **70.8%** | Accuracy in classifying the correct image out of 32 candidates. |
26
+ | **Batch-wise R@5** | **97.0%** | Accuracy that the correct image is in the top 5 candidates. |
27
+ | **Global R@5** | **16.18%** | Retrieval recall across the full test set (8,000+ images). |
28
+
29
+ ## 🚀 Usage
30
+
31
+ ```python
32
+ from transformers import CLIPProcessor, CLIPModel
33
+ from PIL import Image
34
+
35
+ model_id = "spicy03/CLIP-ROCO-v1"
36
+ model = CLIPModel.from_pretrained(model_id)
37
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
38
+
39
+ image = Image.open("chest_xray.jpg")
40
+ labels = ["Pneumonia", "Normal Chest X-ray", "Brain MRI"]
41
+
42
+ inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
43
+ outputs = model(**inputs)
44
+ probs = outputs.logits_per_image.softmax(dim=1)
45
+
46
+ for label, prob in zip(labels, probs[0]):
47
+ print(f"{label}: {prob:.2f}")
48
+ Training Details
49
+ Dataset: ROCO (Radiology Objects in COntext)
50
+
51
+ Base Model: openai/clip-vit-base-patch32
52
+
53
+ Hardware: Fine-tuned on a single NVIDIA T4 GPU using mixed precision and gradient accumulation.
54
+
55
+ Epochs: 5 (Selected best checkpoint based on Val Loss).
config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "dtype": "float32",
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 512,
10
+ "text_config": {
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "dropout": 0.0,
14
+ "dtype": "float32",
15
+ "eos_token_id": 2,
16
+ "hidden_act": "quick_gelu",
17
+ "hidden_size": 512,
18
+ "initializer_factor": 1.0,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 2048,
21
+ "layer_norm_eps": 1e-05,
22
+ "max_position_embeddings": 77,
23
+ "model_type": "clip_text_model",
24
+ "num_attention_heads": 8,
25
+ "num_hidden_layers": 12,
26
+ "projection_dim": 512,
27
+ "vocab_size": 49408
28
+ },
29
+ "transformers_version": "4.57.3",
30
+ "vision_config": {
31
+ "attention_dropout": 0.0,
32
+ "dropout": 0.0,
33
+ "dtype": "float32",
34
+ "hidden_act": "quick_gelu",
35
+ "hidden_size": 768,
36
+ "image_size": 224,
37
+ "initializer_factor": 1.0,
38
+ "initializer_range": 0.02,
39
+ "intermediate_size": 3072,
40
+ "layer_norm_eps": 1e-05,
41
+ "model_type": "clip_vision_model",
42
+ "num_attention_heads": 12,
43
+ "num_channels": 3,
44
+ "num_hidden_layers": 12,
45
+ "patch_size": 32,
46
+ "projection_dim": 512
47
+ }
48
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34fe0f873d5b2cbdacde13af95df85f1c90c3bdab978c58d4493551019029905
3
+ size 605156676