|
|
--- |
|
|
library_name: transformers |
|
|
tags: [] |
|
|
--- |
|
|
|
|
|
# yujiepan/clip-vit-tiny-random-patch14-336 |
|
|
|
|
|
This model is intended for debugging. |
|
|
|
|
|
## Usage |
|
|
```python |
|
|
from transformers import CLIPProcessor, CLIPModel, CLIPConfig |
|
|
from PIL import Image |
|
|
import requests |
|
|
import torch |
|
|
|
|
|
model_id = "yujiepan/clip-vit-tiny-random-patch14-336" |
|
|
model = CLIPModel.from_pretrained(model_id).cuda() |
|
|
processor = CLIPProcessor.from_pretrained(model_id) |
|
|
|
|
|
url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" |
|
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
text = "A description of the image" |
|
|
inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda") |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
logits_per_image = outputs.logits_per_image # shape: [batch_size, num_texts] |
|
|
logits_per_text = outputs.logits_per_text # shape: [batch_size, num_images] |
|
|
probs = logits_per_image.softmax(dim=1) # shape: [batch_size, num_texts] |
|
|
print(probs) |
|
|
``` |
|
|
|
|
|
## Codes |
|
|
```python |
|
|
from transformers import CLIPProcessor, CLIPModel, CLIPConfig |
|
|
from PIL import Image |
|
|
import requests |
|
|
import torch |
|
|
|
|
|
model_name = "openai/clip-vit-large-patch14-336" |
|
|
config = CLIPConfig.from_pretrained(model_name) |
|
|
config = config.to_dict() |
|
|
config["projection_dim"] = 8 |
|
|
config["text_config"]["hidden_size"] = 8 |
|
|
config["text_config"]["projection_dim"] = 8 |
|
|
config["text_config"]["intermediate_size"] = 16 |
|
|
config["text_config"]["num_hidden_layers"] = 2 |
|
|
config["text_config"]["num_attention_heads"] = 2 |
|
|
config["vision_config"]["hidden_size"] = 8 |
|
|
config["vision_config"]["projection_dim"] = 8 |
|
|
config["vision_config"]["intermediate_size"] = 16 |
|
|
config["vision_config"]["num_hidden_layers"] = 2 |
|
|
config["vision_config"]["num_attention_heads"] = 2 |
|
|
config = CLIPConfig.from_dict(config) |
|
|
model = CLIPModel(config).half().cuda() |
|
|
processor = CLIPProcessor.from_pretrained(model_name) |
|
|
|
|
|
url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" |
|
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
text = "A description of the image" |
|
|
inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda") |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
logits_per_image = outputs.logits_per_image # shape: [batch_size, num_texts] |
|
|
logits_per_text = outputs.logits_per_text # shape: [batch_size, num_images] |
|
|
probs = logits_per_image.softmax(dim=1) # shape: [batch_size, num_texts] |
|
|
print(probs) |
|
|
|
|
|
model.push_to_hub("yujiepan/clip-vit-tiny-random-patch14-336") |
|
|
processor.push_to_hub("yujiepan/clip-vit-tiny-random-patch14-336") |
|
|
``` |
|
|
|
|
|
|