dewiri's picture
Upload app.py
e36e780 verified
raw
history blame
9.02 kB
from transformers import pipeline
checkpoint = "openai/clip-vit-large-patch14"
detector = pipeline(model=checkpoint, task="zero-shot-image-classification")
#checkpoint = "google/siglip-so400m-patch14-384"
#detector = pipeline(task="zero-shot-image-classification", model="google/siglip-so400m-patch14-384")
from datasets import load_dataset
dataset = load_dataset('pcuenq/oxford-pets')
dataset
dataset['train'][0]['image']
from PIL import Image
import io
from tqdm import tqdm
labels_oxford_pets = ['Siamese', 'Birman', 'shiba inu', 'staffordshire bull terrier', 'basset hound', 'Bombay', 'japanese chin', 'chihuahua', 'german shorthaired', 'pomeranian', 'beagle', 'english cocker spaniel', 'american pit bull terrier', 'Ragdoll', 'Persian', 'Egyptian Mau', 'miniature pinscher', 'Sphynx', 'Maine Coon', 'keeshond', 'yorkshire terrier', 'havanese', 'leonberger', 'wheaten terrier', 'american bulldog', 'english setter', 'boxer', 'newfoundland', 'Bengal', 'samoyed', 'British Shorthair', 'great pyrenees', 'Abyssinian', 'pug', 'saint bernard', 'Russian Blue', 'scottish terrier']
# List to store true labels and predicted labels
true_labels = []
predicted_labels = []
for i in tqdm(range(len(dataset['train']))):
# Get the image bytes from the dataset
image_bytes = dataset['train'][i]['image']['bytes']
# Convert the bytes to a PIL image
image = Image.open(io.BytesIO(image_bytes))
# Run the detector on the image with the provided labels
results = detector(image, candidate_labels=labels_oxford_pets)
# Sort the results by score in descending order
sorted_results = sorted(results, key=lambda x: x['score'], reverse=True)
# Get the top predicted label
predicted_label = sorted_results[0]['label']
# Append the true and predicted labels to the respective lists
true_labels.append(dataset['train'][i]['label'])
predicted_labels.append(predicted_label)
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
# Calculate precision and recall
precision = precision_score(true_labels, predicted_labels, average='weighted', labels=labels_oxford_pets)
recall = recall_score(true_labels, predicted_labels, average='weighted', labels=labels_oxford_pets)
# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
import gradio as gr
from transformers import pipeline
# Load models
vit_classifier = pipeline("image-classification", model="kuhs/vit-base-oxford-iiit-pets")
clip_detector = pipeline(model="openai/clip-vit-large-patch14", task="zero-shot-image-classification")
labels_oxford_pets = [
'Siamese', 'Birman', 'shiba inu', 'staffordshire bull terrier', 'basset hound', 'Bombay', 'japanese chin',
'chihuahua', 'german shorthaired', 'pomeranian', 'beagle', 'english cocker spaniel', 'american pit bull terrier',
'Ragdoll', 'Persian', 'Egyptian Mau', 'miniature pinscher', 'Sphynx', 'Maine Coon', 'keeshond', 'yorkshire terrier',
'havanese', 'leonberger', 'wheaten terrier', 'american bulldog', 'english setter', 'boxer', 'newfoundland', 'Bengal',
'samoyed', 'British Shorthair', 'great pyrenees', 'Abyssinian', 'pug', 'saint bernard', 'Russian Blue', 'scottish terrier'
]
def classify_pet(image):
vit_results = vit_classifier(image)
vit_output = {result['label']: result['score'] for result in vit_results}
clip_results = clip_detector(image, candidate_labels=labels_oxford_pets)
clip_output = {result['label']: result['score'] for result in clip_results}
return {"ViT Classification": vit_output, "CLIP Zero-Shot Classification": clip_output}
example_images = [
["example_images/dog1.jpeg"],
["example_images/dog2.jpeg"],
["example_images/leonberger.jpg"],
["example_images/snow_leopard.jpeg"],
["example_images/cat.jpg"]
]
iface = gr.Interface(
fn=classify_pet,
inputs=gr.Image(type="filepath"),
outputs=gr.JSON(),
title="Pet Classification Comparison",
description="Upload an image of a pet, and compare results from a trained ViT model and a zero-shot CLIP model.",
examples=example_images
)
iface.launch()
import io
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from huggingface_hub import notebook_login
from datasets import load_dataset, DatasetDict
from transformers import AutoImageProcessor, ViTForImageClassification
from transformers import Trainer, TrainingArguments
import evaluate
# Login onto Hugging Face hub to load any private dataset/model.
# We need to login as we'll also upload our model to the hub
notebook_login()
dataset = load_dataset('pcuenq/oxford-pets')
dataset
dataset['train'][0]
labels = dataset['train'].unique('label')
print(len(labels),labels)
def show_samples(ds,rows,cols):
samples = ds.shuffle().select(np.arange(rows*cols)) # selecting random images
fig = plt.figure(figsize=(cols*4,rows*4))
# plotting
for i in range(rows*cols):
img_bytes = samples[i]['image']['bytes']
img = Image.open(io.BytesIO(img_bytes))
label = samples[i]['label']
fig.add_subplot(rows,cols,i+1)
plt.imshow(img)
plt.title(label)
plt.axis('off')
show_samples(dataset['train'],rows=3,cols=5)
split_dataset = dataset['train'].train_test_split(test_size=0.2) # 80% train, 20% evaluation
eval_dataset = split_dataset['test'].train_test_split(test_size=0.5) # 50% validation, 50% test
# recombining the splits using a DatasetDict
our_dataset = DatasetDict({
'train': split_dataset['train'],
'validation': eval_dataset['train'],
'test': eval_dataset['test']
})
our_dataset
label2id = {c:idx for idx,c in enumerate(labels)}
id2label = {idx:c for idx,c in enumerate(labels)}
processor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224')
processor
def transforms(batch):
batch['image'] = [Image.open(io.BytesIO(x['bytes'])).convert('RGB') for x in batch['image']]
inputs = processor(batch['image'],return_tensors='pt')
inputs['labels']=[label2id[y] for y in batch['label']]
return inputs
processed_dataset = our_dataset.with_transform(transforms)
processed_dataset
def collate_fn(batch):
return {
'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
'labels': torch.tensor([x['labels'] for x in batch])
}
accuracy = evaluate.load('accuracy')
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits,axis=1)
score = accuracy.compute(predictions=predictions, references=labels)
return score
model = ViTForImageClassification.from_pretrained(
'google/vit-base-patch16-224',
num_labels = len(labels),
id2label = id2label,
label2id = label2id,
ignore_mismatched_sizes = True
)
model
for name,p in model.named_parameters():
if not name.startswith('classifier'):
p.requires_grad = False
num_params = sum([p.numel() for p in model.parameters()])
trainable_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
print(f"{num_params = :,} | {trainable_params = :,}")
training_args = TrainingArguments(
output_dir="./vit-base-oxford-iiit-pets",
per_device_train_batch_size=16,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=100,
num_train_epochs=5,
learning_rate=3e-4,
save_total_limit=2,
remove_unused_columns=False,
push_to_hub=True,
report_to='tensorboard',
load_best_model_at_end=True,
run_name="transferlearning-google_vit-base-patch16-224 no data augmentation"
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=collate_fn,
compute_metrics=compute_metrics,
train_dataset=processed_dataset["train"],
eval_dataset=processed_dataset["validation"],
tokenizer=processor
)
trainer.train()
trainer.evaluate(processed_dataset['test'])
def show_predictions(rows,cols):
samples = our_dataset['test'].shuffle().select(np.arange(rows*cols))
processed_samples = samples.with_transform(transforms)
predictions = trainer.predict(processed_samples).predictions.argmax(axis=1) # predicted labels from logits
fig = plt.figure(figsize=(cols*4,rows*4))
for i in range(rows*cols):
img_bytes = samples[i]['image']['bytes']
img = Image.open(io.BytesIO(img_bytes))
prediction = predictions[i]
label = f"label: {samples[i]['label']}\npredicted: {id2label[prediction]}"
fig.add_subplot(rows,cols,i+1)
plt.imshow(img)
plt.title(label)
plt.axis('off')
show_predictions(rows=5,cols=5)
kwargs = {
"finetuned_from": model.config._name_or_path,
"dataset": 'pcuenq/oxford-pets',
"tasks": "image-classification",
"tags": ['image-classification'],
}
trainer.save_model()
trainer.push_to_hub('🐕️🐈️', **kwargs)