from transformers import pipeline checkpoint = "openai/clip-vit-large-patch14" detector = pipeline(model=checkpoint, task="zero-shot-image-classification") #checkpoint = "google/siglip-so400m-patch14-384" #detector = pipeline(task="zero-shot-image-classification", model="google/siglip-so400m-patch14-384") from datasets import load_dataset dataset = load_dataset('pcuenq/oxford-pets') dataset dataset['train'][0]['image'] from PIL import Image import io from tqdm import tqdm labels_oxford_pets = ['Siamese', 'Birman', 'shiba inu', 'staffordshire bull terrier', 'basset hound', 'Bombay', 'japanese chin', 'chihuahua', 'german shorthaired', 'pomeranian', 'beagle', 'english cocker spaniel', 'american pit bull terrier', 'Ragdoll', 'Persian', 'Egyptian Mau', 'miniature pinscher', 'Sphynx', 'Maine Coon', 'keeshond', 'yorkshire terrier', 'havanese', 'leonberger', 'wheaten terrier', 'american bulldog', 'english setter', 'boxer', 'newfoundland', 'Bengal', 'samoyed', 'British Shorthair', 'great pyrenees', 'Abyssinian', 'pug', 'saint bernard', 'Russian Blue', 'scottish terrier'] # List to store true labels and predicted labels true_labels = [] predicted_labels = [] for i in tqdm(range(len(dataset['train']))): # Get the image bytes from the dataset image_bytes = dataset['train'][i]['image']['bytes'] # Convert the bytes to a PIL image image = Image.open(io.BytesIO(image_bytes)) # Run the detector on the image with the provided labels results = detector(image, candidate_labels=labels_oxford_pets) # Sort the results by score in descending order sorted_results = sorted(results, key=lambda x: x['score'], reverse=True) # Get the top predicted label predicted_label = sorted_results[0]['label'] # Append the true and predicted labels to the respective lists true_labels.append(dataset['train'][i]['label']) predicted_labels.append(predicted_label) from sklearn.metrics import accuracy_score, precision_score, recall_score # Calculate accuracy accuracy = accuracy_score(true_labels, predicted_labels) # Calculate precision and recall precision = precision_score(true_labels, predicted_labels, average='weighted', labels=labels_oxford_pets) recall = recall_score(true_labels, predicted_labels, average='weighted', labels=labels_oxford_pets) # Print the results print(f"Accuracy: {accuracy:.4f}") print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") import gradio as gr from transformers import pipeline # Load models vit_classifier = pipeline("image-classification", model="kuhs/vit-base-oxford-iiit-pets") clip_detector = pipeline(model="openai/clip-vit-large-patch14", task="zero-shot-image-classification") labels_oxford_pets = [ 'Siamese', 'Birman', 'shiba inu', 'staffordshire bull terrier', 'basset hound', 'Bombay', 'japanese chin', 'chihuahua', 'german shorthaired', 'pomeranian', 'beagle', 'english cocker spaniel', 'american pit bull terrier', 'Ragdoll', 'Persian', 'Egyptian Mau', 'miniature pinscher', 'Sphynx', 'Maine Coon', 'keeshond', 'yorkshire terrier', 'havanese', 'leonberger', 'wheaten terrier', 'american bulldog', 'english setter', 'boxer', 'newfoundland', 'Bengal', 'samoyed', 'British Shorthair', 'great pyrenees', 'Abyssinian', 'pug', 'saint bernard', 'Russian Blue', 'scottish terrier' ] def classify_pet(image): vit_results = vit_classifier(image) vit_output = {result['label']: result['score'] for result in vit_results} clip_results = clip_detector(image, candidate_labels=labels_oxford_pets) clip_output = {result['label']: result['score'] for result in clip_results} return {"ViT Classification": vit_output, "CLIP Zero-Shot Classification": clip_output} example_images = [ ["example_images/dog1.jpeg"], ["example_images/dog2.jpeg"], ["example_images/leonberger.jpg"], ["example_images/snow_leopard.jpeg"], ["example_images/cat.jpg"] ] iface = gr.Interface( fn=classify_pet, inputs=gr.Image(type="filepath"), outputs=gr.JSON(), title="Pet Classification Comparison", description="Upload an image of a pet, and compare results from a trained ViT model and a zero-shot CLIP model.", examples=example_images ) iface.launch() import io from PIL import Image import numpy as np import pandas as pd import matplotlib.pyplot as plt import torch import torch.nn as nn from huggingface_hub import notebook_login from datasets import load_dataset, DatasetDict from transformers import AutoImageProcessor, ViTForImageClassification from transformers import Trainer, TrainingArguments import evaluate # Login onto Hugging Face hub to load any private dataset/model. # We need to login as we'll also upload our model to the hub notebook_login() dataset = load_dataset('pcuenq/oxford-pets') dataset dataset['train'][0] labels = dataset['train'].unique('label') print(len(labels),labels) def show_samples(ds,rows,cols): samples = ds.shuffle().select(np.arange(rows*cols)) # selecting random images fig = plt.figure(figsize=(cols*4,rows*4)) # plotting for i in range(rows*cols): img_bytes = samples[i]['image']['bytes'] img = Image.open(io.BytesIO(img_bytes)) label = samples[i]['label'] fig.add_subplot(rows,cols,i+1) plt.imshow(img) plt.title(label) plt.axis('off') show_samples(dataset['train'],rows=3,cols=5) split_dataset = dataset['train'].train_test_split(test_size=0.2) # 80% train, 20% evaluation eval_dataset = split_dataset['test'].train_test_split(test_size=0.5) # 50% validation, 50% test # recombining the splits using a DatasetDict our_dataset = DatasetDict({ 'train': split_dataset['train'], 'validation': eval_dataset['train'], 'test': eval_dataset['test'] }) our_dataset label2id = {c:idx for idx,c in enumerate(labels)} id2label = {idx:c for idx,c in enumerate(labels)} processor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224') processor def transforms(batch): batch['image'] = [Image.open(io.BytesIO(x['bytes'])).convert('RGB') for x in batch['image']] inputs = processor(batch['image'],return_tensors='pt') inputs['labels']=[label2id[y] for y in batch['label']] return inputs processed_dataset = our_dataset.with_transform(transforms) processed_dataset def collate_fn(batch): return { 'pixel_values': torch.stack([x['pixel_values'] for x in batch]), 'labels': torch.tensor([x['labels'] for x in batch]) } accuracy = evaluate.load('accuracy') def compute_metrics(eval_preds): logits, labels = eval_preds predictions = np.argmax(logits,axis=1) score = accuracy.compute(predictions=predictions, references=labels) return score model = ViTForImageClassification.from_pretrained( 'google/vit-base-patch16-224', num_labels = len(labels), id2label = id2label, label2id = label2id, ignore_mismatched_sizes = True ) model for name,p in model.named_parameters(): if not name.startswith('classifier'): p.requires_grad = False num_params = sum([p.numel() for p in model.parameters()]) trainable_params = sum([p.numel() for p in model.parameters() if p.requires_grad]) print(f"{num_params = :,} | {trainable_params = :,}") training_args = TrainingArguments( output_dir="./vit-base-oxford-iiit-pets", per_device_train_batch_size=16, evaluation_strategy="epoch", save_strategy="epoch", logging_steps=100, num_train_epochs=5, learning_rate=3e-4, save_total_limit=2, remove_unused_columns=False, push_to_hub=True, report_to='tensorboard', load_best_model_at_end=True, run_name="transferlearning-google_vit-base-patch16-224 no data augmentation" ) trainer = Trainer( model=model, args=training_args, data_collator=collate_fn, compute_metrics=compute_metrics, train_dataset=processed_dataset["train"], eval_dataset=processed_dataset["validation"], tokenizer=processor ) trainer.train() trainer.evaluate(processed_dataset['test']) def show_predictions(rows,cols): samples = our_dataset['test'].shuffle().select(np.arange(rows*cols)) processed_samples = samples.with_transform(transforms) predictions = trainer.predict(processed_samples).predictions.argmax(axis=1) # predicted labels from logits fig = plt.figure(figsize=(cols*4,rows*4)) for i in range(rows*cols): img_bytes = samples[i]['image']['bytes'] img = Image.open(io.BytesIO(img_bytes)) prediction = predictions[i] label = f"label: {samples[i]['label']}\npredicted: {id2label[prediction]}" fig.add_subplot(rows,cols,i+1) plt.imshow(img) plt.title(label) plt.axis('off') show_predictions(rows=5,cols=5) kwargs = { "finetuned_from": model.config._name_or_path, "dataset": 'pcuenq/oxford-pets', "tasks": "image-classification", "tags": ['image-classification'], } trainer.save_model() trainer.push_to_hub('🐕️🐈️', **kwargs)