In [1]:
from datasets import load_dataset

data = load_dataset("dataset")

  from .autonotebook import tqdm as notebook_tqdm
Resolving data files: 100%|██████████| 25/25 [00:00<00:00, 150874.24it/s]
Resolving data files: 100%|██████████| 26/26 [00:00<00:00, 154683.55it/s]


In [2]:
# !pip install Pillow

In [3]:
data['train'][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1024x2048>,
 'label': 0}

In [4]:
labels = data["train"].features["label"].names
labels

['ai_gen', 'human']

In [5]:
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [6]:
label2id

{'ai_gen': '0', 'human': '1'}

In [7]:
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [8]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [9]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [10]:
data = data.with_transform(transforms)

In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 18000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 20715
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 13354
    })
})

In [12]:
from datasets import DatasetDict, concatenate_datasets


# Concatenate train and validation datasets
combined_train_validation = concatenate_datasets([data['train'], data['validation']])

# Now your datasets are
datasets = DatasetDict({
    'train': combined_train_validation,
    'test': data['test']
})
datasets

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 38715
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 13354
    })
})

In [13]:
data= datasets

In [14]:
data

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 38715
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 13354
    })
})

In [15]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [16]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    "umm-maybe/AI-image-detector",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

  return self.fget.__get__(instance, owner)()


In [17]:
import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:
training_args = TrainingArguments(
    output_dir="ai_detector_v2",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0187,0.010242,0.997379
2,0.0079,0.008325,0.998053
3,0.0002,0.007271,0.998278


Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}


TrainOutput(global_step=1815, training_loss=0.025701069246692784, metrics={'train_runtime': 4592.2162, 'train_samples_per_second': 25.292, 'train_steps_per_second': 0.395, 'total_flos': 9.099444558417777e+18, 'train_loss': 0.025701069246692784, 'epoch': 3.0})