In [None]:
!pip install transformers
!pip install datasets
!pip install numpy
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
import numpy as np
import random

In [None]:
def tokenize(batch):
    encoding = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256, return_tensors='pt')
    return encoding

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def random_swap(sentence, n=5):
    length = len(sentence)
    if length <= 1:
        return sentence
    n = min(n, length - 1)
    for _ in range(n):
        idx1, idx2 = random.sample(range(length), 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

metric = load_metric("accuracy")

# Load the dataset
dataset = load_dataset("dmitva/human_ai_generated_text", trust_remote_code=True)

# Combine the 'human_text' and 'ai_text' columns into one
human_text = Dataset.from_dict({'text': dataset['train']['human_text'][:5000], 'labels': [0]*5000})
ai_text = Dataset.from_dict({'text': dataset['train']['ai_text'][:5000], 'labels': [1]*5000})
combined_dataset = concatenate_datasets([human_text, ai_text])

# Apply data augmentation
combined_dataset = combined_dataset.map(lambda x: {'text': ' '.join(random_swap(' '.join(x['text']).split())), 'labels': x['labels']})

# Shuffle the data
shuffled_dataset = combined_dataset.shuffle()

# Tokenize the 'text' column
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizedDataset = shuffled_dataset.map(tokenize, batched=True)

# Split the tokenized dataset into a training set and a validation set
tokenizedDataset = tokenizedDataset.train_test_split(test_size=0.2)

# Model: bert-base-uncased
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Adjust the dropout rate
model.dropout.p = 0.3

BertTraining_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=3,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    weight_decay=0.01,
)

# Creation of Bert Trainer Object
trainer = Trainer(
    model=model,
    args=BertTraining_args,
    train_dataset=tokenizedDataset['train'],
    eval_dataset=tokenizedDataset['test'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_threshold=0.01, early_stopping_patience=1)],
)

# Fine-tune the Model
trainer.train()

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy
500,0.5274,0.418412,0.778
1000,0.3636,0.471251,0.8105


TrainOutput(global_step=1000, training_loss=0.4455198059082031, metrics={'train_runtime': 205.741, 'train_samples_per_second': 116.652, 'train_steps_per_second': 7.291, 'total_flos': 2104888442880000.0, 'train_loss': 0.4455198059082031, 'epoch': 2.0})

In [None]:
# Predictions
predictions = trainer.predict(tokenizedDataset['test'])
classPredictions = np.argmax(predictions.predictions, axis=-1)
evalMetrics = compute_metrics((predictions.predictions, predictions.label_ids))
print(evalMetrics)

apiToken = "REDACTED"

# Push the fine-tuned model to Huggingface
model.push_to_hub("SkwarczynskiP/bert-base-uncased-finetuned-dmitva-AI-and-human-generated", token=apiToken)

# Push the tokenizer to Huggingface
tokenizer.push_to_hub("SkwarczynskiP/bert-base-uncased-finetuned-dmitva-AI-and-human-generated", token=apiToken)

{'accuracy': 0.778}


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SkwarczynskiP/bert-base-uncased-finetuned-dmitva-AI-and-human-generated/commit/9cdee964ce84d6bdd25f980d895dfb8df16d37bd', commit_message='Upload tokenizer', commit_description='', oid='9cdee964ce84d6bdd25f980d895dfb8df16d37bd', pr_url=None, pr_revision=None, pr_num=None)