File size: 5,134 Bytes
edb51e6 b275ea2 edb51e6 4cfa7c5 edb51e6 94b758b 4cfa7c5 94b758b 4cfa7c5 94b758b 4cfa7c5 94b758b b275ea2 4cfa7c5 b275ea2 4cfa7c5 b275ea2 94b758b b275ea2 94b758b b275ea2 2c4eec9 b275ea2 94b758b b275ea2 94b758b b275ea2 94b758b b275ea2 94b758b 2c4eec9 94b758b b275ea2 94b758b b275ea2 94b758b b275ea2 94b758b b275ea2 94b758b b275ea2 94b758b b275ea2 2c4eec9 b275ea2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# Model Loading and Testing Instructions
This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset.
The following code load and test the models on colab notebook.
---
# Step 1: Prerequisites
1. Import the required Python packages:
```python
from huggingface_hub import login
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import login
```
2. Log in by using the account (see our Ed private post & email sent to TAs, thanks!):
```python
login("Replace with the key")
```
# Step 2: Define the preprocessing and dataset class
1. Run the following class and functions designed to preprocess the test data
```python
class NewsDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer(
text,
max_length=self.max_len,
padding="max_length",
truncation=True,
return_tensors="pt"
)
return {
"input_ids": encoding["input_ids"].squeeze(),
"attention_mask": encoding["attention_mask"].squeeze(),
"labels": torch.tensor(label, dtype=torch.long)
}
def preprocess_text(text):
"""Clean and preprocess text."""
text = str(text)
contractions = {
"n't": " not",
"'s": " is",
"'ll": " will",
"'ve": " have"
}
for contraction, expansion in contractions.items():
text = text.replace(contraction, expansion)
text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE)
text = re.sub(r'http\\S+', '', text)
text = re.sub(r'-', ' ', text)
text = text.lower()
text = ' '.join(text.split())
return text
```
# Step 3: Load the model and tokenizer from Hugging Face Hub
This step loads the pre-trained model and tokenizer, which are hosted on the Hugging Face Hub.
```python
print("Loading model and tokenizer...")
REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to
model = RobertaForSequenceClassification.from_pretrained(REPO_NAME)
tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model and tokenizer loaded successfully!")
```
# Step 4: Load test dataset
```python
print("Loading test data...")
test_data_path = "Replace wit your test set path" #Note: Replace with your test set path
test_data = pd.read_csv(test_data_path)
```
# Step 5: Preprocess test data
```python
X_test = test_data['title'].apply(preprocess_text).values
y_test = test_data['labels'].values
```
# Step 6: Prepare the dataset and dataloader
```python
test_dataset = NewsDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2)
```
# Step 7: Evaluate the model and calculate accuracy
```python
print("Evaluating the model...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, dim=-1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")
```
# Expected output:
```python
Loading model and tokenizer...
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
config.json:β100%
β735/735β[00:00<00:00,β40.8kB/s]
model.safetensors:β100%
β499M/499Mβ[00:11<00:00,β42.7MB/s]
tokenizer_config.json:β100%
β1.19k/1.19kβ[00:00<00:00,β69.8kB/s]
vocab.json:β100%
β999k/999kβ[00:00<00:00,β4.09MB/s]
merges.txt:β100%
β456k/456kβ[00:00<00:00,β2.61MB/s]
special_tokens_map.json:β100%
β958/958β[00:00<00:00,β57.4kB/s]
Model and tokenizer loaded successfully!
Loading test data...
Evaluating the model...
Test Accuracy: 0.8500
``` |