Update README.md
Browse files
README.md
CHANGED
|
@@ -5,7 +5,7 @@ The following code load and test the models on colab notebook.
|
|
| 5 |
|
| 6 |
---
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
1. Import the required Python packages:
|
| 11 |
|
|
@@ -28,10 +28,11 @@ from huggingface_hub import login
|
|
| 28 |
login("Replace with the key")
|
| 29 |
```
|
| 30 |
|
| 31 |
-
# Define the preprocessing and dataset class
|
| 32 |
|
| 33 |
-
1. Run the following
|
| 34 |
|
|
|
|
| 35 |
class NewsDataset(Dataset):
|
| 36 |
def __init__(self, texts, labels, tokenizer, max_len=128):
|
| 37 |
self.texts = texts
|
|
@@ -75,33 +76,41 @@ def preprocess_text(text):
|
|
| 75 |
text = text.lower()
|
| 76 |
text = ' '.join(text.split())
|
| 77 |
return text
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
print("Loading model and tokenizer...")
|
| 83 |
-
REPO_NAME = "CIS5190GoGo/CustomModel"
|
| 84 |
model = RobertaForSequenceClassification.from_pretrained(REPO_NAME)
|
| 85 |
tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME)
|
| 86 |
|
| 87 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 88 |
model.to(device)
|
| 89 |
print("Model and tokenizer loaded successfully!")
|
|
|
|
| 90 |
|
| 91 |
-
# Step
|
|
|
|
| 92 |
print("Loading test data...")
|
| 93 |
test_data_path = "/content/drive/MyDrive/5190_project/test_data_random_subset.csv" # Replace with your test set path
|
| 94 |
test_data = pd.read_csv(test_data_path)
|
| 95 |
-
|
| 96 |
-
# Preprocess test data
|
|
|
|
| 97 |
X_test = test_data['title'].apply(preprocess_text).values
|
| 98 |
y_test = test_data['labels'].values
|
|
|
|
| 99 |
|
| 100 |
-
# Step
|
|
|
|
| 101 |
test_dataset = NewsDataset(X_test, y_test, tokenizer)
|
| 102 |
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2)
|
|
|
|
| 103 |
|
| 104 |
-
# Step
|
|
|
|
| 105 |
print("Evaluating the model...")
|
| 106 |
model.eval()
|
| 107 |
all_preds, all_labels = [], []
|
|
@@ -118,6 +127,6 @@ with torch.no_grad():
|
|
| 118 |
all_preds.extend(preds.cpu().numpy())
|
| 119 |
all_labels.extend(labels.cpu().numpy())
|
| 120 |
|
| 121 |
-
# Step 5: Calculate accuracy
|
| 122 |
accuracy = accuracy_score(all_labels, all_preds)
|
| 123 |
-
print(f"Test Accuracy: {accuracy:.4f}")
|
|
|
|
|
|
| 5 |
|
| 6 |
---
|
| 7 |
|
| 8 |
+
# Step 1: Prerequisites
|
| 9 |
|
| 10 |
1. Import the required Python packages:
|
| 11 |
|
|
|
|
| 28 |
login("Replace with the key")
|
| 29 |
```
|
| 30 |
|
| 31 |
+
# Step 2: Define the preprocessing and dataset class
|
| 32 |
|
| 33 |
+
1. Run the following class and functions designed to preprocess the test data
|
| 34 |
|
| 35 |
+
```python
|
| 36 |
class NewsDataset(Dataset):
|
| 37 |
def __init__(self, texts, labels, tokenizer, max_len=128):
|
| 38 |
self.texts = texts
|
|
|
|
| 76 |
text = text.lower()
|
| 77 |
text = ' '.join(text.split())
|
| 78 |
return text
|
| 79 |
+
```
|
| 80 |
|
| 81 |
|
| 82 |
+
# Step 3: Load the model and tokenizer from Hugging Face Hub
|
| 83 |
+
```python
|
| 84 |
print("Loading model and tokenizer...")
|
| 85 |
+
REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to
|
| 86 |
model = RobertaForSequenceClassification.from_pretrained(REPO_NAME)
|
| 87 |
tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME)
|
| 88 |
|
| 89 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 90 |
model.to(device)
|
| 91 |
print("Model and tokenizer loaded successfully!")
|
| 92 |
+
```
|
| 93 |
|
| 94 |
+
# Step 4: Load test dataset
|
| 95 |
+
```python
|
| 96 |
print("Loading test data...")
|
| 97 |
test_data_path = "/content/drive/MyDrive/5190_project/test_data_random_subset.csv" # Replace with your test set path
|
| 98 |
test_data = pd.read_csv(test_data_path)
|
| 99 |
+
```
|
| 100 |
+
# Step 5: Preprocess test data
|
| 101 |
+
```python
|
| 102 |
X_test = test_data['title'].apply(preprocess_text).values
|
| 103 |
y_test = test_data['labels'].values
|
| 104 |
+
```
|
| 105 |
|
| 106 |
+
# Step 6: Prepare the dataset and dataloader
|
| 107 |
+
```python
|
| 108 |
test_dataset = NewsDataset(X_test, y_test, tokenizer)
|
| 109 |
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2)
|
| 110 |
+
```
|
| 111 |
|
| 112 |
+
# Step 7: Evaluate the model and calculate accuracy
|
| 113 |
+
```python
|
| 114 |
print("Evaluating the model...")
|
| 115 |
model.eval()
|
| 116 |
all_preds, all_labels = [], []
|
|
|
|
| 127 |
all_preds.extend(preds.cpu().numpy())
|
| 128 |
all_labels.extend(labels.cpu().numpy())
|
| 129 |
|
|
|
|
| 130 |
accuracy = accuracy_score(all_labels, all_preds)
|
| 131 |
+
print(f"Test Accuracy: {accuracy:.4f}")
|
| 132 |
+
```
|