Spaces:
Running
Running
| pinned: true | |
| sdk: static | |
| ## Evaluation Pipeline | |
| # Use eval_pipeline.py or the raw version of the code below to evaluate the model. Make sure to set the dataset and model path. | |
| ``` | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, classification_report | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import BertTokenizer, BertForSequenceClassification, AdamW | |
| from transformers import get_scheduler | |
| from datasets import load_dataset | |
| data_path = "" | |
| model_path = "" | |
| data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"} | |
| dataset_train = load_dataset(data_path, data_files=data_files, split="train") | |
| dataset_val = load_dataset(data_path, data_files=data_files, split="validation") | |
| dataset_test = load_dataset(data_path, data_files=data_files, split="test") | |
| train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True) | |
| test_loader = DataLoader(dataset_test, batch_size=16) | |
| class CustomModel: | |
| def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128): | |
| """ | |
| Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters. | |
| Args: | |
| model_name (str): Name of the pretrained BERT model. | |
| num_labels (int): Number of labels for the classification task. | |
| lr (float): Learning rate for the optimizer. | |
| epochs (int): Number of epochs for training. | |
| max_len (int): Maximum token length for sequences. | |
| """ | |
| self.model_name = model_name | |
| self.num_labels = num_labels | |
| self.epochs = epochs | |
| self.max_len = max_len | |
| # Load tokenizer and model | |
| self.tokenizer = BertTokenizer.from_pretrained(model_name) | |
| self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) | |
| # Define optimizer | |
| self.optimizer = AdamW(self.model.parameters(), lr=lr) | |
| # Scheduler placeholder | |
| self.scheduler = None | |
| # Device setup | |
| self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| self.model.to(self.device) | |
| def setup_scheduler(self, train_loader): | |
| """ | |
| Setup a learning rate scheduler based on training data. | |
| Args: | |
| train_loader (DataLoader): Training data loader. | |
| """ | |
| num_training_steps = len(train_loader) * self.epochs | |
| self.scheduler = get_scheduler( | |
| "linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps | |
| ) | |
| def tokenize_batch(self, texts): | |
| """ | |
| Tokenize a batch of text inputs. | |
| Args: | |
| texts (list[str]): List of text strings to tokenize. | |
| Returns: | |
| dict: Tokenized inputs with attention masks and input IDs. | |
| """ | |
| return self.tokenizer( | |
| texts, | |
| padding=True, | |
| truncation=True, | |
| max_length=self.max_len, | |
| return_tensors="pt" | |
| ) | |
| def train(self, train_loader): | |
| """ | |
| Train the model with raw text inputs and labels. | |
| Args: | |
| train_loader (DataLoader): Training data loader containing text and labels. | |
| """ | |
| self.model.train() | |
| for epoch in range(self.epochs): | |
| epoch_loss = 0 | |
| for batch in train_loader: | |
| texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels) | |
| labels = labels.to(self.device) | |
| # Tokenize the batch | |
| tokenized_inputs = self.tokenize_batch(texts) | |
| tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()} | |
| tokenized_inputs['labels'] = labels | |
| # Forward pass and optimization | |
| outputs = self.model(**tokenized_inputs) | |
| loss = outputs.loss | |
| loss.backward() | |
| self.optimizer.step() | |
| self.scheduler.step() | |
| self.optimizer.zero_grad() | |
| epoch_loss += loss.item() | |
| print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}") | |
| def evaluate(self, test_loader): | |
| """ | |
| Evaluate the model with raw text inputs and labels. | |
| Args: | |
| test_loader (DataLoader): Test data loader containing text and labels. | |
| Returns: | |
| Tuple: True labels and predicted labels. | |
| """ | |
| self.model.eval() | |
| y_true, y_pred = [], [] | |
| with torch.no_grad(): | |
| for batch in test_loader: | |
| texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels) | |
| labels = labels.to(self.device) | |
| # Tokenize the batch | |
| tokenized_inputs = self.tokenize_batch(texts) | |
| tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()} | |
| # Forward pass | |
| outputs = self.model(**tokenized_inputs) | |
| logits = outputs.logits | |
| predictions = torch.argmax(logits, dim=-1) | |
| y_true.extend(labels.tolist()) | |
| y_pred.extend(predictions.tolist()) | |
| return y_true, y_pred | |
| def save_model(self, save_path): | |
| """ | |
| Save the model locally in Hugging Face format. | |
| Args: | |
| save_path (str): Path to save the model. | |
| """ | |
| self.model.save_pretrained(save_path) | |
| self.tokenizer.save_pretrained(save_path) | |
| def push_model(self, repo_name): | |
| """ | |
| Push the model to the Hugging Face Hub. | |
| Args: | |
| repo_name (str): Repository name on Hugging Face Hub. | |
| """ | |
| self.model.push_to_hub(repo_name) | |
| self.tokenizer.push_to_hub(repo_name) | |
| custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4) | |
| y_true, y_pred = custom_model.evaluate(test_loader) | |
| print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}") | |
| print("Classification Report:\n", classification_report(y_true, y_pred)) | |
| ``` |