| | |
| | import numpy as np |
| | import torch |
| | from sklearn.datasets import fetch_20newsgroups |
| | from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix, |
| | ConfusionMatrixDisplay, classification_report) |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.pipeline import Pipeline |
| | from skops import card, hub_utils |
| | from skorch import NeuralNetClassifier |
| | from skorch.callbacks import LRScheduler, ProgressBar |
| | from skorch.hf import HuggingfacePretrainedTokenizer |
| | from torch import nn |
| | from torch.optim.lr_scheduler import LambdaLR |
| | from transformers import AutoModelForSequenceClassification |
| | from transformers import AutoTokenizer |
| | |
| | from pathlib import Path |
| | import transformers |
| | import skorch |
| | import sklearn |
| | import torch |
| |
|
| | |
| | |
| | TOKENIZER = "distilbert-base-uncased" |
| | PRETRAINED_MODEL = "distilbert-base-uncased" |
| |
|
| | |
| | OPTMIZER = torch.optim.AdamW |
| | LR = 5e-5 |
| | MAX_EPOCHS = 3 |
| | CRITERION = nn.CrossEntropyLoss |
| | BATCH_SIZE = 8 |
| |
|
| | |
| | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' |
| |
|
| | |
| | dataset = fetch_20newsgroups() |
| |
|
| | print(dataset.DESCR.split('Usage')[0]) |
| |
|
| | dataset.target_names |
| |
|
| | X = dataset.data |
| | y = dataset.target |
| | X_train, X_test, y_train, y_test, = train_test_split(X, y, stratify=y, random_state=0) |
| | num_training_steps = MAX_EPOCHS * (len(X_train) // BATCH_SIZE + 1) |
| |
|
| | |
| | |
| |
|
| | def lr_schedule(current_step): |
| | factor = float(num_training_steps - current_step) / float(max(1, num_training_steps)) |
| | assert factor > 0 |
| | return factor |
| |
|
| | class BertModule(nn.Module): |
| | def __init__(self, name, num_labels): |
| | super().__init__() |
| | self.name = name |
| | self.num_labels = num_labels |
| | |
| | self.reset_weights() |
| | |
| | def reset_weights(self): |
| | self.bert = AutoModelForSequenceClassification.from_pretrained( |
| | self.name, num_labels=self.num_labels |
| | ) |
| | |
| | def forward(self, **kwargs): |
| | pred = self.bert(**kwargs) |
| | return pred.logits |
| |
|
| | |
| | pipeline = Pipeline([ |
| | ('tokenizer', HuggingfacePretrainedTokenizer(TOKENIZER)), |
| | ('net', NeuralNetClassifier( |
| | BertModule, |
| | module__name=PRETRAINED_MODEL, |
| | module__num_labels=len(set(y_train)), |
| | optimizer=OPTMIZER, |
| | lr=LR, |
| | max_epochs=MAX_EPOCHS, |
| | criterion=CRITERION, |
| | batch_size=BATCH_SIZE, |
| | iterator_train__shuffle=True, |
| | device=DEVICE, |
| | callbacks=[ |
| | LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every='batch'), |
| | ProgressBar(), |
| | ], |
| | )), |
| | ]) |
| |
|
| | torch.manual_seed(0) |
| | torch.cuda.manual_seed(0) |
| | torch.cuda.manual_seed_all(0) |
| | np.random.seed(0) |
| |
|
| | |
| | %time pipeline.fit(X_train, y_train) |
| |
|
| | |
| | %%time |
| | with torch.inference_mode(): |
| | y_pred = pipeline.predict(X_test) |
| |
|
| | accuracy_score(y_test, y_pred) |
| |
|
| | |
| | import pickle |
| | with open("model.pkl", mode="bw") as f: |
| | pickle.dump(pipeline, file=f) |
| |
|
| | |
| | local_repo = "model_repo" |
| | hub_utils.init( |
| | model="model.pkl", |
| | requirements=[f"scikit-learn={sklearn.__version__}", f"transformers={transformers.__version__}", |
| | f"torch={torch.__version__}", f"skorch={skorch.__version__}"], |
| | dst=local_repo, |
| | task="text-classification", |
| | data=X_test, |
| | ) |
| |
|
| | |
| | model_card = card.Card(pipeline, metadata=card.metadata_from_config(Path("model_repo"))) |
| |
|
| | |
| | model_description = ( |
| | "This is a neural net classifier and distilbert model chained with sklearn Pipeline trained on 20 news groups dataset." |
| | ) |
| | limitations = "This model is trained for a tutorial and is not ready to be used in production." |
| | model_card.add( |
| | model_description=model_description, |
| | limitations=limitations |
| | ) |
| |
|
| | |
| | eval_descr = ( |
| | "The model is evaluated on validation data from 20 news group's test split," |
| | " using accuracy and F1-score with micro average." |
| | ) |
| | model_card.add(eval_method=eval_descr) |
| |
|
| | accuracy = accuracy_score(y_test, y_pred) |
| | f1 = f1_score(y_test, y_pred, average="micro") |
| | model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1}) |
| |
|
| |
|
| | cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_) |
| | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_) |
| | disp.plot() |
| |
|
| | disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png") |
| | model_card.add_plot(**{"Confusion matrix": "confusion_matrix.png"}) |
| |
|
| | clf_report = classification_report( |
| | y_test, y_pred, output_dict=True, target_names=dataset.target_names |
| | ) |
| | |
| | |
| | import pandas as pd |
| | del clf_report["accuracy"] |
| | clf_report = pd.DataFrame(clf_report).T.reset_index() |
| | model_card.add_table( |
| | folded=True, |
| | **{ |
| | "Classification Report": clf_report, |
| | }, |
| | ) |
| |
|
| | |
| | model_card.save(Path(local_repo) / "README.md") |
| |
|
| | |
| | hub_utils.add_files(__file__, dst=local_repo) |
| |
|
| | |
| | from huggingface_hub import notebook_login |
| | notebook_login() |
| |
|
| | hub_utils.push( |
| | repo_id="scikit-learn/skorch-text-classification", |
| | source=local_repo, |
| | create_remote=True, |
| | ) |