File size: 4,905 Bytes

fcbc617

import os 
import sklearn
import xgboost

import dill as pickle
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_auc_score, RocCurveDisplay, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay

from skops import hub_utils
from skops import card

# load in in the Suicide Detection dataset
# accessible at https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch
df = pd.read_csv(
    "Suicide_Detection.csv",
    usecols=["text", "class"],
    dtype= {"text":str,"class":str}
)

# separate text and target class
X = df['text'].to_list()
y = df['class'].apply(lambda x: 1 if x == 'suicide' else 0).to_list()

# construct training and testing splits 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


def preprocessor(s):
    """preprocessor for the tfidf vectorizer"""

    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

    stopwords_set = set(ENGLISH_STOP_WORDS)

    def filter(text):
        if text == None:
            return ""
        words = str(text).split()
        filtered_words = [word for word in words if word and word.lower() not in stopwords_set]
        return " ".join(filtered_words)

    return filter(s)

# construct the model pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 3), min_df=100)),
    ('classifier', xgboost.XGBClassifier())
], verbose=True)

# fit the model using the training split
model.fit(X_train, y_train)

# use the trained model to make predictions on the testing set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# save the trained model
model_filename = "model.pkl"
with open(model_filename, mode="bw") as f:
    pickle.dump(model, file=f)

local_repo = Path("suicide-detector")

# construct the hugging face page 
hub_utils.init(
    model=model_filename,
    requirements=[f"scikit-learn={sklearn.__version__}", f"xgboost={xgboost.__version__}"],
    dst=str(local_repo),
    task="text-classification",
    data=X_test,
)

# made a header card from the metadata
model_card = card.Card(model, metadata=card.metadata_from_config(local_repo))

# add license
model_card.metadata.license = "mit"

model_description = """
Suicide Detection text classification model.

Trained on the Suicide and Depression Detection dataset (https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch)

The model vectorises each text using a trained tfidf vectorizer and then classifies using xgboost.
"""

get_started_code = """
import sklearn 
import dill as pickle

from skops import hub_utils
from pathlib import Path

suicide_detector_repo = Path("./suicide-detector")

hub_utils.download(
    repo_id="AndyJamesTurner/suicideDetector",
    dst=suicide_detector_repo
)

with open(suicide_detector_repo / "model.pkl", 'rb') as file:
    clf = pickle.load(file)

classification = clf.predict(["I want to kill myself"])[0]
"""

# add additional details to the page including 
# model description, getting started guide, and author
model_card.add(
    get_started_code=get_started_code,
    model_card_authors="Andy Turner",
    model_description=model_description,
)

# add description of how the model was evaluated
eval_descr = (
    "The model was evaluated on a 0.3 holdout split using f1 score, accuracy, confusion matrix and ROC curves."
)
model_card.add(eval_method=eval_descr)

# compute model evaluation metrics and add details to the hugging face model card
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="micro")
model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1})

cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
disp.figure_.savefig(local_repo / "confusion_matrix.png")
model_card.add_plot(**{"Confusion matrix": "confusion_matrix.png"})

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
disp.plot()
disp.figure_.savefig(local_repo / "roc_curve.png")
model_card.add_plot(**{"ROC Curve": "roc_curve.png"})

clf_report = classification_report(
    y_test, y_pred, output_dict=True, target_names=["not suicide", "suicide"]
)

del clf_report["accuracy"]
clf_report = pd.DataFrame(clf_report).T.reset_index()
model_card.add_table(
    folded=True,
    **{
        "Classification Report": clf_report,
    },
)

# construct a readme from the model card
model_card.save(local_repo / "README.md")

# add this file to the repo to document how it was constructed
hub_utils.add_files(
    os.path.realpath(__file__),
    dst=str(local_repo),
)