{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from datasets.dataset_dict import DatasetDict\n",
"from datasets import Dataset, concatenate_datasets\n",
"import evaluate\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from datetime import datetime\n",
"from sklearn.model_selection import train_test_split \n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a6e35182bde84894bec8b46e06cc226d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='
= 0.5 else 0 for label in labels]\n",
"\n",
" \n",
" output_dict = dict()\n",
" output_dict[\"pred_labels\"] = pred_labels\n",
" output_dict[\"actual_labels\"] = actual_labels\n",
" output_dict[\"passage\"] = text['passage']\n",
" output_dict[\"ID\"] = text['ID']\n",
"\n",
"\n",
" # score[0][(\"actual_label\", 'passage')] = text['passage'], text['label']\n",
" dataOutput.append(output_dict)\n",
" return dataOutput\n",
"\n",
"# Get F1 scores\n",
"def score(dataOutput, labels):\n",
" from sklearn.metrics import f1_score, accuracy_score\n",
"\n",
" df_score = pd.DataFrame(index=['NLP'], columns= [label+\"_F1\" for label in labels] + [\"Micro_F1\", \"Macro_F1\"])\n",
" actual_labels = [x['actual_labels'] for x in dataOutput]\n",
" pred_labels = [x['pred_labels'] for x in dataOutput]\n",
" for index, label in enumerate(labels):\n",
" f1 = round(f1_score(y_true=np.array(actual_labels)[:,index], y_pred=np.array(pred_labels)[:,index]),3)\n",
" df_score.at['NLP', label+\"_F1\"] = f1\n",
" # print(f\"{label}: {(6 - len(label)) *' '}{f1}\")\n",
"\n",
" # print(\"\\n\")\n",
"\n",
" f1_micro = round(f1_score(y_true=actual_labels, y_pred=pred_labels, average='micro'),3)\n",
" f1_macro = round(f1_score(y_true=actual_labels, y_pred=pred_labels, average='macro'),3)\n",
" df_score.at['NLP', \"Micro_F1\"] = f1_micro\n",
" df_score.at['NLP', \"Macro_F1\"] = f1_macro\n",
" return df_score\n",
"def cor_score(dataOutput, labels):\n",
" from sklearn.metrics import matthews_corrcoef\n",
" df_score = pd.DataFrame(index=['NLP'], columns= [label+\"_Cor\" for label in labels])\n",
" actual_labels = [x['actual_labels'] for x in dataOutput]\n",
" pred_labels = [x['pred_labels'] for x in dataOutput]\n",
" for index, label in enumerate(labels):\n",
" corrcoef = round(matthews_corrcoef(y_true=np.array(actual_labels)[:,index], y_pred=np.array(pred_labels)[:,index]),3)\n",
" df_score.at['NLP', label+\"_Cor\"] = corrcoef \n",
" return df_score\n",
" # print(f'F1 score (micro) {f1_micro}\\nF1 score (macro) {f1_macro}')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inference"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load datset "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['ID', 'passage', 'EVENT', 'EVENT_Illness', 'EVENT_Accident', 'EVENT_Other', 'CAUSE', 'CAUSE_Just_Happens', 'CAUSE_Material_Physical', 'CAUSE_Spirits_Gods', 'CAUSE_Witchcraft_Sorcery', 'CAUSE_Rule_Violation_Taboo', 'CAUSE_Other', 'ACTION', 'ACTION_Physical_Material', 'ACTION_Technical_Specialist', 'ACTION_Divination', 'ACTION_Shaman_Medium_Healer', 'ACTION_Priest_High_Religion', 'ACTION_Other'],\n",
" num_rows: 2074\n",
"})"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"loc = \"\"\n",
"# loc = \"../HRAF_MultiLabel_ThreeLargeClasses/\" #load old threemain class (comment this out unless you specifically are using it)\n",
"\n",
"# dataset = load_dataset('csv', data_files={'train': 'train.txt', 'validation': 'val.txt', 'test': 'test.txt'}, sep=\";\", \n",
"# names=[\"text\", \"label\"])\n",
"\n",
"\n",
"f = open(loc+\"Datasets/test_dataset.json\")\n",
"# f = open(\"../HRAF_MultiLabel_ThreeLargeClasses/Datasets/test_dataset.json\") #load old threemain class (comment this out unless you specifically are using it)\n",
"data = json.load(f)\n",
"f.close()\n",
"Hraf = Dataset.from_dict(data)\n",
"Hraf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define Kwargs and Labels"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['EVENT',\n",
" 'EVENT_Illness',\n",
" 'EVENT_Accident',\n",
" 'EVENT_Other',\n",
" 'CAUSE',\n",
" 'CAUSE_Just_Happens',\n",
" 'CAUSE_Material_Physical',\n",
" 'CAUSE_Spirits_Gods',\n",
" 'CAUSE_Witchcraft_Sorcery',\n",
" 'CAUSE_Rule_Violation_Taboo',\n",
" 'CAUSE_Other',\n",
" 'ACTION',\n",
" 'ACTION_Physical_Material',\n",
" 'ACTION_Technical_Specialist',\n",
" 'ACTION_Divination',\n",
" 'ACTION_Shaman_Medium_Healer',\n",
" 'ACTION_Priest_High_Religion',\n",
" 'ACTION_Other']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define tokenizer kwargs\n",
"tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}\n",
"\n",
"classifier_kwargs = {'top_k':None, 'device':0} #Set device -1 for CPU, 0 or higher for GPU\n",
"\n",
"# get label names\n",
"labels = [label for label in Hraf.features.keys() if label not in ['ID', 'passage']]\n",
"labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Single Model Inference"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run this or the other model, not both"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<>:4: SyntaxWarning: invalid escape sequence '\\H'\n",
"<>:4: SyntaxWarning: invalid escape sequence '\\H'\n",
"C:\\Users\\Ericc\\AppData\\Local\\Temp\\ipykernel_12652\\2739061250.py:4: SyntaxWarning: invalid escape sequence '\\H'\n",
" model = \"Model_0_TEST_DELETE\\Hierarchy_test_fold_1\"\n",
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at d:\\Documents\\Dropbox\\MEM-DEV-LAB-Current\\2023-eHRAF-Misf\\HRAF-Misf-NaturalLanguageProcessing\\HRAF_NLP\\HRAF_MultiLabel_Hierarchical\\Model_0_TEST_DELETE\\Hierarchy_test_fold_1\\checkpoint-260 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"data": {
"text/plain": [
"[[{'label': 'CAUSE_Spirits_Gods', 'score': 0.06896580755710602},\n",
" {'label': 'ACTION_Other', 'score': 0.06823690235614777},\n",
" {'label': 'CAUSE_Just_Happens', 'score': 0.06278162449598312},\n",
" {'label': 'ACTION', 'score': 0.060430508106946945},\n",
" {'label': 'ACTION_Divination', 'score': 0.05936731770634651},\n",
" {'label': 'CAUSE_Material_Physical', 'score': 0.05765993520617485},\n",
" {'label': 'EVENT_Illness', 'score': 0.05764380469918251},\n",
" {'label': 'CAUSE_Witchcraft_Sorcery', 'score': 0.05746110901236534},\n",
" {'label': 'CAUSE_Other', 'score': 0.05521957203745842},\n",
" {'label': 'ACTION_Priest_High_Religion', 'score': 0.053678661584854126},\n",
" {'label': 'ACTION_Technical_Specialist', 'score': 0.05367414280772209},\n",
" {'label': 'CAUSE', 'score': 0.05352191999554634},\n",
" {'label': 'EVENT', 'score': 0.05282969027757645},\n",
" {'label': 'EVENT_Other', 'score': 0.04889928549528122},\n",
" {'label': 'ACTION_Physical_Material', 'score': 0.0487285777926445},\n",
" {'label': 'CAUSE_Rule_Violation_Taboo', 'score': 0.0484745018184185},\n",
" {'label': 'ACTION_Shaman_Medium_Healer', 'score': 0.047018107026815414},\n",
" {'label': 'EVENT_Accident', 'score': 0.04540855437517166}]]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from transformers import pipeline, AutoTokenizer\n",
"\n",
"# CHANGE Model name\n",
"model = \"Model_0_TEST_DELETE\\Hierarchy_test_fold_1\"\n",
"checkpoint_path = \"checkpoint-260\"\n",
"# set up the pipeline from local\n",
"import os\n",
"path =os.path.abspath(f\"{model}/{checkpoint_path}\")\n",
"classifier = pipeline(\"text-classification\", model=path, **classifier_kwargs)\n",
"\n",
"\n",
"# sample inference ENTER TEXT IN HERE.\n",
"text = '''\n",
"“Drinking-tubes made of the leg-bones of swans (Fig. 109) are 190 also used chiefly as a measure of precaution against diseases subject to shunning.....”\n",
"'''\n",
"# reveal sample classification\n",
"prediction = classifier(text, **tokenizer_kwargs)\n",
"prediction\n",
"\n",
"# # Demo other models (COMMENT THIS OUT UNLESS YOU REALLY WANT TO DEMO THIS)\n",
"# # Set up path from online hub (note, this is analogous but different model and is here because this is a demo)\n",
"# classifier = pipeline(\"text-classification\", top_k=None, model=\"Chantland/Hraf_MultiLabel\", use_auth_token=\"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\", tokenizer=AutoTokenizer.from_pretrained(\"distilbert-base-uncased\"))\n",
"# model = \"MultiLabel_ThreeLargeClasses\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict The Dataset"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2074 passages Predicted\n"
]
}
],
"source": [
"# Predict dataset (may take about .25 seconds per passage when tested on lab mac, could differ depending on your system)\n",
"# Also note that this pipeline is sequential and may give a warning saying it is unoptimized. Currently, using a whole dataset does not seem to reap faster results so we are remaining with sequential\n",
"HrafOutput = predictor(Hraf, labels=labels, tokenizer_kwargs=tokenizer_kwargs, classifier=classifier)\n",
"print(len(HrafOutput), \"passages Predicted\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# HrafOutput = classifier(Hraf['passage'],**tokenizer_kwargs)\n",
"# print(len(HrafOutput), \"passages Predicted\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate \"Correctness\" Metrics"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" EVENT_F1 | \n",
" EVENT_Illness_F1 | \n",
" EVENT_Accident_F1 | \n",
" EVENT_Other_F1 | \n",
" CAUSE_F1 | \n",
" CAUSE_Just_Happens_F1 | \n",
" CAUSE_Material_Physical_F1 | \n",
" CAUSE_Spirits_Gods_F1 | \n",
" CAUSE_Witchcraft_Sorcery_F1 | \n",
" CAUSE_Rule_Violation_Taboo_F1 | \n",
" CAUSE_Other_F1 | \n",
" ACTION_F1 | \n",
" ACTION_Physical_Material_F1 | \n",
" ACTION_Technical_Specialist_F1 | \n",
" ACTION_Divination_F1 | \n",
" ACTION_Shaman_Medium_Healer_F1 | \n",
" ACTION_Priest_High_Religion_F1 | \n",
" ACTION_Other_F1 | \n",
" Micro_F1 | \n",
" Macro_F1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | NLP | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_F1 EVENT_Illness_F1 EVENT_Accident_F1 EVENT_Other_F1 CAUSE_F1 \\\n",
"NLP 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" CAUSE_Just_Happens_F1 CAUSE_Material_Physical_F1 CAUSE_Spirits_Gods_F1 \\\n",
"NLP 0.0 0.0 0.0 \n",
"\n",
" CAUSE_Witchcraft_Sorcery_F1 CAUSE_Rule_Violation_Taboo_F1 CAUSE_Other_F1 \\\n",
"NLP 0.0 0.0 0.0 \n",
"\n",
" ACTION_F1 ACTION_Physical_Material_F1 ACTION_Technical_Specialist_F1 \\\n",
"NLP 0.0 0.0 0.0 \n",
"\n",
" ACTION_Divination_F1 ACTION_Shaman_Medium_Healer_F1 \\\n",
"NLP 0.0 0.0 \n",
"\n",
" ACTION_Priest_High_Religion_F1 ACTION_Other_F1 Micro_F1 Macro_F1 \n",
"NLP 0.0 0.0 0.0 0.0 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#get F1 scores for labels.\n",
"df_score = score(HrafOutput, labels)\n",
"# Optional TEST, get correlational score instead)\n",
"df_score = score(HrafOutput, labels)\n",
"df_score"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" EVENT_Cor | \n",
" EVENT_Illness_Cor | \n",
" EVENT_Accident_Cor | \n",
" EVENT_Other_Cor | \n",
" CAUSE_Cor | \n",
" CAUSE_Just_Happens_Cor | \n",
" CAUSE_Material_Physical_Cor | \n",
" CAUSE_Spirits_Gods_Cor | \n",
" CAUSE_Witchcraft_Sorcery_Cor | \n",
" CAUSE_Rule_Violation_Taboo_Cor | \n",
" CAUSE_Other_Cor | \n",
" ACTION_Cor | \n",
" ACTION_Physical_Material_Cor | \n",
" ACTION_Technical_Specialist_Cor | \n",
" ACTION_Divination_Cor | \n",
" ACTION_Shaman_Medium_Healer_Cor | \n",
" ACTION_Priest_High_Religion_Cor | \n",
" ACTION_Other_Cor | \n",
"
\n",
" \n",
" \n",
" \n",
" | NLP | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" EVENT_Cor EVENT_Illness_Cor EVENT_Accident_Cor EVENT_Other_Cor CAUSE_Cor \\\n",
"NLP 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" CAUSE_Just_Happens_Cor CAUSE_Material_Physical_Cor CAUSE_Spirits_Gods_Cor \\\n",
"NLP 0.0 0.0 0.0 \n",
"\n",
" CAUSE_Witchcraft_Sorcery_Cor CAUSE_Rule_Violation_Taboo_Cor \\\n",
"NLP 0.0 0.0 \n",
"\n",
" CAUSE_Other_Cor ACTION_Cor ACTION_Physical_Material_Cor \\\n",
"NLP 0.0 0.0 0.0 \n",
"\n",
" ACTION_Technical_Specialist_Cor ACTION_Divination_Cor \\\n",
"NLP 0.0 0.0 \n",
"\n",
" ACTION_Shaman_Medium_Healer_Cor ACTION_Priest_High_Religion_Cor \\\n",
"NLP 0.0 0.0 \n",
"\n",
" ACTION_Other_Cor \n",
"NLP 0.0 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Optional TEST, get correlational score instead)\n",
"df_score_cor = cor_score(HrafOutput, labels)\n",
"df_score_cor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Add correctness to file"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Date | \n",
" EVENT_Illness_F1 | \n",
" EVENT_Accident_F1 | \n",
" EVENT_Other_F1 | \n",
" CAUSE_Material_Physical_F1 | \n",
" CAUSE_Spirits_Gods_F1 | \n",
" CAUSE_Witchcraft_Sorcery_F1 | \n",
" CAUSE_Rule_Violation_Taboo_F1 | \n",
" ACTION_Physical_Material_F1 | \n",
" ACTION_Technical_Specialist_F1 | \n",
" ACTION_Divination_F1 | \n",
" ACTION_Shaman_Medium_Healer_F1 | \n",
" ACTION_Priest_High_Religion_F1 | \n",
" Micro_F1 | \n",
" Macro_F1 | \n",
" test_length | \n",
" train_length | \n",
" Notes | \n",
"
\n",
" \n",
" \n",
" \n",
" | NLP | \n",
" 2024-05-29 | \n",
" 0.856 | \n",
" 0.355 | \n",
" 0.574 | \n",
" 0.394 | \n",
" 0.681 | \n",
" 0.59 | \n",
" 0.493 | \n",
" 0.628 | \n",
" 0.396 | \n",
" 0.0 | \n",
" 0.479 | \n",
" 0.193 | \n",
" 0.622 | \n",
" 0.47 | \n",
" 2074 | \n",
" 8293 | \n",
" model: Model_3_LearningRates/Learning_Rate_1e-... | \n",
"
\n",
" \n",
" | NLP | \n",
" 2024-05-29 | \n",
" 0.856 | \n",
" 0 | \n",
" 0.473 | \n",
" 0.065 | \n",
" 0.446 | \n",
" 0 | \n",
" 0 | \n",
" 0.559 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.503 | \n",
" 0.2 | \n",
" 2074 | \n",
" 8293 | \n",
" model: Model_2_ReducedCols/Weight_Decay_.01_fo... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Date EVENT_Illness_F1 EVENT_Accident_F1 EVENT_Other_F1 \\\n",
"NLP 2024-05-29 0.856 0.355 0.574 \n",
"NLP 2024-05-29 0.856 0 0.473 \n",
"\n",
" CAUSE_Material_Physical_F1 CAUSE_Spirits_Gods_F1 \\\n",
"NLP 0.394 0.681 \n",
"NLP 0.065 0.446 \n",
"\n",
" CAUSE_Witchcraft_Sorcery_F1 CAUSE_Rule_Violation_Taboo_F1 \\\n",
"NLP 0.59 0.493 \n",
"NLP 0 0 \n",
"\n",
" ACTION_Physical_Material_F1 ACTION_Technical_Specialist_F1 \\\n",
"NLP 0.628 0.396 \n",
"NLP 0.559 0 \n",
"\n",
" ACTION_Divination_F1 ACTION_Shaman_Medium_Healer_F1 \\\n",
"NLP 0.0 0.479 \n",
"NLP 0 0 \n",
"\n",
" ACTION_Priest_High_Religion_F1 Micro_F1 Macro_F1 test_length \\\n",
"NLP 0.193 0.622 0.47 2074 \n",
"NLP 0 0.503 0.2 2074 \n",
"\n",
" train_length Notes \n",
"NLP 8293 model: Model_3_LearningRates/Learning_Rate_1e-... \n",
"NLP 8293 model: Model_2_ReducedCols/Weight_Decay_.01_fo... "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"# export F1 scores to excel\n",
"df_scoresSep = df_score.copy()\n",
"# first load train (and maybe add validation)\n",
"f = open(loc+\"Datasets/train_dataset.json\")\n",
"data = json.load(f)\n",
"train = Dataset.from_dict(data)\n",
"if os.path.isfile(loc+\"Datasets/validation_dataset.json\"):\n",
" f = open(loc+\"Datasets/validation_dataset.json\")\n",
" data = json.load(f)\n",
" valid = Dataset.from_dict(data)\n",
" train = concatenate_datasets([train, valid])\n",
"# add lengths of test and training set\n",
"df_scoresSep[[\"test_length\", \"train_length\"]] = (len(Hraf), len(train))\n",
"# add date\n",
"df_scoresSep.insert(0, \"Date\", [datetime.today().date()])\n",
"if loc == \"\":\n",
" df_scoresSep['Notes'] = f\"model: {model}/{checkpoint_path}, Dataset: {model}\"\n",
"else:\n",
" df_scoresSep['Notes'] = f\"model: {model}/{checkpoint_path}, Dataset: {loc}\"\n",
"# load model_performance.xlsx or else create it\n",
"if os.path.isfile(\"Model_Prediction_Performance.xlsx\"):\n",
" df_oldScores = pd.read_excel(\"Model_Prediction_Performance.xlsx\", index_col=0)\n",
" df_oldScores_merged = pd.concat([df_scoresSep, df_oldScores])\n",
" nonDateCols = df_oldScores_merged.columns[df_scoresSep.columns != 'Date']\n",
" if any(df_oldScores_merged.duplicated(subset=nonDateCols)): # don't append the data unless it is new\n",
" print(\"Duplicated scores found, skipping new addition\")\n",
" df_scoresSep = df_oldScores.copy()\n",
" else:\n",
" df_scoresSep = df_oldScores_merged.copy()\n",
" df_scoresSep['Date'] = df_scoresSep['Date'].astype('datetime64[ns]')\n",
" df_scoresSep.to_excel(\"Model_Prediction_Performance.xlsx\")\n",
"else:\n",
" df_scoresSep['Date'] = df_scoresSep['Date'].astype('datetime64[ns]')\n",
" df_scoresSep.to_excel(f\"Model_Prediction_Performance.xlsx\")\n",
"df_scoresSep"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checkpoint Multi-model Inference"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is to run over MANY models and checkpoints to test and see which is the strongest. This is ran instead of the single model one above and should NOT be ran together with the single model (simply because they do different things)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# code for running through all checkpoints\n",
"# code for running through all checkpoints\n",
"import os\n",
"import pandas as pd\n",
"import re\n",
"import json\n",
"from transformers import pipeline, AutoTokenizer\n",
"def checkpointInfer(path, data, labels, tokenizer_kwargs, classifier_kwargs, folds=True, output_str=\"output_dir_\", modelDestinctifier:str= \"ModelDistinctifierUnknown\"):\n",
" # Initiate Dataframe overall\n",
" df = pd.DataFrame([])\n",
"\n",
" # Get all viable models\n",
" # Makes sure the model starts with the output string and is a directory\n",
" models = [name for name in os.listdir(path) if (name.startswith(output_str) and os.path.isdir(f\"{path}/{name}\"))]\n",
"\n",
" for model in models:\n",
" # Initiate Dataframe for each model\n",
" df_model = pd.DataFrame([])\n",
"\n",
" # Get checkpoint directory for a particular model and get the unit in which the model is distinguished (like learning rates)\n",
" checkpoints_dir = [checkpoint for checkpoint in os.listdir(f\"{path}/{model}\") if checkpoint.startswith(\"checkpoint\")]\n",
" modelDestinctifier_unit = re.findall(f\"{output_str}(.*?)_\",model)\n",
" try:\n",
" modelDestinctifier_unit = float(modelDestinctifier_unit[0])\n",
" except:\n",
" pass\n",
"\n",
"\n",
" # Predict for each checkpoint (within said model) and save results\n",
" for checkpoint in checkpoints_dir:\n",
" # Initiate Dataframe for each checkpoint\n",
" df_checkpoint = pd.DataFrame([])\n",
" # set up the pipeline from local\n",
" model_path =os.path.abspath(f\"{path}/{model}/{checkpoint}\")\n",
" classifier = pipeline(\"text-classification\", model=model_path, **classifier_kwargs)\n",
" # Get Predictions\n",
" dataOutput = predictor(data, labels=labels, tokenizer_kwargs=tokenizer_kwargs, classifier=classifier)\n",
" # Get scores\n",
" df_checkpoint = score(dataOutput, labels)\n",
" df_checkpoint = df_checkpoint.reset_index(drop=True) #remove the index here\n",
"\n",
"\n",
" df_checkpoint.insert(0,modelDestinctifier,modelDestinctifier_unit) #insert model distinctifier (like weight decay or learning rate)\n",
" #Extract and add Fold name if relevant\n",
" if folds: #if using folds\n",
" fold = re.findall(r\"fold_(\\d*)\",model)\n",
" fold = int(fold[0])\n",
" df_checkpoint.insert(1,\"Fold\",fold)\n",
" else:\n",
" fold = \"\"\n",
"\n",
" # get checkpoint\n",
" checkpoint_num = re.findall(r\"checkpoint-(\\d*)\",checkpoint)\n",
" assert len(checkpoint_num) == 1, f\"More or less than one checkpoint numbers found: {len(checkpoint_num)} checkpoints\"\n",
" checkpoint_num = int(checkpoint_num[0])\n",
"\n",
" df_checkpoint.insert(0,\"Model\",model) # Add model name\n",
" df_checkpoint.insert(0,\"Checkpoint\",checkpoint_num)\n",
" df_model = pd.concat([df_model,df_checkpoint])\n",
" print(model, checkpoint, \"Complete\")\n",
"\n",
" # concat model to overarching dataframe\n",
" df = pd.concat([df,df_model])\n",
" # save df for each model (as a checkpoint)\n",
" # import evaluation if it exists\n",
" if os.path.exists(f\"{path}/Inference_Test.xlsx\"):\n",
" old_df = pd.read_excel(f\"{path}/Inference_Test.xlsx\", sheet_name=\"Sheet1\", index_col=0)\n",
" df_model = pd.concat([old_df, df_model])\n",
"\n",
" df_model.to_excel(f\"{path}/Inference_Test.xlsx\", sheet_name=\"Sheet1\")\n",
" print(model, \"Successfully Saved\")\n",
"\n",
" return df\n",
"\n",
"\n",
" \n",
"\n",
"\n",
"\n",
"\n",
"# output_str=\"output_dir_\"\n",
"\n",
"# model = \"MultiLabel_ThreeLargeClasses_kfoldsDEMO_WeightInvestigation\"\n",
"# path =os.path.abspath(f\"HRAF_Model_{model}\")\n",
"# x = [name for name in os.listdir(path) if (name.startswith(\"output_dir_\") and os.path.isdir(f\"{path}/{name}\"))]\n",
"# # x\n",
"# modelDestinctifier_unit = re.findall(f\"{output_str}(.*?)_\",x[1])\n",
"# try:\n",
"# modelDestinctifier_unit = float(modelDestinctifier_unit)\n",
"# except:\n",
"# pass"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Weight_Decay_.01_fold_1 checkpoint-26430 Complete\n",
"Weight_Decay_.01_fold_1 Successfully Saved\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Checkpoint | \n",
" Weight_Decay | \n",
" Fold | \n",
" EVENT_Illness_F1 | \n",
" EVENT_Accident_F1 | \n",
" EVENT_Other_F1 | \n",
" CAUSE_Just_Happens_F1 | \n",
" CAUSE_Material_Physical_F1 | \n",
" CAUSE_Spirits_Gods_F1 | \n",
" CAUSE_Witchcraft_Sorcery_F1 | \n",
" CAUSE_Rule_Violation_Taboo_F1 | \n",
" CAUSE_Other_F1 | \n",
" ACTION_Physical_Material_F1 | \n",
" ACTION_Technical_Specialist_F1 | \n",
" ACTION_Divination_F1 | \n",
" ACTION_Shaman_Medium_Healer_F1 | \n",
" ACTION_Priest_High_Religion_F1 | \n",
" ACTION_Other_F1 | \n",
" Micro_F1 | \n",
" Macro_F1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 26430 | \n",
" 0.01 | \n",
" 1 | \n",
" 0.865 | \n",
" 0.0 | \n",
" 0.494 | \n",
" 0.0 | \n",
" 0.059 | \n",
" 0.591 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.567 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.046 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.501 | \n",
" 0.175 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Checkpoint Weight_Decay Fold EVENT_Illness_F1 EVENT_Accident_F1 \\\n",
"0 26430 0.01 1 0.865 0.0 \n",
"\n",
" EVENT_Other_F1 CAUSE_Just_Happens_F1 CAUSE_Material_Physical_F1 \\\n",
"0 0.494 0.0 0.059 \n",
"\n",
" CAUSE_Spirits_Gods_F1 CAUSE_Witchcraft_Sorcery_F1 \\\n",
"0 0.591 0.0 \n",
"\n",
" CAUSE_Rule_Violation_Taboo_F1 CAUSE_Other_F1 ACTION_Physical_Material_F1 \\\n",
"0 0.0 0.0 0.567 \n",
"\n",
" ACTION_Technical_Specialist_F1 ACTION_Divination_F1 \\\n",
"0 0.0 0.0 \n",
"\n",
" ACTION_Shaman_Medium_Healer_F1 ACTION_Priest_High_Religion_F1 \\\n",
"0 0.046 0.0 \n",
"\n",
" ACTION_Other_F1 Micro_F1 Macro_F1 \n",
"0 0.0 0.501 0.175 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#This code will take a LONG time depending on how many models you have. It is recommended to use a GPU\n",
"path = loc+\"/Model_3_LearningRates\"\n",
"output_str = \"Learning_Rate_\"\n",
"modelDestinctifier = \"Learning_Rate\"\n",
"\n",
"df_allScores = checkpointInfer(path=path, data=Hraf, labels=labels, tokenizer_kwargs=tokenizer_kwargs, classifier_kwargs=classifier_kwargs, folds=True, output_str=output_str, modelDestinctifier= modelDestinctifier)\n",
"df_allScores"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Optional File save"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"# HrafOutput"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "468d823866d34e7f80f1e3dc120f38b1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/728 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# optionally save the file to json\n",
"from transformers import AutoTokenizer\n",
"import copy\n",
"\n",
"HrafOutput_dummy = copy.deepcopy(HrafOutput)\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
"\n",
"def preprocess_function(examples):\n",
" return tokenizer(examples[\"passage\"], truncation=True)\n",
"\n",
"tokenized_Hraf = Hraf.map(preprocess_function, batched=True)\n",
"\n",
"for index, passage in enumerate(HrafOutput_dummy):\n",
" assert passage['passage'] == tokenized_Hraf[index]['passage']\n",
" passage['pred_labels'] = {key:passage['pred_labels'][index] for index, key in enumerate(labels)}\n",
" passage['actual_labels'] = {key:passage['actual_labels'][index] for index, key in enumerate(labels)}\n",
" passage['input_ids'] = tokenized_Hraf[index]['input_ids']"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"# Save to unformatted json (uncomment)\n",
"with open(f\"Datasets/tokenized_inputs.json\", \"w\") as outfile:\n",
" json.dump(HrafOutput_dummy, outfile)\n",
"\n",
"\n",
"# # Save to Dataset (uncomment)\n",
"# HrafOutput_dummy_dataset = Dataset.from_list(HrafOutput_dummy)\n",
"# Dataset.to_json(HrafOutput_dummy_dataset, f\"Datasets/tokenized_Hraf\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## CHi Square"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1167, 351],\n",
" [ 49, 183]], dtype=int64)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.stats import chi2_contingency\n",
"\n",
"ct_EVENT_CAUSE = pd.crosstab(df[('EVENT','No_Info')], df[('CAUSE','No_Info')], rownames=['ACTION'], colnames=['CAUSE'])\n",
"ct_EVENT_CAUSE"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"EVENT by CAUSE:\n",
"chi: 292.4\n",
"p: 0.0\n",
"\n",
"\n",
"EVENT by ACTION:\n",
"chi: 103.3\n",
"p: 0.0\n",
"\n",
"\n",
"ACTION by CAUSE:\n",
"chi: 0.0\n",
"p: 0.857\n",
"\n",
"\n"
]
}
],
"source": [
"def chi_square_calc(row, col):\n",
" cross_tab = pd.crosstab(df[(row,'No_Info')], df[(col,'No_Info')], rownames=[row], colnames=[col])\n",
" stat, p, dof, expected = chi2_contingency(cross_tab)\n",
" results = f\"{row} by {col}:\\nchi: {round(stat,1)}\\np: {round(p,3)}\\n\\n\"\n",
" return results\n",
"\n",
"group_list = [('EVENT', 'CAUSE'), ('EVENT', 'ACTION'), ('ACTION', 'CAUSE')]\n",
"for row, col in group_list:\n",
" print(chi_square_calc(row, col))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"def chi_sqr(obs):\n",
" size_x = obs.shape\n",
" chi_mat = np.zeros(size_x)\n",
" for row in range(size_x[0]):\n",
" for col in range(size_x[1]):\n",
" exp = np.sum(x[row]) * np.sum(x[:,col]) / np.sum(x)\n",
" chi_mat[row, col] = np.sum((obs[row, col] - exp)**2 / exp)\n",
" return chi_mat\n",
"\n",
"print(np.sum(chi_sqr(x)))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}