wesfggfd commited on Jun 7, 2025

Commit

5ca1038

verified ·

1 Parent(s): 1bb4590

Upload 22 files

Browse files

Files changed (23) hide show

.gitattributes +6 -0
Images/LSTM.png +3 -0
Images/LSTM_cell_backward_rev3a_5.png +0 -0
Images/LSTM_cell_backward_rev3a_c2.png +0 -0
Images/LSTM_figure4_v3a.png +0 -0
Images/LSTM_rnn.png +0 -0
Images/RNN.png +0 -0
Images/initial_state.png +3 -0
Images/old_rnn_cell_backward_3a_c.png +0 -0
Images/rnn_backward_overview_3a_1.png +0 -0
Images/rnn_cell_backprop.png +3 -0
Images/rnn_cell_backward_3a_4.png +0 -0
Images/rnn_cell_backward_3a_c.png +3 -0
Images/rnn_forward_sequence_figure3_v3a.png +3 -0
Images/rnn_step_forward.png +3 -0
Images/rnn_step_forward_figure2_v3a.png +0 -0
Natural-Language-Processing-with-Disaster-Tweets/V1/nlp-getting-started-tutorial.ipynb +1 -0
Natural-Language-Processing-with-Disaster-Tweets/V1/nlp-getting-started.zip +3 -0
Natural-Language-Processing-with-Disaster-Tweets/V2/nlp-getting-started.zip +3 -0
Natural-Language-Processing-with-Disaster-Tweets/V2/nlp-to-monitor-twitter-for-natural-disaster.ipynb +1 -0
Natural-Language-Processing-with-Disaster-Tweets/V2/opt-keras-opt_1.3b_en-v2.tar.gz +3 -0
Natural-Language-Processing-with-Disaster-Tweets/V3/intro-to-transformers.ipynb +1 -0
Natural-Language-Processing-with-Disaster-Tweets/V3/nlp-getting-started.zip +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Images/initial_state.png filter=lfs diff=lfs merge=lfs -text
+Images/LSTM.png filter=lfs diff=lfs merge=lfs -text
+Images/rnn_cell_backprop.png filter=lfs diff=lfs merge=lfs -text
+Images/rnn_cell_backward_3a_c.png filter=lfs diff=lfs merge=lfs -text
+Images/rnn_forward_sequence_figure3_v3a.png filter=lfs diff=lfs merge=lfs -text
+Images/rnn_step_forward.png filter=lfs diff=lfs merge=lfs -text

Images/LSTM.png ADDED Viewed

Git LFS Details

SHA256: 734d1378b06d379726b8e6504b5b5a04b16906ede34ce1403e93ca03b2b192e3
Pointer size: 131 Bytes
Size of remote file: 193 kB

Images/LSTM_cell_backward_rev3a_5.png ADDED Viewed

Images/LSTM_cell_backward_rev3a_c2.png ADDED Viewed

Images/LSTM_figure4_v3a.png ADDED Viewed

Images/LSTM_rnn.png ADDED Viewed

Images/RNN.png ADDED Viewed

Images/initial_state.png ADDED Viewed

Git LFS Details

SHA256: 5631671a9088d3771d6d8f232fef5f13ca3c99fe7c4490005a1af10b431115f8
Pointer size: 131 Bytes
Size of remote file: 143 kB

Images/old_rnn_cell_backward_3a_c.png ADDED Viewed

Images/rnn_backward_overview_3a_1.png ADDED Viewed

Images/rnn_cell_backprop.png ADDED Viewed

Git LFS Details

SHA256: 919b9c9c06b44b6bf11a8ab9957e745ea7f1a0b4449b02e0c83c6963ab1d2c4b
Pointer size: 131 Bytes
Size of remote file: 149 kB

Images/rnn_cell_backward_3a_4.png ADDED Viewed

Images/rnn_cell_backward_3a_c.png ADDED Viewed

Git LFS Details

SHA256: e570d73e58f97cae0c7be8953bfe5e28ab654280a6a04d160d532ec4c6d0abca
Pointer size: 131 Bytes
Size of remote file: 150 kB

Images/rnn_forward_sequence_figure3_v3a.png ADDED Viewed

Git LFS Details

SHA256: 22550f3a27f635808db2599eb8431f86b85910d8ea6c9c4ccab395441ba57536
Pointer size: 131 Bytes
Size of remote file: 119 kB

Images/rnn_step_forward.png ADDED Viewed

Git LFS Details

SHA256: dea1eae5cc0a8ac3e7f5f6b83b2c8fd2956e31f14212135ad321985bd84549ab
Pointer size: 131 Bytes
Size of remote file: 142 kB

Images/rnn_step_forward_figure2_v3a.png ADDED Viewed

Natural-Language-Processing-with-Disaster-Tweets/V1/nlp-getting-started-tutorial.ipynb ADDED Viewed

	@@ -0,0 +1 @@

Natural-Language-Processing-with-Disaster-Tweets/V1/nlp-getting-started.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95b0e2b687762de374b2b6901547fa22d764807e72d344c829640f3756f5a70f
+size 607343

Natural-Language-Processing-with-Disaster-Tweets/V2/nlp-getting-started.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95b0e2b687762de374b2b6901547fa22d764807e72d344c829640f3756f5a70f
+size 607343

Natural-Language-Processing-with-Disaster-Tweets/V2/nlp-to-monitor-twitter-for-natural-disaster.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.11","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceId":17777,"databundleVersionId":869809,"sourceType":"competition"},{"sourceId":6085,"sourceType":"modelInstanceVersion","modelInstanceId":4700,"modelId":2824}],"dockerImageVersionId":31011,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Using NLP with Twitter to monitor for natural disasters\nWhen disaster strikes, those affected often post on social media about the disaster, even before making official reports. Social media can serve as an early warning system for governmental and humanitarian organizations to detect natural disasters.\n\nThis notebook explores a collection of human-labeled tweets where some describe a natural disaster. The challenge is to predict whether the tweet describes a true natural disaster. Natural Language Processing (NLP) techniques including LSTM and GRU are compared.","metadata":{}},{"cell_type":"code","source":"import tensorflow as tf\nimport keras\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.models import Sequential\nfrom keras import ops, mixed_precision\nmixed_precision.set_global_policy(\"mixed_float16\")\nfrom keras.layers import Input, GRU, Dense, Dropout, Masking\nimport keras_hub\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport spacy\nimport warnings\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix\nwarnings.simplefilter(action='ignore', category=FutureWarning)\n\nsns.set(style='whitegrid')\nprint(\"Tensorflow version \" + tf.__version__)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:15:29.500383Z","iopub.execute_input":"2025-04-16T00:15:29.500719Z","iopub.status.idle":"2025-04-16T00:15:50.683206Z","shell.execute_reply.started":"2025-04-16T00:15:29.500695Z","shell.execute_reply":"2025-04-16T00:15:50.682556Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!python -m spacy download en_core_web_md --quiet","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:15:50.684046Z","iopub.execute_input":"2025-04-16T00:15:50.684434Z","iopub.status.idle":"2025-04-16T00:15:59.943968Z","shell.execute_reply.started":"2025-04-16T00:15:50.684416Z","shell.execute_reply":"2025-04-16T00:15:59.943212Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"df_train = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ndf_test = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")\n\nprint('Training Set Shape = {}'.format(df_train.shape))\nprint('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))\nprint('Test Set Shape = {}'.format(df_test.shape))\nprint('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:15:59.946238Z","iopub.execute_input":"2025-04-16T00:15:59.946476Z","iopub.status.idle":"2025-04-16T00:16:00.032632Z","shell.execute_reply.started":"2025-04-16T00:15:59.946454Z","shell.execute_reply":"2025-04-16T00:16:00.032045Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Exploratory Data Analysis","metadata":{}},{"cell_type":"code","source":"df_train.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.033304Z","iopub.execute_input":"2025-04-16T00:16:00.033561Z","iopub.status.idle":"2025-04-16T00:16:00.051689Z","shell.execute_reply.started":"2025-04-16T00:16:00.033543Z","shell.execute_reply":"2025-04-16T00:16:00.051006Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"df_train.loc[1, 'text']","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.052404Z","iopub.execute_input":"2025-04-16T00:16:00.052573Z","iopub.status.idle":"2025-04-16T00:16:00.057342Z","shell.execute_reply.started":"2025-04-16T00:16:00.05256Z","shell.execute_reply":"2025-04-16T00:16:00.056797Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"df_train.isna().sum()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.058058Z","iopub.execute_input":"2025-04-16T00:16:00.058712Z","iopub.status.idle":"2025-04-16T00:16:00.074326Z","shell.execute_reply.started":"2025-04-16T00:16:00.058686Z","shell.execute_reply":"2025-04-16T00:16:00.073651Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"df_train.duplicated(subset=['text'], keep=False).sum()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.075037Z","iopub.execute_input":"2025-04-16T00:16:00.075271Z","iopub.status.idle":"2025-04-16T00:16:00.094215Z","shell.execute_reply.started":"2025-04-16T00:16:00.075255Z","shell.execute_reply":"2025-04-16T00:16:00.093612Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"df_train = df_train.drop_duplicates(subset=['text'])\ndf_train.duplicated(subset=['text'], keep=False).sum()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.094859Z","iopub.execute_input":"2025-04-16T00:16:00.095088Z","iopub.status.idle":"2025-04-16T00:16:00.113627Z","shell.execute_reply.started":"2025-04-16T00:16:00.095067Z","shell.execute_reply":"2025-04-16T00:16:00.112893Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Count labels in each class\nlabel_counts = df_train['target'].value_counts(normalize=True).reset_index()\nlabel_counts.columns = ['label', 'proportion']\n\n# Visualize class distribution\nsns.barplot(data=label_counts, x='label', y='proportion')\n\nplt.title('Relative Distribution of Tweets')\nplt.xlabel('Disaster==TRUE')\nplt.ylabel('Proportion')\nplt.ylim(0, 1)\nplt.show()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.115993Z","iopub.execute_input":"2025-04-16T00:16:00.116186Z","iopub.status.idle":"2025-04-16T00:16:00.322673Z","shell.execute_reply.started":"2025-04-16T00:16:00.116169Z","shell.execute_reply":"2025-04-16T00:16:00.3219Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"df_train.loc[:, \"length\"] = df_train.loc[:, \"text\"].apply(len)\n\nplt.figure(figsize=(10, 6))\n\nfor category in df_train[\"target\"].unique():\n subset = df_train[df_train[\"target\"] == category]\n sns.kdeplot(subset[\"length\"], label=category, fill=True, alpha=0.3)\n\nplt.xlabel(\"Text Length\")\nplt.ylabel(\"Density\")\nplt.title(\"Kernel Density Estimate of Text Length by Category\")\nplt.legend(title=\"Category\")\nplt.show();","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.323464Z","iopub.execute_input":"2025-04-16T00:16:00.323652Z","iopub.status.idle":"2025-04-16T00:16:00.687268Z","shell.execute_reply.started":"2025-04-16T00:16:00.323638Z","shell.execute_reply":"2025-04-16T00:16:00.68653Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Model Architecture (Analysis Plan)\n\nFirst, I'll use the library `spacy` to manage the pre-processing pipeline. Based on the EDA above, I can keep the pipeline very simple by tokenizing and vectorizing the text. \n\nI'll compare two models: GRU and a fine tuned LLM. \n\nSo that I can have a consistent input shape, I'll limit to the first 30 tokens and pad the input with 0s for input to the GRU model. \n\nFor the LLM, I will tokenize with the LLM's built-in tokenizer and in addition to padding I'll add a preprocessing step for tracking the final token in the input as the LLM reads from left to right and the only token with the full context is the final one.\n\nI'll compare the confusion matrices (F1 scores) and the stability of the validation accuracy over training epochs to check for overfitting. \n\nFinally, I'll create one submission for each model to compare to the hiddent test set.","metadata":{}},{"cell_type":"markdown","source":"### Model 1: GRU\nGated Recurrent Unit (GRU) is a type of Recurrent Neural Network (RNN) that evolved as a simplification and improvement on LSTM.","metadata":{}},{"cell_type":"code","source":"nlp = spacy.load(\n \"en_core_web_md\", \n disable=\n [\n \"tagger\", \"parser\", \"ner\", \n \"lemmatizer\", \"attribute_ruler\"\n ]\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:00.688195Z","iopub.execute_input":"2025-04-16T00:16:00.68893Z","iopub.status.idle":"2025-04-16T00:16:02.237637Z","shell.execute_reply.started":"2025-04-16T00:16:00.688904Z","shell.execute_reply":"2025-04-16T00:16:02.236853Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"MAX_TOKENS = 30\nEMBEDDING_DIM = nlp.vocab.vectors_length\n\ndef vectorize_sequence(text):\n doc = nlp(text)\n vecs = [\n token.vector for token in doc \n if token.has_vector and not token.is_stop\n ]\n \n if len(vecs) == 0:\n print(f'\"{text}\" returns 0 vectors')\n vecs = [np.zeros(EMBEDDING_DIM)]\n \n return vecs[:MAX_TOKENS]\n\nX_seq = [vectorize_sequence(text) for text in df_train[\"text\"]]\nX_seq_padded = pad_sequences(\n X_seq, \n maxlen=MAX_TOKENS, \n dtype='float32', \n padding='post', \n truncating='post', \n value=0.0\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:02.238348Z","iopub.execute_input":"2025-04-16T00:16:02.238556Z","iopub.status.idle":"2025-04-16T00:16:21.630481Z","shell.execute_reply.started":"2025-04-16T00:16:02.238534Z","shell.execute_reply":"2025-04-16T00:16:21.629902Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Split training and test data\ny = df_train[\"target\"].values\n\nX_train, X_val, y_train, y_val = train_test_split(\n X_seq_padded, y, test_size=0.2, random_state=42)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:21.631232Z","iopub.execute_input":"2025-04-16T00:16:21.631515Z","iopub.status.idle":"2025-04-16T00:16:21.718278Z","shell.execute_reply.started":"2025-04-16T00:16:21.631482Z","shell.execute_reply":"2025-04-16T00:16:21.717396Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Compile GRU model\nmodel = Sequential()\nmodel.add(Input(shape=(MAX_TOKENS, EMBEDDING_DIM)))\n# model.add(Masking(mask_value=0.0))\nmodel.add(GRU(64))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(1, activation='sigmoid'))\n\nmodel.compile(\n loss='binary_crossentropy', \n optimizer='adam', \n metrics=['accuracy'])\n\nmodel.summary()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:21.719281Z","iopub.execute_input":"2025-04-16T00:16:21.719828Z","iopub.status.idle":"2025-04-16T00:16:23.235787Z","shell.execute_reply.started":"2025-04-16T00:16:21.719803Z","shell.execute_reply":"2025-04-16T00:16:23.235251Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Fit GRU model\nhistory_gru = model.fit(\n X_train, \n y_train, \n validation_split=0.1, \n epochs=10, \n batch_size=32\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:23.236615Z","iopub.execute_input":"2025-04-16T00:16:23.237118Z","iopub.status.idle":"2025-04-16T00:16:38.648983Z","shell.execute_reply.started":"2025-04-16T00:16:23.237093Z","shell.execute_reply":"2025-04-16T00:16:38.648392Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def displayConfusionMatrix(y_true, y_pred, dataset):\n # Convert probabilities to binary predictions\n y_pred_binary = (y_pred > 0.5).astype(int).flatten()\n\n # Plot confusion matrix\n disp = ConfusionMatrixDisplay.from_predictions(\n y_true,\n y_pred_binary,\n display_labels=[\"Not Disaster\", \"Disaster\"],\n cmap=plt.cm.Blues\n )\n\n tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()\n f1_score = tp / (tp + ((fn + fp) / 2))\n\n disp.ax_.set_title(f\"Confusion Matrix on {dataset} Dataset — F1 Score: {f1_score:.2f}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:38.64993Z","iopub.execute_input":"2025-04-16T00:16:38.65015Z","iopub.status.idle":"2025-04-16T00:16:38.655015Z","shell.execute_reply.started":"2025-04-16T00:16:38.650133Z","shell.execute_reply":"2025-04-16T00:16:38.654451Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"y_pred_train = model.predict(X_train)\ndisplayConfusionMatrix(y_train, y_pred_train, \"Training\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:38.65566Z","iopub.execute_input":"2025-04-16T00:16:38.655853Z","iopub.status.idle":"2025-04-16T00:16:40.096905Z","shell.execute_reply.started":"2025-04-16T00:16:38.655838Z","shell.execute_reply":"2025-04-16T00:16:40.096251Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"### Model 2: OPT (with Keras Hub)\nFollowing the tutorial for Keras Hub [here](https://keras.io/keras_hub/getting_started/), I'll finetune a pre-trained LLM to categorize the disaster tweets. ","metadata":{}},{"cell_type":"code","source":"import os\nos.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"]=\"1.00\"","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:40.097776Z","iopub.execute_input":"2025-04-16T00:16:40.098026Z","iopub.status.idle":"2025-04-16T00:16:40.101675Z","shell.execute_reply.started":"2025-04-16T00:16:40.098009Z","shell.execute_reply":"2025-04-16T00:16:40.101128Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"texts = df_train[\"text\"].tolist()\nlabels = df_train[\"target\"].tolist()\n\nX_train, X_val, y_train, y_val = train_test_split(\n texts, labels, test_size=0.2, random_state=42)\n\nraw_train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))\nraw_val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))\n\nMODEL = \"opt_1.3b_en\"\nbatch_size = 1\n\ntokenizer = keras_hub.tokenizers.Tokenizer.from_preset(\n MODEL\n)\n\npacker = keras_hub.layers.StartEndPacker(\n start_value=tokenizer.start_token_id,\n end_value=tokenizer.end_token_id,\n pad_value=tokenizer.pad_token_id,\n sequence_length=None,\n)\n\ndef preprocess(x, y=None, sequence_length=256):\n x = tokenizer(x)\n x = packer(x, sequence_length=sequence_length)\n x = {\n \"token_ids\": x,\n \"padding_mask\": x != tokenizer.pad_token_id,\n }\n return keras.utils.pack_x_y_sample_weight(x, y)\n\ntrain_ds = raw_train_ds.map(preprocess, num_parallel_calls=16)\nval_ds = raw_val_ds.map(preprocess, num_parallel_calls=16)\n\ntrain_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)\nval_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)\n\nbackbone = keras_hub.models.Backbone.from_preset(\n MODEL,\n)\n\nbackbone.enable_lora(4)\nbackbone.trainable = False\n\nclass LastTokenPooler(keras.layers.Layer):\n def call(self, hidden_states, padding_mask):\n lengths = tf.reduce_sum(tf.cast(padding_mask, tf.int32), axis=1) - 1 \n batch_size = tf.shape(hidden_states)[0]\n indices = tf.stack([tf.range(batch_size), lengths], axis=1)\n last_token_embeddings = tf.gather_nd(hidden_states, indices)\n return last_token_embeddings\n\ninputs = backbone.input\nx = backbone(inputs)\nx = LastTokenPooler(\n name=\"pooler\",\n)(x, inputs[\"padding_mask\"])\nx = keras.layers.Dense(\n 2048,\n activation=\"relu\",\n name=\"pooled_dense\",\n)(x)\nx = keras.layers.Dropout(\n 0.1,\n name=\"output_dropout\",\n)(x)\noutputs = keras.layers.Dense(\n 1,\n activation=\"sigmoid\",\n name=\"output_dense\",\n)(x)\ntext_classifier = keras.Model(inputs, outputs)\ntext_classifier.summary()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:16:40.10241Z","iopub.execute_input":"2025-04-16T00:16:40.102641Z","iopub.status.idle":"2025-04-16T00:17:24.140645Z","shell.execute_reply.started":"2025-04-16T00:16:40.102614Z","shell.execute_reply":"2025-04-16T00:17:24.140084Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"text_classifier.compile(\n optimizer=keras.optimizers.Adam(5e-5),\n loss=\"binary_crossentropy\",\n metrics=[\"accuracy\"],\n)\n\nhistory_opt = text_classifier.fit(\n train_ds,\n validation_data=val_ds,\n validation_split=0.1, \n epochs=10, \n batch_size=batch_size\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-16T00:17:24.141413Z","iopub.execute_input":"2025-04-16T00:17:24.141629Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"y_pred_train = text_classifier.predict(train_ds)\ndisplayConfusionMatrix(y_train, y_pred_train, \"Training\")","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Results & Analysis\nFrom the confusion matrices displayed above, we can see that that both models performed fairly well on the validation dataset. Each additional epoch for the LLM appeared to actually reduce the validation accuracy. The GRU model showed fairly good performance and stable validation accuracy indicating overfitting is not likely.","metadata":{}},{"cell_type":"code","source":"def plot_accuracy(model, ax, title):\n ax.plot(model.history['accuracy'], label='Train Accuracy', color='#1f77b4', linestyle='dashed')\n ax.plot(model.history['val_accuracy'], label='Val Accuracy', color='#1f77b4')\n ax.set_title(f'CNN ({title}) Accuracy')\n ax.set_xlabel('Epochs')\n ax.set_ylabel('Accuracy')\n ax.legend()\n\nfig, axes = plt.subplots(1, 2, figsize=(10, 6), sharey=True)\n\n# Plot history_basic\nplot_accuracy(history_gru, axes[0], \"GRU\")\nplot_accuracy(history_opt, axes[1], \"OPT\")\n\nplt.tight_layout()\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Create Submission\nI'll create a submission for both the GRU model and OPT model.","metadata":{}},{"cell_type":"code","source":"# Predict on GRU model\nX_seq_test = [vectorize_sequence(text) for text in df_test[\"text\"]]\nX_seq_test_padded = pad_sequences(\n X_seq_test, \n maxlen=MAX_TOKENS, \n dtype='float32', \n padding='post', \n truncating='post', \n value=0.0\n)\ny_pred_probs_gru = model.predict(X_seq_test_padded)\ny_pred_gru = (y_pred_probs_gru > 0.5).astype(int).flatten()\nsubmission_gru_df = pd.DataFrame(\n {\n 'id': df_test['id'],\n 'target': y_pred_gru\n }\n)\nsubmission_gru_df.to_csv('submission_gru.csv', index=False)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Predict on OPT model\nX_test = df_test[\"text\"].tolist()\nraw_test_ds = tf.data.Dataset.from_tensor_slices(X_test)\ntest_ds = raw_test_ds.map(preprocess, num_parallel_calls=16)\ntest_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)\ny_pred_probs_opt = text_classifier.predict(test_ds)\ny_pred_opt = (y_pred_probs_opt > 0.5).astype(int).flatten()\nsubmission_opt_df = pd.DataFrame(\n {\n 'id': df_test['id'],\n 'target': y_pred_opt\n }\n)\nsubmission_opt_df.to_csv('submission_opt.csv', index=False)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Conclusions\nThe GRU model performed fairly well and was easy to set up, both in terms of text pre-processing and model architecture. The LLM, by contrast, was much more of a challenge. I initially attemted to use Gemma 2b Instruct, based on the Keras Hub tutorial, but found that the model weights alone almost maxed my GPU's memory of 16GB. Not surprisingly, I consistently hit OOM errors even after reducing the trainable parameters to less than 20MB through freezing the backbone and locking LoRA. The OPT model at 8GB was a more tractable model for this architecture. Deep learning tasks are far more hardware dependent than other machine learning tasks, which can be inefficient in Big-O terms without proper optimization, but training neural nets requires taking much more care in configuring the VM and writing code to match.","metadata":{}}]}

Natural-Language-Processing-with-Disaster-Tweets/V2/opt-keras-opt_1.3b_en-v2.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eca1b8882b24311f3cf328da910beb87aa47232c2ec2ca92620016280b3008c
+size 3144514453

Natural-Language-Processing-with-Disaster-Tweets/V3/intro-to-transformers.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceId":17777,"databundleVersionId":869809,"sourceType":"competition"}],"dockerImageVersionId":30356,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# Necessary Imports\nimport numpy as np \nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.style.use('ggplot')\n\nimport warnings\nwarnings.filterwarnings(\"ignore\")","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-01-16T12:42:23.441515Z","iopub.execute_input":"2023-01-16T12:42:23.44195Z","iopub.status.idle":"2023-01-16T12:42:23.456229Z","shell.execute_reply.started":"2023-01-16T12:42:23.441861Z","shell.execute_reply":"2023-01-16T12:42:23.455174Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# read the datasets\ntrain_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\", usecols=['id', 'text', 'target'])\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\", usecols=['text'])","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:23.45805Z","iopub.execute_input":"2023-01-16T12:42:23.458371Z","iopub.status.idle":"2023-01-16T12:42:23.531514Z","shell.execute_reply.started":"2023-01-16T12:42:23.45832Z","shell.execute_reply":"2023-01-16T12:42:23.530616Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# check target ratio\ntrain_df.target.value_counts().plot(kind='bar', title='Non-Disaster[0] vs Disaster[1]');","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:23.532927Z","iopub.execute_input":"2023-01-16T12:42:23.533253Z","iopub.status.idle":"2023-01-16T12:42:23.752773Z","shell.execute_reply.started":"2023-01-16T12:42:23.53322Z","shell.execute_reply":"2023-01-16T12:42:23.751759Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# inpect tweet length\ntrain_df[\"Words Per Tweet\"] = train_df[\"text\"].str.split().apply(len)\ntrain_df.boxplot(\"Words Per Tweet\", by=\"target\", grid=False,\n showfliers=False, color=\"black\")\nplt.suptitle(\"\")\nplt.xlabel(\"\");","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:23.755503Z","iopub.execute_input":"2023-01-16T12:42:23.755881Z","iopub.status.idle":"2023-01-16T12:42:23.964531Z","shell.execute_reply.started":"2023-01-16T12:42:23.755837Z","shell.execute_reply":"2023-01-16T12:42:23.963623Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Transformer models have a maximum input sequence length. For applications using 🤗 DistilBERT or DeBERTa, the maximum context size -`max_position_embeddings`- is 512 tokens, which amounts to a few paragraphs of text. so this is not a problem here.","metadata":{}},{"cell_type":"markdown","source":"# Preprocessing","metadata":{}},{"cell_type":"code","source":"# remove duplicates\ntrain_df.drop_duplicates(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:23.965894Z","iopub.execute_input":"2023-01-16T12:42:23.966415Z","iopub.status.idle":"2023-01-16T12:42:23.976703Z","shell.execute_reply.started":"2023-01-16T12:42:23.966379Z","shell.execute_reply":"2023-01-16T12:42:23.97586Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import re \ndef decontracted(phrase):\n # specific\n phrase = re.sub(r\"won\\'t\", \"will not\", phrase)\n phrase = re.sub(r\"can\\'t\", \"can not\", phrase)\n phrase = re.sub(r\"don\\'t\", \"do not\", phrase)\n\n # general\n phrase = re.sub(r\"n\\'t\", \" not\", phrase)\n phrase = re.sub(r\"\\'re\", \" are\", phrase)\n phrase = re.sub(r\"\\'s\", \" is\", phrase)\n phrase = re.sub(r\"\\'d\", \" would\", phrase)\n phrase = re.sub(r\"\\'ll\", \" will\", phrase)\n phrase = re.sub(r\"\\'t\", \" not\", phrase)\n phrase = re.sub(r\"\\'ve\", \" have\", phrase)\n phrase = re.sub(r\"\\'m\", \" am\", phrase)\n return phrase\n\ndef preprocessing(text):\n text = text.replace('#','')\n text = decontracted(text)\n text = re.sub('\\S*@\\S*\\s?','',text)\n text = re.sub('http[s]?:(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\$\$,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',text)\n return text.strip()\n\ntrain_df['text'] = train_df['text'].apply(preprocessing)\ntest_df['text'] = test_df['text'].apply(preprocessing)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:23.978013Z","iopub.execute_input":"2023-01-16T12:42:23.978562Z","iopub.status.idle":"2023-01-16T12:42:24.197382Z","shell.execute_reply.started":"2023-01-16T12:42:23.978528Z","shell.execute_reply":"2023-01-16T12:42:24.196487Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\n\ntrain_df, valid_df = train_test_split(train_df, test_size=0.33, random_state=10)\n# print shapes\ntrain_df.shape, valid_df.shape, test_df.shape","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:24.198649Z","iopub.execute_input":"2023-01-16T12:42:24.198996Z","iopub.status.idle":"2023-01-16T12:42:24.679375Z","shell.execute_reply.started":"2023-01-16T12:42:24.198962Z","shell.execute_reply":"2023-01-16T12:42:24.678315Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# convert to Hugging Face Datasets\nfrom datasets import Dataset, load_dataset\n\ntrain_ds = Dataset.from_pandas(train_df)\nvalid_ds = Dataset.from_pandas(valid_df)\ntest_ds = Dataset.from_pandas(test_df)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:24.680828Z","iopub.execute_input":"2023-01-16T12:42:24.681142Z","iopub.status.idle":"2023-01-16T12:42:25.24823Z","shell.execute_reply.started":"2023-01-16T12:42:24.681115Z","shell.execute_reply":"2023-01-16T12:42:25.247285Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"you can try any model by replacing `model_ckpt` value.","metadata":{}},{"cell_type":"code","source":"from transformers import AutoConfig\n\n# model_ckpt = \"distilbert-base-uncased\"\nmodel_ckpt = \"microsoft/deberta-v3-base\"\nconfig = AutoConfig.from_pretrained(model_ckpt)\n\n# check model configrations\nconfig","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:25.249657Z","iopub.execute_input":"2023-01-16T12:42:25.250206Z","iopub.status.idle":"2023-01-16T12:42:27.592436Z","shell.execute_reply.started":"2023-01-16T12:42:25.25017Z","shell.execute_reply":"2023-01-16T12:42:27.591508Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from transformers import AutoTokenizer\n\ntokenizer = AutoTokenizer.from_pretrained(model_ckpt)","metadata":{"_kg_hide-output":false,"execution":{"iopub.status.busy":"2023-01-16T12:42:27.596814Z","iopub.execute_input":"2023-01-16T12:42:27.597802Z","iopub.status.idle":"2023-01-16T12:42:42.774673Z","shell.execute_reply.started":"2023-01-16T12:42:27.597763Z","shell.execute_reply":"2023-01-16T12:42:42.77352Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# apply it to the whole dataset\ntrain_tok = train_ds.map(lambda row: tokenizer(row['text'], padding=True, truncation=True), batched=True)\nvalid_tok = valid_ds.map(lambda row: tokenizer(row['text'], padding=True, truncation=True), batched=True)\ntest_tok = test_ds.map(lambda row: tokenizer(row['text'], padding=True, truncation=True), batched=True)","metadata":{"_kg_hide-output":false,"execution":{"iopub.status.busy":"2023-01-16T12:42:42.776135Z","iopub.execute_input":"2023-01-16T12:42:42.77736Z","iopub.status.idle":"2023-01-16T12:42:44.075615Z","shell.execute_reply.started":"2023-01-16T12:42:42.777303Z","shell.execute_reply":"2023-01-16T12:42:44.074636Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# inspect tokenizer results\nprint(train_tok[0]), tokenizer.decode(train_tok[0]['input_ids'])","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:44.077218Z","iopub.execute_input":"2023-01-16T12:42:44.07785Z","iopub.status.idle":"2023-01-16T12:42:48.813585Z","shell.execute_reply.started":"2023-01-16T12:42:44.077811Z","shell.execute_reply":"2023-01-16T12:42:48.812612Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# inspect sepcial tokens & ids\ntokenizer.all_special_tokens, tokenizer.all_special_ids","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:42:48.815197Z","iopub.execute_input":"2023-01-16T12:42:48.815957Z","iopub.status.idle":"2023-01-16T12:42:48.822474Z","shell.execute_reply.started":"2023-01-16T12:42:48.815918Z","shell.execute_reply":"2023-01-16T12:42:48.821388Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Pretrained Model\n1. Transformers as Feature Extractors\n2. Fine-Tuning Transformers\n\n\n## Transformers as Feature Extractors\nUsing a transformer as a feature extractor is fairly simple. e freeze the body's weights during training and use the hidden states as features for the classifier. with this approach we can quickly train a small or shallow model. ","metadata":{}},{"cell_type":"code","source":"from transformers import AutoModel\nimport torch\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Here CUDA\nmodel = AutoModel.from_pretrained(model_ckpt).to(device)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-01-16T12:42:48.824951Z","iopub.execute_input":"2023-01-16T12:42:48.826249Z","iopub.status.idle":"2023-01-16T12:43:34.413237Z","shell.execute_reply.started":"2023-01-16T12:42:48.826212Z","shell.execute_reply":"2023-01-16T12:43:34.412105Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Feature Extraction\ndef extract_hidden_state(batch):\n # Place model inputs on the same device as the model\n inputs = {k: v.to(device) for k,v in batch.items()\n if k in tokenizer.model_input_names} # ['input_ids', 'attention_mask']\n # Extract last hidden states\n with torch.no_grad():\n last_hidden_state = model(**inputs).last_hidden_state\n # Return vector for [CLS] token & Place it on CPU for compatibility with other dataset columns\n return {'hidden_state': last_hidden_state[:,0].cpu().numpy()}","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:34.415085Z","iopub.execute_input":"2023-01-16T12:43:34.415434Z","iopub.status.idle":"2023-01-16T12:43:34.421295Z","shell.execute_reply.started":"2023-01-16T12:43:34.415405Z","shell.execute_reply":"2023-01-16T12:43:34.420256Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Since our model expects tensors as inputs, the next thing to do is:\n# convert the input_ids and attention_mask columns to the \"torch\" format, as follows:\ntrain_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'target'])\nvalid_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'target'])","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:34.422764Z","iopub.execute_input":"2023-01-16T12:43:34.423341Z","iopub.status.idle":"2023-01-16T12:43:34.432566Z","shell.execute_reply.started":"2023-01-16T12:43:34.423304Z","shell.execute_reply":"2023-01-16T12:43:34.431656Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# extract features\ntrain_hidden = train_tok.map(extract_hidden_state, batched=True)\nvalid_hidden = valid_tok.map(extract_hidden_state, batched=True)","metadata":{"_kg_hide-output":false,"execution":{"iopub.status.busy":"2023-01-16T12:43:34.434086Z","iopub.execute_input":"2023-01-16T12:43:34.434442Z","iopub.status.idle":"2023-01-16T12:43:50.311458Z","shell.execute_reply.started":"2023-01-16T12:43:34.434408Z","shell.execute_reply":"2023-01-16T12:43:50.310371Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"X_train = np.array(train_hidden['hidden_state'])\ny_train = np.array(train_hidden['target'])\nX_valid = np.array(valid_hidden['hidden_state'])\ny_valid = np.array(valid_hidden['target'])\n\nX_train.shape, X_valid.shape","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:50.313985Z","iopub.execute_input":"2023-01-16T12:43:50.314625Z","iopub.status.idle":"2023-01-16T12:43:50.385062Z","shell.execute_reply.started":"2023-01-16T12:43:50.314586Z","shell.execute_reply":"2023-01-16T12:43:50.383907Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from sklearn.linear_model import LogisticRegression\n\nlr = LogisticRegression(max_iter=4000)\nlr.fit(X_train, y_train)\nlr.score(X_valid, y_valid)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:50.386678Z","iopub.execute_input":"2023-01-16T12:43:50.387193Z","iopub.status.idle":"2023-01-16T12:43:52.610237Z","shell.execute_reply.started":"2023-01-16T12:43:50.387154Z","shell.execute_reply":"2023-01-16T12:43:52.60923Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"It is often a good idea to have a few baseline metrics, or else we may end up thinking our model works great when in fact it is doing worse than basic models.","metadata":{}},{"cell_type":"code","source":"from sklearn.dummy import DummyClassifier\n\ndummy_clf = DummyClassifier(strategy=\"most_frequent\")\ndummy_clf.fit(X_train, y_train)\ndummy_clf.score(X_valid, y_valid)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:52.611832Z","iopub.execute_input":"2023-01-16T12:43:52.612893Z","iopub.status.idle":"2023-01-16T12:43:52.62545Z","shell.execute_reply.started":"2023-01-16T12:43:52.612847Z","shell.execute_reply":"2023-01-16T12:43:52.623465Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Great, we have a 25% accuracy boost with Transformer Features 👏","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score\nimport seaborn as sns\n\ny_pred = lr.predict(X_valid)\n\n# plot the confusion matrix\nsns.heatmap(confusion_matrix(y_valid, y_pred), annot=True, fmt='.4g');\n\nprint('Accuracy', round(accuracy_score(y_valid, y_pred), 4)*100, '%')\nprint('Precision', round(precision_score(y_valid, y_pred), 2)*100, '%')\nprint('Recall', round(recall_score(y_valid, y_pred), 2)*100, '%')\nprint(classification_report(y_valid, y_pred, target_names=['Not Disaster', 'Disaster'])) # [0, 1]","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:52.627002Z","iopub.execute_input":"2023-01-16T12:43:52.627689Z","iopub.status.idle":"2023-01-16T12:43:52.99643Z","shell.execute_reply.started":"2023-01-16T12:43:52.627649Z","shell.execute_reply":"2023-01-16T12:43:52.995188Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Fine-Tuning Transformers\n","metadata":{}},{"cell_type":"code","source":"from transformers import AutoModelForSequenceClassification\n\nmodel = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2).to(device)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-01-16T12:43:52.997802Z","iopub.execute_input":"2023-01-16T12:43:52.998426Z","iopub.status.idle":"2023-01-16T12:43:57.197706Z","shell.execute_reply.started":"2023-01-16T12:43:52.998386Z","shell.execute_reply":"2023-01-16T12:43:57.196785Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n\n# defining metrics for evaluation\ndef compute_matrics(pred):\n labels = pred.label_ids\n preds = pred.predictions.argmax(-1)\n \n acc = accuracy_score(labels, preds)\n f1 = f1_score(labels, preds, average='weighted')\n rec = recall_score(labels, preds)\n pre = precision_score(labels, preds)\n \n return {'accuracy':acc,\n 'f1-score':f1,\n 'recall': rec,\n 'precession': pre}","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:57.199372Z","iopub.execute_input":"2023-01-16T12:43:57.199763Z","iopub.status.idle":"2023-01-16T12:43:57.206511Z","shell.execute_reply.started":"2023-01-16T12:43:57.199704Z","shell.execute_reply":"2023-01-16T12:43:57.205392Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"### Build the Trainer","metadata":{}},{"cell_type":"code","source":"from transformers import Trainer, TrainingArguments\n\n# Hyperparameters\nEPOCHS = 3 # 2 -> 82,6 | 5 -> 82.0\nBATCH_SIZE = 64\nLOGGING_STEP = len(train_ds) // BATCH_SIZE\nMODEL_NAME = f'{model_ckpt}_finetuned-disatser-tweets'\n\n# Arguments\ntraining_args = TrainingArguments(\n output_dir=MODEL_NAME,\n num_train_epochs=EPOCHS,\n learning_rate=1e-5, # 0.00001\n per_device_train_batch_size=BATCH_SIZE, # batch size per device during training\n per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation\n weight_decay=0.1,\n disable_tqdm=False,\n logging_steps=LOGGING_STEP,\n push_to_hub=False,\n log_level='error',\n report_to='none' # turn wandb off\n)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:57.208019Z","iopub.execute_input":"2023-01-16T12:43:57.208636Z","iopub.status.idle":"2023-01-16T12:43:57.348476Z","shell.execute_reply.started":"2023-01-16T12:43:57.208602Z","shell.execute_reply":"2023-01-16T12:43:57.347569Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# the Distilbert model needs the target named as label\ntrain_tok = train_tok.rename_column(\"target\", \"label\")\nvalid_tok = valid_tok.rename_column(\"target\", \"label\")","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:57.349762Z","iopub.execute_input":"2023-01-16T12:43:57.350986Z","iopub.status.idle":"2023-01-16T12:43:57.360609Z","shell.execute_reply.started":"2023-01-16T12:43:57.350949Z","shell.execute_reply":"2023-01-16T12:43:57.359593Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# train the model\ntrainer = Trainer(\n model=model,\n args=training_args,\n compute_metrics=compute_matrics,\n train_dataset=train_tok,\n eval_dataset=valid_tok,\n tokenizer=tokenizer\n)\n\ntrainer.train();","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:43:57.362268Z","iopub.execute_input":"2023-01-16T12:43:57.362693Z","iopub.status.idle":"2023-01-16T12:46:42.317216Z","shell.execute_reply.started":"2023-01-16T12:43:57.362658Z","shell.execute_reply":"2023-01-16T12:46:42.316302Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# get predictions\npreds_output = trainer.predict(valid_tok)\n\n# print metrics\npreds_output.metrics","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:42.319878Z","iopub.execute_input":"2023-01-16T12:46:42.320512Z","iopub.status.idle":"2023-01-16T12:46:48.446387Z","shell.execute_reply.started":"2023-01-16T12:46:42.320472Z","shell.execute_reply":"2023-01-16T12:46:48.445403Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Error analysis\n\nlet's investigate the model's predictions, A simple yet powerful technique is to sort the validation samples by the model loss. When we pass the label during the forward pass, the loss is automatically calculated and returned. Our goal hear is to find **Wrong labels**.\n\nHere's a function that returns the loss along with the predicted label:","metadata":{}},{"cell_type":"code","source":"from torch.nn.functional import cross_entropy\n\ndef get_preditions_losses(batch):\n # Place model inputs on the same device as the model\n inputs = {k: v.to(device) for k,v in batch.items()\n if k in tokenizer.model_input_names} # ['input_ids', 'attention_mask']\n # get predictions & loss\n with torch.no_grad():\n output = model(**inputs)\n preds = torch.argmax(output.logits, axis=-1)\n loss = cross_entropy(output.logits, batch['label'].to(device),\n reduction='none')\n \n # Place outputs on CPU for compatibility with other dataset columns\n return {'loss': loss.cpu().numpy(),\n 'predicted_label':preds.cpu().numpy()}","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:48.452118Z","iopub.execute_input":"2023-01-16T12:46:48.452442Z","iopub.status.idle":"2023-01-16T12:46:48.459839Z","shell.execute_reply.started":"2023-01-16T12:46:48.452412Z","shell.execute_reply":"2023-01-16T12:46:48.458803Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# get preditions & loss\nvalid_tok = valid_tok.map(get_preditions_losses, batched=True)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:48.46147Z","iopub.execute_input":"2023-01-16T12:46:48.462171Z","iopub.status.idle":"2023-01-16T12:46:53.17598Z","shell.execute_reply.started":"2023-01-16T12:46:48.462099Z","shell.execute_reply":"2023-01-16T12:46:53.175131Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# create a DataFrame with the texts, predicted_labels, and loss\n\n# Convert our dataset pandas\nvalid_tok.set_format(\"pandas\")\n\n# valid_tok\nerrors = valid_tok[:][['id', 'text', 'label', 'predicted_label', 'loss']]\nerrors['label'] = errors['label'].astype(str)\nerrors['predicted_label'] = errors['predicted_label'].astype(str)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:53.177502Z","iopub.execute_input":"2023-01-16T12:46:53.177892Z","iopub.status.idle":"2023-01-16T12:46:53.200068Z","shell.execute_reply.started":"2023-01-16T12:46:53.177846Z","shell.execute_reply":"2023-01-16T12:46:53.199238Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# inspect 100 highest losses\nerrors.sort_values(by='loss', ascending=False).iloc[:101, :]","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:53.201196Z","iopub.execute_input":"2023-01-16T12:46:53.201525Z","iopub.status.idle":"2023-01-16T12:46:53.221647Z","shell.execute_reply.started":"2023-01-16T12:46:53.201491Z","shell.execute_reply":"2023-01-16T12:46:53.22059Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Disaster miss labeled:\nincorrect_ids_1 = [4043, 5247, 5621, 7196, 6566, 952, 921, 1251, 3460, \n 853, 3395, 4787, 6450, 3079, 5124, 6211, 4083, 5109,\n 10499, 5990, 2675, 9432, 843, 3042, 875, 6159, 5760,\n 6862, 2119, 10650, 3475, 4596]\n\n# Non-Disaster miss labeled:\nincorrect_ids_0 = [8721, 3221, 8489, 10318, 10008, 8704, 7415, 2241, 516,\n 9607, 3903, 3240, 1560, 7984, 10237, 8578, 2582, 8450,\n 7862, 1239, 2929, 6624, 8880, 5576, 9931, 7174, 10418,\n 1222, 9029, 2076, 4882, 7002, 10575, 4576, 3005, 571,\n 8577, 6034, 4895, 1065, 513, 9337, 5186, 2969, 2340,\n 2121, 3591, 1865, 5559, 5791, 8528, 4675, 3613, 5713,\n 9738, 2780, 5638, 4395, 5479, 7362, 4441]","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:53.223066Z","iopub.execute_input":"2023-01-16T12:46:53.223402Z","iopub.status.idle":"2023-01-16T12:46:53.231527Z","shell.execute_reply.started":"2023-01-16T12:46:53.223369Z","shell.execute_reply":"2023-01-16T12:46:53.230524Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"We can see that there is alot of misslabeled tweets, a lot of tweets contain (Hazardous, explode, army, dead, collapse) are labeled wrongly as Disaster. and -of course- there is a little wrong model predictions.","metadata":{}},{"cell_type":"markdown","source":"Deep learning models are exceptionally good at finding and exploiting shortcuts to get to a prediction. For this reason, it is also worth investing time into looking at the examples that the model is most confident about, so that we can be confident that the model does not improperly exploit certain features of the text.","metadata":{}},{"cell_type":"code","source":"# lowest 10 losses\nerrors.sort_values(by='loss', ascending=True).head(10)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T12:46:53.233009Z","iopub.execute_input":"2023-01-16T12:46:53.234145Z","iopub.status.idle":"2023-01-16T12:46:53.250381Z","shell.execute_reply.started":"2023-01-16T12:46:53.234105Z","shell.execute_reply":"2023-01-16T12:46:53.249712Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Edit miss-labeled labels","metadata":{}},{"cell_type":"code","source":"# re-read & correct labels\ntrain_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\", usecols=['id', 'text', 'target'])\ntrain_df.rename({'target':'label'}, axis='columns', inplace=True)\ntrain_df.drop_duplicates(inplace=True)\n\n# Edit Disaster miss-labeled labels\ntrain_df.loc[train_df['id'].isin(incorrect_ids_1), 'label'] = 1\n\n# Edit Non-Disaster miss-labeled labels\ntrain_df.loc[train_df['id'].isin(incorrect_ids_0), 'label'] = 0","metadata":{"execution":{"iopub.status.busy":"2023-01-16T13:53:42.531107Z","iopub.execute_input":"2023-01-16T13:53:42.531555Z","iopub.status.idle":"2023-01-16T13:53:42.572263Z","shell.execute_reply.started":"2023-01-16T13:53:42.531518Z","shell.execute_reply":"2023-01-16T13:53:42.570991Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Re-run the model","metadata":{}},{"cell_type":"code","source":"# split the dataset\ntrain_df, valid_df = train_test_split(train_df, test_size=0.33, random_state=10)\n\n# convert to Hugging Face Datasets\ntrain_ds = Dataset.from_pandas(train_df)\nvalid_ds = Dataset.from_pandas(valid_df)\n\n# apply tokenizer to the whole dataset\ntrain_tok = train_ds.map(lambda row: tokenizer(row['text'], padding=True, truncation=True), batched=True)\nvalid_tok = valid_ds.map(lambda row: tokenizer(row['text'], padding=True, truncation=True), batched=True)\n\n# train the model\nmodel = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2).to(device)\ntraining_args = TrainingArguments(\n output_dir=MODEL_NAME,\n num_train_epochs=5,\n learning_rate=1e-5, # 0.00001\n per_device_train_batch_size=BATCH_SIZE, # batch size per device during training\n per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation\n weight_decay=0.1,\n disable_tqdm=False,\n logging_steps=LOGGING_STEP,\n push_to_hub=False,\n log_level='error',\n report_to='none' # turn wandb off\n)\ntrainer = Trainer(\n model=model,\n args=training_args,\n compute_metrics=compute_matrics,\n train_dataset=train_tok,\n eval_dataset=valid_tok,\n tokenizer=tokenizer\n)\ntrainer.train();","metadata":{"execution":{"iopub.status.busy":"2023-01-16T13:53:43.875061Z","iopub.execute_input":"2023-01-16T13:53:43.875507Z","iopub.status.idle":"2023-01-16T13:58:47.799538Z","shell.execute_reply.started":"2023-01-16T13:53:43.875466Z","shell.execute_reply":"2023-01-16T13:58:47.798231Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# get predictions after edit labels\npreds_output = trainer.predict(valid_tok)\n\n# print metrics\npreds_output.metrics","metadata":{"execution":{"iopub.status.busy":"2023-01-16T13:58:47.802487Z","iopub.execute_input":"2023-01-16T13:58:47.803028Z","iopub.status.idle":"2023-01-16T13:58:55.26739Z","shell.execute_reply.started":"2023-01-16T13:58:47.802979Z","shell.execute_reply":"2023-01-16T13:58:55.265789Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"Wow we've reached ~ 88% accuracy! 👏","metadata":{}},{"cell_type":"markdown","source":"# Submit results","metadata":{}},{"cell_type":"code","source":"# get predictions\npreds_output = trainer.predict(test_tok)\ny_preds = np.argmax(preds_output.predictions, axis=1)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T13:59:01.600391Z","iopub.execute_input":"2023-01-16T13:59:01.600871Z","iopub.status.idle":"2023-01-16T13:59:09.235009Z","shell.execute_reply.started":"2023-01-16T13:59:01.600833Z","shell.execute_reply":"2023-01-16T13:59:09.233602Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# read the submission file\nsample_submission = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")\n\n# assign the predictions to the submission df\nsample_submission.target = y_preds\nsample_submission.head()\n\n# save submission file\nsample_submission.to_csv(\"submission.csv\", columns = ['id','target'], index=False)","metadata":{"execution":{"iopub.status.busy":"2023-01-16T13:59:09.242137Z","iopub.execute_input":"2023-01-16T13:59:09.246011Z","iopub.status.idle":"2023-01-16T13:59:09.276998Z","shell.execute_reply.started":"2023-01-16T13:59:09.245944Z","shell.execute_reply":"2023-01-16T13:59:09.275668Z"},"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"## Conclusion\n\n- BERT and its variants, like RoBERTa and DistilBERT, belong to this class of architectures. The representation computed for a given token in this architecture depends both on the left (before the token) and the right (after the token) contexts. This is often called bidirectional attention.\n- Distilled model is 40% smaller than the original but still maintains about 97% performance on the various NLP tasks.\n- DeBERTa improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data. \n- These models convert an input sequence of text into a rich numerical representation that is well suited for tasks like text classification or named entity recognition.\n- Transformers as Feature Extractors are good start specially if you don't have a GPU.\n- Fine-Tuning Transformers give you the best results but require a GPU to save time.\n- Error Analysis can detect dataset issues, here we found lots of misslabeled data.\n\n\n\n**For any suggestions, please let me know in the comments! Thanks. \nHappy Learning🤗**","metadata":{}}]}

Natural-Language-Processing-with-Disaster-Tweets/V3/nlp-getting-started.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95b0e2b687762de374b2b6901547fa22d764807e72d344c829640f3756f5a70f
+size 607343