{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "H1hq1Bwr02H_" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /opt/homebrew/lib/python3.11/site-packages (4.27.1)\n", "Requirement already satisfied: filelock in /opt/homebrew/lib/python3.11/site-packages (from transformers) (3.9.1)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (0.13.2)\n", "Requirement already satisfied: numpy>=1.17 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (1.24.2)\n", "Requirement already satisfied: packaging>=20.0 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from transformers) (23.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /opt/homebrew/lib/python3.11/site-packages (from transformers) (2.28.2)\n", "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (0.13.2)\n", "Requirement already satisfied: tqdm>=4.27 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/homebrew/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/lib/python3.11/site-packages (from requests->transformers) (3.1.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/lib/python3.11/site-packages (from requests->transformers) (1.26.15)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/lib/python3.11/site-packages (from requests->transformers) (2022.12.7)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: numpy in /opt/homebrew/lib/python3.11/site-packages (1.24.2)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: torch in /opt/homebrew/lib/python3.11/site-packages (2.0.0)\n", "Requirement already satisfied: filelock in /opt/homebrew/lib/python3.11/site-packages (from torch) (3.9.1)\n", "Requirement already satisfied: typing-extensions in /opt/homebrew/lib/python3.11/site-packages (from torch) (4.5.0)\n", "Requirement already satisfied: sympy in /opt/homebrew/lib/python3.11/site-packages (from torch) (1.11.1)\n", "Requirement already satisfied: networkx in /opt/homebrew/lib/python3.11/site-packages (from torch) (3.0)\n", "Requirement already satisfied: jinja2 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from torch) (3.1.2)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from jinja2->torch) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in /opt/homebrew/lib/python3.11/site-packages (from sympy->torch) (1.3.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: numpy in /opt/homebrew/lib/python3.11/site-packages (1.24.2)\n", "Requirement already satisfied: scikit-learn in /opt/homebrew/lib/python3.11/site-packages (1.2.2)\n", "Requirement already satisfied: scipy>=1.3.2 in /opt/homebrew/lib/python3.11/site-packages (from scikit-learn) (1.10.1)\n", "Requirement already satisfied: joblib>=1.1.1 in /opt/homebrew/lib/python3.11/site-packages (from scikit-learn) (1.2.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/homebrew/lib/python3.11/site-packages (from scikit-learn) (3.1.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip3 install transformers\n", "!pip3 install -q git+https://github.com/gmihaila/ml_things.git\n", "!pip3 install numpy\n", "!pip3 install torch\n", "!pip3 install numpy scikit-learn\n", "\n", "import io\n", "import os\n", "import torch\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "from torch.utils.data import Dataset, DataLoader\n", "from transformers import (AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, set_seed)\n", "from sklearn.metrics import classification_report, accuracy_score\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xHJyRk6MEENr", "outputId": "a6258cd9-61c0-4b68-9177-94190620158e" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at mideind/IceBERT were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']\n", "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Model loaded to `cpu`\n", "-----------------\n", "-----------------\n", "-----------------\n", "New text: Viðeigandi aðgerðir eru á næsta leiti en sá sakaði greiddi 15.000 kr.\n", "Formality: Formal\n", "Professional: Unprofessional\n", "Friendliness: Unfriendly\n", "Overall Classification: Bad\n" ] } ], "source": [ "# Setting a fixed random seed for reproducibility of results across runs.\n", "set_seed(123)\n", "epochs = 4 # Number of times to iterate over the entire dataset during training\n", "batch_size = 32 # Number of samples processed before the model is updated\n", "max_length = 200 # Maximum length of the input sequences\n", "# Setting the device to GPU if available, else CPU\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model_name_or_path = 'mideind/IceBERT' # Specifying the pre-trained model to use\n", "# Dictionary mapping of labels to ids is commented out. Presumed defined elsewhere\n", "labels_ids = {'informal': 0, 'formal': 1}\n", "n_labels = len(labels_ids) # Counting the number of unique labels\n", "\n", "# Defining a custom Dataset class for handling the formality dataset\n", "class FormalityDataset(Dataset):\n", "\n", " def __init__(self, path, use_tokenizer, labels_ids, max_sequence_len=None):\n", " # Check if the provided path is a directory\n", " if not os.path.isdir(path):\n", " raise ValueError('Invalid `path` variable! Needs to be a directory')\n", " # Use the tokenizer's max length if no specific max_sequence_len is provided\n", " max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len\n", " texts = []\n", " labels = []\n", " print('Reading partitions...')\n", "\n", " # Reading data files for each label\n", " for label, label_id, in tqdm(labels_ids.items()):\n", " sentiment_path = os.path.join(path, label)\n", " files_names = os.listdir(sentiment_path)\n", " print('Reading %s files...' % label)\n", " # Reading individual files\n", " for file_name in tqdm(files_names):\n", " file_path = os.path.join(sentiment_path, file_name)\n", " with io.open(file_path, mode='r', encoding='ISO-8859-1') as f:\n", " lines = f.readlines()\n", " for line in lines:\n", " texts.append(line.strip())\n", " labels.append(label_id)\n", "\n", " self.n_examples = len(labels)\n", " print('Using tokenizer on all texts. This can take a while...')\n", " # Tokenizing all texts and adding special tokens, padding, and truncating to max_length\n", " self.inputs = use_tokenizer(texts, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt', max_length=max_sequence_len)\n", " self.sequence_len = self.inputs['input_ids'].shape[-1]\n", " print('Texts padded or truncated to %d length!' % self.sequence_len)\n", " self.inputs.update({'labels':torch.tensor(labels)})\n", " print('Finished!\\n')\n", "\n", " def __len__(self):\n", " # Returns the number of examples\n", " return self.n_examples\n", "\n", " def __getitem__(self, item):\n", " # Returns a specific item from the dataset\n", " return {key: self.inputs[key][item] for key in self.inputs.keys()}\n", "\n", "# Training function, which updates the model's weights based on the training data\n", "def train(dataloader, optimizer_, scheduler_, device_):\n", " global model # Reference to the model being trained\n", " predictions_labels = []\n", " true_labels = []\n", " total_loss = 0\n", "\n", " model.train() # Set the model to training mode\n", "\n", " # Iterate over each batch in the dataloader\n", " for batch in tqdm(dataloader, total=len(dataloader)):\n", " true_labels += batch['labels'].numpy().flatten().tolist()\n", " batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n", " model.zero_grad() # Reset gradients\n", " outputs = model(**batch)\n", " loss, logits = outputs[:2]\n", " total_loss += loss.item()\n", " loss.backward() # Compute gradient of loss w.r.t. model parameters\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip gradients to avoid explosion\n", " optimizer.step() # Update model parameters\n", " scheduler.step() # Update learning rate\n", " logits = logits.detach().cpu().numpy()\n", " predictions_labels += logits.argmax(axis=-1).flatten().tolist()\n", "\n", " avg_epoch_loss = total_loss / len(dataloader) # Compute average loss for the epoch\n", " return true_labels, predictions_labels, avg_epoch_loss\n", "\n", "# Function to evaluate the model on a validation set\n", "def validation(dataloader, device_):\n", " global model # Reference to the model being evaluated\n", " predictions_labels = []\n", " true_labels = []\n", " total_loss = 0\n", "\n", " model.eval() # Set the model to evaluation mode\n", "\n", " # Iterate over each batch in the dataloader\n", " for batch in tqdm(dataloader, total=len(dataloader)):\n", " true_labels += batch['labels'].numpy().flatten().tolist()\n", " batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n", "\n", " with torch.no_grad(): # Disable gradient computation\n", " outputs = model(**batch)\n", " loss, logits = outputs[:2]\n", " logits = logits.detach().cpu().numpy()\n", " total_loss += loss.item()\n", " predict_content = logits.argmax(axis=-1).flatten().tolist()\n", " predictions_labels += predict_content\n", "\n", " avg_epoch_loss = total_loss / len(dataloader) # Compute average loss for the validation\n", " return true_labels, predictions_labels, avg_epoch_loss\n", "\n", "# Load the model and tokenizer from Hugging Face's Transformers library\n", "model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)\n", "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n", "model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)\n", "\n", "model.to(device) # Move model to the specified device (GPU or CPU)\n", "\n", "print('Model loaded to `%s`' % device)\n", "\n", "# Load a CSV file into a DataFrame\n", "#df = pd.read_csv('Book3.csv')\n", "\n", "file_path = '/Users/karalifingibergsdottir/Desktop/Book3.csv'\n", "df = pd.read_csv(file_path)\n", "\n", "# Extract columns from the DataFrame\n", "sentences = df['Sentence'].values\n", "formality_labels = df['Formality'].values\n", "professional_labels = df['Professional'].values\n", "friendliness_labels = df['Friendlyness'].values # Note: Typo in the document itself\n", "\n", "# Tokenize sentences for TF-IDF vectorization\n", "tokenizer = AutoTokenizer.from_pretrained(\"mideind/IceBERT\")\n", "tokenized_sentences = [tokenizer(sentence, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=128) for sentence in sentences]\n", "input_ids = [tokenized_sentence.input_ids[0] for tokenized_sentence in tokenized_sentences]\n", "input_strings = [' '.join(map(str, input_id)) for input_id in input_ids]\n", "tfidf_vectorizer = TfidfVectorizer()\n", "X = tfidf_vectorizer.fit_transform(input_strings)\n", "\n", "# Function to train a Naive Bayes classifier\n", "def train_classifier(X, labels):\n", " classifier = MultinomialNB()\n", " classifier.fit(X, labels)\n", " return classifier\n", "\n", "# Train Naive Bayes classifiers for each aspect of text (formality, professionalism, friendliness)\n", "formality_classifier = train_classifier(X, formality_labels)\n", "professional_classifier = train_classifier(X, professional_labels)\n", "friendliness_classifier = train_classifier(X, friendliness_labels)\n", "\n", "# Function to predict classifications for a new text\n", "def predict_text_classifications(text):\n", " tokenized_text = tokenizer(text, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=128)\n", " input_id = tokenized_text.input_ids[0]\n", " input_string = ' '.join(map(str, input_id))\n", "\n", " X_new = tfidf_vectorizer.transform([input_string])\n", "\n", " formality_pred = formality_classifier.predict(X_new)[0]\n", " professional_pred = professional_classifier.predict(X_new)[0]\n", " friendliness_pred = friendliness_classifier.predict(X_new)[0]\n", "\n", " # Determine the overall classification based on a simple majority rule\n", " positive_count = formality_pred + professional_pred + friendliness_pred\n", " classification = \"Good\" if positive_count >= 2 else \"Bad\"\n", "\n", " return formality_pred, professional_pred, friendliness_pred, classification\n", "\n", "print(f\"-----------------\")\n", "print(f\"-----------------\")\n", "print(f\"-----------------\")\n", "\n", "# Example usage of the prediction function\n", "new_text = \"Viðeigandi aðgerðir eru á næsta leiti en sá sakaði greiddi 15.000 kr.\"\n", "formality_pred, professional_pred, friendliness_pred, overall_classification = predict_text_classifications(new_text)\n", "\n", "# Print predictions for the new text\n", "print(f\"New text: {new_text}\")\n", "print(f\"Formality: {'Formal' if formality_pred else 'Informal'}\")\n", "print(f\"Professional: {'Professional' if professional_pred else 'Unprofessional'}\")\n", "print(f\"Friendliness: {'Friendly' if friendliness_pred else 'Unfriendly'}\")\n", "print(f\"Overall Classification: {overall_classification}\")\n" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 0 }