File size: 21,045 Bytes

9983f8c

{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "H1hq1Bwr02H_"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: transformers in /opt/homebrew/lib/python3.11/site-packages (4.27.1)\n",
            "Requirement already satisfied: filelock in /opt/homebrew/lib/python3.11/site-packages (from transformers) (3.9.1)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (0.13.2)\n",
            "Requirement already satisfied: numpy>=1.17 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (1.24.2)\n",
            "Requirement already satisfied: packaging>=20.0 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from transformers) (23.0)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from transformers) (6.0)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (2022.10.31)\n",
            "Requirement already satisfied: requests in /opt/homebrew/lib/python3.11/site-packages (from transformers) (2.28.2)\n",
            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (0.13.2)\n",
            "Requirement already satisfied: tqdm>=4.27 in /opt/homebrew/lib/python3.11/site-packages (from transformers) (4.65.0)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/homebrew/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/lib/python3.11/site-packages (from requests->transformers) (3.1.0)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from requests->transformers) (3.4)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/lib/python3.11/site-packages (from requests->transformers) (1.26.15)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/lib/python3.11/site-packages (from requests->transformers) (2022.12.7)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
            "Requirement already satisfied: numpy in /opt/homebrew/lib/python3.11/site-packages (1.24.2)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
            "Requirement already satisfied: torch in /opt/homebrew/lib/python3.11/site-packages (2.0.0)\n",
            "Requirement already satisfied: filelock in /opt/homebrew/lib/python3.11/site-packages (from torch) (3.9.1)\n",
            "Requirement already satisfied: typing-extensions in /opt/homebrew/lib/python3.11/site-packages (from torch) (4.5.0)\n",
            "Requirement already satisfied: sympy in /opt/homebrew/lib/python3.11/site-packages (from torch) (1.11.1)\n",
            "Requirement already satisfied: networkx in /opt/homebrew/lib/python3.11/site-packages (from torch) (3.0)\n",
            "Requirement already satisfied: jinja2 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from torch) (3.1.2)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /Users/karalifingibergsdottir/Library/Python/3.11/lib/python/site-packages (from jinja2->torch) (2.1.2)\n",
            "Requirement already satisfied: mpmath>=0.19 in /opt/homebrew/lib/python3.11/site-packages (from sympy->torch) (1.3.0)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
            "Requirement already satisfied: numpy in /opt/homebrew/lib/python3.11/site-packages (1.24.2)\n",
            "Requirement already satisfied: scikit-learn in /opt/homebrew/lib/python3.11/site-packages (1.2.2)\n",
            "Requirement already satisfied: scipy>=1.3.2 in /opt/homebrew/lib/python3.11/site-packages (from scikit-learn) (1.10.1)\n",
            "Requirement already satisfied: joblib>=1.1.1 in /opt/homebrew/lib/python3.11/site-packages (from scikit-learn) (1.2.0)\n",
            "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/homebrew/lib/python3.11/site-packages (from scikit-learn) (3.1.0)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n"
          ]
        }
      ],
      "source": [
        "!pip3 install transformers\n",
        "!pip3 install -q git+https://github.com/gmihaila/ml_things.git\n",
        "!pip3 install numpy\n",
        "!pip3 install torch\n",
        "!pip3 install numpy scikit-learn\n",
        "\n",
        "import io\n",
        "import os\n",
        "import torch\n",
        "import pandas as pd\n",
        "from tqdm.notebook import tqdm\n",
        "from torch.utils.data import Dataset, DataLoader\n",
        "from transformers import (AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, set_seed)\n",
        "from sklearn.metrics import classification_report, accuracy_score\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.model_selection import train_test_split"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xHJyRk6MEENr",
        "outputId": "a6258cd9-61c0-4b68-9177-94190620158e"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at mideind/IceBERT were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']\n",
            "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Model loaded to `cpu`\n",
            "-----------------\n",
            "-----------------\n",
            "-----------------\n",
            "New text: Viðeigandi aðgerðir eru á næsta leiti en sá sakaði greiddi 15.000 kr.\n",
            "Formality: Formal\n",
            "Professional: Unprofessional\n",
            "Friendliness: Unfriendly\n",
            "Overall Classification: Bad\n"
          ]
        }
      ],
      "source": [
        "# Setting a fixed random seed for reproducibility of results across runs.\n",
        "set_seed(123)\n",
        "epochs = 4  # Number of times to iterate over the entire dataset during training\n",
        "batch_size = 32  # Number of samples processed before the model is updated\n",
        "max_length = 200  # Maximum length of the input sequences\n",
        "# Setting the device to GPU if available, else CPU\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "model_name_or_path = 'mideind/IceBERT'  # Specifying the pre-trained model to use\n",
        "# Dictionary mapping of labels to ids is commented out. Presumed defined elsewhere\n",
        "labels_ids = {'informal': 0, 'formal': 1}\n",
        "n_labels = len(labels_ids)  # Counting the number of unique labels\n",
        "\n",
        "# Defining a custom Dataset class for handling the formality dataset\n",
        "class FormalityDataset(Dataset):\n",
        "\n",
        "    def __init__(self, path, use_tokenizer, labels_ids, max_sequence_len=None):\n",
        "        # Check if the provided path is a directory\n",
        "        if not os.path.isdir(path):\n",
        "            raise ValueError('Invalid `path` variable! Needs to be a directory')\n",
        "        # Use the tokenizer's max length if no specific max_sequence_len is provided\n",
        "        max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len\n",
        "        texts = []\n",
        "        labels = []\n",
        "        print('Reading partitions...')\n",
        "\n",
        "        # Reading data files for each label\n",
        "        for label, label_id, in tqdm(labels_ids.items()):\n",
        "            sentiment_path = os.path.join(path, label)\n",
        "            files_names = os.listdir(sentiment_path)\n",
        "            print('Reading %s files...' % label)\n",
        "            # Reading individual files\n",
        "            for file_name in tqdm(files_names):\n",
        "                file_path = os.path.join(sentiment_path, file_name)\n",
        "                with io.open(file_path, mode='r', encoding='ISO-8859-1') as f:\n",
        "                    lines = f.readlines()\n",
        "                    for line in lines:\n",
        "                        texts.append(line.strip())\n",
        "                        labels.append(label_id)\n",
        "\n",
        "        self.n_examples = len(labels)\n",
        "        print('Using tokenizer on all texts. This can take a while...')\n",
        "        # Tokenizing all texts and adding special tokens, padding, and truncating to max_length\n",
        "        self.inputs = use_tokenizer(texts, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt',  max_length=max_sequence_len)\n",
        "        self.sequence_len = self.inputs['input_ids'].shape[-1]\n",
        "        print('Texts padded or truncated to %d length!' % self.sequence_len)\n",
        "        self.inputs.update({'labels':torch.tensor(labels)})\n",
        "        print('Finished!\\n')\n",
        "\n",
        "    def __len__(self):\n",
        "        # Returns the number of examples\n",
        "        return self.n_examples\n",
        "\n",
        "    def __getitem__(self, item):\n",
        "        # Returns a specific item from the dataset\n",
        "        return {key: self.inputs[key][item] for key in self.inputs.keys()}\n",
        "\n",
        "# Training function, which updates the model's weights based on the training data\n",
        "def train(dataloader, optimizer_, scheduler_, device_):\n",
        "    global model  # Reference to the model being trained\n",
        "    predictions_labels = []\n",
        "    true_labels = []\n",
        "    total_loss = 0\n",
        "\n",
        "    model.train()  # Set the model to training mode\n",
        "\n",
        "    # Iterate over each batch in the dataloader\n",
        "    for batch in tqdm(dataloader, total=len(dataloader)):\n",
        "        true_labels += batch['labels'].numpy().flatten().tolist()\n",
        "        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
        "        model.zero_grad()  # Reset gradients\n",
        "        outputs = model(**batch)\n",
        "        loss, logits = outputs[:2]\n",
        "        total_loss += loss.item()\n",
        "        loss.backward()  # Compute gradient of loss w.r.t. model parameters\n",
        "        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clip gradients to avoid explosion\n",
        "        optimizer.step()  # Update model parameters\n",
        "        scheduler.step()  # Update learning rate\n",
        "        logits = logits.detach().cpu().numpy()\n",
        "        predictions_labels += logits.argmax(axis=-1).flatten().tolist()\n",
        "\n",
        "    avg_epoch_loss = total_loss / len(dataloader)  # Compute average loss for the epoch\n",
        "    return true_labels, predictions_labels, avg_epoch_loss\n",
        "\n",
        "# Function to evaluate the model on a validation set\n",
        "def validation(dataloader, device_):\n",
        "    global model  # Reference to the model being evaluated\n",
        "    predictions_labels = []\n",
        "    true_labels = []\n",
        "    total_loss = 0\n",
        "\n",
        "    model.eval()  # Set the model to evaluation mode\n",
        "\n",
        "    # Iterate over each batch in the dataloader\n",
        "    for batch in tqdm(dataloader, total=len(dataloader)):\n",
        "        true_labels += batch['labels'].numpy().flatten().tolist()\n",
        "        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
        "\n",
        "        with torch.no_grad():  # Disable gradient computation\n",
        "            outputs = model(**batch)\n",
        "            loss, logits = outputs[:2]\n",
        "            logits = logits.detach().cpu().numpy()\n",
        "            total_loss += loss.item()\n",
        "            predict_content = logits.argmax(axis=-1).flatten().tolist()\n",
        "            predictions_labels += predict_content\n",
        "\n",
        "    avg_epoch_loss = total_loss / len(dataloader)  # Compute average loss for the validation\n",
        "    return true_labels, predictions_labels, avg_epoch_loss\n",
        "\n",
        "# Load the model and tokenizer from Hugging Face's Transformers library\n",
        "model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)\n",
        "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n",
        "model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)\n",
        "\n",
        "model.to(device)  # Move model to the specified device (GPU or CPU)\n",
        "\n",
        "print('Model loaded to `%s`' % device)\n",
        "\n",
        "# Load a CSV file into a DataFrame\n",
        "#df = pd.read_csv('Book3.csv')\n",
        "\n",
        "file_path = '/Users/karalifingibergsdottir/Desktop/Book3.csv'\n",
        "df = pd.read_csv(file_path)\n",
        "\n",
        "# Extract columns from the DataFrame\n",
        "sentences = df['Sentence'].values\n",
        "formality_labels = df['Formality'].values\n",
        "professional_labels = df['Professional'].values\n",
        "friendliness_labels = df['Friendlyness'].values  # Note: Typo in the document itself\n",
        "\n",
        "# Tokenize sentences for TF-IDF vectorization\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"mideind/IceBERT\")\n",
        "tokenized_sentences = [tokenizer(sentence, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=128) for sentence in sentences]\n",
        "input_ids = [tokenized_sentence.input_ids[0] for tokenized_sentence in tokenized_sentences]\n",
        "input_strings = [' '.join(map(str, input_id)) for input_id in input_ids]\n",
        "tfidf_vectorizer = TfidfVectorizer()\n",
        "X = tfidf_vectorizer.fit_transform(input_strings)\n",
        "\n",
        "# Function to train a Naive Bayes classifier\n",
        "def train_classifier(X, labels):\n",
        "    classifier = MultinomialNB()\n",
        "    classifier.fit(X, labels)\n",
        "    return classifier\n",
        "\n",
        "# Train Naive Bayes classifiers for each aspect of text (formality, professionalism, friendliness)\n",
        "formality_classifier = train_classifier(X, formality_labels)\n",
        "professional_classifier = train_classifier(X, professional_labels)\n",
        "friendliness_classifier = train_classifier(X, friendliness_labels)\n",
        "\n",
        "# Function to predict classifications for a new text\n",
        "def predict_text_classifications(text):\n",
        "    tokenized_text = tokenizer(text, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=128)\n",
        "    input_id = tokenized_text.input_ids[0]\n",
        "    input_string = ' '.join(map(str, input_id))\n",
        "\n",
        "    X_new = tfidf_vectorizer.transform([input_string])\n",
        "\n",
        "    formality_pred = formality_classifier.predict(X_new)[0]\n",
        "    professional_pred = professional_classifier.predict(X_new)[0]\n",
        "    friendliness_pred = friendliness_classifier.predict(X_new)[0]\n",
        "\n",
        "    # Determine the overall classification based on a simple majority rule\n",
        "    positive_count = formality_pred + professional_pred + friendliness_pred\n",
        "    classification = \"Good\" if positive_count >= 2 else \"Bad\"\n",
        "\n",
        "    return formality_pred, professional_pred, friendliness_pred, classification\n",
        "\n",
        "print(f\"-----------------\")\n",
        "print(f\"-----------------\")\n",
        "print(f\"-----------------\")\n",
        "\n",
        "# Example usage of the prediction function\n",
        "new_text = \"Viðeigandi aðgerðir eru á næsta leiti en sá sakaði greiddi 15.000 kr.\"\n",
        "formality_pred, professional_pred, friendliness_pred, overall_classification = predict_text_classifications(new_text)\n",
        "\n",
        "# Print predictions for the new text\n",
        "print(f\"New text: {new_text}\")\n",
        "print(f\"Formality: {'Formal' if formality_pred else 'Informal'}\")\n",
        "print(f\"Professional: {'Professional' if professional_pred else 'Unprofessional'}\")\n",
        "print(f\"Friendliness: {'Friendly' if friendliness_pred else 'Unfriendly'}\")\n",
        "print(f\"Overall Classification: {overall_classification}\")\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.5"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}