In [1]:
!pip3 install transformers
!pip3 install -q git+https://github.com/gmihaila/ml_things.git
!pip3 install numpy
!pip3 install torch
!pip3 install numpy scikit-learn

import io
import os
import torch
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, set_seed)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, r

In [5]:
# Setting a fixed random seed for reproducibility of results across runs.
set_seed(123)
epochs = 4 # Number of times to iterate over the entire dataset during training
batch_size = 32 # Number of samples processed before the model is updated
max_length = 200 # Maximum length of the input sequences
# Setting the device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name_or_path = 'mideind/IceBERT' # Specifying the pre-trained model to use
# Dictionary mapping of labels to ids is commented out. Presumed defined elsewhere
labels_ids = {'informal': 0, 'formal': 1}
n_labels = len(labels_ids) # Counting the number of unique labels

# Defining a custom Dataset class for handling the formality dataset
class FormalityDataset(Dataset):

 def __init__(self, path, use_tokenizer, labels_ids, max_sequence_len=None):
 # Check if the provided path is a directory
 if not os.path.isdir(path):
 raise ValueError('Invalid `path` variable! Needs to be a directory')
 # Use the tokenizer's max length if no specific max_sequence_len is provided
 max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
 texts = []
 labels = []
 print('Reading partitions...')

 # Reading data files for each label
 for label, label_id, in tqdm(labels_ids.items()):
 sentiment_path = os.path.join(path, label)
 files_names = os.listdir(sentiment_path)
 print('Reading %s files...' % label)
 # Reading individual files
 for file_name in tqdm(files_names):
 file_path = os.path.join(sentiment_path, file_name)
 with io.open(file_path, mode='r', encoding='ISO-8859-1') as f:
 lines = f.readlines()
 for line in lines:
 texts.append(line.strip())
 labels.append(label_id)

 self.n_examples = len(labels)
 print('Using tokenizer on all texts. This can take a while...')
 # Tokenizing all texts and adding special tokens, padding, and truncating to max_length
 self.inputs = use_tokenizer(texts, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt', max_length=max_sequence_len)
 self.sequence_len = self.inputs['input_ids'].shape[-1]
 print('Texts padded or truncated to %d length!' % self.sequence_len)
 self.inputs.update({'labels':torch.tensor(labels)})
 print('Finished!\n')

 def __len__(self):
 # Returns the number of examples
 return self.n_examples

 def __getitem__(self, item):
 # Returns a specific item from the dataset
 return {key: self.inputs[key][item] for key in self.inputs.keys()}

# Training function, which updates the model's weights based on the training data
def train(dataloader, optimizer_, scheduler_, device_):
 global model # Reference to the model being trained
 predictions_labels = []
 true_labels = []
 total_loss = 0

 model.train() # Set the model to training mode

 # Iterate over each batch in the dataloader
 for batch in tqdm(dataloader, total=len(dataloader)):
 true_labels += batch['labels'].numpy().flatten().tolist()
 batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
 model.zero_grad() # Reset gradients
 outputs = model(**batch)
 loss, logits = outputs[:2]
 total_loss += loss.item()
 loss.backward() # Compute gradient of loss w.r.t. model parameters
 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip gradients to avoid explosion
 optimizer.step() # Update model parameters
 scheduler.step() # Update learning rate
 logits = logits.detach().cpu().numpy()
 predictions_labels += logits.argmax(axis=-1).flatten().tolist()

 avg_epoch_loss = total_loss / len(dataloader) # Compute average loss for the epoch
 return true_labels, predictions_labels, avg_epoch_loss

# Function to evaluate the model on a validation set
def validation(dataloader, device_):
 global model # Reference to the model being evaluated
 predictions_labels = []
 true_labels = []
 total_loss = 0

 model.eval() # Set the model to evaluation mode

 # Iterate over each batch in the dataloader
 for batch in tqdm(dataloader, total=len(dataloader)):
 true_labels += batch['labels'].numpy().flatten().tolist()
 batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

 with torch.no_grad(): # Disable gradient computation
 outputs = model(**batch)
 loss, logits = outputs[:2]
 logits = logits.detach().cpu().numpy()
 total_loss += loss.item()
 predict_content = logits.argmax(axis=-1).flatten().tolist()
 predictions_labels += predict_content

 avg_epoch_loss = total_loss / len(dataloader) # Compute average loss for the validation
 return true_labels, predictions_labels, avg_epoch_loss

# Load the model and tokenizer from Hugging Face's Transformers library
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

model.to(device) # Move model to the specified device (GPU or CPU)

print('Model loaded to `%s`' % device)

# Load a CSV file into a DataFrame
#df = pd.read_csv('Book3.csv')

file_path = '/Users/karalifingibergsdottir/Desktop/Book3.csv'
df = pd.read_csv(file_path)

# Extract columns from the DataFrame
sentences = df['Sentence'].values
formality_labels = df['Formality'].values
professional_labels = df['Professional'].values
friendliness_labels = df['Friendlyness'].values # Note: Typo in the document itself

# Tokenize sentences for TF-IDF vectorization
tokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT")
tokenized_sentences = [tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128) for sentence in sentences]
input_ids = [tokenized_sentence.input_ids[0] for tokenized_sentence in tokenized_sentences]
input_strings = [' '.join(map(str, input_id)) for input_id in input_ids]
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(input_strings)

# Function to train a Naive Bayes classifier
def train_classifier(X, labels):
 classifier = MultinomialNB()
 classifier.fit(X, labels)
 return classifier

# Train Naive Bayes classifiers for each aspect of text (formality, professionalism, friendliness)
formality_classifier = train_classifier(X, formality_labels)
professional_classifier = train_classifier(X, professional_labels)
friendliness_classifier = train_classifier(X, friendliness_labels)

# Function to predict classifications for a new text
def predict_text_classifications(text):
 tokenized_text = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
 input_id = tokenized_text.input_ids[0]
 input_string = ' '.join(map(str, input_id))

 X_new = tfidf_vectorizer.transform([input_string])

 formality_pred = formality_classifier.predict(X_new)[0]
 professional_pred = professional_classifier.predict(X_new)[0]
 friendliness_pred = friendliness_classifier.predict(X_new)[0]

 # Determine the overall classification based on a simple majority rule
 positive_count = formality_pred + professional_pred + friendliness_pred
 classification = "Good" if positive_count >= 2 else "Bad"

 return formality_pred, professional_pred, friendliness_pred, classification

print(f"-----------------")
print(f"-----------------")
print(f"-----------------")

# Example usage of the prediction function
new_text = "Viðeigandi aðgerðir eru á næsta leiti en sá sakaði greiddi 15.000 kr."
formality_pred, professional_pred, friendliness_pred, overall_classification = predict_text_classifications(new_text)

# Print predictions for the new text
print(f"New text: {new_text}")
print(f"Formality: {'Formal' if formality_pred else 'Informal'}")
print(f"Professional: {'Professional' if professional_pred else 'Unprofessional'}")
print(f"Friendliness: {'Friendly' if friendliness_pred else 'Unfriendly'}")
print(f"Overall Classification: {overall_classification}")


Some weights of the model checkpoint at mideind/IceBERT were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out

Model loaded to `cpu`
-----------------
-----------------
-----------------
New text: Viðeigandi aðgerðir eru á næsta leiti en sá sakaði greiddi 15.000 kr.
Formality: Formal
Professional: Unprofessional
Friendliness: Unfriendly
Overall Classification: Bad
