import re import json import glob import torch import tarfile import datetime # for metrics from torchmetrics.classification import BinaryAUROC from sklearn.metrics import roc_auc_score def writeToJSON(data, where_to_save): """ data: a dictionary that contains data to save where_to_save: the name of the file to write on """ with open(where_to_save, "w", encoding="utf8") as outfile: json.dump(data, outfile) def readJSON(input_file): """ 1. arguments input_file: a json file to read 2. output a json objet in a form of a dictionary """ with open(input_file, "r", encoding="utf-8", errors='ignore') as infile: json_object = json.load(infile, strict=False) return json_object def writeTEXT(data, where_to_save): with open(where_to_save, "w", encoding="utf-8") as outfile: for d in data: outfile.write(str(d)) outfile.write("\n") def readTEXT_to_LIST(input_file): with open(input_file, "r", encoding="utf-8") as infile: data = [] for line in infile: data.append(line) return data def saveCSV(df, where_to_save): df.to_csv(where_to_save, index=False) def time_format(total_time): """ Change the from seconds to hh:mm:ss """ total_time_rounded = int(round((total_time))) total_time_final = str(datetime.timedelta(seconds=total_time_rounded)) return total_time_final def z_normalizer(labels): """ Implement a z-score normalization technique""" labels_mean = torch.mean(labels) labels_std = torch.std(labels) # Guard against division by zero when all labels are identical (std == 0) labels_std = labels_std.clamp(min=1e-8) scaled_labels = (labels - labels_mean) / labels_std return scaled_labels def z_denormalize(scaled_labels, labels_mean, labels_std): labels = (scaled_labels * labels_std) + labels_mean return labels def min_max_scaling(labels): """ Implement a min-max normalization technique""" min_val = torch.min(labels) max_val = torch.max(labels) diff = max_val - min_val # Guard against division by zero when all labels are identical diff = diff.clamp(min=1e-8) scaled_labels = (labels - min_val) / diff return scaled_labels def mm_denormalize(scaled_labels, min_val, max_val): diff = max_val - min_val denorm_labels = (scaled_labels * diff) + min_val return denorm_labels def log_scaling(labels): """ Implement log-scaling normalization technique""" scaled_labels = torch.log1p(labels) return scaled_labels def ls_denormalize(scaled_labels): denorm_labels = torch.expm1(scaled_labels) return denorm_labels def compressCheckpointsWithTar(filename): filename_for_tar = filename[0:-3] tar = tarfile.open(f"{filename_for_tar}.tar.gz", "w:gz") tar.add(filename) tar.close() def decompressTarCheckpoints(tar_filename): tar = tarfile.open(tar_filename) tar.extractall() tar.close() def replace_bond_lengths_with_num(sentence): sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*Å", "[NUM]", sentence) # Regex pattern to match bond lengths and units return sentence.strip() def replace_bond_angles_with_ang(sentence): sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°", "[ANG]", sentence) # Regex pattern to match angles and units sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees", "[ANG]", sentence) # Regex pattern to match angles and units return sentence.strip() def replace_bond_lengths_and_angles_with_num_and_ang(sentence): sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*Å", "[NUM]", sentence) # Regex pattern to match bond lengths and units sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°", "[ANG]", sentence) # Regex pattern to match angles and units sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees", "[ANG]", sentence) # Regex pattern to match angles and units return sentence.strip() def get_cleaned_stopwords(): # from https://github.com/igorbrigadir/stopwords stopword_files = glob.glob("stopwords/en/*.txt") num_str = {'one','two','three','four','five','six','seven','eight','nine'} all_stopwords_list = set() for file_path in stopword_files: all_stopwords_list |= set(readTEXT_to_LIST(file_path)) cleaned_list_for_mat = {wrd.replace("\n", "").strip() for wrd in all_stopwords_list} - {wrd for wrd in all_stopwords_list if wrd.isdigit()} - num_str return cleaned_list_for_mat def remove_mat_stopwords(sentence): stopwords_list = get_cleaned_stopwords() words = sentence.split() words_lower = sentence.lower().split() sentence = ' '.join([words[i] for i in range(len(words)) if words_lower[i] not in stopwords_list]) return sentence def get_sequence_len_stats(df, tokenizer, max_len): training_on = sum(1 for sent in df['description'].apply(tokenizer.tokenize) if len(sent) <= max_len) return (training_on/len(df))*100 def get_roc_score(predictions, targets): roc_fn = BinaryAUROC(threshold=None) x = torch.tensor(targets) y = torch.tensor(predictions) y = torch.round(torch.sigmoid(y)) roc_score = roc_fn(y, x) return roc_score