Spaces:
Sleeping
Sleeping
File size: 5,399 Bytes
e4d634a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | import re
import json
import glob
import torch
import tarfile
import datetime
# for metrics
from torchmetrics.classification import BinaryAUROC
from sklearn.metrics import roc_auc_score
def writeToJSON(data, where_to_save):
"""
data: a dictionary that contains data to save
where_to_save: the name of the file to write on
"""
with open(where_to_save, "w", encoding="utf8") as outfile:
json.dump(data, outfile)
def readJSON(input_file):
"""
1. arguments
input_file: a json file to read
2. output
a json objet in a form of a dictionary
"""
with open(input_file, "r", encoding="utf-8", errors='ignore') as infile:
json_object = json.load(infile, strict=False)
return json_object
def writeTEXT(data, where_to_save):
with open(where_to_save, "w", encoding="utf-8") as outfile:
for d in data:
outfile.write(str(d))
outfile.write("\n")
def readTEXT_to_LIST(input_file):
with open(input_file, "r", encoding="utf-8") as infile:
data = []
for line in infile:
data.append(line)
return data
def saveCSV(df, where_to_save):
df.to_csv(where_to_save, index=False)
def time_format(total_time):
"""
Change the from seconds to hh:mm:ss
"""
total_time_rounded = int(round((total_time)))
total_time_final = str(datetime.timedelta(seconds=total_time_rounded))
return total_time_final
def z_normalizer(labels):
""" Implement a z-score normalization technique"""
labels_mean = torch.mean(labels)
labels_std = torch.std(labels)
# Guard against division by zero when all labels are identical (std == 0)
labels_std = labels_std.clamp(min=1e-8)
scaled_labels = (labels - labels_mean) / labels_std
return scaled_labels
def z_denormalize(scaled_labels, labels_mean, labels_std):
labels = (scaled_labels * labels_std) + labels_mean
return labels
def min_max_scaling(labels):
""" Implement a min-max normalization technique"""
min_val = torch.min(labels)
max_val = torch.max(labels)
diff = max_val - min_val
# Guard against division by zero when all labels are identical
diff = diff.clamp(min=1e-8)
scaled_labels = (labels - min_val) / diff
return scaled_labels
def mm_denormalize(scaled_labels, min_val, max_val):
diff = max_val - min_val
denorm_labels = (scaled_labels * diff) + min_val
return denorm_labels
def log_scaling(labels):
""" Implement log-scaling normalization technique"""
scaled_labels = torch.log1p(labels)
return scaled_labels
def ls_denormalize(scaled_labels):
denorm_labels = torch.expm1(scaled_labels)
return denorm_labels
def compressCheckpointsWithTar(filename):
filename_for_tar = filename[0:-3]
tar = tarfile.open(f"{filename_for_tar}.tar.gz", "w:gz")
tar.add(filename)
tar.close()
def decompressTarCheckpoints(tar_filename):
tar = tarfile.open(tar_filename)
tar.extractall()
tar.close()
def replace_bond_lengths_with_num(sentence):
sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*Å", "[NUM]", sentence) # Regex pattern to match bond lengths and units
return sentence.strip()
def replace_bond_angles_with_ang(sentence):
sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°", "[ANG]", sentence) # Regex pattern to match angles and units
sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees", "[ANG]", sentence) # Regex pattern to match angles and units
return sentence.strip()
def replace_bond_lengths_and_angles_with_num_and_ang(sentence):
sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*Å", "[NUM]", sentence) # Regex pattern to match bond lengths and units
sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°", "[ANG]", sentence) # Regex pattern to match angles and units
sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees", "[ANG]", sentence) # Regex pattern to match angles and units
return sentence.strip()
def get_cleaned_stopwords():
# from https://github.com/igorbrigadir/stopwords
stopword_files = glob.glob("stopwords/en/*.txt")
num_str = {'one','two','three','four','five','six','seven','eight','nine'}
all_stopwords_list = set()
for file_path in stopword_files:
all_stopwords_list |= set(readTEXT_to_LIST(file_path))
cleaned_list_for_mat = {wrd.replace("\n", "").strip() for wrd in all_stopwords_list} - {wrd for wrd in all_stopwords_list if wrd.isdigit()} - num_str
return cleaned_list_for_mat
def remove_mat_stopwords(sentence):
stopwords_list = get_cleaned_stopwords()
words = sentence.split()
words_lower = sentence.lower().split()
sentence = ' '.join([words[i] for i in range(len(words)) if words_lower[i] not in stopwords_list])
return sentence
def get_sequence_len_stats(df, tokenizer, max_len):
training_on = sum(1 for sent in df['description'].apply(tokenizer.tokenize) if len(sent) <= max_len)
return (training_on/len(df))*100
def get_roc_score(predictions, targets):
roc_fn = BinaryAUROC(threshold=None)
x = torch.tensor(targets)
y = torch.tensor(predictions)
y = torch.round(torch.sigmoid(y))
roc_score = roc_fn(y, x)
return roc_score |