Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Utility functions.
"""
import argparse
from collections import Counter
from contextlib import contextmanager
import gzip
import json
import logging
import lzma
import os
import random
import re
import sys
import unicodedata
import zipfile
import torch
import numpy as np
from stanza.models.common.constant import lcode2lang
import stanza.models.common.seq2seq_constant as constant
from stanza.resources.default_packages import TRANSFORMER_NICKNAMES
import stanza.utils.conll18_ud_eval as ud_eval
from stanza.utils.conll18_ud_eval import UDError
logger = logging.getLogger('stanza')
# filenames
def get_wordvec_file(wordvec_dir, shorthand, wordvec_type=None):
""" Lookup the name of the word vectors file, given a directory and the language shorthand.
"""
lcode, tcode = shorthand.split('_', 1)
lang = lcode2lang[lcode]
# locate language folder
word2vec_dir = os.path.join(wordvec_dir, 'word2vec', lang)
fasttext_dir = os.path.join(wordvec_dir, 'fasttext', lang)
lang_dir = None
if wordvec_type is not None:
lang_dir = os.path.join(wordvec_dir, wordvec_type, lang)
if not os.path.exists(lang_dir):
raise FileNotFoundError("Word vector type {} was specified, but directory {} does not exist".format(wordvec_type, lang_dir))
elif os.path.exists(word2vec_dir): # first try word2vec
lang_dir = word2vec_dir
elif os.path.exists(fasttext_dir): # otherwise try fasttext
lang_dir = fasttext_dir
else:
raise FileNotFoundError("Cannot locate word vector directory for language: {} Looked in {} and {}".format(lang, word2vec_dir, fasttext_dir))
# look for wordvec filename in {lang_dir}
filename = os.path.join(lang_dir, '{}.vectors'.format(lcode))
if os.path.exists(filename + ".xz"):
filename = filename + ".xz"
elif os.path.exists(filename + ".txt"):
filename = filename + ".txt"
return filename
@contextmanager
def output_stream(filename=None):
"""
Yields the given file if a file is given, or returns sys.stdout if filename is None
Opens the file in a context manager so it closes nicely
"""
if filename is None:
yield sys.stdout
else:
with open(filename, "w", encoding="utf-8") as fout:
yield fout
@contextmanager
def open_read_text(filename, encoding="utf-8"):
"""
Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular text otherwise.
Use as a context
eg:
with open_read_text(filename) as fin:
do stuff
File will be closed once the context exits
"""
if filename.endswith(".xz"):
with lzma.open(filename, mode='rt', encoding=encoding) as fin:
yield fin
elif filename.endswith(".gz"):
with gzip.open(filename, mode='rt', encoding=encoding) as fin:
yield fin
else:
with open(filename, encoding=encoding) as fin:
yield fin
@contextmanager
def open_read_binary(filename):
"""
Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular binary file otherwise.
If a .zip file is given, it can be read if there is a single file in there
Use as a context
eg:
with open_read_binary(filename) as fin:
do stuff
File will be closed once the context exits
"""
if filename.endswith(".xz"):
with lzma.open(filename, mode='rb') as fin:
yield fin
elif filename.endswith(".gz"):
with gzip.open(filename, mode='rb') as fin:
yield fin
elif filename.endswith(".zip"):
with zipfile.ZipFile(filename) as zin:
input_names = zin.namelist()
if len(input_names) == 0:
raise ValueError("Empty zip archive")
if len(input_names) > 1:
raise ValueError("zip file %s has more than one file in it")
with zin.open(input_names[0]) as fin:
yield fin
else:
with open(filename, mode='rb') as fin:
yield fin
# training schedule
def get_adaptive_eval_interval(cur_dev_size, thres_dev_size, base_interval):
""" Adjust the evaluation interval adaptively.
If cur_dev_size <= thres_dev_size, return base_interval;
else, linearly increase the interval (round to integer times of base interval).
"""
if cur_dev_size <= thres_dev_size:
return base_interval
else:
alpha = round(cur_dev_size / thres_dev_size)
return base_interval * alpha
# ud utils
def ud_scores(gold_conllu_file, system_conllu_file):
def has_readline(f):
return hasattr(f, 'readline') and callable(f.readline)
if has_readline(gold_conllu_file):
try:
gold_ud = ud_eval.load_conllu(gold_conllu_file, '', {})
except UDError as e:
raise UDError("Could not process gold UD file") from e
else:
try:
gold_ud = ud_eval.load_conllu_file(gold_conllu_file)
except UDError as e:
raise UDError("Could not read %s" % gold_conllu_file) from e
if has_readline(system_conllu_file):
try:
system_ud = ud_eval.load_conllu(system_conllu_file, '', {})
except UDError as e:
raise UDError("Could not process system UD file") from e
else:
try:
system_ud = ud_eval.load_conllu_file(system_conllu_file)
except UDError as e:
raise UDError("Could not read %s" % system_conllu_file) from e
evaluation = ud_eval.evaluate(gold_ud, system_ud)
return evaluation
def harmonic_mean(a, weights=None):
if any([x == 0 for x in a]):
return 0
else:
assert weights is None or len(weights) == len(a), 'Weights has length {} which is different from that of the array ({}).'.format(len(weights), len(a))
if weights is None:
return len(a) / sum([1/x for x in a])
else:
return sum(weights) / sum(w/x for x, w in zip(a, weights))
# torch utils
def dispatch_optimizer(name, parameters, opt_logger, lr=None, betas=None, eps=None, momentum=None, **extra_args):
extra_logging = ""
if len(extra_args) > 0:
extra_logging = ", " + ", ".join("%s=%s" % (x, y) for x, y in extra_args.items())
if name == 'amsgrad':
opt_logger.debug("Building Adam w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
return torch.optim.Adam(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
elif name == 'amsgradw':
opt_logger.debug("Building AdamW w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
return torch.optim.AdamW(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
elif name == 'sgd':
opt_logger.debug("Building SGD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
return torch.optim.SGD(parameters, lr=lr, momentum=momentum, **extra_args)
elif name == 'adagrad':
opt_logger.debug("Building Adagrad with lr=%f%s", lr, extra_logging)
return torch.optim.Adagrad(parameters, lr=lr, **extra_args)
elif name == 'adam':
opt_logger.debug("Building Adam with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
return torch.optim.Adam(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
elif name == 'adamw':
opt_logger.debug("Building AdamW with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
return torch.optim.AdamW(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
elif name == 'adamax':
opt_logger.debug("Building Adamax%s", extra_logging)
return torch.optim.Adamax(parameters, **extra_args) # use default lr
elif name == 'adadelta':
opt_logger.debug("Building Adadelta with lr=%f%s", lr, extra_logging)
return torch.optim.Adadelta(parameters, lr=lr, **extra_args)
elif name == 'adabelief':
try:
from adabelief_pytorch import AdaBelief
except ModuleNotFoundError as e:
raise ModuleNotFoundError("Could not create adabelief optimizer. Perhaps the adabelief-pytorch package is not installed") from e
opt_logger.debug("Building AdaBelief with lr=%f, eps=%f%s", lr, eps, extra_logging)
# TODO: add weight_decouple and rectify as extra args?
return AdaBelief(parameters, lr=lr, eps=eps, weight_decouple=True, rectify=True, **extra_args)
elif name == 'madgrad':
try:
import madgrad
except ModuleNotFoundError as e:
raise ModuleNotFoundError("Could not create madgrad optimizer. Perhaps the madgrad package is not installed") from e
opt_logger.debug("Building MADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
return madgrad.MADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
elif name == 'mirror_madgrad':
try:
import madgrad
except ModuleNotFoundError as e:
raise ModuleNotFoundError("Could not create mirror_madgrad optimizer. Perhaps the madgrad package is not installed") from e
opt_logger.debug("Building MirrorMADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
return madgrad.MirrorMADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
else:
raise ValueError("Unsupported optimizer: {}".format(name))
def get_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None, opt_logger=None):
opt_logger = opt_logger if opt_logger is not None else logger
base_parameters = [p for n, p in model.named_parameters()
if p.requires_grad and not n.startswith("bert_model.")
and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
parameters = [{'param_group_name': 'base', 'params': base_parameters}]
charlm_parameters = [p for n, p in model.named_parameters()
if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})
if not is_peft:
bert_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]
# bert_finetune_layers limits the bert finetuning to the *last* N layers of the model
if len(bert_parameters) > 0 and bert_finetune_layers is not None:
num_layers = model.bert_model.config.num_hidden_layers
start_layer = num_layers - bert_finetune_layers
bert_parameters = []
for layer_num in range(start_layer, num_layers):
bert_parameters.extend([param for name, param in model.named_parameters()
if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])
if len(bert_parameters) > 0 and bert_learning_rate > 0:
opt_logger.debug("Finetuning %d bert parameters with LR %s and WD %s", len(bert_parameters), lr * bert_learning_rate, bert_weight_decay)
parameters.append({'param_group_name': 'bert', 'params': bert_parameters, 'lr': lr * bert_learning_rate})
if bert_weight_decay is not None:
parameters[-1]['weight_decay'] = bert_weight_decay
else:
# some optimizers seem to train some even with a learning rate of 0...
if bert_learning_rate > 0:
# because PEFT handles what to hand to an optimizer, we don't want to touch that
parameters.append({'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate})
if bert_weight_decay is not None:
parameters[-1]['weight_decay'] = bert_weight_decay
extra_args = {}
if weight_decay is not None:
extra_args["weight_decay"] = weight_decay
return dispatch_optimizer(name, parameters, opt_logger=opt_logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
def get_split_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None):
"""Same as `get_optimizer`, but splits the optimizer for Bert into a seperate optimizer"""
base_parameters = [p for n, p in model.named_parameters()
if p.requires_grad and not n.startswith("bert_model.")
and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
parameters = [{'param_group_name': 'base', 'params': base_parameters}]
charlm_parameters = [p for n, p in model.named_parameters()
if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})
bert_parameters = None
if not is_peft:
trainable_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]
# bert_finetune_layers limits the bert finetuning to the *last* N layers of the model
if len(trainable_parameters) > 0 and bert_finetune_layers is not None:
num_layers = model.bert_model.config.num_hidden_layers
start_layer = num_layers - bert_finetune_layers
trainable_parameters = []
for layer_num in range(start_layer, num_layers):
trainable_parameters.extend([param for name, param in model.named_parameters()
if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])
if len(trainable_parameters) > 0:
bert_parameters = [{'param_group_name': 'bert', 'params': trainable_parameters, 'lr': lr * bert_learning_rate}]
else:
# because PEFT handles what to hand to an optimizer, we don't want to touch that
bert_parameters = [{'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate}]
extra_args = {}
if weight_decay is not None:
extra_args["weight_decay"] = weight_decay
optimizers = {
"general_optimizer": dispatch_optimizer(name, parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
}
if bert_parameters is not None and bert_learning_rate > 0.0:
if bert_weight_decay is not None:
extra_args['weight_decay'] = bert_weight_decay
optimizers["bert_optimizer"] = dispatch_optimizer(name, bert_parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
return optimizers
def change_lr(optimizer, new_lr):
for param_group in optimizer.param_groups:
param_group['lr'] = new_lr
def flatten_indices(seq_lens, width):
flat = []
for i, l in enumerate(seq_lens):
for j in range(l):
flat.append(i * width + j)
return flat
def keep_partial_grad(grad, topk):
"""
Keep only the topk rows of grads.
"""
assert topk < grad.size(0)
grad.data[topk:].zero_()
return grad
# other utils
def ensure_dir(d, verbose=True):
if not os.path.exists(d):
if verbose:
logger.info("Directory {} does not exist; creating...".format(d))
# exist_ok: guard against race conditions
os.makedirs(d, exist_ok=True)
def save_config(config, path, verbose=True):
with open(path, 'w') as outfile:
json.dump(config, outfile, indent=2)
if verbose:
print("Config saved to file {}".format(path))
return config
def load_config(path, verbose=True):
with open(path) as f:
config = json.load(f)
if verbose:
print("Config loaded from file {}".format(path))
return config
def print_config(config):
info = "Running with the following configs:\n"
for k,v in config.items():
info += "\t{} : {}\n".format(k, str(v))
logger.info("\n" + info + "\n")
def normalize_text(text):
return unicodedata.normalize('NFD', text)
def unmap_with_copy(indices, src_tokens, vocab):
"""
Unmap a list of list of indices, by optionally copying from src_tokens.
"""
result = []
for ind, tokens in zip(indices, src_tokens):
words = []
for idx in ind:
if idx >= 0:
words.append(vocab.id2word[idx])
else:
idx = -idx - 1 # flip and minus 1
words.append(tokens[idx])
result += [words]
return result
def prune_decoded_seqs(seqs):
"""
Prune decoded sequences after EOS token.
"""
out = []
for s in seqs:
if constant.EOS in s:
idx = s.index(constant.EOS_TOKEN)
out += [s[:idx]]
else:
out += [s]
return out
def prune_hyp(hyp):
"""
Prune a decoded hypothesis
"""
if constant.EOS_ID in hyp:
idx = hyp.index(constant.EOS_ID)
return hyp[:idx]
else:
return hyp
def prune(data_list, lens):
assert len(data_list) == len(lens)
nl = []
for d, l in zip(data_list, lens):
nl.append(d[:l])
return nl
def sort(packed, ref, reverse=True):
"""
Sort a series of packed list, according to a ref list.
Also return the original index before the sort.
"""
assert (isinstance(packed, tuple) or isinstance(packed, list)) and isinstance(ref, list)
packed = [ref] + [range(len(ref))] + list(packed)
sorted_packed = [list(t) for t in zip(*sorted(zip(*packed), reverse=reverse))]
return tuple(sorted_packed[1:])
def unsort(sorted_list, oidx):
"""
Unsort a sorted list, based on the original idx.
"""
assert len(sorted_list) == len(oidx), "Number of list elements must match with original indices."
if len(sorted_list) == 0:
return []
_, unsorted = [list(t) for t in zip(*sorted(zip(oidx, sorted_list)))]
return unsorted
def sort_with_indices(data, key=None, reverse=False):
"""
Sort data and return both the data and the original indices.
One useful application is to sort by length, which can be done with key=len
Returns the data as a sorted list, then the indices of the original list.
"""
if not data:
return [], []
if key:
ordered = sorted(enumerate(data), key=lambda x: key(x[1]), reverse=reverse)
else:
ordered = sorted(enumerate(data), key=lambda x: x[1], reverse=reverse)
result = tuple(zip(*ordered))
return result[1], result[0]
def split_into_batches(data, batch_size):
"""
Returns a list of intervals so that each interval is either <= batch_size or one element long.
Long elements are not dropped from the intervals.
data is a list of lists
batch_size is how long to make each batch
return value is a list of pairs, start_idx end_idx
"""
intervals = []
interval_start = 0
interval_size = 0
for idx, line in enumerate(data):
if len(line) > batch_size:
# guess we'll just hope the model can handle a batch of this size after all
if interval_size > 0:
intervals.append((interval_start, idx))
intervals.append((idx, idx+1))
interval_start = idx+1
interval_size = 0
elif len(line) + interval_size > batch_size:
# this line puts us over batch_size
intervals.append((interval_start, idx))
interval_start = idx
interval_size = len(line)
else:
interval_size = interval_size + len(line)
if interval_size > 0:
# there's some leftover
intervals.append((interval_start, len(data)))
return intervals
def tensor_unsort(sorted_tensor, oidx):
"""
Unsort a sorted tensor on its 0-th dimension, based on the original idx.
"""
assert sorted_tensor.size(0) == len(oidx), "Number of list elements must match with original indices."
backidx = [x[0] for x in sorted(enumerate(oidx), key=lambda x: x[1])]
return sorted_tensor[backidx]
def set_random_seed(seed):
"""
Set a random seed on all of the things which might need it.
torch, np, python random, and torch.cuda
"""
if seed is None:
seed = random.randint(0, 1000000000)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# some of these calls are probably redundant
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
return seed
def find_missing_tags(known_tags, test_tags):
if isinstance(known_tags, list) and isinstance(known_tags[0], list):
known_tags = set(x for y in known_tags for x in y)
if isinstance(test_tags, list) and isinstance(test_tags[0], list):
test_tags = sorted(set(x for y in test_tags for x in y))
missing_tags = sorted(x for x in test_tags if x not in known_tags)
return missing_tags
def warn_missing_tags(known_tags, test_tags, test_set_name):
"""
Print a warning if any tags present in the second list are not in the first list.
Can also handle a list of lists.
"""
missing_tags = find_missing_tags(known_tags, test_tags)
if len(missing_tags) > 0:
logger.warning("Found tags in {} missing from the expected tag set: {}".format(test_set_name, missing_tags))
return True
return False
def checkpoint_name(save_dir, save_name, checkpoint_name):
"""
Will return a recommended checkpoint name for the given dir, save_name, optional checkpoint_name
For example, can pass in args['save_dir'], args['save_name'], args['checkpoint_save_name']
"""
if checkpoint_name:
model_dir = os.path.split(checkpoint_name)[0]
if model_dir == save_dir:
return checkpoint_name
return os.path.join(save_dir, checkpoint_name)
model_dir = os.path.split(save_name)[0]
if model_dir != save_dir:
save_name = os.path.join(save_dir, save_name)
if save_name.endswith(".pt"):
return save_name[:-3] + "_checkpoint.pt"
return save_name + "_checkpoint"
def default_device():
"""
Pick a default device based on what's available on this system
"""
if torch.cuda.is_available():
return 'cuda'
return 'cpu'
def add_device_args(parser):
"""
Add args which specify cpu, cuda, or arbitrary device
"""
parser.add_argument('--device', type=str, default=default_device(), help='Which device to run on - use a torch device string name')
parser.add_argument('--cuda', dest='device', action='store_const', const='cuda', help='Run on CUDA')
parser.add_argument('--cpu', dest='device', action='store_const', const='cpu', help='Ignore CUDA and run on CPU')
def load_elmo(elmo_model):
# This import is here so that Elmo integration can be treated
# as an optional feature
import elmoformanylangs
logger.info("Loading elmo: %s" % elmo_model)
elmo_model = elmoformanylangs.Embedder(elmo_model)
return elmo_model
def log_training_args(args, args_logger, name="training"):
"""
For record keeping purposes, log the arguments when training
"""
if isinstance(args, argparse.Namespace):
args = vars(args)
keys = sorted(args.keys())
log_lines = ['%s: %s' % (k, args[k]) for k in keys]
args_logger.info('ARGS USED AT %s TIME:\n%s\n', name.upper(), '\n'.join(log_lines))
def embedding_name(args):
"""
Return the generic name of the biggest embedding used by a model.
Used by POS and depparse, for example.
TODO: Probably will make the transformer names a bit more informative,
such as electra, roberta, etc. Maybe even phobert for VI, for example
"""
embedding = "nocharlm"
if args['wordvec_pretrain_file'] is None and args['wordvec_file'] is None:
embedding = "nopretrain"
if args.get('charlm', True) and (args['charlm_forward_file'] or args['charlm_backward_file']):
embedding = "charlm"
if args['bert_model']:
if args['bert_model'] in TRANSFORMER_NICKNAMES:
embedding = TRANSFORMER_NICKNAMES[args['bert_model']]
else:
embedding = "transformer"
return embedding
def standard_model_file_name(args, model_type, **kwargs):
"""
Returns a model file name based on some common args found in the various models.
The expectation is that the args will have something like
parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_parser.pt", help="File name to save the model")
Then the model shorthand, embedding type, and other args will be
turned into arguments in a format string
"""
embedding = embedding_name(args)
finetune = ""
transformer_lr = ""
if args.get("bert_finetune", False):
finetune = "finetuned"
if "bert_learning_rate" in args:
transformer_lr = "{}".format(args["bert_learning_rate"])
use_peft = "nopeft"
if args.get("bert_finetune", False) and args.get("use_peft", False):
use_peft = "peft"
bert_finetuning = ""
if args.get("bert_finetune", False):
if args.get("use_peft", False):
bert_finetuning = "peft"
else:
bert_finetuning = "ft"
seed = args.get('seed', None)
if seed is None:
seed = ""
else:
seed = str(seed)
format_args = {
"batch_size": args['batch_size'],
"bert_finetuning": bert_finetuning,
"embedding": embedding,
"finetune": finetune,
"peft": use_peft,
"seed": seed,
"shorthand": args['shorthand'],
"transformer_lr": transformer_lr,
}
format_args.update(**kwargs)
model_file = args['save_name'].format(**format_args)
model_file = re.sub("_+", "_", model_file)
model_dir = os.path.split(model_file)[0]
if not os.path.exists(os.path.join(args['save_dir'], model_file)) and os.path.exists(model_file):
return model_file
return os.path.join(args['save_dir'], model_file)
def escape_misc_space(space):
spaces = []
for char in space:
if char == ' ':
spaces.append('\\s')
elif char == '\t':
spaces.append('\\t')
elif char == '\r':
spaces.append('\\r')
elif char == '\n':
spaces.append('\\n')
elif char == '|':
spaces.append('\\p')
elif char == '\\':
spaces.append('\\\\')
elif char == ' ':
spaces.append('\\u00A0')
else:
spaces.append(char)
escaped_space = "".join(spaces)
return escaped_space
def unescape_misc_space(misc_space):
spaces = []
pos = 0
while pos < len(misc_space):
if misc_space[pos:pos+2] == '\\s':
spaces.append(' ')
pos += 2
elif misc_space[pos:pos+2] == '\\t':
spaces.append('\t')
pos += 2
elif misc_space[pos:pos+2] == '\\r':
spaces.append('\r')
pos += 2
elif misc_space[pos:pos+2] == '\\n':
spaces.append('\n')
pos += 2
elif misc_space[pos:pos+2] == '\\p':
spaces.append('|')
pos += 2
elif misc_space[pos:pos+2] == '\\\\':
spaces.append('\\')
pos += 2
elif misc_space[pos:pos+6] == '\\u00A0':
spaces.append(' ')
pos += 6
else:
spaces.append(misc_space[pos])
pos += 1
unescaped_space = "".join(spaces)
return unescaped_space
def space_before_to_misc(space):
"""
Convert whitespace to SpacesBefore specifically for the start of a document.
In general, UD datasets do not have both SpacesAfter on a token and SpacesBefore on the next token.
The space(s) are only marked on one of the tokens.
Only at the very beginning of a document is it necessary to mark what spaces occurred before the actual text,
and the default assumption is that there is no space if there is no SpacesBefore annotation.
"""
if not space:
return ""
escaped_space = escape_misc_space(space)
return "SpacesBefore=%s" % escaped_space
def space_after_to_misc(space):
"""
Convert whitespace back to the escaped format - either SpaceAfter=No or SpacesAfter=...
"""
if not space:
return "SpaceAfter=No"
if space == " ":
return ""
escaped_space = escape_misc_space(space)
return "SpacesAfter=%s" % escaped_space
def misc_to_space_before(misc):
"""
Find any SpacesBefore annotation in the MISC column and turn it into a space value
"""
if not misc:
return ""
pieces = misc.split("|")
for piece in pieces:
if not piece.lower().startswith("spacesbefore="):
continue
misc_space = piece.split("=", maxsplit=1)[1]
return unescape_misc_space(misc_space)
return ""
def misc_to_space_after(misc):
"""
Convert either SpaceAfter=No or the SpacesAfter annotation
see https://universaldependencies.org/misc.html#spacesafter
We compensate for some treebanks using SpaceAfter=\n instead of SpacesAfter=\n
On the way back, though, those annotations will be turned into SpacesAfter
"""
if not misc:
return " "
pieces = misc.split("|")
if any(piece.lower() == "spaceafter=no" for piece in pieces):
return ""
if "SpaceAfter=Yes" in pieces:
# as of UD 2.11, the Cantonese treebank had this as a misc feature
return " "
if "SpaceAfter=No~" in pieces:
# as of UD 2.11, a weird typo in the Russian Taiga dataset
return ""
for piece in pieces:
if piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter="):
misc_space = piece.split("=", maxsplit=1)[1]
return unescape_misc_space(misc_space)
return " "
def log_norms(model):
lines = ["NORMS FOR MODEL PARAMTERS"]
pieces = []
for name, param in model.named_parameters():
if param.requires_grad:
pieces.append((name, "%.6g" % torch.norm(param).item(), "%d" % param.numel()))
name_len = max(len(x[0]) for x in pieces)
norm_len = max(len(x[1]) for x in pieces)
line_format = " %-" + str(name_len) + "s %" + str(norm_len) + "s %s"
for line in pieces:
lines.append(line_format % line)
logger.info("\n".join(lines))
def attach_bert_model(model, bert_model, bert_tokenizer, use_peft, force_bert_saved):
if use_peft:
# we use a peft-specific pathway for saving peft weights
model.add_unsaved_module('bert_model', bert_model)
model.bert_model.train()
elif force_bert_saved:
model.bert_model = bert_model
elif bert_model is not None:
model.add_unsaved_module('bert_model', bert_model)
for _, parameter in bert_model.named_parameters():
parameter.requires_grad = False
else:
model.bert_model = None
model.add_unsaved_module('bert_tokenizer', bert_tokenizer)
def build_save_each_filename(base_filename):
"""
If the given name doesn't have %d in it, add %4d at the end of the filename
This way, there's something to count how many models have been saved
"""
try:
base_filename % 1
except TypeError:
# so models.pt -> models_0001.pt, etc
pieces = os.path.splitext(model_save_each_file)
base_filename = pieces[0] + "_%04d" + pieces[1]
return base_filename