stanza-digphil / stanza /models /common /utils.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 2 months ago

32 kB

	"""
	Utility functions.
	"""

	import argparse
	from collections import Counter
	from contextlib import contextmanager
	import gzip
	import json
	import logging
	import lzma
	import os
	import random
	import re
	import sys
	import unicodedata
	import zipfile

	import torch
	import numpy as np

	from stanza.models.common.constant import lcode2lang
	import stanza.models.common.seq2seq_constant as constant
	from stanza.resources.default_packages import TRANSFORMER_NICKNAMES
	import stanza.utils.conll18_ud_eval as ud_eval
	from stanza.utils.conll18_ud_eval import UDError

	logger = logging.getLogger('stanza')

	# filenames
	def get_wordvec_file(wordvec_dir, shorthand, wordvec_type=None):
	""" Lookup the name of the word vectors file, given a directory and the language shorthand.
	"""
	lcode, tcode = shorthand.split('_', 1)
	lang = lcode2lang[lcode]
	# locate language folder
	word2vec_dir = os.path.join(wordvec_dir, 'word2vec', lang)
	fasttext_dir = os.path.join(wordvec_dir, 'fasttext', lang)
	lang_dir = None
	if wordvec_type is not None:
	lang_dir = os.path.join(wordvec_dir, wordvec_type, lang)
	if not os.path.exists(lang_dir):
	raise FileNotFoundError("Word vector type {} was specified, but directory {} does not exist".format(wordvec_type, lang_dir))
	elif os.path.exists(word2vec_dir): # first try word2vec
	lang_dir = word2vec_dir
	elif os.path.exists(fasttext_dir): # otherwise try fasttext
	lang_dir = fasttext_dir
	else:
	raise FileNotFoundError("Cannot locate word vector directory for language: {} Looked in {} and {}".format(lang, word2vec_dir, fasttext_dir))
	# look for wordvec filename in {lang_dir}
	filename = os.path.join(lang_dir, '{}.vectors'.format(lcode))
	if os.path.exists(filename + ".xz"):
	filename = filename + ".xz"
	elif os.path.exists(filename + ".txt"):
	filename = filename + ".txt"
	return filename

	@contextmanager
	def output_stream(filename=None):
	"""
	Yields the given file if a file is given, or returns sys.stdout if filename is None

	Opens the file in a context manager so it closes nicely
	"""
	if filename is None:
	yield sys.stdout
	else:
	with open(filename, "w", encoding="utf-8") as fout:
	yield fout


	@contextmanager
	def open_read_text(filename, encoding="utf-8"):
	"""
	Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular text otherwise.

	Use as a context

	eg:
	with open_read_text(filename) as fin:
	do stuff

	File will be closed once the context exits
	"""
	if filename.endswith(".xz"):
	with lzma.open(filename, mode='rt', encoding=encoding) as fin:
	yield fin
	elif filename.endswith(".gz"):
	with gzip.open(filename, mode='rt', encoding=encoding) as fin:
	yield fin
	else:
	with open(filename, encoding=encoding) as fin:
	yield fin

	@contextmanager
	def open_read_binary(filename):
	"""
	Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular binary file otherwise.

	If a .zip file is given, it can be read if there is a single file in there

	Use as a context

	eg:
	with open_read_binary(filename) as fin:
	do stuff

	File will be closed once the context exits
	"""
	if filename.endswith(".xz"):
	with lzma.open(filename, mode='rb') as fin:
	yield fin
	elif filename.endswith(".gz"):
	with gzip.open(filename, mode='rb') as fin:
	yield fin
	elif filename.endswith(".zip"):
	with zipfile.ZipFile(filename) as zin:
	input_names = zin.namelist()
	if len(input_names) == 0:
	raise ValueError("Empty zip archive")
	if len(input_names) > 1:
	raise ValueError("zip file %s has more than one file in it")
	with zin.open(input_names[0]) as fin:
	yield fin
	else:
	with open(filename, mode='rb') as fin:
	yield fin

	# training schedule
	def get_adaptive_eval_interval(cur_dev_size, thres_dev_size, base_interval):
	""" Adjust the evaluation interval adaptively.
	If cur_dev_size <= thres_dev_size, return base_interval;
	else, linearly increase the interval (round to integer times of base interval).
	"""
	if cur_dev_size <= thres_dev_size:
	return base_interval
	else:
	alpha = round(cur_dev_size / thres_dev_size)
	return base_interval * alpha

	# ud utils
	def ud_scores(gold_conllu_file, system_conllu_file):
	def has_readline(f):
	return hasattr(f, 'readline') and callable(f.readline)

	if has_readline(gold_conllu_file):
	try:
	gold_ud = ud_eval.load_conllu(gold_conllu_file, '', {})
	except UDError as e:
	raise UDError("Could not process gold UD file") from e
	else:
	try:
	gold_ud = ud_eval.load_conllu_file(gold_conllu_file)
	except UDError as e:
	raise UDError("Could not read %s" % gold_conllu_file) from e

	if has_readline(system_conllu_file):
	try:
	system_ud = ud_eval.load_conllu(system_conllu_file, '', {})
	except UDError as e:
	raise UDError("Could not process system UD file") from e
	else:
	try:
	system_ud = ud_eval.load_conllu_file(system_conllu_file)
	except UDError as e:
	raise UDError("Could not read %s" % system_conllu_file) from e

	evaluation = ud_eval.evaluate(gold_ud, system_ud)

	return evaluation

	def harmonic_mean(a, weights=None):
	if any([x == 0 for x in a]):
	return 0
	else:
	assert weights is None or len(weights) == len(a), 'Weights has length {} which is different from that of the array ({}).'.format(len(weights), len(a))
	if weights is None:
	return len(a) / sum([1/x for x in a])
	else:
	return sum(weights) / sum(w/x for x, w in zip(a, weights))

	# torch utils
	def dispatch_optimizer(name, parameters, opt_logger, lr=None, betas=None, eps=None, momentum=None, **extra_args):
	extra_logging = ""
	if len(extra_args) > 0:
	extra_logging = ", " + ", ".join("%s=%s" % (x, y) for x, y in extra_args.items())

	if name == 'amsgrad':
	opt_logger.debug("Building Adam w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
	return torch.optim.Adam(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
	elif name == 'amsgradw':
	opt_logger.debug("Building AdamW w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
	return torch.optim.AdamW(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
	elif name == 'sgd':
	opt_logger.debug("Building SGD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
	return torch.optim.SGD(parameters, lr=lr, momentum=momentum, **extra_args)
	elif name == 'adagrad':
	opt_logger.debug("Building Adagrad with lr=%f%s", lr, extra_logging)
	return torch.optim.Adagrad(parameters, lr=lr, **extra_args)
	elif name == 'adam':
	opt_logger.debug("Building Adam with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
	return torch.optim.Adam(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
	elif name == 'adamw':
	opt_logger.debug("Building AdamW with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
	return torch.optim.AdamW(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
	elif name == 'adamax':
	opt_logger.debug("Building Adamax%s", extra_logging)
	return torch.optim.Adamax(parameters, **extra_args) # use default lr
	elif name == 'adadelta':
	opt_logger.debug("Building Adadelta with lr=%f%s", lr, extra_logging)
	return torch.optim.Adadelta(parameters, lr=lr, **extra_args)
	elif name == 'adabelief':
	try:
	from adabelief_pytorch import AdaBelief
	except ModuleNotFoundError as e:
	raise ModuleNotFoundError("Could not create adabelief optimizer. Perhaps the adabelief-pytorch package is not installed") from e
	opt_logger.debug("Building AdaBelief with lr=%f, eps=%f%s", lr, eps, extra_logging)
	# TODO: add weight_decouple and rectify as extra args?
	return AdaBelief(parameters, lr=lr, eps=eps, weight_decouple=True, rectify=True, **extra_args)
	elif name == 'madgrad':
	try:
	import madgrad
	except ModuleNotFoundError as e:
	raise ModuleNotFoundError("Could not create madgrad optimizer. Perhaps the madgrad package is not installed") from e
	opt_logger.debug("Building MADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
	return madgrad.MADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
	elif name == 'mirror_madgrad':
	try:
	import madgrad
	except ModuleNotFoundError as e:
	raise ModuleNotFoundError("Could not create mirror_madgrad optimizer. Perhaps the madgrad package is not installed") from e
	opt_logger.debug("Building MirrorMADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
	return madgrad.MirrorMADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
	else:
	raise ValueError("Unsupported optimizer: {}".format(name))


	def get_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None, opt_logger=None):
	opt_logger = opt_logger if opt_logger is not None else logger
	base_parameters = [p for n, p in model.named_parameters()
	if p.requires_grad and not n.startswith("bert_model.")
	and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
	parameters = [{'param_group_name': 'base', 'params': base_parameters}]

	charlm_parameters = [p for n, p in model.named_parameters()
	if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
	if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
	parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})

	if not is_peft:
	bert_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]

	# bert_finetune_layers limits the bert finetuning to the last N layers of the model
	if len(bert_parameters) > 0 and bert_finetune_layers is not None:
	num_layers = model.bert_model.config.num_hidden_layers
	start_layer = num_layers - bert_finetune_layers
	bert_parameters = []
	for layer_num in range(start_layer, num_layers):
	bert_parameters.extend([param for name, param in model.named_parameters()
	if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])

	if len(bert_parameters) > 0 and bert_learning_rate > 0:
	opt_logger.debug("Finetuning %d bert parameters with LR %s and WD %s", len(bert_parameters), lr * bert_learning_rate, bert_weight_decay)
	parameters.append({'param_group_name': 'bert', 'params': bert_parameters, 'lr': lr * bert_learning_rate})
	if bert_weight_decay is not None:
	parameters[-1]['weight_decay'] = bert_weight_decay
	else:
	# some optimizers seem to train some even with a learning rate of 0...
	if bert_learning_rate > 0:
	# because PEFT handles what to hand to an optimizer, we don't want to touch that
	parameters.append({'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate})
	if bert_weight_decay is not None:
	parameters[-1]['weight_decay'] = bert_weight_decay

	extra_args = {}
	if weight_decay is not None:
	extra_args["weight_decay"] = weight_decay

	return dispatch_optimizer(name, parameters, opt_logger=opt_logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)

	def get_split_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None):
	"""Same as `get_optimizer`, but splits the optimizer for Bert into a seperate optimizer"""
	base_parameters = [p for n, p in model.named_parameters()
	if p.requires_grad and not n.startswith("bert_model.")
	and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
	parameters = [{'param_group_name': 'base', 'params': base_parameters}]

	charlm_parameters = [p for n, p in model.named_parameters()
	if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
	if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
	parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})

	bert_parameters = None
	if not is_peft:
	trainable_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]

	# bert_finetune_layers limits the bert finetuning to the last N layers of the model
	if len(trainable_parameters) > 0 and bert_finetune_layers is not None:
	num_layers = model.bert_model.config.num_hidden_layers
	start_layer = num_layers - bert_finetune_layers
	trainable_parameters = []
	for layer_num in range(start_layer, num_layers):
	trainable_parameters.extend([param for name, param in model.named_parameters()
	if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])

	if len(trainable_parameters) > 0:
	bert_parameters = [{'param_group_name': 'bert', 'params': trainable_parameters, 'lr': lr * bert_learning_rate}]
	else:
	# because PEFT handles what to hand to an optimizer, we don't want to touch that
	bert_parameters = [{'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate}]

	extra_args = {}
	if weight_decay is not None:
	extra_args["weight_decay"] = weight_decay

	optimizers = {
	"general_optimizer": dispatch_optimizer(name, parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
	}
	if bert_parameters is not None and bert_learning_rate > 0.0:
	if bert_weight_decay is not None:
	extra_args['weight_decay'] = bert_weight_decay
	optimizers["bert_optimizer"] = dispatch_optimizer(name, bert_parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
	return optimizers


	def change_lr(optimizer, new_lr):
	for param_group in optimizer.param_groups:
	param_group['lr'] = new_lr

	def flatten_indices(seq_lens, width):
	flat = []
	for i, l in enumerate(seq_lens):
	for j in range(l):
	flat.append(i * width + j)
	return flat

	def keep_partial_grad(grad, topk):
	"""
	Keep only the topk rows of grads.
	"""
	assert topk < grad.size(0)
	grad.data[topk:].zero_()
	return grad

	# other utils
	def ensure_dir(d, verbose=True):
	if not os.path.exists(d):
	if verbose:
	logger.info("Directory {} does not exist; creating...".format(d))
	# exist_ok: guard against race conditions
	os.makedirs(d, exist_ok=True)

	def save_config(config, path, verbose=True):
	with open(path, 'w') as outfile:
	json.dump(config, outfile, indent=2)
	if verbose:
	print("Config saved to file {}".format(path))
	return config

	def load_config(path, verbose=True):
	with open(path) as f:
	config = json.load(f)
	if verbose:
	print("Config loaded from file {}".format(path))
	return config

	def print_config(config):
	info = "Running with the following configs:\n"
	for k,v in config.items():
	info += "\t{} : {}\n".format(k, str(v))
	logger.info("\n" + info + "\n")

	def normalize_text(text):
	return unicodedata.normalize('NFD', text)

	def unmap_with_copy(indices, src_tokens, vocab):
	"""
	Unmap a list of list of indices, by optionally copying from src_tokens.
	"""
	result = []
	for ind, tokens in zip(indices, src_tokens):
	words = []
	for idx in ind:
	if idx >= 0:
	words.append(vocab.id2word[idx])
	else:
	idx = -idx - 1 # flip and minus 1
	words.append(tokens[idx])
	result += [words]
	return result

	def prune_decoded_seqs(seqs):
	"""
	Prune decoded sequences after EOS token.
	"""
	out = []
	for s in seqs:
	if constant.EOS in s:
	idx = s.index(constant.EOS_TOKEN)
	out += [s[:idx]]
	else:
	out += [s]
	return out

	def prune_hyp(hyp):
	"""
	Prune a decoded hypothesis
	"""
	if constant.EOS_ID in hyp:
	idx = hyp.index(constant.EOS_ID)
	return hyp[:idx]
	else:
	return hyp

	def prune(data_list, lens):
	assert len(data_list) == len(lens)
	nl = []
	for d, l in zip(data_list, lens):
	nl.append(d[:l])
	return nl

	def sort(packed, ref, reverse=True):
	"""
	Sort a series of packed list, according to a ref list.
	Also return the original index before the sort.
	"""
	assert (isinstance(packed, tuple) or isinstance(packed, list)) and isinstance(ref, list)
	packed = [ref] + [range(len(ref))] + list(packed)
	sorted_packed = [list(t) for t in zip(sorted(zip(packed), reverse=reverse))]
	return tuple(sorted_packed[1:])

	def unsort(sorted_list, oidx):
	"""
	Unsort a sorted list, based on the original idx.
	"""
	assert len(sorted_list) == len(oidx), "Number of list elements must match with original indices."
	if len(sorted_list) == 0:
	return []
	_, unsorted = [list(t) for t in zip(*sorted(zip(oidx, sorted_list)))]
	return unsorted

	def sort_with_indices(data, key=None, reverse=False):
	"""
	Sort data and return both the data and the original indices.

	One useful application is to sort by length, which can be done with key=len
	Returns the data as a sorted list, then the indices of the original list.
	"""
	if not data:
	return [], []
	if key:
	ordered = sorted(enumerate(data), key=lambda x: key(x[1]), reverse=reverse)
	else:
	ordered = sorted(enumerate(data), key=lambda x: x[1], reverse=reverse)

	result = tuple(zip(*ordered))
	return result[1], result[0]

	def split_into_batches(data, batch_size):
	"""
	Returns a list of intervals so that each interval is either <= batch_size or one element long.

	Long elements are not dropped from the intervals.
	data is a list of lists
	batch_size is how long to make each batch
	return value is a list of pairs, start_idx end_idx
	"""
	intervals = []
	interval_start = 0
	interval_size = 0
	for idx, line in enumerate(data):
	if len(line) > batch_size:
	# guess we'll just hope the model can handle a batch of this size after all
	if interval_size > 0:
	intervals.append((interval_start, idx))
	intervals.append((idx, idx+1))
	interval_start = idx+1
	interval_size = 0
	elif len(line) + interval_size > batch_size:
	# this line puts us over batch_size
	intervals.append((interval_start, idx))
	interval_start = idx
	interval_size = len(line)
	else:
	interval_size = interval_size + len(line)
	if interval_size > 0:
	# there's some leftover
	intervals.append((interval_start, len(data)))
	return intervals

	def tensor_unsort(sorted_tensor, oidx):
	"""
	Unsort a sorted tensor on its 0-th dimension, based on the original idx.
	"""
	assert sorted_tensor.size(0) == len(oidx), "Number of list elements must match with original indices."
	backidx = [x[0] for x in sorted(enumerate(oidx), key=lambda x: x[1])]
	return sorted_tensor[backidx]


	def set_random_seed(seed):
	"""
	Set a random seed on all of the things which might need it.
	torch, np, python random, and torch.cuda
	"""
	if seed is None:
	seed = random.randint(0, 1000000000)

	torch.manual_seed(seed)
	np.random.seed(seed)
	random.seed(seed)
	# some of these calls are probably redundant
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	return seed

	def find_missing_tags(known_tags, test_tags):
	if isinstance(known_tags, list) and isinstance(known_tags[0], list):
	known_tags = set(x for y in known_tags for x in y)
	if isinstance(test_tags, list) and isinstance(test_tags[0], list):
	test_tags = sorted(set(x for y in test_tags for x in y))
	missing_tags = sorted(x for x in test_tags if x not in known_tags)
	return missing_tags

	def warn_missing_tags(known_tags, test_tags, test_set_name):
	"""
	Print a warning if any tags present in the second list are not in the first list.

	Can also handle a list of lists.
	"""
	missing_tags = find_missing_tags(known_tags, test_tags)
	if len(missing_tags) > 0:
	logger.warning("Found tags in {} missing from the expected tag set: {}".format(test_set_name, missing_tags))
	return True
	return False

	def checkpoint_name(save_dir, save_name, checkpoint_name):
	"""
	Will return a recommended checkpoint name for the given dir, save_name, optional checkpoint_name

	For example, can pass in args['save_dir'], args['save_name'], args['checkpoint_save_name']
	"""
	if checkpoint_name:
	model_dir = os.path.split(checkpoint_name)[0]
	if model_dir == save_dir:
	return checkpoint_name
	return os.path.join(save_dir, checkpoint_name)

	model_dir = os.path.split(save_name)[0]
	if model_dir != save_dir:
	save_name = os.path.join(save_dir, save_name)
	if save_name.endswith(".pt"):
	return save_name[:-3] + "_checkpoint.pt"

	return save_name + "_checkpoint"

	def default_device():
	"""
	Pick a default device based on what's available on this system
	"""
	if torch.cuda.is_available():
	return 'cuda'
	return 'cpu'

	def add_device_args(parser):
	"""
	Add args which specify cpu, cuda, or arbitrary device
	"""
	parser.add_argument('--device', type=str, default=default_device(), help='Which device to run on - use a torch device string name')
	parser.add_argument('--cuda', dest='device', action='store_const', const='cuda', help='Run on CUDA')
	parser.add_argument('--cpu', dest='device', action='store_const', const='cpu', help='Ignore CUDA and run on CPU')

	def load_elmo(elmo_model):
	# This import is here so that Elmo integration can be treated
	# as an optional feature
	import elmoformanylangs

	logger.info("Loading elmo: %s" % elmo_model)
	elmo_model = elmoformanylangs.Embedder(elmo_model)
	return elmo_model

	def log_training_args(args, args_logger, name="training"):
	"""
	For record keeping purposes, log the arguments when training
	"""
	if isinstance(args, argparse.Namespace):
	args = vars(args)
	keys = sorted(args.keys())
	log_lines = ['%s: %s' % (k, args[k]) for k in keys]
	args_logger.info('ARGS USED AT %s TIME:\n%s\n', name.upper(), '\n'.join(log_lines))

	def embedding_name(args):
	"""
	Return the generic name of the biggest embedding used by a model.

	Used by POS and depparse, for example.

	TODO: Probably will make the transformer names a bit more informative,
	such as electra, roberta, etc. Maybe even phobert for VI, for example
	"""
	embedding = "nocharlm"
	if args['wordvec_pretrain_file'] is None and args['wordvec_file'] is None:
	embedding = "nopretrain"
	if args.get('charlm', True) and (args['charlm_forward_file'] or args['charlm_backward_file']):
	embedding = "charlm"
	if args['bert_model']:
	if args['bert_model'] in TRANSFORMER_NICKNAMES:
	embedding = TRANSFORMER_NICKNAMES[args['bert_model']]
	else:
	embedding = "transformer"

	return embedding

	def standard_model_file_name(args, model_type, **kwargs):
	"""
	Returns a model file name based on some common args found in the various models.

	The expectation is that the args will have something like

	parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_parser.pt", help="File name to save the model")

	Then the model shorthand, embedding type, and other args will be
	turned into arguments in a format string
	"""
	embedding = embedding_name(args)

	finetune = ""
	transformer_lr = ""
	if args.get("bert_finetune", False):
	finetune = "finetuned"
	if "bert_learning_rate" in args:
	transformer_lr = "{}".format(args["bert_learning_rate"])

	use_peft = "nopeft"
	if args.get("bert_finetune", False) and args.get("use_peft", False):
	use_peft = "peft"

	bert_finetuning = ""
	if args.get("bert_finetune", False):
	if args.get("use_peft", False):
	bert_finetuning = "peft"
	else:
	bert_finetuning = "ft"

	seed = args.get('seed', None)
	if seed is None:
	seed = ""
	else:
	seed = str(seed)

	format_args = {
	"batch_size": args['batch_size'],
	"bert_finetuning": bert_finetuning,
	"embedding": embedding,
	"finetune": finetune,
	"peft": use_peft,
	"seed": seed,
	"shorthand": args['shorthand'],
	"transformer_lr": transformer_lr,
	}
	format_args.update(**kwargs)
	model_file = args['save_name'].format(**format_args)
	model_file = re.sub("_+", "_", model_file)

	model_dir = os.path.split(model_file)[0]

	if not os.path.exists(os.path.join(args['save_dir'], model_file)) and os.path.exists(model_file):
	return model_file
	return os.path.join(args['save_dir'], model_file)

	def escape_misc_space(space):
	spaces = []
	for char in space:
	if char == ' ':
	spaces.append('\\s')
	elif char == '\t':
	spaces.append('\\t')
	elif char == '\r':
	spaces.append('\\r')
	elif char == '\n':
	spaces.append('\\n')
	elif char == '\|':
	spaces.append('\\p')
	elif char == '\\':
	spaces.append('\\\\')
	elif char == ' ':
	spaces.append('\\u00A0')
	else:
	spaces.append(char)
	escaped_space = "".join(spaces)
	return escaped_space

	def unescape_misc_space(misc_space):
	spaces = []
	pos = 0
	while pos < len(misc_space):
	if misc_space[pos:pos+2] == '\\s':
	spaces.append(' ')
	pos += 2
	elif misc_space[pos:pos+2] == '\\t':
	spaces.append('\t')
	pos += 2
	elif misc_space[pos:pos+2] == '\\r':
	spaces.append('\r')
	pos += 2
	elif misc_space[pos:pos+2] == '\\n':
	spaces.append('\n')
	pos += 2
	elif misc_space[pos:pos+2] == '\\p':
	spaces.append('\|')
	pos += 2
	elif misc_space[pos:pos+2] == '\\\\':
	spaces.append('\\')
	pos += 2
	elif misc_space[pos:pos+6] == '\\u00A0':
	spaces.append(' ')
	pos += 6
	else:
	spaces.append(misc_space[pos])
	pos += 1
	unescaped_space = "".join(spaces)
	return unescaped_space

	def space_before_to_misc(space):
	"""
	Convert whitespace to SpacesBefore specifically for the start of a document.

	In general, UD datasets do not have both SpacesAfter on a token and SpacesBefore on the next token.

	The space(s) are only marked on one of the tokens.

	Only at the very beginning of a document is it necessary to mark what spaces occurred before the actual text,
	and the default assumption is that there is no space if there is no SpacesBefore annotation.
	"""
	if not space:
	return ""
	escaped_space = escape_misc_space(space)
	return "SpacesBefore=%s" % escaped_space

	def space_after_to_misc(space):
	"""
	Convert whitespace back to the escaped format - either SpaceAfter=No or SpacesAfter=...
	"""
	if not space:
	return "SpaceAfter=No"
	if space == " ":
	return ""
	escaped_space = escape_misc_space(space)
	return "SpacesAfter=%s" % escaped_space

	def misc_to_space_before(misc):
	"""
	Find any SpacesBefore annotation in the MISC column and turn it into a space value
	"""
	if not misc:
	return ""
	pieces = misc.split("\|")
	for piece in pieces:
	if not piece.lower().startswith("spacesbefore="):
	continue
	misc_space = piece.split("=", maxsplit=1)[1]
	return unescape_misc_space(misc_space)
	return ""

	def misc_to_space_after(misc):
	"""
	Convert either SpaceAfter=No or the SpacesAfter annotation

	see https://universaldependencies.org/misc.html#spacesafter

	We compensate for some treebanks using SpaceAfter=\n instead of SpacesAfter=\n
	On the way back, though, those annotations will be turned into SpacesAfter
	"""
	if not misc:
	return " "
	pieces = misc.split("\|")
	if any(piece.lower() == "spaceafter=no" for piece in pieces):
	return ""
	if "SpaceAfter=Yes" in pieces:
	# as of UD 2.11, the Cantonese treebank had this as a misc feature
	return " "
	if "SpaceAfter=No~" in pieces:
	# as of UD 2.11, a weird typo in the Russian Taiga dataset
	return ""
	for piece in pieces:
	if piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter="):
	misc_space = piece.split("=", maxsplit=1)[1]
	return unescape_misc_space(misc_space)
	return " "

	def log_norms(model):
	lines = ["NORMS FOR MODEL PARAMTERS"]
	pieces = []
	for name, param in model.named_parameters():
	if param.requires_grad:
	pieces.append((name, "%.6g" % torch.norm(param).item(), "%d" % param.numel()))
	name_len = max(len(x[0]) for x in pieces)
	norm_len = max(len(x[1]) for x in pieces)
	line_format = " %-" + str(name_len) + "s %" + str(norm_len) + "s %s"
	for line in pieces:
	lines.append(line_format % line)
	logger.info("\n".join(lines))

	def attach_bert_model(model, bert_model, bert_tokenizer, use_peft, force_bert_saved):
	if use_peft:
	# we use a peft-specific pathway for saving peft weights
	model.add_unsaved_module('bert_model', bert_model)
	model.bert_model.train()
	elif force_bert_saved:
	model.bert_model = bert_model
	elif bert_model is not None:
	model.add_unsaved_module('bert_model', bert_model)
	for _, parameter in bert_model.named_parameters():
	parameter.requires_grad = False
	else:
	model.bert_model = None
	model.add_unsaved_module('bert_tokenizer', bert_tokenizer)

	def build_save_each_filename(base_filename):
	"""
	If the given name doesn't have %d in it, add %4d at the end of the filename

	This way, there's something to count how many models have been saved
	"""
	try:
	base_filename % 1
	except TypeError:
	# so models.pt -> models_0001.pt, etc
	pieces = os.path.splitext(model_save_each_file)
	base_filename = pieces[0] + "_%04d" + pieces[1]
	return base_filename