"""
Utility functions.
"""

import argparse
from collections import Counter
from contextlib import contextmanager
import gzip
import json
import logging
import lzma
import os
import random
import re
import sys
import unicodedata
import zipfile

import torch
import numpy as np

from stanza.models.common.constant import lcode2lang
import stanza.models.common.seq2seq_constant as constant
from stanza.resources.default_packages import TRANSFORMER_NICKNAMES
import stanza.utils.conll18_ud_eval as ud_eval
from stanza.utils.conll18_ud_eval import UDError

logger = logging.getLogger('stanza')

# filenames
def get_wordvec_file(wordvec_dir, shorthand, wordvec_type=None):
    """ Lookup the name of the word vectors file, given a directory and the language shorthand.
    """
    lcode, tcode = shorthand.split('_', 1)
    lang = lcode2lang[lcode]
    # locate language folder
    word2vec_dir = os.path.join(wordvec_dir, 'word2vec', lang)
    fasttext_dir = os.path.join(wordvec_dir, 'fasttext', lang)
    lang_dir = None
    if wordvec_type is not None:
        lang_dir = os.path.join(wordvec_dir, wordvec_type, lang)
        if not os.path.exists(lang_dir):
            raise FileNotFoundError("Word vector type {} was specified, but directory {} does not exist".format(wordvec_type, lang_dir))
    elif os.path.exists(word2vec_dir): # first try word2vec
        lang_dir = word2vec_dir
    elif os.path.exists(fasttext_dir): # otherwise try fasttext
        lang_dir = fasttext_dir
    else:
        raise FileNotFoundError("Cannot locate word vector directory for language: {}  Looked in {} and {}".format(lang, word2vec_dir, fasttext_dir))
    # look for wordvec filename in {lang_dir}
    filename = os.path.join(lang_dir, '{}.vectors'.format(lcode))
    if os.path.exists(filename + ".xz"):
        filename = filename + ".xz"
    elif os.path.exists(filename + ".txt"):
        filename = filename + ".txt"
    return filename

@contextmanager
def output_stream(filename=None):
    """
    Yields the given file if a file is given, or returns sys.stdout if filename is None

    Opens the file in a context manager so it closes nicely
    """
    if filename is None:
        yield sys.stdout
    else:
        with open(filename, "w", encoding="utf-8") as fout:
            yield fout


@contextmanager
def open_read_text(filename, encoding="utf-8"):
    """
    Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular text otherwise.

    Use as a context

    eg:
    with open_read_text(filename) as fin:
        do stuff

    File will be closed once the context exits
    """
    if filename.endswith(".xz"):
        with lzma.open(filename, mode='rt', encoding=encoding) as fin:
            yield fin
    elif filename.endswith(".gz"):
        with gzip.open(filename, mode='rt', encoding=encoding) as fin:
            yield fin
    else:
        with open(filename, encoding=encoding) as fin:
            yield fin

@contextmanager
def open_read_binary(filename):
    """
    Opens a file as an .xz file or .gz if it ends with .xz or .gz, or regular binary file otherwise.

    If a .zip file is given, it can be read if there is a single file in there

    Use as a context

    eg:
    with open_read_binary(filename) as fin:
        do stuff

    File will be closed once the context exits
    """
    if filename.endswith(".xz"):
        with lzma.open(filename, mode='rb') as fin:
            yield fin
    elif filename.endswith(".gz"):
        with gzip.open(filename, mode='rb') as fin:
            yield fin
    elif filename.endswith(".zip"):
        with zipfile.ZipFile(filename) as zin:
            input_names = zin.namelist()
            if len(input_names) == 0:
                raise ValueError("Empty zip archive")
            if len(input_names) > 1:
                raise ValueError("zip file %s has more than one file in it")
            with zin.open(input_names[0]) as fin:
                yield fin
    else:
        with open(filename, mode='rb') as fin:
            yield fin

# training schedule
def get_adaptive_eval_interval(cur_dev_size, thres_dev_size, base_interval):
    """ Adjust the evaluation interval adaptively.
    If cur_dev_size <= thres_dev_size, return base_interval;
    else, linearly increase the interval (round to integer times of base interval).
    """
    if cur_dev_size <= thres_dev_size:
        return base_interval
    else:
        alpha = round(cur_dev_size / thres_dev_size)
        return base_interval * alpha

# ud utils
def ud_scores(gold_conllu_file, system_conllu_file):
    def has_readline(f):
        return hasattr(f, 'readline') and callable(f.readline)

    if has_readline(gold_conllu_file):
        try:
            gold_ud = ud_eval.load_conllu(gold_conllu_file, '', {})
        except UDError as e:
            raise UDError("Could not process gold UD file") from e
    else:
        try:
            gold_ud = ud_eval.load_conllu_file(gold_conllu_file)
        except UDError as e:
            raise UDError("Could not read %s" % gold_conllu_file) from e

    if has_readline(system_conllu_file):
        try:
            system_ud = ud_eval.load_conllu(system_conllu_file, '', {})
        except UDError as e:
            raise UDError("Could not process system UD file") from e
    else:
        try:
            system_ud = ud_eval.load_conllu_file(system_conllu_file)
        except UDError as e:
            raise UDError("Could not read %s" % system_conllu_file) from e

    evaluation = ud_eval.evaluate(gold_ud, system_ud)

    return evaluation

def harmonic_mean(a, weights=None):
    if any([x == 0 for x in a]):
        return 0
    else:
        assert weights is None or len(weights) == len(a), 'Weights has length {} which is different from that of the array ({}).'.format(len(weights), len(a))
        if weights is None:
            return len(a) / sum([1/x for x in a])
        else:
            return sum(weights) / sum(w/x for x, w in zip(a, weights))

# torch utils
def dispatch_optimizer(name, parameters, opt_logger, lr=None, betas=None, eps=None, momentum=None, **extra_args):
    extra_logging = ""
    if len(extra_args) > 0:
        extra_logging = ", " + ", ".join("%s=%s" % (x, y) for x, y in extra_args.items())

    if name == 'amsgrad':
        opt_logger.debug("Building Adam w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
        return torch.optim.Adam(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
    elif name == 'amsgradw':
        opt_logger.debug("Building AdamW w/ amsgrad with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
        return torch.optim.AdamW(parameters, amsgrad=True, lr=lr, betas=betas, eps=eps, **extra_args)
    elif name == 'sgd':
        opt_logger.debug("Building SGD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
        return torch.optim.SGD(parameters, lr=lr, momentum=momentum, **extra_args)
    elif name == 'adagrad':
        opt_logger.debug("Building Adagrad with lr=%f%s", lr, extra_logging)
        return torch.optim.Adagrad(parameters, lr=lr, **extra_args)
    elif name == 'adam':
        opt_logger.debug("Building Adam with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
        return torch.optim.Adam(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
    elif name == 'adamw':
        opt_logger.debug("Building AdamW with lr=%f, betas=%s, eps=%f%s", lr, betas, eps, extra_logging)
        return torch.optim.AdamW(parameters, lr=lr, betas=betas, eps=eps, **extra_args)
    elif name == 'adamax':
        opt_logger.debug("Building Adamax%s", extra_logging)
        return torch.optim.Adamax(parameters, **extra_args) # use default lr
    elif name == 'adadelta':
        opt_logger.debug("Building Adadelta with lr=%f%s", lr, extra_logging)
        return torch.optim.Adadelta(parameters, lr=lr, **extra_args)
    elif name == 'adabelief':
        try:
            from adabelief_pytorch import AdaBelief
        except ModuleNotFoundError as e:
            raise ModuleNotFoundError("Could not create adabelief optimizer.  Perhaps the adabelief-pytorch package is not installed") from e
        opt_logger.debug("Building AdaBelief with lr=%f, eps=%f%s", lr, eps, extra_logging)
        # TODO: add weight_decouple and rectify as extra args?
        return AdaBelief(parameters, lr=lr, eps=eps, weight_decouple=True, rectify=True, **extra_args)
    elif name == 'madgrad':
        try:
            import madgrad
        except ModuleNotFoundError as e:
            raise ModuleNotFoundError("Could not create madgrad optimizer.  Perhaps the madgrad package is not installed") from e
        opt_logger.debug("Building MADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
        return madgrad.MADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
    elif name == 'mirror_madgrad':
        try:
            import madgrad
        except ModuleNotFoundError as e:
            raise ModuleNotFoundError("Could not create mirror_madgrad optimizer.  Perhaps the madgrad package is not installed") from e
        opt_logger.debug("Building MirrorMADGRAD with lr=%f, momentum=%f%s", lr, momentum, extra_logging)
        return madgrad.MirrorMADGRAD(parameters, lr=lr, momentum=momentum, **extra_args)
    else:
        raise ValueError("Unsupported optimizer: {}".format(name))


def get_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None, opt_logger=None):
    opt_logger = opt_logger if opt_logger is not None else logger
    base_parameters = [p for n, p in model.named_parameters()
                       if p.requires_grad and not n.startswith("bert_model.")
                       and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
    parameters = [{'param_group_name': 'base', 'params': base_parameters}]

    charlm_parameters = [p for n, p in model.named_parameters()
                         if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
    if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
        parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})

    if not is_peft:
        bert_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]

        # bert_finetune_layers limits the bert finetuning to the *last* N layers of the model
        if len(bert_parameters) > 0 and bert_finetune_layers is not None:
            num_layers = model.bert_model.config.num_hidden_layers
            start_layer = num_layers - bert_finetune_layers
            bert_parameters = []
            for layer_num in range(start_layer, num_layers):
                bert_parameters.extend([param for name, param in model.named_parameters()
                                        if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])

        if len(bert_parameters) > 0 and bert_learning_rate > 0:
            opt_logger.debug("Finetuning %d bert parameters with LR %s and WD %s", len(bert_parameters), lr * bert_learning_rate, bert_weight_decay)
            parameters.append({'param_group_name': 'bert', 'params': bert_parameters, 'lr': lr * bert_learning_rate})
            if bert_weight_decay is not None:
                parameters[-1]['weight_decay'] = bert_weight_decay
    else:
        # some optimizers seem to train some even with a learning rate of 0...
        if bert_learning_rate > 0:
            # because PEFT handles what to hand to an optimizer, we don't want to touch that
            parameters.append({'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate})
            if bert_weight_decay is not None:
                parameters[-1]['weight_decay'] = bert_weight_decay

    extra_args = {}
    if weight_decay is not None:
        extra_args["weight_decay"] = weight_decay

    return dispatch_optimizer(name, parameters, opt_logger=opt_logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)

def get_split_optimizer(name, model, lr, betas=(0.9, 0.999), eps=1e-8, momentum=0, weight_decay=None, bert_learning_rate=0.0, bert_weight_decay=None, charlm_learning_rate=0.0, is_peft=False, bert_finetune_layers=None):
    """Same as `get_optimizer`, but splits the optimizer for Bert into a seperate optimizer"""
    base_parameters = [p for n, p in model.named_parameters()
                       if p.requires_grad and not n.startswith("bert_model.")
                       and not n.startswith("charmodel_forward.") and not n.startswith("charmodel_backward.")]
    parameters = [{'param_group_name': 'base', 'params': base_parameters}]

    charlm_parameters = [p for n, p in model.named_parameters()
                         if p.requires_grad and (n.startswith("charmodel_forward.") or n.startswith("charmodel_backward."))]
    if len(charlm_parameters) > 0 and charlm_learning_rate > 0:
        parameters.append({'param_group_name': 'charlm', 'params': charlm_parameters, 'lr': lr * charlm_learning_rate})

    bert_parameters = None
    if not is_peft:
        trainable_parameters = [p for n, p in model.named_parameters() if p.requires_grad and n.startswith("bert_model.")]

        # bert_finetune_layers limits the bert finetuning to the *last* N layers of the model
        if len(trainable_parameters) > 0 and bert_finetune_layers is not None:
            num_layers = model.bert_model.config.num_hidden_layers
            start_layer = num_layers - bert_finetune_layers
            trainable_parameters = []
            for layer_num in range(start_layer, num_layers):
                trainable_parameters.extend([param for name, param in model.named_parameters()
                                             if param.requires_grad and name.startswith("bert_model.") and "layer.%d." % layer_num in name])

        if len(trainable_parameters) > 0:
            bert_parameters = [{'param_group_name': 'bert', 'params': trainable_parameters, 'lr': lr * bert_learning_rate}]
    else:
        # because PEFT handles what to hand to an optimizer, we don't want to touch that
        bert_parameters = [{'param_group_name': 'bert', 'params': model.bert_model.parameters(), 'lr': lr * bert_learning_rate}]

    extra_args = {}
    if weight_decay is not None:
        extra_args["weight_decay"] = weight_decay

    optimizers = {
        "general_optimizer": dispatch_optimizer(name, parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
    }
    if bert_parameters is not None and bert_learning_rate > 0.0:
        if bert_weight_decay is not None:
            extra_args['weight_decay'] = bert_weight_decay
        optimizers["bert_optimizer"] = dispatch_optimizer(name, bert_parameters, opt_logger=logger, lr=lr, betas=betas, eps=eps, momentum=momentum, **extra_args)
    return optimizers


def change_lr(optimizer, new_lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_lr

def flatten_indices(seq_lens, width):
    flat = []
    for i, l in enumerate(seq_lens):
        for j in range(l):
            flat.append(i * width + j)
    return flat

def keep_partial_grad(grad, topk):
    """
    Keep only the topk rows of grads.
    """
    assert topk < grad.size(0)
    grad.data[topk:].zero_()
    return grad

# other utils
def ensure_dir(d, verbose=True):
    if not os.path.exists(d):
        if verbose:
            logger.info("Directory {} does not exist; creating...".format(d))
        # exist_ok: guard against race conditions
        os.makedirs(d, exist_ok=True)

def save_config(config, path, verbose=True):
    with open(path, 'w') as outfile:
        json.dump(config, outfile, indent=2)
    if verbose:
        print("Config saved to file {}".format(path))
    return config

def load_config(path, verbose=True):
    with open(path) as f:
        config = json.load(f)
    if verbose:
        print("Config loaded from file {}".format(path))
    return config

def print_config(config):
    info = "Running with the following configs:\n"
    for k,v in config.items():
        info += "\t{} : {}\n".format(k, str(v))
    logger.info("\n" + info + "\n")

def normalize_text(text):
    return unicodedata.normalize('NFD', text)

def unmap_with_copy(indices, src_tokens, vocab):
    """
    Unmap a list of list of indices, by optionally copying from src_tokens.
    """
    result = []
    for ind, tokens in zip(indices, src_tokens):
        words = []
        for idx in ind:
            if idx >= 0:
                words.append(vocab.id2word[idx])
            else:
                idx = -idx - 1 # flip and minus 1
                words.append(tokens[idx])
        result += [words]
    return result

def prune_decoded_seqs(seqs):
    """
    Prune decoded sequences after EOS token.
    """
    out = []
    for s in seqs:
        if constant.EOS in s:
            idx = s.index(constant.EOS_TOKEN)
            out += [s[:idx]]
        else:
            out += [s]
    return out

def prune_hyp(hyp):
    """
    Prune a decoded hypothesis
    """
    if constant.EOS_ID in hyp:
        idx = hyp.index(constant.EOS_ID)
        return hyp[:idx]
    else:
        return hyp

def prune(data_list, lens):
    assert len(data_list) == len(lens)
    nl = []
    for d, l in zip(data_list, lens):
        nl.append(d[:l])
    return nl

def sort(packed, ref, reverse=True):
    """
    Sort a series of packed list, according to a ref list.
    Also return the original index before the sort.
    """
    assert (isinstance(packed, tuple) or isinstance(packed, list)) and isinstance(ref, list)
    packed = [ref] + [range(len(ref))] + list(packed)
    sorted_packed = [list(t) for t in zip(*sorted(zip(*packed), reverse=reverse))]
    return tuple(sorted_packed[1:])

def unsort(sorted_list, oidx):
    """
    Unsort a sorted list, based on the original idx.
    """
    assert len(sorted_list) == len(oidx), "Number of list elements must match with original indices."
    if len(sorted_list) == 0:
        return []
    _, unsorted = [list(t) for t in zip(*sorted(zip(oidx, sorted_list)))]
    return unsorted

def sort_with_indices(data, key=None, reverse=False):
    """
    Sort data and return both the data and the original indices.

    One useful application is to sort by length, which can be done with key=len
    Returns the data as a sorted list, then the indices of the original list.
    """
    if not data:
        return [], []
    if key:
        ordered = sorted(enumerate(data), key=lambda x: key(x[1]), reverse=reverse)
    else:
        ordered = sorted(enumerate(data), key=lambda x: x[1], reverse=reverse)

    result = tuple(zip(*ordered))
    return result[1], result[0]

def split_into_batches(data, batch_size):
    """
    Returns a list of intervals so that each interval is either <= batch_size or one element long.

    Long elements are not dropped from the intervals.
    data is a list of lists
    batch_size is how long to make each batch
    return value is a list of pairs, start_idx end_idx
    """
    intervals = []
    interval_start = 0
    interval_size = 0
    for idx, line in enumerate(data):
        if len(line) > batch_size:
            # guess we'll just hope the model can handle a batch of this size after all
            if interval_size > 0:
                intervals.append((interval_start, idx))
            intervals.append((idx, idx+1))
            interval_start = idx+1
            interval_size = 0
        elif len(line) + interval_size > batch_size:
            # this line puts us over batch_size
            intervals.append((interval_start, idx))
            interval_start = idx
            interval_size = len(line)
        else:
            interval_size = interval_size + len(line)
    if interval_size > 0:
        # there's some leftover
        intervals.append((interval_start, len(data)))
    return intervals

def tensor_unsort(sorted_tensor, oidx):
    """
    Unsort a sorted tensor on its 0-th dimension, based on the original idx.
    """
    assert sorted_tensor.size(0) == len(oidx), "Number of list elements must match with original indices."
    backidx = [x[0] for x in sorted(enumerate(oidx), key=lambda x: x[1])]
    return sorted_tensor[backidx]


def set_random_seed(seed):
    """
    Set a random seed on all of the things which might need it.
    torch, np, python random, and torch.cuda
    """
    if seed is None:
        seed = random.randint(0, 1000000000)

    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # some of these calls are probably redundant
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    return seed

def find_missing_tags(known_tags, test_tags):
    if isinstance(known_tags, list) and isinstance(known_tags[0], list):
        known_tags = set(x for y in known_tags for x in y)
    if isinstance(test_tags, list) and isinstance(test_tags[0], list):
        test_tags = sorted(set(x for y in test_tags for x in y))
    missing_tags = sorted(x for x in test_tags if x not in known_tags)
    return missing_tags

def warn_missing_tags(known_tags, test_tags, test_set_name):
    """
    Print a warning if any tags present in the second list are not in the first list.

    Can also handle a list of lists.
    """
    missing_tags = find_missing_tags(known_tags, test_tags)
    if len(missing_tags) > 0:
        logger.warning("Found tags in {} missing from the expected tag set: {}".format(test_set_name, missing_tags))
        return True
    return False

def checkpoint_name(save_dir, save_name, checkpoint_name):
    """
    Will return a recommended checkpoint name for the given dir, save_name, optional checkpoint_name

    For example, can pass in args['save_dir'], args['save_name'], args['checkpoint_save_name']
    """
    if checkpoint_name:
        model_dir = os.path.split(checkpoint_name)[0]
        if model_dir == save_dir:
            return checkpoint_name
        return os.path.join(save_dir, checkpoint_name)

    model_dir = os.path.split(save_name)[0]
    if model_dir != save_dir:
        save_name = os.path.join(save_dir, save_name)
    if save_name.endswith(".pt"):
        return save_name[:-3] + "_checkpoint.pt"

    return save_name + "_checkpoint"

def default_device():
    """
    Pick a default device based on what's available on this system
    """
    if torch.cuda.is_available():
        return 'cuda'
    return 'cpu'

def add_device_args(parser):
    """
    Add args which specify cpu, cuda, or arbitrary device
    """
    parser.add_argument('--device', type=str, default=default_device(), help='Which device to run on - use a torch device string name')
    parser.add_argument('--cuda', dest='device', action='store_const', const='cuda', help='Run on CUDA')
    parser.add_argument('--cpu', dest='device', action='store_const', const='cpu', help='Ignore CUDA and run on CPU')

def load_elmo(elmo_model):
    # This import is here so that Elmo integration can be treated
    # as an optional feature
    import elmoformanylangs

    logger.info("Loading elmo: %s" % elmo_model)
    elmo_model = elmoformanylangs.Embedder(elmo_model)
    return elmo_model

def log_training_args(args, args_logger, name="training"):
    """
    For record keeping purposes, log the arguments when training
    """
    if isinstance(args, argparse.Namespace):
        args = vars(args)
    keys = sorted(args.keys())
    log_lines = ['%s: %s' % (k, args[k]) for k in keys]
    args_logger.info('ARGS USED AT %s TIME:\n%s\n', name.upper(), '\n'.join(log_lines))

def embedding_name(args):
    """
    Return the generic name of the biggest embedding used by a model.

    Used by POS and depparse, for example.

    TODO: Probably will make the transformer names a bit more informative,
    such as electra, roberta, etc.  Maybe even phobert for VI, for example
    """
    embedding = "nocharlm"
    if args['wordvec_pretrain_file'] is None and args['wordvec_file'] is None:
        embedding = "nopretrain"
    if args.get('charlm', True) and (args['charlm_forward_file'] or args['charlm_backward_file']):
        embedding = "charlm"
    if args['bert_model']:
        if args['bert_model'] in TRANSFORMER_NICKNAMES:
            embedding = TRANSFORMER_NICKNAMES[args['bert_model']]
        else:
            embedding = "transformer"

    return embedding

def standard_model_file_name(args, model_type, **kwargs):
    """
    Returns a model file name based on some common args found in the various models.

    The expectation is that the args will have something like

      parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_parser.pt", help="File name to save the model")

    Then the model shorthand, embedding type, and other args will be
    turned into arguments in a format string
    """
    embedding = embedding_name(args)

    finetune = ""
    transformer_lr = ""
    if args.get("bert_finetune", False):
        finetune = "finetuned"
        if "bert_learning_rate" in args:
            transformer_lr = "{}".format(args["bert_learning_rate"])

    use_peft = "nopeft"
    if args.get("bert_finetune", False) and args.get("use_peft", False):
        use_peft = "peft"

    bert_finetuning = ""
    if args.get("bert_finetune", False):
        if args.get("use_peft", False):
            bert_finetuning = "peft"
        else:
            bert_finetuning = "ft"

    seed = args.get('seed', None)
    if seed is None:
        seed = ""
    else:
        seed = str(seed)

    format_args = {
        "batch_size":      args['batch_size'],
        "bert_finetuning": bert_finetuning,
        "embedding":       embedding,
        "finetune":        finetune,
        "peft":            use_peft,
        "seed":            seed,
        "shorthand":       args['shorthand'],
        "transformer_lr":  transformer_lr,
    }
    format_args.update(**kwargs)
    model_file = args['save_name'].format(**format_args)
    model_file = re.sub("_+", "_", model_file)

    model_dir = os.path.split(model_file)[0]

    if not os.path.exists(os.path.join(args['save_dir'], model_file)) and os.path.exists(model_file):
        return model_file
    return os.path.join(args['save_dir'], model_file)

def escape_misc_space(space):
    spaces = []
    for char in space:
        if char == ' ':
            spaces.append('\\s')
        elif char == '\t':
            spaces.append('\\t')
        elif char == '\r':
            spaces.append('\\r')
        elif char == '\n':
            spaces.append('\\n')
        elif char == '|':
            spaces.append('\\p')
        elif char == '\\':
            spaces.append('\\\\')
        elif char == ' ':
            spaces.append('\\u00A0')
        else:
            spaces.append(char)
    escaped_space = "".join(spaces)
    return escaped_space

def unescape_misc_space(misc_space):
    spaces = []
    pos = 0
    while pos < len(misc_space):
        if misc_space[pos:pos+2] == '\\s':
            spaces.append(' ')
            pos += 2
        elif misc_space[pos:pos+2] == '\\t':
            spaces.append('\t')
            pos += 2
        elif misc_space[pos:pos+2] == '\\r':
            spaces.append('\r')
            pos += 2
        elif misc_space[pos:pos+2] == '\\n':
            spaces.append('\n')
            pos += 2
        elif misc_space[pos:pos+2] == '\\p':
            spaces.append('|')
            pos += 2
        elif misc_space[pos:pos+2] == '\\\\':
            spaces.append('\\')
            pos += 2
        elif misc_space[pos:pos+6] == '\\u00A0':
            spaces.append(' ')
            pos += 6
        else:
            spaces.append(misc_space[pos])
            pos += 1
    unescaped_space = "".join(spaces)
    return unescaped_space

def space_before_to_misc(space):
    """
    Convert whitespace to SpacesBefore specifically for the start of a document.

    In general, UD datasets do not have both SpacesAfter on a token and SpacesBefore on the next token.

    The space(s) are only marked on one of the tokens.

    Only at the very beginning of a document is it necessary to mark what spaces occurred before the actual text,
    and the default assumption is that there is no space if there is no SpacesBefore annotation.
    """
    if not space:
        return ""
    escaped_space = escape_misc_space(space)
    return "SpacesBefore=%s" % escaped_space

def space_after_to_misc(space):
    """
    Convert whitespace back to the escaped format - either SpaceAfter=No or SpacesAfter=...
    """
    if not space:
        return "SpaceAfter=No"
    if space == " ":
        return ""
    escaped_space = escape_misc_space(space)
    return "SpacesAfter=%s" % escaped_space

def misc_to_space_before(misc):
    """
    Find any SpacesBefore annotation in the MISC column and turn it into a space value
    """
    if not misc:
        return ""
    pieces = misc.split("|")
    for piece in pieces:
        if not piece.lower().startswith("spacesbefore="):
            continue
        misc_space = piece.split("=", maxsplit=1)[1]
        return unescape_misc_space(misc_space)
    return ""

def misc_to_space_after(misc):
    """
    Convert either SpaceAfter=No or the SpacesAfter annotation

    see https://universaldependencies.org/misc.html#spacesafter

    We compensate for some treebanks using SpaceAfter=\n instead of SpacesAfter=\n
    On the way back, though, those annotations will be turned into SpacesAfter
    """
    if not misc:
        return " "
    pieces = misc.split("|")
    if any(piece.lower() == "spaceafter=no" for piece in pieces):
        return ""
    if "SpaceAfter=Yes" in pieces:
        # as of UD 2.11, the Cantonese treebank had this as a misc feature
        return " "
    if "SpaceAfter=No~" in pieces:
        # as of UD 2.11, a weird typo in the Russian Taiga dataset
        return ""
    for piece in pieces:
        if piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter="):
            misc_space = piece.split("=", maxsplit=1)[1]
            return unescape_misc_space(misc_space)
    return " "

def log_norms(model):
    lines = ["NORMS FOR MODEL PARAMTERS"]
    pieces = []
    for name, param in model.named_parameters():
        if param.requires_grad:
            pieces.append((name, "%.6g" % torch.norm(param).item(), "%d" % param.numel()))
    name_len = max(len(x[0]) for x in pieces)
    norm_len = max(len(x[1]) for x in pieces)
    line_format = "  %-" + str(name_len) + "s   %" + str(norm_len) + "s     %s"
    for line in pieces:
        lines.append(line_format % line)
    logger.info("\n".join(lines))

def attach_bert_model(model, bert_model, bert_tokenizer, use_peft, force_bert_saved):
    if use_peft:
        # we use a peft-specific pathway for saving peft weights
        model.add_unsaved_module('bert_model', bert_model)
        model.bert_model.train()
    elif force_bert_saved:
        model.bert_model = bert_model
    elif bert_model is not None:
        model.add_unsaved_module('bert_model', bert_model)
        for _, parameter in bert_model.named_parameters():
            parameter.requires_grad = False
    else:
        model.bert_model = None
    model.add_unsaved_module('bert_tokenizer', bert_tokenizer)

def build_save_each_filename(base_filename):
    """
    If the given name doesn't have %d in it, add %4d at the end of the filename

    This way, there's something to count how many models have been saved
    """
    try:
        base_filename % 1
    except TypeError:
        # so models.pt -> models_0001.pt, etc
        pieces = os.path.splitext(model_save_each_file)
        base_filename = pieces[0] + "_%04d" + pieces[1]
    return base_filename