|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import functools |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
@functools.lru_cache(maxsize=None) |
|
|
def get_fim_token_ids(tokenizer): |
|
|
if "codellama" in tokenizer.name_or_path: |
|
|
return ( |
|
|
tokenizer.bos_token_id, |
|
|
tokenizer.suffix_id, |
|
|
tokenizer.prefix_id, |
|
|
tokenizer.middle_id, |
|
|
0, |
|
|
) |
|
|
elif "deepseek-coder" in tokenizer.name_or_path: |
|
|
return ( |
|
|
tokenizer.bos_token_id, |
|
|
tokenizer.encode("<|fim▁hole|>", add_special_tokens=False)[0], |
|
|
tokenizer.encode("<|fim▁begin|>", add_special_tokens=False)[0], |
|
|
tokenizer.encode("<|fim▁end|>", add_special_tokens=False)[0], |
|
|
tokenizer.encode("<pad>", add_special_tokens=False)[0], |
|
|
) |
|
|
elif "stable-code" in tokenizer.name_or_path: |
|
|
return ( |
|
|
tokenizer.bos_token_id, |
|
|
tokenizer.encode("<fim_suffix>")[0], |
|
|
tokenizer.encode("<fim_prefix>")[0], |
|
|
tokenizer.encode("<fim_middle>")[0], |
|
|
tokenizer.encode("<fim_pad>")[0], |
|
|
) |
|
|
else: |
|
|
bos_token_id = None |
|
|
try: |
|
|
FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[ |
|
|
"additional_special_tokens" |
|
|
][1:5] |
|
|
suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = ( |
|
|
tokenizer.vocab[tok] |
|
|
for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD] |
|
|
) |
|
|
except KeyError: |
|
|
suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = ( |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
) |
|
|
return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id |
|
|
|
|
|
|
|
|
def _bos_token_processing(prefix_token_list, bos_token): |
|
|
if bos_token is not None: |
|
|
|
|
|
prefix_token_list.insert(0, bos_token) |
|
|
|
|
|
return prefix_token_list |
|
|
|
|
|
|
|
|
|
|
|
def permute( |
|
|
sample, |
|
|
np_rng, |
|
|
suffix_tok_id, |
|
|
prefix_tok_id, |
|
|
middle_tok_id, |
|
|
pad_tok_id, |
|
|
fim_rate=0.5, |
|
|
fim_spm_rate=0.5, |
|
|
truncate_or_pad=False, |
|
|
bos_token_id=None, |
|
|
): |
|
|
""" |
|
|
Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes: |
|
|
PSM and SPM (with a probability of fim_spm_rate). |
|
|
""" |
|
|
|
|
|
if np_rng.binomial(1, fim_rate): |
|
|
boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2)) |
|
|
boundaries.sort() |
|
|
|
|
|
prefix = np.array(sample[: boundaries[0]], dtype=np.int64) |
|
|
middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64) |
|
|
suffix = np.array(sample[boundaries[1] :], dtype=np.int64) |
|
|
|
|
|
if truncate_or_pad: |
|
|
new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3 |
|
|
diff = new_length - len(sample) |
|
|
if diff > 0: |
|
|
if suffix.shape[0] <= diff: |
|
|
return sample, np_rng |
|
|
suffix = suffix[: suffix.shape[0] - diff] |
|
|
elif diff < 0: |
|
|
suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)]) |
|
|
|
|
|
if np_rng.binomial(1, fim_spm_rate): |
|
|
prefix_special_tokens = _bos_token_processing( |
|
|
[prefix_tok_id, suffix_tok_id], bos_token_id |
|
|
) |
|
|
|
|
|
new_sample = np.concatenate( |
|
|
[ |
|
|
prefix_special_tokens, |
|
|
suffix, |
|
|
[middle_tok_id], |
|
|
prefix, |
|
|
middle, |
|
|
] |
|
|
) |
|
|
else: |
|
|
prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id) |
|
|
|
|
|
new_sample = np.concatenate( |
|
|
[ |
|
|
prefix_special_tokens, |
|
|
prefix, |
|
|
[suffix_tok_id], |
|
|
suffix, |
|
|
[middle_tok_id], |
|
|
middle, |
|
|
] |
|
|
) |
|
|
else: |
|
|
|
|
|
new_sample = sample |
|
|
return list(new_sample), np_rng |
|
|
|