Spaces:
Runtime error
Runtime error
| from functools import lru_cache | |
| def bytes_to_unicode_dict(): | |
| """ | |
| Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control | |
| characters the bpe code barfs on. | |
| The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab | |
| if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for | |
| decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup | |
| tables between utf-8 bytes and unicode strings. | |
| """ | |
| bs = ( | |
| list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) | |
| ) | |
| cs = bs[:] | |
| n = 0 | |
| for b in range(2 ** 8): | |
| if b not in bs: | |
| bs.append(b) | |
| cs.append(2 ** 8 + n) | |
| n += 1 | |
| cs = [chr(n) for n in cs] | |
| return dict(zip(cs, bs)) | |
| ORD_UNICODE_MAP = bytes_to_unicode_dict() | |
| def byte_to_char(bytestr): | |
| return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace") | |
| # @lru_cache() | |
| def bytetokens_to_unicdode(byte_tokens: list): | |
| return [byte_to_char(token) for token in byte_tokens] | |
| if __name__ == '__main__': | |
| tokens = ['<s>', | |
| 'ì¹´ì¹´ìĺ¤', | |
| 'ìĹĶ', | |
| 'íĦ°', | |
| 'íĶĦëĿ¼ìĿ´', | |
| 'ì¦Ī', | |
| '(', | |
| 'ëĮĢíijľ', | |
| 'Ġë°±', | |
| 'ìĥģ', | |
| 'ìĹ½', | |
| ')', | |
| 'ê°Ģ', | |
| 'Ġìĺ¬íķ´', | |
| 'Ġ8', | |
| 'ìĽĶ', | |
| 'Ġ기ì¤Ģ', | |
| 'Ġëĭ¤ìĪĺ', | |
| 'Ġê¶Į', | |
| 'ìľĦ', | |
| 'ĠìŀĪëĬĶ', | |
| 'Ġê¸Ģë¡ľë²Į', | |
| 'ĠíķĻ', | |
| 'íļĮìĹIJìĦľ', | |
| 'Ġì´Ŀ', | |
| 'Ġ16', | |
| 'ê±´', | |
| 'ìĿĺ', | |
| 'ĠìĿ¸ê³µ', | |
| 'ì§Ģ', | |
| 'ëĬ¥', | |
| '(', | |
| 'A', | |
| 'I', | |
| ')', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿĦ', | |
| 'Ġëĵ±', | |
| 'ìŀ¬', | |
| 'íĸĪëĭ¤ê³ł', | |
| 'Ġ9', | |
| 'ìĿ¼', | |
| 'Ġë°ĿíĺĶ', | |
| 'ëĭ¤', | |
| '.', | |
| 'Ġì§ĢëĤľíķ´', | |
| 'Ġëĵ±', | |
| 'ìŀ¬', | |
| 'íķľ', | |
| 'Ġ13', | |
| 'ê±´ë', | |
| '³´ëĭ¤', | |
| 'Ġ3', | |
| 'ê±´', | |
| 'Ġë§İìĿĢ', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿ´', | |
| 'Ġë°ĺ', | |
| 'ëħĦ', | |
| 'ìŬ', | |
| 'Ġë§ĮìĹIJ', | |
| 'Ġì±Ħ', | |
| 'íĥĿ', | |
| 'ëIJIJëĭ¤', | |
| '.', | |
| 'Ġì¹´ì¹´ìĺ¤', | |
| 'ìĹĶ', | |
| 'íĦ°', | |
| 'íĶĦëĿ¼ìĿ´', | |
| 'ì¦Ī', | |
| '(', | |
| 'ìĿ´', | |
| 'íķĺ', | |
| 'Ġì¹´ì¹´ìĺ¤', | |
| 'ìĹĶ', | |
| 'íĦ°', | |
| ')', | |
| 'ëĬĶ', | |
| 'ĠA', | |
| 'I', | |
| 'ĠìĹ°êµ¬', | |
| 'ĠìĦ±', | |
| '과를', | |
| 'ĠìĿ´', | |
| 'ìĸ´ê°Ģ', | |
| '기', | |
| 'ĠìľĦíķ´', | |
| 'ĠìĿ¸ìŀ¬', | |
| 'ĠíĻķë³´', | |
| 'ìĹIJ', | |
| 'ĠìĨį', | |
| 'ëıĦ를', | |
| 'ĠëĨĴìĿ´', | |
| 'ê²łëĭ¤ëĬĶ', | |
| 'Ġë°©', | |
| '침', | |
| 'ìĿ´ëĭ¤', | |
| '.', | |
| 'Ċ', | |
| 'Ċ', | |
| 'ì¹´ì¹´ìĺ¤', | |
| 'ìĹĶ', | |
| 'íĦ°', | |
| 'ëĬĶ', | |
| 'Ġ8', | |
| 'ìĽĶ', | |
| 'ĠìŀIJìŰ', | |
| 'ìĸ´', | |
| 'ì²ĺ리', | |
| 'Ġë¶Ħìķ¼', | |
| 'ìĿĺ', | |
| 'Ġê¸Ģë¡ľë²Į', | |
| 'Ġíĥij', | |
| 'ĠíķĻ', | |
| 'íļĮ', | |
| 'ìĿ¸', | |
| "Ġ'", | |
| 'A', | |
| 'C', | |
| 'L', | |
| '-', | |
| 'I', | |
| 'J', | |
| 'C', | |
| 'N', | |
| 'L', | |
| 'P', | |
| "'", | |
| 'ìĹIJ', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿĦ', | |
| 'Ġë°ľíijľ', | |
| 'íķľ', | |
| 'ĠìĤ¬ë¡Ģ', | |
| 'ê¹Įì§Ģ', | |
| 'Ġíķ©', | |
| 'íķ´', | |
| 'Ġìĺ¬íķ´', | |
| 'Ġì´Ŀ', | |
| 'Ġ16', | |
| 'ê±´', | |
| 'ìĿĺ', | |
| 'ĠA', | |
| 'I', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿĦ', | |
| 'Ġëĵ±', | |
| 'ìŀ¬', | |
| 'íĸĪëĭ¤ê³ł', | |
| 'Ġë°ĿíĺĶ', | |
| 'ëĭ¤', | |
| '.', | |
| 'ĠìĿ´', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿĢ', | |
| 'ĠìĿ¸ëıĦ', | |
| 'ë©Ķ', | |
| 'ìĿ¸', | |
| '(', | |
| 'in', | |
| '-', | |
| 'd', | |
| 'om', | |
| 'a', | |
| 'in', | |
| ')', | |
| 'Ġìĥĺ', | |
| 'íĶĮ', | |
| 'ìĿĦ', | |
| 'ĠìĤ¬ìļ©', | |
| 'íķ´', | |
| 'ĠìŀIJìŰ', | |
| 'ìĸ´', | |
| 'Ġ공격', | |
| 'Ġë°©ìĭĿìľ¼ë¡ľ', | |
| 'ĠìķĦìĽĥ', | |
| 'ìĺ¤', | |
| 'ë¸Į', | |
| 'ëıĦ', | |
| 'ë©Ķ', | |
| 'ìĿ¸', | |
| '(', | |
| 'out', | |
| '-', | |
| 'of', | |
| '-', | |
| 'd', | |
| 'om', | |
| 'a', | |
| 'in', | |
| ')', | |
| 'Ġìĥĺ', | |
| 'íĶĮ', | |
| 'ìĿĦ', | |
| 'ĠìŀIJëıĻ', | |
| 'ìľ¼ë¡ľ', | |
| 'ĠìĥĿ', | |
| 'ìĦ±', | |
| ',', | |
| 'Ġë¶Ħ', | |
| 'ë¥ĺ', | |
| 'Ġ모ëį¸', | |
| 'ìĿĺ', | |
| 'Ġê°IJ', | |
| 'ì§Ģ', | |
| 'ĠëĬ¥ëł¥ìĿĦ', | |
| 'Ġíĸ¥', | |
| 'ìĥģ', | |
| 'ìĭľíĤ¤ëĬĶ', | |
| 'ĠëĤ´ìļ©', | |
| 'ìĿĺ', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿ´ëĭ¤', | |
| '.', | |
| 'Ċ', | |
| 'Ċ', | |
| '7', | |
| 'ìĽĶ', | |
| 'ìĹIJëĬĶ', | |
| 'Ġ머', | |
| 'ìĭł', | |
| '룬', | |
| 'ëĭĿ', | |
| 'ĠíķĻ', | |
| 'íļĮ', | |
| "Ġ'", | |
| 'I', | |
| 'C', | |
| 'M', | |
| 'L', | |
| "'", | |
| 'ìĹIJ', | |
| 'Ġíļ¨ìľ¨', | |
| 'ìłģìĿ¸', | |
| 'Ġê³ł', | |
| 'íĴĪ', | |
| 'ì§Ī', | |
| 'ĠìĿĮ', | |
| 'ìĦ±', | |
| 'íķ©', | |
| 'ìĦ±ìĿ´', | |
| 'Ġê°ĢëĬ¥íķľ', | |
| "Ġ'", | |
| 'ìĹĶ', | |
| 'ëĵľ', | |
| 'Ġíά', | |
| 'ĠìĹĶ', | |
| 'ëĵľ', | |
| '(', | |
| 'en', | |
| 'd', | |
| '-', | |
| 't', | |
| 'o', | |
| '-', | |
| 'en', | |
| 'd', | |
| ')', | |
| "'", | |
| 'Ġ모ëį¸', | |
| 'ìĿĦ', | |
| 'ĠìłľìķĪ', | |
| 'íķĺëĬĶ', | |
| 'Ġëħ¼ë¬¸', | |
| 'ìĿĦ', | |
| 'Ġë°ľíijľ', | |
| 'íĸĪëĭ¤', | |
| '.', | |
| 'Ġ6', | |
| 'ìĽĶ', | |
| 'ìĹIJëĬĶ', | |
| 'ĠìĿĮ', | |
| 'íĸ¥', | |
| '·', | |
| 'ìĿĮ', | |
| 'ìĦ±', | |
| 'Ġìĭł', | |
| 'íĺ¸', | |
| 'ì²ĺ리', | |
| 'Ġë¶Ħìķ¼', | |
| 'ĠíķĻ', | |
| 'ìĪł', | |
| 'ëĮĢíļĮ', | |
| "Ġ'", | |
| 'I', | |
| 'C', | |
| 'A', | |
| 'S', | |
| 'S', | |
| 'P', | |
| "'", | |
| 'ìĹIJ', | |
| 'ĠëĮĢ', | |
| 'ê·ľëª¨', | |
| 'Ġíħ', | |
| 'į', | |
| 'ìĬ¤íĬ¸', | |
| 'Ġì½Ķ', | |
| 'íį¼ìĬ¤', | |
| '(', | |
| 'ìĸ¸', | |
| 'ìĸ´', | |
| 'ĠìŰ', | |
| '구를', | |
| 'ĠìľĦíķ´', | |
| 'Ġíħ', | |
| 'į', | |
| 'ìĬ¤íĬ¸ë¥¼', | |
| 'Ġì»´íĵ¨íĦ°', | |
| 'ê°Ģ', | |
| 'ĠìĿ½ìĿĦ', | |
| 'ĠìĪĺ', | |
| 'ĠìŀĪëĬĶ', | |
| 'Ġíĺķíĥľë¡ľ', | |
| 'Ġ모ìķĦ', | |
| 'ĠëĨĵìĿĢ', | |
| 'Ġìĸ¸ìĸ´', | |
| 'ĠìŀIJë£Į', | |
| ')', | |
| 'Ġìłķë³´', | |
| 'ĠíķĻìĬµ', | |
| 'ìĹIJ', | |
| 'ĠëĮĢíķľ', | |
| 'Ġëħ¼ë¬¸', | |
| 'Ġ1', | |
| 'ê±´ìĿĦ', | |
| 'Ġìĭ¤', | |
| 'ìĹĪëĭ¤', | |
| '.', | |
| 'Ċ', | |
| '</s>'] | |
| import time | |
| start = time.time() | |
| for i in range(1000): | |
| result = bytetokens_to_unicdode(tokens) | |
| end = time.time() | |
| print(result) | |
| print(f'time: {end-start}') |