norae
/
docker
/deezer-spleeter-env
/lib
/python3.10
/site-packages
/future
/utils
/surrogateescape.py
| """ | |
| This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error | |
| handler of Python 3. | |
| Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc | |
| """ | |
| # This code is released under the Python license and the BSD 2-clause license | |
| import codecs | |
| import sys | |
| from future import utils | |
| FS_ERRORS = 'surrogateescape' | |
| # # -- Python 2/3 compatibility ------------------------------------- | |
| # FS_ERRORS = 'my_surrogateescape' | |
| def u(text): | |
| if utils.PY3: | |
| return text | |
| else: | |
| return text.decode('unicode_escape') | |
| def b(data): | |
| if utils.PY3: | |
| return data.encode('latin1') | |
| else: | |
| return data | |
| if utils.PY3: | |
| _unichr = chr | |
| bytes_chr = lambda code: bytes((code,)) | |
| else: | |
| _unichr = unichr | |
| bytes_chr = chr | |
| def surrogateescape_handler(exc): | |
| """ | |
| Pure Python implementation of the PEP 383: the "surrogateescape" error | |
| handler of Python 3. Undecodable bytes will be replaced by a Unicode | |
| character U+DCxx on decoding, and these are translated into the | |
| original bytes on encoding. | |
| """ | |
| mystring = exc.object[exc.start:exc.end] | |
| try: | |
| if isinstance(exc, UnicodeDecodeError): | |
| # mystring is a byte-string in this case | |
| decoded = replace_surrogate_decode(mystring) | |
| elif isinstance(exc, UnicodeEncodeError): | |
| # In the case of u'\udcc3'.encode('ascii', | |
| # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an | |
| # exception anyway after this function is called, even though I think | |
| # it's doing what it should. It seems that the strict encoder is called | |
| # to encode the unicode string that this function returns ... | |
| decoded = replace_surrogate_encode(mystring) | |
| else: | |
| raise exc | |
| except NotASurrogateError: | |
| raise exc | |
| return (decoded, exc.end) | |
| class NotASurrogateError(Exception): | |
| pass | |
| def replace_surrogate_encode(mystring): | |
| """ | |
| Returns a (unicode) string, not the more logical bytes, because the codecs | |
| register_error functionality expects this. | |
| """ | |
| decoded = [] | |
| for ch in mystring: | |
| # if utils.PY3: | |
| # code = ch | |
| # else: | |
| code = ord(ch) | |
| # The following magic comes from Py3.3's Python/codecs.c file: | |
| if not 0xD800 <= code <= 0xDCFF: | |
| # Not a surrogate. Fail with the original exception. | |
| raise NotASurrogateError | |
| # mybytes = [0xe0 | (code >> 12), | |
| # 0x80 | ((code >> 6) & 0x3f), | |
| # 0x80 | (code & 0x3f)] | |
| # Is this a good idea? | |
| if 0xDC00 <= code <= 0xDC7F: | |
| decoded.append(_unichr(code - 0xDC00)) | |
| elif code <= 0xDCFF: | |
| decoded.append(_unichr(code - 0xDC00)) | |
| else: | |
| raise NotASurrogateError | |
| return str().join(decoded) | |
| def replace_surrogate_decode(mybytes): | |
| """ | |
| Returns a (unicode) string | |
| """ | |
| decoded = [] | |
| for ch in mybytes: | |
| # We may be parsing newbytes (in which case ch is an int) or a native | |
| # str on Py2 | |
| if isinstance(ch, int): | |
| code = ch | |
| else: | |
| code = ord(ch) | |
| if 0x80 <= code <= 0xFF: | |
| decoded.append(_unichr(0xDC00 + code)) | |
| elif code <= 0x7F: | |
| decoded.append(_unichr(code)) | |
| else: | |
| # # It may be a bad byte | |
| # # Try swallowing it. | |
| # continue | |
| # print("RAISE!") | |
| raise NotASurrogateError | |
| return str().join(decoded) | |
| def encodefilename(fn): | |
| if FS_ENCODING == 'ascii': | |
| # ASCII encoder of Python 2 expects that the error handler returns a | |
| # Unicode string encodable to ASCII, whereas our surrogateescape error | |
| # handler has to return bytes in 0x80-0xFF range. | |
| encoded = [] | |
| for index, ch in enumerate(fn): | |
| code = ord(ch) | |
| if code < 128: | |
| ch = bytes_chr(code) | |
| elif 0xDC80 <= code <= 0xDCFF: | |
| ch = bytes_chr(code - 0xDC00) | |
| else: | |
| raise UnicodeEncodeError(FS_ENCODING, | |
| fn, index, index+1, | |
| 'ordinal not in range(128)') | |
| encoded.append(ch) | |
| return bytes().join(encoded) | |
| elif FS_ENCODING == 'utf-8': | |
| # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF | |
| # doesn't go through our error handler | |
| encoded = [] | |
| for index, ch in enumerate(fn): | |
| code = ord(ch) | |
| if 0xD800 <= code <= 0xDFFF: | |
| if 0xDC80 <= code <= 0xDCFF: | |
| ch = bytes_chr(code - 0xDC00) | |
| encoded.append(ch) | |
| else: | |
| raise UnicodeEncodeError( | |
| FS_ENCODING, | |
| fn, index, index+1, 'surrogates not allowed') | |
| else: | |
| ch_utf8 = ch.encode('utf-8') | |
| encoded.append(ch_utf8) | |
| return bytes().join(encoded) | |
| else: | |
| return fn.encode(FS_ENCODING, FS_ERRORS) | |
| def decodefilename(fn): | |
| return fn.decode(FS_ENCODING, FS_ERRORS) | |
| FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | |
| # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') | |
| # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | |
| # normalize the filesystem encoding name. | |
| # For example, we expect "utf-8", not "UTF8". | |
| FS_ENCODING = codecs.lookup(FS_ENCODING).name | |
| def register_surrogateescape(): | |
| """ | |
| Registers the surrogateescape error handler on Python 2 (only) | |
| """ | |
| if utils.PY3: | |
| return | |
| try: | |
| codecs.lookup_error(FS_ERRORS) | |
| except LookupError: | |
| codecs.register_error(FS_ERRORS, surrogateescape_handler) | |
| if __name__ == '__main__': | |
| pass | |
| # # Tests: | |
| # register_surrogateescape() | |
| # b = decodefilename(fn) | |
| # assert b == encoded, "%r != %r" % (b, encoded) | |
| # c = encodefilename(b) | |
| # assert c == fn, '%r != %r' % (c, fn) | |
| # # print("ok") | |