Upload folder using huggingface_hub

096c926 verified about 1 year ago

6.1 kB

	"""
	This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
	handler of Python 3.

	Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
	"""

	# This code is released under the Python license and the BSD 2-clause license

	import codecs
	import sys

	from future import utils


	FS_ERRORS = 'surrogateescape'

	# # -- Python 2/3 compatibility -------------------------------------
	# FS_ERRORS = 'my_surrogateescape'

	def u(text):
	if utils.PY3:
	return text
	else:
	return text.decode('unicode_escape')

	def b(data):
	if utils.PY3:
	return data.encode('latin1')
	else:
	return data

	if utils.PY3:
	_unichr = chr
	bytes_chr = lambda code: bytes((code,))
	else:
	_unichr = unichr
	bytes_chr = chr

	def surrogateescape_handler(exc):
	"""
	Pure Python implementation of the PEP 383: the "surrogateescape" error
	handler of Python 3. Undecodable bytes will be replaced by a Unicode
	character U+DCxx on decoding, and these are translated into the
	original bytes on encoding.
	"""
	mystring = exc.object[exc.start:exc.end]

	try:
	if isinstance(exc, UnicodeDecodeError):
	# mystring is a byte-string in this case
	decoded = replace_surrogate_decode(mystring)
	elif isinstance(exc, UnicodeEncodeError):
	# In the case of u'\udcc3'.encode('ascii',
	# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
	# exception anyway after this function is called, even though I think
	# it's doing what it should. It seems that the strict encoder is called
	# to encode the unicode string that this function returns ...
	decoded = replace_surrogate_encode(mystring)
	else:
	raise exc
	except NotASurrogateError:
	raise exc
	return (decoded, exc.end)


	class NotASurrogateError(Exception):
	pass


	def replace_surrogate_encode(mystring):
	"""
	Returns a (unicode) string, not the more logical bytes, because the codecs
	register_error functionality expects this.
	"""
	decoded = []
	for ch in mystring:
	# if utils.PY3:
	# code = ch
	# else:
	code = ord(ch)

	# The following magic comes from Py3.3's Python/codecs.c file:
	if not 0xD800 <= code <= 0xDCFF:
	# Not a surrogate. Fail with the original exception.
	raise NotASurrogateError
	# mybytes = [0xe0 \| (code >> 12),
	# 0x80 \| ((code >> 6) & 0x3f),
	# 0x80 \| (code & 0x3f)]
	# Is this a good idea?
	if 0xDC00 <= code <= 0xDC7F:
	decoded.append(_unichr(code - 0xDC00))
	elif code <= 0xDCFF:
	decoded.append(_unichr(code - 0xDC00))
	else:
	raise NotASurrogateError
	return str().join(decoded)


	def replace_surrogate_decode(mybytes):
	"""
	Returns a (unicode) string
	"""
	decoded = []
	for ch in mybytes:
	# We may be parsing newbytes (in which case ch is an int) or a native
	# str on Py2
	if isinstance(ch, int):
	code = ch
	else:
	code = ord(ch)
	if 0x80 <= code <= 0xFF:
	decoded.append(_unichr(0xDC00 + code))
	elif code <= 0x7F:
	decoded.append(_unichr(code))
	else:
	# # It may be a bad byte
	# # Try swallowing it.
	# continue
	# print("RAISE!")
	raise NotASurrogateError
	return str().join(decoded)


	def encodefilename(fn):
	if FS_ENCODING == 'ascii':
	# ASCII encoder of Python 2 expects that the error handler returns a
	# Unicode string encodable to ASCII, whereas our surrogateescape error
	# handler has to return bytes in 0x80-0xFF range.
	encoded = []
	for index, ch in enumerate(fn):
	code = ord(ch)
	if code < 128:
	ch = bytes_chr(code)
	elif 0xDC80 <= code <= 0xDCFF:
	ch = bytes_chr(code - 0xDC00)
	else:
	raise UnicodeEncodeError(FS_ENCODING,
	fn, index, index+1,
	'ordinal not in range(128)')
	encoded.append(ch)
	return bytes().join(encoded)
	elif FS_ENCODING == 'utf-8':
	# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
	# doesn't go through our error handler
	encoded = []
	for index, ch in enumerate(fn):
	code = ord(ch)
	if 0xD800 <= code <= 0xDFFF:
	if 0xDC80 <= code <= 0xDCFF:
	ch = bytes_chr(code - 0xDC00)
	encoded.append(ch)
	else:
	raise UnicodeEncodeError(
	FS_ENCODING,
	fn, index, index+1, 'surrogates not allowed')
	else:
	ch_utf8 = ch.encode('utf-8')
	encoded.append(ch_utf8)
	return bytes().join(encoded)
	else:
	return fn.encode(FS_ENCODING, FS_ERRORS)

	def decodefilename(fn):
	return fn.decode(FS_ENCODING, FS_ERRORS)

	FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
	# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
	# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')


	# normalize the filesystem encoding name.
	# For example, we expect "utf-8", not "UTF8".
	FS_ENCODING = codecs.lookup(FS_ENCODING).name


	def register_surrogateescape():
	"""
	Registers the surrogateescape error handler on Python 2 (only)
	"""
	if utils.PY3:
	return
	try:
	codecs.lookup_error(FS_ERRORS)
	except LookupError:
	codecs.register_error(FS_ERRORS, surrogateescape_handler)


	if __name__ == '__main__':
	pass
	# # Tests:
	# register_surrogateescape()

	# b = decodefilename(fn)
	# assert b == encoded, "%r != %r" % (b, encoded)
	# c = encodefilename(b)
	# assert c == fn, '%r != %r' % (c, fn)
	# # print("ok")