|
|
|
|
|
|
|
|
| from __future__ import absolute_import
|
|
|
| import re
|
| import sys
|
|
|
| if sys.version_info[0] >= 3:
|
| _unicode, _str, _bytes, _unichr = str, str, bytes, chr
|
| IS_PYTHON3 = True
|
| else:
|
| _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
|
| IS_PYTHON3 = False
|
|
|
| empty_bytes = _bytes()
|
| empty_unicode = _unicode()
|
|
|
| join_bytes = empty_bytes.join
|
|
|
|
|
| class UnicodeLiteralBuilder(object):
|
| """Assemble a unicode string.
|
| """
|
| def __init__(self):
|
| self.chars = []
|
|
|
| def append(self, characters):
|
| if isinstance(characters, _bytes):
|
|
|
| characters = characters.decode("ASCII")
|
| assert isinstance(characters, _unicode), str(type(characters))
|
| self.chars.append(characters)
|
|
|
| if sys.maxunicode == 65535:
|
| def append_charval(self, char_number):
|
| if char_number > 65535:
|
|
|
|
|
| char_number -= 0x10000
|
| self.chars.append( _unichr((char_number // 1024) + 0xD800) )
|
| self.chars.append( _unichr((char_number % 1024) + 0xDC00) )
|
| else:
|
| self.chars.append( _unichr(char_number) )
|
| else:
|
| def append_charval(self, char_number):
|
| self.chars.append( _unichr(char_number) )
|
|
|
| def append_uescape(self, char_number, escape_string):
|
| self.append_charval(char_number)
|
|
|
| def getstring(self):
|
| return EncodedString(u''.join(self.chars))
|
|
|
| def getstrings(self):
|
| return (None, self.getstring())
|
|
|
|
|
| class BytesLiteralBuilder(object):
|
| """Assemble a byte string or char value.
|
| """
|
| def __init__(self, target_encoding):
|
| self.chars = []
|
| self.target_encoding = target_encoding
|
|
|
| def append(self, characters):
|
| if isinstance(characters, _unicode):
|
| characters = characters.encode(self.target_encoding)
|
| assert isinstance(characters, _bytes), str(type(characters))
|
| self.chars.append(characters)
|
|
|
| def append_charval(self, char_number):
|
| self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
|
|
|
| def append_uescape(self, char_number, escape_string):
|
| self.append(escape_string)
|
|
|
| def getstring(self):
|
|
|
| return bytes_literal(join_bytes(self.chars), self.target_encoding)
|
|
|
| def getchar(self):
|
|
|
| return self.getstring()
|
|
|
| def getstrings(self):
|
| return (self.getstring(), None)
|
|
|
|
|
| class StrLiteralBuilder(object):
|
| """Assemble both a bytes and a unicode representation of a string.
|
| """
|
| def __init__(self, target_encoding):
|
| self._bytes = BytesLiteralBuilder(target_encoding)
|
| self._unicode = UnicodeLiteralBuilder()
|
|
|
| def append(self, characters):
|
| self._bytes.append(characters)
|
| self._unicode.append(characters)
|
|
|
| def append_charval(self, char_number):
|
| self._bytes.append_charval(char_number)
|
| self._unicode.append_charval(char_number)
|
|
|
| def append_uescape(self, char_number, escape_string):
|
| self._bytes.append(escape_string)
|
| self._unicode.append_charval(char_number)
|
|
|
| def getstrings(self):
|
| return (self._bytes.getstring(), self._unicode.getstring())
|
|
|
|
|
| class EncodedString(_unicode):
|
|
|
|
|
|
|
| encoding = None
|
|
|
| def __deepcopy__(self, memo):
|
| return self
|
|
|
| def byteencode(self):
|
| assert self.encoding is not None
|
| return self.encode(self.encoding)
|
|
|
| def utf8encode(self):
|
| assert self.encoding is None
|
| return self.encode("UTF-8")
|
|
|
| @property
|
| def is_unicode(self):
|
| return self.encoding is None
|
|
|
| def contains_surrogates(self):
|
| return string_contains_surrogates(self)
|
|
|
| def as_utf8_string(self):
|
| return bytes_literal(self.utf8encode(), 'utf8')
|
|
|
| def as_c_string_literal(self):
|
|
|
| if self.encoding is None:
|
| s = self.as_utf8_string()
|
| else:
|
| s = bytes_literal(self.byteencode(), self.encoding)
|
| return s.as_c_string_literal()
|
|
|
| if not hasattr(_unicode, "isascii"):
|
| def isascii(self):
|
|
|
| try:
|
| self.encode("ascii")
|
| except UnicodeEncodeError:
|
| return False
|
| else:
|
| return True
|
|
|
|
|
| def string_contains_surrogates(ustring):
|
| """
|
| Check if the unicode string contains surrogate code points
|
| on a CPython platform with wide (UCS-4) or narrow (UTF-16)
|
| Unicode, i.e. characters that would be spelled as two
|
| separate code units on a narrow platform.
|
| """
|
| for c in map(ord, ustring):
|
| if c > 65535:
|
| return True
|
| if 0xD800 <= c <= 0xDFFF:
|
| return True
|
| return False
|
|
|
|
|
| def string_contains_lone_surrogates(ustring):
|
| """
|
| Check if the unicode string contains lone surrogate code points
|
| on a CPython platform with wide (UCS-4) or narrow (UTF-16)
|
| Unicode, i.e. characters that would be spelled as two
|
| separate code units on a narrow platform, but that do not form a pair.
|
| """
|
| last_was_start = False
|
| unicode_uses_surrogate_encoding = sys.maxunicode == 65535
|
| for c in map(ord, ustring):
|
|
|
| if c < 0xD800 or c > 0xDFFF:
|
| if last_was_start:
|
| return True
|
| elif not unicode_uses_surrogate_encoding:
|
|
|
| return True
|
| elif c <= 0xDBFF:
|
| if last_was_start:
|
| return True
|
| last_was_start = True
|
| else:
|
| if not last_was_start:
|
| return True
|
| last_was_start = False
|
| return last_was_start
|
|
|
|
|
| class BytesLiteral(_bytes):
|
|
|
| encoding = None
|
|
|
| def __deepcopy__(self, memo):
|
| return self
|
|
|
| def byteencode(self):
|
| if IS_PYTHON3:
|
| return _bytes(self)
|
| else:
|
|
|
| return self.decode('ISO-8859-1').encode('ISO-8859-1')
|
|
|
| def utf8encode(self):
|
| assert False, "this is not a unicode string: %r" % self
|
|
|
| def __str__(self):
|
| """Fake-decode the byte string to unicode to support %
|
| formatting of unicode strings.
|
| """
|
| return self.decode('ISO-8859-1')
|
|
|
| is_unicode = False
|
|
|
| def as_c_string_literal(self):
|
| value = split_string_literal(escape_byte_string(self))
|
| return '"%s"' % value
|
|
|
| if not hasattr(_bytes, "isascii"):
|
| def isascii(self):
|
|
|
| return True
|
|
|
|
|
| def bytes_literal(s, encoding):
|
| assert isinstance(s, bytes)
|
| s = BytesLiteral(s)
|
| s.encoding = encoding
|
| return s
|
|
|
|
|
| def encoded_string(s, encoding):
|
| assert isinstance(s, (_unicode, bytes))
|
| s = EncodedString(s)
|
| if encoding is not None:
|
| s.encoding = encoding
|
| return s
|
|
|
| def encoded_string_or_bytes_literal(s, encoding):
|
| if isinstance(s, bytes):
|
| return bytes_literal(s, encoding)
|
| else:
|
| return encoded_string(s, encoding)
|
|
|
|
|
| char_from_escape_sequence = {
|
| r'\a' : u'\a',
|
| r'\b' : u'\b',
|
| r'\f' : u'\f',
|
| r'\n' : u'\n',
|
| r'\r' : u'\r',
|
| r'\t' : u'\t',
|
| r'\v' : u'\v',
|
| }.get
|
|
|
| _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
|
|
|
|
|
| def _to_escape_sequence(s):
|
| if s in '\n\r\t':
|
| return repr(s)[1:-1]
|
| elif s == '"':
|
| return r'\"'
|
| elif s == '\\':
|
| return r'\\'
|
| else:
|
|
|
| return ''.join(['\\%03o' % ord(c) for c in s])
|
|
|
|
|
| def _build_specials_replacer():
|
| subexps = []
|
| replacements = {}
|
| for special in _c_special:
|
| regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
|
| subexps.append(regexp)
|
| replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
|
| sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
|
| def replace_specials(m):
|
| return replacements[m.group(1)]
|
| def replace(s):
|
| return sub(replace_specials, s)
|
| return replace
|
|
|
| _replace_specials = _build_specials_replacer()
|
|
|
|
|
| def escape_char(c):
|
| if IS_PYTHON3:
|
| c = c.decode('ISO-8859-1')
|
| if c in '\n\r\t\\':
|
| return repr(c)[1:-1]
|
| elif c == "'":
|
| return "\\'"
|
| n = ord(c)
|
| if n < 32 or n > 127:
|
|
|
| return "\\x%02X" % n
|
| else:
|
| return c
|
|
|
| def escape_byte_string(s):
|
| """Escape a byte string so that it can be written into C code.
|
| Note that this returns a Unicode string instead which, when
|
| encoded as ISO-8859-1, will result in the correct byte sequence
|
| being written.
|
| """
|
| s = _replace_specials(s)
|
| try:
|
| return s.decode("ASCII")
|
| except UnicodeDecodeError:
|
| pass
|
| if IS_PYTHON3:
|
| s_new = bytearray()
|
| append, extend = s_new.append, s_new.extend
|
| for b in s:
|
| if b >= 128:
|
| extend(('\\%3o' % b).encode('ASCII'))
|
| else:
|
| append(b)
|
| return s_new.decode('ISO-8859-1')
|
| else:
|
| l = []
|
| append = l.append
|
| for c in s:
|
| o = ord(c)
|
| if o >= 128:
|
| append('\\%3o' % o)
|
| else:
|
| append(c)
|
| return join_bytes(l).decode('ISO-8859-1')
|
|
|
| def split_string_literal(s, limit=2000):
|
|
|
| if len(s) < limit:
|
| return s
|
| else:
|
| start = 0
|
| chunks = []
|
| while start < len(s):
|
| end = start + limit
|
| if len(s) > end-4 and '\\' in s[end-4:end]:
|
| end -= 4 - s[end-4:end].find('\\')
|
| while s[end-1] == '\\':
|
| end -= 1
|
| if end == start:
|
|
|
| end = start + limit - (limit % 2) - 4
|
| break
|
| chunks.append(s[start:end])
|
| start = end
|
| return '""'.join(chunks)
|
|
|
| def encode_pyunicode_string(s):
|
| """Create Py_UNICODE[] representation of a given unicode string.
|
| """
|
| s = list(map(ord, s)) + [0]
|
|
|
| if sys.maxunicode >= 0x10000:
|
| utf16, utf32 = [], s
|
| for code_point in s:
|
| if code_point >= 0x10000:
|
| high, low = divmod(code_point - 0x10000, 1024)
|
| utf16.append(high + 0xD800)
|
| utf16.append(low + 0xDC00)
|
| else:
|
| utf16.append(code_point)
|
| else:
|
| utf16, utf32 = s, []
|
| for code_unit in s:
|
| if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
|
| high, low = utf32[-1], code_unit
|
| utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
|
| else:
|
| utf32.append(code_unit)
|
|
|
| if utf16 == utf32:
|
| utf16 = []
|
| return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))
|
|
|