| | |
| | |
| | |
| |
|
| | from __future__ import absolute_import |
| |
|
| | import re |
| | import sys |
| |
|
| | if sys.version_info[0] >= 3: |
| | _unicode, _str, _bytes, _unichr = str, str, bytes, chr |
| | IS_PYTHON3 = True |
| | else: |
| | _unicode, _str, _bytes, _unichr = unicode, str, str, unichr |
| | IS_PYTHON3 = False |
| |
|
| | empty_bytes = _bytes() |
| | empty_unicode = _unicode() |
| |
|
| | join_bytes = empty_bytes.join |
| |
|
| |
|
| | class UnicodeLiteralBuilder(object): |
| | """Assemble a unicode string. |
| | """ |
| | def __init__(self): |
| | self.chars = [] |
| |
|
| | def append(self, characters): |
| | if isinstance(characters, _bytes): |
| | |
| | characters = characters.decode("ASCII") |
| | assert isinstance(characters, _unicode), str(type(characters)) |
| | self.chars.append(characters) |
| |
|
| | if sys.maxunicode == 65535: |
| | def append_charval(self, char_number): |
| | if char_number > 65535: |
| | |
| | |
| | char_number -= 0x10000 |
| | self.chars.append( _unichr((char_number // 1024) + 0xD800) ) |
| | self.chars.append( _unichr((char_number % 1024) + 0xDC00) ) |
| | else: |
| | self.chars.append( _unichr(char_number) ) |
| | else: |
| | def append_charval(self, char_number): |
| | self.chars.append( _unichr(char_number) ) |
| |
|
| | def append_uescape(self, char_number, escape_string): |
| | self.append_charval(char_number) |
| |
|
| | def getstring(self): |
| | return EncodedString(u''.join(self.chars)) |
| |
|
| | def getstrings(self): |
| | return (None, self.getstring()) |
| |
|
| |
|
| | class BytesLiteralBuilder(object): |
| | """Assemble a byte string or char value. |
| | """ |
| | def __init__(self, target_encoding): |
| | self.chars = [] |
| | self.target_encoding = target_encoding |
| |
|
| | def append(self, characters): |
| | if isinstance(characters, _unicode): |
| | characters = characters.encode(self.target_encoding) |
| | assert isinstance(characters, _bytes), str(type(characters)) |
| | self.chars.append(characters) |
| |
|
| | def append_charval(self, char_number): |
| | self.chars.append( _unichr(char_number).encode('ISO-8859-1') ) |
| |
|
| | def append_uescape(self, char_number, escape_string): |
| | self.append(escape_string) |
| |
|
| | def getstring(self): |
| | |
| | return bytes_literal(join_bytes(self.chars), self.target_encoding) |
| |
|
| | def getchar(self): |
| | |
| | return self.getstring() |
| |
|
| | def getstrings(self): |
| | return (self.getstring(), None) |
| |
|
| |
|
| | class StrLiteralBuilder(object): |
| | """Assemble both a bytes and a unicode representation of a string. |
| | """ |
| | def __init__(self, target_encoding): |
| | self._bytes = BytesLiteralBuilder(target_encoding) |
| | self._unicode = UnicodeLiteralBuilder() |
| |
|
| | def append(self, characters): |
| | self._bytes.append(characters) |
| | self._unicode.append(characters) |
| |
|
| | def append_charval(self, char_number): |
| | self._bytes.append_charval(char_number) |
| | self._unicode.append_charval(char_number) |
| |
|
| | def append_uescape(self, char_number, escape_string): |
| | self._bytes.append(escape_string) |
| | self._unicode.append_charval(char_number) |
| |
|
| | def getstrings(self): |
| | return (self._bytes.getstring(), self._unicode.getstring()) |
| |
|
| |
|
| | class EncodedString(_unicode): |
| | |
| | |
| | |
| | encoding = None |
| |
|
| | def __deepcopy__(self, memo): |
| | return self |
| |
|
| | def byteencode(self): |
| | assert self.encoding is not None |
| | return self.encode(self.encoding) |
| |
|
| | def utf8encode(self): |
| | assert self.encoding is None |
| | return self.encode("UTF-8") |
| |
|
| | @property |
| | def is_unicode(self): |
| | return self.encoding is None |
| |
|
| | def contains_surrogates(self): |
| | return string_contains_surrogates(self) |
| |
|
| | def as_utf8_string(self): |
| | return bytes_literal(self.utf8encode(), 'utf8') |
| |
|
| |
|
| | def string_contains_surrogates(ustring): |
| | """ |
| | Check if the unicode string contains surrogate code points |
| | on a CPython platform with wide (UCS-4) or narrow (UTF-16) |
| | Unicode, i.e. characters that would be spelled as two |
| | separate code units on a narrow platform. |
| | """ |
| | for c in map(ord, ustring): |
| | if c > 65535: |
| | return True |
| | if 0xD800 <= c <= 0xDFFF: |
| | return True |
| | return False |
| |
|
| |
|
| | def string_contains_lone_surrogates(ustring): |
| | """ |
| | Check if the unicode string contains lone surrogate code points |
| | on a CPython platform with wide (UCS-4) or narrow (UTF-16) |
| | Unicode, i.e. characters that would be spelled as two |
| | separate code units on a narrow platform, but that do not form a pair. |
| | """ |
| | last_was_start = False |
| | unicode_uses_surrogate_encoding = sys.maxunicode == 65535 |
| | for c in map(ord, ustring): |
| | |
| | if c < 0xD800 or c > 0xDFFF: |
| | if last_was_start: |
| | return True |
| | elif not unicode_uses_surrogate_encoding: |
| | |
| | return True |
| | elif c <= 0xDBFF: |
| | if last_was_start: |
| | return True |
| | last_was_start = True |
| | else: |
| | if not last_was_start: |
| | return True |
| | last_was_start = False |
| | return last_was_start |
| |
|
| |
|
| | class BytesLiteral(_bytes): |
| | |
| | encoding = None |
| |
|
| | def __deepcopy__(self, memo): |
| | return self |
| |
|
| | def byteencode(self): |
| | if IS_PYTHON3: |
| | return _bytes(self) |
| | else: |
| | |
| | return self.decode('ISO-8859-1').encode('ISO-8859-1') |
| |
|
| | def utf8encode(self): |
| | assert False, "this is not a unicode string: %r" % self |
| |
|
| | def __str__(self): |
| | """Fake-decode the byte string to unicode to support % |
| | formatting of unicode strings. |
| | """ |
| | return self.decode('ISO-8859-1') |
| |
|
| | is_unicode = False |
| |
|
| | def as_c_string_literal(self): |
| | value = split_string_literal(escape_byte_string(self)) |
| | return '"%s"' % value |
| |
|
| |
|
| | def bytes_literal(s, encoding): |
| | assert isinstance(s, bytes) |
| | s = BytesLiteral(s) |
| | s.encoding = encoding |
| | return s |
| |
|
| |
|
| | def encoded_string(s, encoding): |
| | assert isinstance(s, (_unicode, bytes)) |
| | s = EncodedString(s) |
| | if encoding is not None: |
| | s.encoding = encoding |
| | return s |
| |
|
| |
|
| | char_from_escape_sequence = { |
| | r'\a' : u'\a', |
| | r'\b' : u'\b', |
| | r'\f' : u'\f', |
| | r'\n' : u'\n', |
| | r'\r' : u'\r', |
| | r'\t' : u'\t', |
| | r'\v' : u'\v', |
| | }.get |
| |
|
| | _c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) |
| |
|
| |
|
| | def _to_escape_sequence(s): |
| | if s in '\n\r\t': |
| | return repr(s)[1:-1] |
| | elif s == '"': |
| | return r'\"' |
| | elif s == '\\': |
| | return r'\\' |
| | else: |
| | |
| | return ''.join(['\\%03o' % ord(c) for c in s]) |
| |
|
| |
|
| | def _build_specials_replacer(): |
| | subexps = [] |
| | replacements = {} |
| | for special in _c_special: |
| | regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) |
| | subexps.append(regexp) |
| | replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') |
| | sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub |
| | def replace_specials(m): |
| | return replacements[m.group(1)] |
| | def replace(s): |
| | return sub(replace_specials, s) |
| | return replace |
| |
|
| | _replace_specials = _build_specials_replacer() |
| |
|
| |
|
| | def escape_char(c): |
| | if IS_PYTHON3: |
| | c = c.decode('ISO-8859-1') |
| | if c in '\n\r\t\\': |
| | return repr(c)[1:-1] |
| | elif c == "'": |
| | return "\\'" |
| | n = ord(c) |
| | if n < 32 or n > 127: |
| | |
| | return "\\x%02X" % n |
| | else: |
| | return c |
| |
|
| | def escape_byte_string(s): |
| | """Escape a byte string so that it can be written into C code. |
| | Note that this returns a Unicode string instead which, when |
| | encoded as ISO-8859-1, will result in the correct byte sequence |
| | being written. |
| | """ |
| | s = _replace_specials(s) |
| | try: |
| | return s.decode("ASCII") |
| | except UnicodeDecodeError: |
| | pass |
| | if IS_PYTHON3: |
| | s_new = bytearray() |
| | append, extend = s_new.append, s_new.extend |
| | for b in s: |
| | if b >= 128: |
| | extend(('\\%3o' % b).encode('ASCII')) |
| | else: |
| | append(b) |
| | return s_new.decode('ISO-8859-1') |
| | else: |
| | l = [] |
| | append = l.append |
| | for c in s: |
| | o = ord(c) |
| | if o >= 128: |
| | append('\\%3o' % o) |
| | else: |
| | append(c) |
| | return join_bytes(l).decode('ISO-8859-1') |
| |
|
| | def split_string_literal(s, limit=2000): |
| | |
| | if len(s) < limit: |
| | return s |
| | else: |
| | start = 0 |
| | chunks = [] |
| | while start < len(s): |
| | end = start + limit |
| | if len(s) > end-4 and '\\' in s[end-4:end]: |
| | end -= 4 - s[end-4:end].find('\\') |
| | while s[end-1] == '\\': |
| | end -= 1 |
| | if end == start: |
| | |
| | end = start + limit - (limit % 2) - 4 |
| | break |
| | chunks.append(s[start:end]) |
| | start = end |
| | return '""'.join(chunks) |
| |
|
| | def encode_pyunicode_string(s): |
| | """Create Py_UNICODE[] representation of a given unicode string. |
| | """ |
| | s = list(map(ord, s)) + [0] |
| |
|
| | if sys.maxunicode >= 0x10000: |
| | utf16, utf32 = [], s |
| | for code_point in s: |
| | if code_point >= 0x10000: |
| | high, low = divmod(code_point - 0x10000, 1024) |
| | utf16.append(high + 0xD800) |
| | utf16.append(low + 0xDC00) |
| | else: |
| | utf16.append(code_point) |
| | else: |
| | utf16, utf32 = s, [] |
| | for code_unit in s: |
| | if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: |
| | high, low = utf32[-1], code_unit |
| | utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 |
| | else: |
| | utf32.append(code_unit) |
| |
|
| | if utf16 == utf32: |
| | utf16 = [] |
| | return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32)) |
| |
|