| | """ |
| | This is a python implementation of wcwidth() and wcswidth(). |
| | |
| | https://github.com/jquast/wcwidth |
| | |
| | from Markus Kuhn's C code, retrieved from: |
| | |
| | http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c |
| | |
| | This is an implementation of wcwidth() and wcswidth() (defined in |
| | IEEE Std 1002.1-2001) for Unicode. |
| | |
| | http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html |
| | http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html |
| | |
| | In fixed-width output devices, Latin characters all occupy a single |
| | "cell" position of equal width, whereas ideographic CJK characters |
| | occupy two such cells. Interoperability between terminal-line |
| | applications and (teletype-style) character terminals using the |
| | UTF-8 encoding requires agreement on which character should advance |
| | the cursor by how many cell positions. No established formal |
| | standards exist at present on which Unicode character shall occupy |
| | how many cell positions on character terminals. These routines are |
| | a first attempt of defining such behavior based on simple rules |
| | applied to data provided by the Unicode Consortium. |
| | |
| | For some graphical characters, the Unicode standard explicitly |
| | defines a character-cell width via the definition of the East Asian |
| | FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. |
| | In all these cases, there is no ambiguity about which width a |
| | terminal shall use. For characters in the East Asian Ambiguous (A) |
| | class, the width choice depends purely on a preference of backward |
| | compatibility with either historic CJK or Western practice. |
| | Choosing single-width for these characters is easy to justify as |
| | the appropriate long-term solution, as the CJK practice of |
| | displaying these characters as double-width comes from historic |
| | implementation simplicity (8-bit encoded characters were displayed |
| | single-width and 16-bit ones double-width, even for Greek, |
| | Cyrillic, etc.) and not any typographic considerations. |
| | |
| | Much less clear is the choice of width for the Not East Asian |
| | (Neutral) class. Existing practice does not dictate a width for any |
| | of these characters. It would nevertheless make sense |
| | typographically to allocate two character cells to characters such |
| | as for instance EM SPACE or VOLUME INTEGRAL, which cannot be |
| | represented adequately with a single-width glyph. The following |
| | routines at present merely assign a single-cell width to all |
| | neutral characters, in the interest of simplicity. This is not |
| | entirely satisfactory and should be reconsidered before |
| | establishing a formal standard in this area. At the moment, the |
| | decision which Not East Asian (Neutral) characters should be |
| | represented by double-width glyphs cannot yet be answered by |
| | applying a simple rule from the Unicode database content. Setting |
| | up a proper standard for the behavior of UTF-8 character terminals |
| | will require a careful analysis not only of each Unicode character, |
| | but also of each presentation form, something the author of these |
| | routines has avoided to do so far. |
| | |
| | http://www.unicode.org/unicode/reports/tr11/ |
| | |
| | Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c |
| | """ |
| | from __future__ import division |
| |
|
| | |
| | import os |
| | import sys |
| | import warnings |
| |
|
| | |
| | from .table_vs16 import VS16_NARROW_TO_WIDE |
| | from .table_wide import WIDE_EASTASIAN |
| | from .table_zero import ZERO_WIDTH |
| | from .unicode_versions import list_versions |
| |
|
| | try: |
| | |
| | from functools import lru_cache |
| | except ImportError: |
| | |
| | |
| | from backports.functools_lru_cache import lru_cache |
| |
|
| | |
| | _PY3 = sys.version_info[0] >= 3 |
| |
|
| |
|
| | def _bisearch(ucs, table): |
| | """ |
| | Auxiliary function for binary search in interval table. |
| | |
| | :arg int ucs: Ordinal value of unicode character. |
| | :arg list table: List of starting and ending ranges of ordinal values, |
| | in form of ``[(start, end), ...]``. |
| | :rtype: int |
| | :returns: 1 if ordinal value ucs is found within lookup table, else 0. |
| | """ |
| | lbound = 0 |
| | ubound = len(table) - 1 |
| |
|
| | if ucs < table[0][0] or ucs > table[ubound][1]: |
| | return 0 |
| | while ubound >= lbound: |
| | mid = (lbound + ubound) // 2 |
| | if ucs > table[mid][1]: |
| | lbound = mid + 1 |
| | elif ucs < table[mid][0]: |
| | ubound = mid - 1 |
| | else: |
| | return 1 |
| |
|
| | return 0 |
| |
|
| |
|
| | @lru_cache(maxsize=1000) |
| | def wcwidth(wc, unicode_version='auto'): |
| | r""" |
| | Given one Unicode character, return its printable length on a terminal. |
| | |
| | :param str wc: A single Unicode character. |
| | :param str unicode_version: A Unicode version number, such as |
| | ``'6.0.0'``. A list of version levels suported by wcwidth |
| | is returned by :func:`list_versions`. |
| | |
| | Any version string may be specified without error -- the nearest |
| | matching version is selected. When ``latest`` (default), the |
| | highest Unicode version level is used. |
| | :return: The width, in cells, necessary to display the character of |
| | Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has |
| | no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is |
| | not printable, or has an indeterminate effect on the terminal, such as |
| | a control character. Otherwise, the number of column positions the |
| | character occupies on a graphic terminal (1 or 2) is returned. |
| | :rtype: int |
| | |
| | See :ref:`Specification` for details of cell measurement. |
| | """ |
| | ucs = ord(wc) if wc else 0 |
| |
|
| | |
| | |
| | |
| | if 32 <= ucs < 0x7f: |
| | return 1 |
| |
|
| | |
| | if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: |
| | return -1 |
| |
|
| | _unicode_version = _wcmatch_version(unicode_version) |
| |
|
| | |
| | if _bisearch(ucs, ZERO_WIDTH[_unicode_version]): |
| | return 0 |
| |
|
| | |
| | return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version]) |
| |
|
| |
|
| | def wcswidth(pwcs, n=None, unicode_version='auto'): |
| | """ |
| | Given a unicode string, return its printable length on a terminal. |
| | |
| | :param str pwcs: Measure width of given unicode string. |
| | :param int n: When ``n`` is None (default), return the length of the entire |
| | string, otherwise only the first ``n`` characters are measured. This |
| | argument exists only for compatibility with the C POSIX function |
| | signature. It is suggested instead to use python's string slicing |
| | capability, ``wcswidth(pwcs[:n])`` |
| | :param str unicode_version: An explicit definition of the unicode version |
| | level to use for determination, may be ``auto`` (default), which uses |
| | the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest |
| | available unicode version, otherwise. |
| | :rtype: int |
| | :returns: The width, in cells, needed to display the first ``n`` characters |
| | of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control |
| | characters! |
| | |
| | See :ref:`Specification` for details of cell measurement. |
| | """ |
| | |
| | _unicode_version = None |
| | end = len(pwcs) if n is None else n |
| | width = 0 |
| | idx = 0 |
| | last_measured_char = None |
| | while idx < end: |
| | char = pwcs[idx] |
| | if char == u'\u200D': |
| | |
| | idx += 2 |
| | continue |
| | if char == u'\uFE0F' and last_measured_char: |
| | |
| | |
| | |
| | if _unicode_version is None: |
| | _unicode_version = _wcversion_value(_wcmatch_version(unicode_version)) |
| | if _unicode_version >= (9, 0, 0): |
| | width += _bisearch(ord(last_measured_char), VS16_NARROW_TO_WIDE["9.0.0"]) |
| | last_measured_char = None |
| | idx += 1 |
| | continue |
| | |
| | wcw = wcwidth(char, unicode_version) |
| | if wcw < 0: |
| | |
| | return wcw |
| | if wcw > 0: |
| | |
| | |
| | last_measured_char = char |
| | width += wcw |
| | idx += 1 |
| | return width |
| |
|
| |
|
| | @lru_cache(maxsize=128) |
| | def _wcversion_value(ver_string): |
| | """ |
| | Integer-mapped value of given dotted version string. |
| | |
| | :param str ver_string: Unicode version string, of form ``n.n.n``. |
| | :rtype: tuple(int) |
| | :returns: tuple of digit tuples, ``tuple(int, [...])``. |
| | """ |
| | retval = tuple(map(int, (ver_string.split('.')))) |
| | return retval |
| |
|
| |
|
| | @lru_cache(maxsize=8) |
| | def _wcmatch_version(given_version): |
| | """ |
| | Return nearest matching supported Unicode version level. |
| | |
| | If an exact match is not determined, the nearest lowest version level is |
| | returned after a warning is emitted. For example, given supported levels |
| | ``4.1.0`` and ``5.0.0``, and a version string of ``4.9.9``, then ``4.1.0`` |
| | is selected and returned: |
| | |
| | >>> _wcmatch_version('4.9.9') |
| | '4.1.0' |
| | >>> _wcmatch_version('8.0') |
| | '8.0.0' |
| | >>> _wcmatch_version('1') |
| | '4.1.0' |
| | |
| | :param str given_version: given version for compare, may be ``auto`` |
| | (default), to select Unicode Version from Environment Variable, |
| | ``UNICODE_VERSION``. If the environment variable is not set, then the |
| | latest is used. |
| | :rtype: str |
| | :returns: unicode string, or non-unicode ``str`` type for python 2 |
| | when given ``version`` is also type ``str``. |
| | """ |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | _return_str = not _PY3 and isinstance(given_version, str) |
| |
|
| | if _return_str: |
| | |
| | |
| | unicode_versions = list(map(lambda ucs: ucs.encode(), list_versions())) |
| | else: |
| | unicode_versions = list_versions() |
| | latest_version = unicode_versions[-1] |
| |
|
| | if given_version in (u'auto', 'auto'): |
| | given_version = os.environ.get( |
| | 'UNICODE_VERSION', |
| | 'latest' if not _return_str else latest_version.encode()) |
| |
|
| | if given_version in (u'latest', 'latest'): |
| | |
| | |
| | return latest_version if not _return_str else latest_version.encode() |
| |
|
| | if given_version in unicode_versions: |
| | |
| | |
| | return given_version if not _return_str else given_version.encode() |
| |
|
| | |
| | |
| | try: |
| | cmp_given = _wcversion_value(given_version) |
| |
|
| | except ValueError: |
| | |
| | warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. " |
| | "Value should be in form of `integer[.]+', the latest " |
| | "supported unicode version {latest_version!r} has been " |
| | "inferred.".format(given_version=given_version, |
| | latest_version=latest_version)) |
| | return latest_version if not _return_str else latest_version.encode() |
| |
|
| | |
| | |
| | earliest_version = unicode_versions[0] |
| | cmp_earliest_version = _wcversion_value(earliest_version) |
| |
|
| | if cmp_given <= cmp_earliest_version: |
| | |
| | |
| | |
| | warnings.warn("UNICODE_VERSION value, {given_version!r}, is lower " |
| | "than any available unicode version. Returning lowest " |
| | "version level, {earliest_version!r}".format( |
| | given_version=given_version, |
| | earliest_version=earliest_version)) |
| | return earliest_version if not _return_str else earliest_version.encode() |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | for idx, unicode_version in enumerate(unicode_versions): |
| | |
| | try: |
| | cmp_next_version = _wcversion_value(unicode_versions[idx + 1]) |
| | except IndexError: |
| | |
| | return latest_version if not _return_str else latest_version.encode() |
| |
|
| | |
| | |
| | |
| | if cmp_given == cmp_next_version[:len(cmp_given)]: |
| | return unicode_versions[idx + 1] |
| |
|
| | |
| | |
| | |
| | |
| | if cmp_next_version > cmp_given: |
| | return unicode_version |
| | assert False, ("Code path unreachable", given_version, unicode_versions) |
| |
|