| | """ |
| | Tools to open .py files as Unicode, using the encoding specified within the file, |
| | as per PEP 263. |
| | |
| | Much of the code is taken from the tokenize module in Python 3.2. |
| | """ |
| |
|
| | import io |
| | from io import TextIOWrapper, BytesIO |
| | from pathlib import Path |
| | import re |
| | from tokenize import open, detect_encoding |
| |
|
| | cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE) |
| | cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) |
| |
|
| | def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): |
| | """Converts a bytes string with python source code to unicode. |
| | |
| | Unicode strings are passed through unchanged. Byte strings are checked |
| | for the python source file encoding cookie to determine encoding. |
| | txt can be either a bytes buffer or a string containing the source |
| | code. |
| | """ |
| | if isinstance(txt, str): |
| | return txt |
| | if isinstance(txt, bytes): |
| | buffer = BytesIO(txt) |
| | else: |
| | buffer = txt |
| | try: |
| | encoding, _ = detect_encoding(buffer.readline) |
| | except SyntaxError: |
| | encoding = "ascii" |
| | buffer.seek(0) |
| | with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text: |
| | text.mode = 'r' |
| | if skip_encoding_cookie: |
| | return u"".join(strip_encoding_cookie(text)) |
| | else: |
| | return text.read() |
| |
|
| | def strip_encoding_cookie(filelike): |
| | """Generator to pull lines from a text-mode file, skipping the encoding |
| | cookie if it is found in the first two lines. |
| | """ |
| | it = iter(filelike) |
| | try: |
| | first = next(it) |
| | if not cookie_comment_re.match(first): |
| | yield first |
| | second = next(it) |
| | if not cookie_comment_re.match(second): |
| | yield second |
| | except StopIteration: |
| | return |
| | |
| | for line in it: |
| | yield line |
| |
|
| | def read_py_file(filename, skip_encoding_cookie=True): |
| | """Read a Python file, using the encoding declared inside the file. |
| | |
| | Parameters |
| | ---------- |
| | filename : str |
| | The path to the file to read. |
| | skip_encoding_cookie : bool |
| | If True (the default), and the encoding declaration is found in the first |
| | two lines, that line will be excluded from the output. |
| | |
| | Returns |
| | ------- |
| | A unicode string containing the contents of the file. |
| | """ |
| | filepath = Path(filename) |
| | with open(filepath) as f: |
| | if skip_encoding_cookie: |
| | return "".join(strip_encoding_cookie(f)) |
| | else: |
| | return f.read() |
| |
|
| | def read_py_url(url, errors='replace', skip_encoding_cookie=True): |
| | """Read a Python file from a URL, using the encoding declared inside the file. |
| | |
| | Parameters |
| | ---------- |
| | url : str |
| | The URL from which to fetch the file. |
| | errors : str |
| | How to handle decoding errors in the file. Options are the same as for |
| | bytes.decode(), but here 'replace' is the default. |
| | skip_encoding_cookie : bool |
| | If True (the default), and the encoding declaration is found in the first |
| | two lines, that line will be excluded from the output. |
| | |
| | Returns |
| | ------- |
| | A unicode string containing the contents of the file. |
| | """ |
| | |
| | from urllib.request import urlopen |
| | response = urlopen(url) |
| | buffer = io.BytesIO(response.read()) |
| | return source_to_unicode(buffer, errors, skip_encoding_cookie) |
| |
|