| """ |
| Tools to open .py files as Unicode, using the encoding specified within the file, |
| as per PEP 263. |
| |
| Much of the code is taken from the tokenize module in Python 3.2. |
| """ |
|
|
| import io |
| from io import TextIOWrapper, BytesIO |
| from pathlib import Path |
| import re |
| from tokenize import open, detect_encoding |
|
|
| cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE) |
| cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) |
|
|
| def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): |
| """Converts a bytes string with python source code to unicode. |
| |
| Unicode strings are passed through unchanged. Byte strings are checked |
| for the python source file encoding cookie to determine encoding. |
| txt can be either a bytes buffer or a string containing the source |
| code. |
| """ |
| if isinstance(txt, str): |
| return txt |
| if isinstance(txt, bytes): |
| buffer = BytesIO(txt) |
| else: |
| buffer = txt |
| try: |
| encoding, _ = detect_encoding(buffer.readline) |
| except SyntaxError: |
| encoding = "ascii" |
| buffer.seek(0) |
| with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text: |
| text.mode = 'r' |
| if skip_encoding_cookie: |
| return u"".join(strip_encoding_cookie(text)) |
| else: |
| return text.read() |
|
|
| def strip_encoding_cookie(filelike): |
| """Generator to pull lines from a text-mode file, skipping the encoding |
| cookie if it is found in the first two lines. |
| """ |
| it = iter(filelike) |
| try: |
| first = next(it) |
| if not cookie_comment_re.match(first): |
| yield first |
| second = next(it) |
| if not cookie_comment_re.match(second): |
| yield second |
| except StopIteration: |
| return |
| |
| for line in it: |
| yield line |
|
|
| def read_py_file(filename, skip_encoding_cookie=True): |
| """Read a Python file, using the encoding declared inside the file. |
| |
| Parameters |
| ---------- |
| filename : str |
| The path to the file to read. |
| skip_encoding_cookie : bool |
| If True (the default), and the encoding declaration is found in the first |
| two lines, that line will be excluded from the output. |
| |
| Returns |
| ------- |
| A unicode string containing the contents of the file. |
| """ |
| filepath = Path(filename) |
| with open(filepath) as f: |
| if skip_encoding_cookie: |
| return "".join(strip_encoding_cookie(f)) |
| else: |
| return f.read() |
|
|
| def read_py_url(url, errors='replace', skip_encoding_cookie=True): |
| """Read a Python file from a URL, using the encoding declared inside the file. |
| |
| Parameters |
| ---------- |
| url : str |
| The URL from which to fetch the file. |
| errors : str |
| How to handle decoding errors in the file. Options are the same as for |
| bytes.decode(), but here 'replace' is the default. |
| skip_encoding_cookie : bool |
| If True (the default), and the encoding declaration is found in the first |
| two lines, that line will be excluded from the output. |
| |
| Returns |
| ------- |
| A unicode string containing the contents of the file. |
| """ |
| |
| from urllib.request import urlopen |
| response = urlopen(url) |
| buffer = io.BytesIO(response.read()) |
| return source_to_unicode(buffer, errors, skip_encoding_cookie) |
|
|