| | from __future__ import annotations |
| |
|
| | from collections.abc import Sequence |
| | import functools |
| | import re |
| |
|
| | DECODE_DEFAULT_CHARS = ";/?:@&=+$,#" |
| | DECODE_COMPONENT_CHARS = "" |
| |
|
| | decode_cache: dict[str, list[str]] = {} |
| |
|
| |
|
| | def get_decode_cache(exclude: str) -> Sequence[str]: |
| | if exclude in decode_cache: |
| | return decode_cache[exclude] |
| |
|
| | cache: list[str] = [] |
| | decode_cache[exclude] = cache |
| |
|
| | for i in range(128): |
| | ch = chr(i) |
| | cache.append(ch) |
| |
|
| | for i in range(len(exclude)): |
| | ch_code = ord(exclude[i]) |
| | cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:] |
| |
|
| | return cache |
| |
|
| |
|
| | |
| | |
| | def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str: |
| | cache = get_decode_cache(exclude) |
| | repl_func = functools.partial(repl_func_with_cache, cache=cache) |
| | return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE) |
| |
|
| |
|
| | def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str: |
| | seq = match.group() |
| | result = "" |
| |
|
| | i = 0 |
| | l = len(seq) |
| | while i < l: |
| | b1 = int(seq[i + 1 : i + 3], 16) |
| |
|
| | if b1 < 0x80: |
| | result += cache[b1] |
| | i += 3 |
| | continue |
| |
|
| | if (b1 & 0xE0) == 0xC0 and (i + 3 < l): |
| | |
| | b2 = int(seq[i + 4 : i + 6], 16) |
| |
|
| | if (b2 & 0xC0) == 0x80: |
| | all_bytes = bytes((b1, b2)) |
| | try: |
| | result += all_bytes.decode() |
| | except UnicodeDecodeError: |
| | result += "\ufffd" * 2 |
| |
|
| | i += 3 |
| | i += 3 |
| | continue |
| |
|
| | if (b1 & 0xF0) == 0xE0 and (i + 6 < l): |
| | |
| | b2 = int(seq[i + 4 : i + 6], 16) |
| | b3 = int(seq[i + 7 : i + 9], 16) |
| |
|
| | if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80: |
| | all_bytes = bytes((b1, b2, b3)) |
| | try: |
| | result += all_bytes.decode() |
| | except UnicodeDecodeError: |
| | result += "\ufffd" * 3 |
| |
|
| | i += 6 |
| | i += 3 |
| | continue |
| |
|
| | if (b1 & 0xF8) == 0xF0 and (i + 9 < l): |
| | |
| | b2 = int(seq[i + 4 : i + 6], 16) |
| | b3 = int(seq[i + 7 : i + 9], 16) |
| | b4 = int(seq[i + 10 : i + 12], 16) |
| |
|
| | if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80: |
| | all_bytes = bytes((b1, b2, b3, b4)) |
| | try: |
| | result += all_bytes.decode() |
| | except UnicodeDecodeError: |
| | result += "\ufffd" * 4 |
| |
|
| | i += 9 |
| | i += 3 |
| | continue |
| |
|
| | result += "\ufffd" |
| | i += 3 |
| |
|
| | return result |
| |
|