Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/INSTALLER +1 -0
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/METADATA +750 -0
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/RECORD +35 -0
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/WHEEL +5 -0
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/entry_points.txt +2 -0
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/licenses/LICENSE +21 -0
- phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/top_level.txt +1 -0
- phivenv/Lib/site-packages/charset_normalizer/__init__.py +48 -0
- phivenv/Lib/site-packages/charset_normalizer/__main__.py +6 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/cd.py +395 -0
- phivenv/Lib/site-packages/charset_normalizer/cli/__init__.py +8 -0
- phivenv/Lib/site-packages/charset_normalizer/cli/__main__.py +381 -0
- phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/charset_normalizer/constant.py +2015 -0
- phivenv/Lib/site-packages/charset_normalizer/legacy.py +80 -0
- phivenv/Lib/site-packages/charset_normalizer/md.cp39-win_amd64.pyd +0 -0
- phivenv/Lib/site-packages/charset_normalizer/md.py +635 -0
- phivenv/Lib/site-packages/charset_normalizer/models.py +360 -0
- phivenv/Lib/site-packages/charset_normalizer/py.typed +0 -0
- phivenv/Lib/site-packages/charset_normalizer/utils.py +414 -0
- phivenv/Lib/site-packages/charset_normalizer/version.py +8 -0
- phivenv/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER +1 -0
- phivenv/Lib/site-packages/colorama-0.4.6.dist-info/METADATA +441 -0
- phivenv/Lib/site-packages/colorama-0.4.6.dist-info/RECORD +31 -0
- phivenv/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL +5 -0
- phivenv/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt +27 -0
- phivenv/Lib/site-packages/colorama/__init__.py +7 -0
- phivenv/Lib/site-packages/colorama/__pycache__/__init__.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/__pycache__/ansi.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/__pycache__/ansitowin32.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/__pycache__/initialise.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/__pycache__/win32.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/__pycache__/winterm.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/ansi.py +102 -0
- phivenv/Lib/site-packages/colorama/ansitowin32.py +277 -0
- phivenv/Lib/site-packages/colorama/initialise.py +121 -0
- phivenv/Lib/site-packages/colorama/tests/__init__.py +1 -0
- phivenv/Lib/site-packages/colorama/tests/__pycache__/__init__.cpython-39.pyc +0 -0
- phivenv/Lib/site-packages/colorama/tests/__pycache__/ansi_test.cpython-39.pyc +0 -0
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/METADATA
ADDED
|
@@ -0,0 +1,750 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: charset-normalizer
|
| 3 |
+
Version: 3.4.3
|
| 4 |
+
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
| 5 |
+
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
| 6 |
+
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
| 7 |
+
License: MIT
|
| 8 |
+
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
| 9 |
+
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
| 10 |
+
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
| 11 |
+
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
| 12 |
+
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
| 13 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 14 |
+
Classifier: Intended Audience :: Developers
|
| 15 |
+
Classifier: Operating System :: OS Independent
|
| 16 |
+
Classifier: Programming Language :: Python
|
| 17 |
+
Classifier: Programming Language :: Python :: 3
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 25 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 26 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 27 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 28 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 29 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
| 30 |
+
Classifier: Topic :: Utilities
|
| 31 |
+
Classifier: Typing :: Typed
|
| 32 |
+
Requires-Python: >=3.7
|
| 33 |
+
Description-Content-Type: text/markdown
|
| 34 |
+
License-File: LICENSE
|
| 35 |
+
Provides-Extra: unicode-backport
|
| 36 |
+
Dynamic: license-file
|
| 37 |
+
|
| 38 |
+
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
| 39 |
+
|
| 40 |
+
<p align="center">
|
| 41 |
+
<sup>The Real First Universal Charset Detector</sup><br>
|
| 42 |
+
<a href="https://pypi.org/project/charset-normalizer">
|
| 43 |
+
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
| 44 |
+
</a>
|
| 45 |
+
<a href="https://pepy.tech/project/charset-normalizer/">
|
| 46 |
+
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
| 47 |
+
</a>
|
| 48 |
+
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
| 49 |
+
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
| 50 |
+
</a>
|
| 51 |
+
</p>
|
| 52 |
+
<p align="center">
|
| 53 |
+
<sup><i>Featured Packages</i></sup><br>
|
| 54 |
+
<a href="https://github.com/jawah/niquests">
|
| 55 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
|
| 56 |
+
</a>
|
| 57 |
+
<a href="https://github.com/jawah/wassima">
|
| 58 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
|
| 59 |
+
</a>
|
| 60 |
+
</p>
|
| 61 |
+
<p align="center">
|
| 62 |
+
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
| 63 |
+
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
| 64 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
| 65 |
+
</a>
|
| 66 |
+
</p>
|
| 67 |
+
|
| 68 |
+
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
| 69 |
+
> I'm trying to resolve the issue by taking a new approach.
|
| 70 |
+
> All IANA character set names for which the Python core library provides codecs are supported.
|
| 71 |
+
|
| 72 |
+
<p align="center">
|
| 73 |
+
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
| 74 |
+
</p>
|
| 75 |
+
|
| 76 |
+
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
| 77 |
+
|
| 78 |
+
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
| 79 |
+
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
| 80 |
+
| `Fast` | ❌ | ✅ | ✅ |
|
| 81 |
+
| `Universal**` | ❌ | ✅ | ❌ |
|
| 82 |
+
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
| 83 |
+
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
| 84 |
+
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
| 85 |
+
| `Native Python` | ✅ | ✅ | ❌ |
|
| 86 |
+
| `Detect spoken language` | ❌ | ✅ | N/A |
|
| 87 |
+
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
| 88 |
+
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
| 89 |
+
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
| 90 |
+
|
| 91 |
+
<p align="center">
|
| 92 |
+
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
| 93 |
+
</p>
|
| 94 |
+
|
| 95 |
+
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
| 96 |
+
|
| 97 |
+
## ⚡ Performance
|
| 98 |
+
|
| 99 |
+
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
| 100 |
+
|
| 101 |
+
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
| 102 |
+
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
| 103 |
+
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
| 104 |
+
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
| 105 |
+
|
| 106 |
+
| Package | 99th percentile | 95th percentile | 50th percentile |
|
| 107 |
+
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
| 108 |
+
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
| 109 |
+
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
| 110 |
+
|
| 111 |
+
_updated as of december 2024 using CPython 3.12_
|
| 112 |
+
|
| 113 |
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
| 114 |
+
|
| 115 |
+
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
| 116 |
+
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
| 117 |
+
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
| 118 |
+
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
| 119 |
+
> (e.g. Supported Encoding) Challenge-them if you want.
|
| 120 |
+
|
| 121 |
+
## ✨ Installation
|
| 122 |
+
|
| 123 |
+
Using pip:
|
| 124 |
+
|
| 125 |
+
```sh
|
| 126 |
+
pip install charset-normalizer -U
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## 🚀 Basic Usage
|
| 130 |
+
|
| 131 |
+
### CLI
|
| 132 |
+
This package comes with a CLI.
|
| 133 |
+
|
| 134 |
+
```
|
| 135 |
+
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
| 136 |
+
file [file ...]
|
| 137 |
+
|
| 138 |
+
The Real First Universal Charset Detector. Discover originating encoding used
|
| 139 |
+
on text file. Normalize text to unicode.
|
| 140 |
+
|
| 141 |
+
positional arguments:
|
| 142 |
+
files File(s) to be analysed
|
| 143 |
+
|
| 144 |
+
optional arguments:
|
| 145 |
+
-h, --help show this help message and exit
|
| 146 |
+
-v, --verbose Display complementary information about file if any.
|
| 147 |
+
Stdout will contain logs about the detection process.
|
| 148 |
+
-a, --with-alternative
|
| 149 |
+
Output complementary possibilities if any. Top-level
|
| 150 |
+
JSON WILL be a list.
|
| 151 |
+
-n, --normalize Permit to normalize input file. If not set, program
|
| 152 |
+
does not write anything.
|
| 153 |
+
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
| 154 |
+
JSON output.
|
| 155 |
+
-r, --replace Replace file when trying to normalize it instead of
|
| 156 |
+
creating a new one.
|
| 157 |
+
-f, --force Replace file without asking if you are sure, use this
|
| 158 |
+
flag with caution.
|
| 159 |
+
-t THRESHOLD, --threshold THRESHOLD
|
| 160 |
+
Define a custom maximum amount of chaos allowed in
|
| 161 |
+
decoded content. 0. <= chaos <= 1.
|
| 162 |
+
--version Show version information and exit.
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
```bash
|
| 166 |
+
normalizer ./data/sample.1.fr.srt
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
or
|
| 170 |
+
|
| 171 |
+
```bash
|
| 172 |
+
python -m charset_normalizer ./data/sample.1.fr.srt
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
| 176 |
+
|
| 177 |
+
```json
|
| 178 |
+
{
|
| 179 |
+
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
| 180 |
+
"encoding": "cp1252",
|
| 181 |
+
"encoding_aliases": [
|
| 182 |
+
"1252",
|
| 183 |
+
"windows_1252"
|
| 184 |
+
],
|
| 185 |
+
"alternative_encodings": [
|
| 186 |
+
"cp1254",
|
| 187 |
+
"cp1256",
|
| 188 |
+
"cp1258",
|
| 189 |
+
"iso8859_14",
|
| 190 |
+
"iso8859_15",
|
| 191 |
+
"iso8859_16",
|
| 192 |
+
"iso8859_3",
|
| 193 |
+
"iso8859_9",
|
| 194 |
+
"latin_1",
|
| 195 |
+
"mbcs"
|
| 196 |
+
],
|
| 197 |
+
"language": "French",
|
| 198 |
+
"alphabets": [
|
| 199 |
+
"Basic Latin",
|
| 200 |
+
"Latin-1 Supplement"
|
| 201 |
+
],
|
| 202 |
+
"has_sig_or_bom": false,
|
| 203 |
+
"chaos": 0.149,
|
| 204 |
+
"coherence": 97.152,
|
| 205 |
+
"unicode_path": null,
|
| 206 |
+
"is_preferred": true
|
| 207 |
+
}
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
### Python
|
| 211 |
+
*Just print out normalized text*
|
| 212 |
+
```python
|
| 213 |
+
from charset_normalizer import from_path
|
| 214 |
+
|
| 215 |
+
results = from_path('./my_subtitle.srt')
|
| 216 |
+
|
| 217 |
+
print(str(results.best()))
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
*Upgrade your code without effort*
|
| 221 |
+
```python
|
| 222 |
+
from charset_normalizer import detect
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
| 226 |
+
|
| 227 |
+
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
| 228 |
+
|
| 229 |
+
## 😇 Why
|
| 230 |
+
|
| 231 |
+
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
| 232 |
+
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
| 233 |
+
|
| 234 |
+
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
| 235 |
+
produce **two identical rendered string.**
|
| 236 |
+
What I want is to get readable text, the best I can.
|
| 237 |
+
|
| 238 |
+
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
| 239 |
+
|
| 240 |
+
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
| 241 |
+
|
| 242 |
+
## 🍰 How
|
| 243 |
+
|
| 244 |
+
- Discard all charset encoding table that could not fit the binary content.
|
| 245 |
+
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
| 246 |
+
- Extract matches with the lowest mess detected.
|
| 247 |
+
- Additionally, we measure coherence / probe for a language.
|
| 248 |
+
|
| 249 |
+
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
| 250 |
+
|
| 251 |
+
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
| 252 |
+
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
| 253 |
+
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
| 254 |
+
improve or rewrite it.
|
| 255 |
+
|
| 256 |
+
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
| 257 |
+
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
| 258 |
+
|
| 259 |
+
## ⚡ Known limitations
|
| 260 |
+
|
| 261 |
+
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
| 262 |
+
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
| 263 |
+
|
| 264 |
+
## ⚠️ About Python EOLs
|
| 265 |
+
|
| 266 |
+
**If you are running:**
|
| 267 |
+
|
| 268 |
+
- Python >=2.7,<3.5: Unsupported
|
| 269 |
+
- Python 3.5: charset-normalizer < 2.1
|
| 270 |
+
- Python 3.6: charset-normalizer < 3.1
|
| 271 |
+
- Python 3.7: charset-normalizer < 4.0
|
| 272 |
+
|
| 273 |
+
Upgrade your Python interpreter as soon as possible.
|
| 274 |
+
|
| 275 |
+
## 👤 Contributing
|
| 276 |
+
|
| 277 |
+
Contributions, issues and feature requests are very much welcome.<br />
|
| 278 |
+
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
| 279 |
+
|
| 280 |
+
## 📝 License
|
| 281 |
+
|
| 282 |
+
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
| 283 |
+
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
| 284 |
+
|
| 285 |
+
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
| 286 |
+
|
| 287 |
+
## 💼 For Enterprise
|
| 288 |
+
|
| 289 |
+
Professional support for charset-normalizer is available as part of the [Tidelift
|
| 290 |
+
Subscription][1]. Tidelift gives software development teams a single source for
|
| 291 |
+
purchasing and maintaining their software, with professional grade assurances
|
| 292 |
+
from the experts who know it best, while seamlessly integrating with existing
|
| 293 |
+
tools.
|
| 294 |
+
|
| 295 |
+
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
| 296 |
+
|
| 297 |
+
[](https://www.bestpractices.dev/projects/7297)
|
| 298 |
+
|
| 299 |
+
# Changelog
|
| 300 |
+
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
| 301 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
| 302 |
+
|
| 303 |
+
## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
|
| 304 |
+
|
| 305 |
+
### Changed
|
| 306 |
+
- mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
|
| 307 |
+
- automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
|
| 308 |
+
|
| 309 |
+
### Added
|
| 310 |
+
- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
|
| 311 |
+
- Support for Python 3.14
|
| 312 |
+
|
| 313 |
+
### Fixed
|
| 314 |
+
- sdist archive contained useless directories.
|
| 315 |
+
- automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
|
| 316 |
+
|
| 317 |
+
### Misc
|
| 318 |
+
- SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
|
| 319 |
+
Each published wheel comes with its SBOM. We choose CycloneDX as the format.
|
| 320 |
+
- Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
|
| 321 |
+
|
| 322 |
+
## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
|
| 323 |
+
|
| 324 |
+
### Fixed
|
| 325 |
+
- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
|
| 326 |
+
- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
|
| 327 |
+
|
| 328 |
+
### Changed
|
| 329 |
+
- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
|
| 330 |
+
|
| 331 |
+
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
| 332 |
+
|
| 333 |
+
### Changed
|
| 334 |
+
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
| 335 |
+
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
| 336 |
+
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
| 337 |
+
|
| 338 |
+
### Added
|
| 339 |
+
- pre-commit configuration.
|
| 340 |
+
- noxfile.
|
| 341 |
+
|
| 342 |
+
### Removed
|
| 343 |
+
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
| 344 |
+
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
| 345 |
+
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
| 346 |
+
- Unused `utils.range_scan` function.
|
| 347 |
+
|
| 348 |
+
### Fixed
|
| 349 |
+
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
| 350 |
+
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
| 351 |
+
|
| 352 |
+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
| 353 |
+
|
| 354 |
+
### Added
|
| 355 |
+
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
| 356 |
+
- Support for Python 3.13 (#512)
|
| 357 |
+
|
| 358 |
+
### Fixed
|
| 359 |
+
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
| 360 |
+
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
| 361 |
+
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
| 362 |
+
|
| 363 |
+
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
| 364 |
+
|
| 365 |
+
### Fixed
|
| 366 |
+
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
| 367 |
+
- Regression on some detection case showcased in the documentation (#371)
|
| 368 |
+
|
| 369 |
+
### Added
|
| 370 |
+
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
| 371 |
+
|
| 372 |
+
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
| 373 |
+
|
| 374 |
+
### Changed
|
| 375 |
+
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
| 376 |
+
- Improved the general detection reliability based on reports from the community
|
| 377 |
+
|
| 378 |
+
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
| 379 |
+
|
| 380 |
+
### Added
|
| 381 |
+
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
| 382 |
+
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
| 383 |
+
|
| 384 |
+
### Removed
|
| 385 |
+
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
| 386 |
+
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
| 387 |
+
|
| 388 |
+
### Changed
|
| 389 |
+
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
| 390 |
+
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
| 391 |
+
|
| 392 |
+
### Fixed
|
| 393 |
+
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
| 394 |
+
|
| 395 |
+
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
| 396 |
+
|
| 397 |
+
### Changed
|
| 398 |
+
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
| 399 |
+
- Minor improvement over the global detection reliability
|
| 400 |
+
|
| 401 |
+
### Added
|
| 402 |
+
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
| 403 |
+
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
| 404 |
+
- Explicit support for Python 3.12
|
| 405 |
+
|
| 406 |
+
### Fixed
|
| 407 |
+
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
| 408 |
+
|
| 409 |
+
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
| 410 |
+
|
| 411 |
+
### Added
|
| 412 |
+
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
| 413 |
+
|
| 414 |
+
### Removed
|
| 415 |
+
- Support for Python 3.6 (PR #260)
|
| 416 |
+
|
| 417 |
+
### Changed
|
| 418 |
+
- Optional speedup provided by mypy/c 1.0.1
|
| 419 |
+
|
| 420 |
+
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
| 421 |
+
|
| 422 |
+
### Fixed
|
| 423 |
+
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
| 424 |
+
|
| 425 |
+
### Changed
|
| 426 |
+
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
| 427 |
+
|
| 428 |
+
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
| 429 |
+
|
| 430 |
+
### Added
|
| 431 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
| 432 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
| 433 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
| 434 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
| 435 |
+
|
| 436 |
+
### Changed
|
| 437 |
+
- Build with static metadata using 'build' frontend
|
| 438 |
+
- Make the language detection stricter
|
| 439 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
| 440 |
+
|
| 441 |
+
### Fixed
|
| 442 |
+
- CLI with opt --normalize fail when using full path for files
|
| 443 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
| 444 |
+
- Sphinx warnings when generating the documentation
|
| 445 |
+
|
| 446 |
+
### Removed
|
| 447 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
| 448 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
| 449 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
| 450 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
| 451 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
| 452 |
+
- Breaking: Top-level function `normalize`
|
| 453 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
| 454 |
+
- Support for the backport `unicodedata2`
|
| 455 |
+
|
| 456 |
+
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
| 457 |
+
|
| 458 |
+
### Added
|
| 459 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
| 460 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
| 461 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
| 462 |
+
|
| 463 |
+
### Changed
|
| 464 |
+
- Build with static metadata using 'build' frontend
|
| 465 |
+
- Make the language detection stricter
|
| 466 |
+
|
| 467 |
+
### Fixed
|
| 468 |
+
- CLI with opt --normalize fail when using full path for files
|
| 469 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
| 470 |
+
|
| 471 |
+
### Removed
|
| 472 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
| 473 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
| 474 |
+
|
| 475 |
+
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
| 476 |
+
|
| 477 |
+
### Added
|
| 478 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
| 479 |
+
|
| 480 |
+
### Removed
|
| 481 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
| 482 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
| 483 |
+
|
| 484 |
+
### Fixed
|
| 485 |
+
- Sphinx warnings when generating the documentation
|
| 486 |
+
|
| 487 |
+
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
| 488 |
+
|
| 489 |
+
### Changed
|
| 490 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
| 491 |
+
|
| 492 |
+
### Removed
|
| 493 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
| 494 |
+
- Breaking: Top-level function `normalize`
|
| 495 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
| 496 |
+
- Support for the backport `unicodedata2`
|
| 497 |
+
|
| 498 |
+
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
| 499 |
+
|
| 500 |
+
### Deprecated
|
| 501 |
+
- Function `normalize` scheduled for removal in 3.0
|
| 502 |
+
|
| 503 |
+
### Changed
|
| 504 |
+
- Removed useless call to decode in fn is_unprintable (#206)
|
| 505 |
+
|
| 506 |
+
### Fixed
|
| 507 |
+
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
| 508 |
+
|
| 509 |
+
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
| 510 |
+
|
| 511 |
+
### Added
|
| 512 |
+
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
| 513 |
+
|
| 514 |
+
### Changed
|
| 515 |
+
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
| 516 |
+
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
| 517 |
+
|
| 518 |
+
### Fixed
|
| 519 |
+
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
| 520 |
+
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
| 521 |
+
|
| 522 |
+
### Removed
|
| 523 |
+
- Support for Python 3.5 (PR #192)
|
| 524 |
+
|
| 525 |
+
### Deprecated
|
| 526 |
+
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
| 527 |
+
|
| 528 |
+
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
| 529 |
+
|
| 530 |
+
### Fixed
|
| 531 |
+
- ASCII miss-detection on rare cases (PR #170)
|
| 532 |
+
|
| 533 |
+
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
| 534 |
+
|
| 535 |
+
### Added
|
| 536 |
+
- Explicit support for Python 3.11 (PR #164)
|
| 537 |
+
|
| 538 |
+
### Changed
|
| 539 |
+
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
| 540 |
+
|
| 541 |
+
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
| 542 |
+
|
| 543 |
+
### Fixed
|
| 544 |
+
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
| 545 |
+
|
| 546 |
+
### Changed
|
| 547 |
+
- Skipping the language-detection (CD) on ASCII (PR #155)
|
| 548 |
+
|
| 549 |
+
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
| 550 |
+
|
| 551 |
+
### Changed
|
| 552 |
+
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
| 553 |
+
|
| 554 |
+
### Fixed
|
| 555 |
+
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
| 556 |
+
|
| 557 |
+
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
| 558 |
+
### Changed
|
| 559 |
+
- Improvement over Vietnamese detection (PR #126)
|
| 560 |
+
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
| 561 |
+
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
| 562 |
+
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
| 563 |
+
- Code style as refactored by Sourcery-AI (PR #131)
|
| 564 |
+
- Minor adjustment on the MD around european words (PR #133)
|
| 565 |
+
- Remove and replace SRTs from assets / tests (PR #139)
|
| 566 |
+
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
| 567 |
+
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
| 568 |
+
|
| 569 |
+
### Fixed
|
| 570 |
+
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
| 571 |
+
- Avoid using too insignificant chunk (PR #137)
|
| 572 |
+
|
| 573 |
+
### Added
|
| 574 |
+
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
| 575 |
+
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
| 576 |
+
|
| 577 |
+
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
| 578 |
+
### Added
|
| 579 |
+
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
| 580 |
+
|
| 581 |
+
### Changed
|
| 582 |
+
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
| 583 |
+
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
| 584 |
+
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
| 585 |
+
- Various detection improvement (MD+CD) (PR #117)
|
| 586 |
+
|
| 587 |
+
### Removed
|
| 588 |
+
- Remove redundant logging entry about detected language(s) (PR #115)
|
| 589 |
+
|
| 590 |
+
### Fixed
|
| 591 |
+
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
| 592 |
+
|
| 593 |
+
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
| 594 |
+
### Fixed
|
| 595 |
+
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
| 596 |
+
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
| 597 |
+
|
| 598 |
+
### Changed
|
| 599 |
+
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
| 600 |
+
|
| 601 |
+
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
| 602 |
+
### Changed
|
| 603 |
+
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
| 604 |
+
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
| 605 |
+
- The Unicode detection is slightly improved (PR #93)
|
| 606 |
+
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
| 607 |
+
|
| 608 |
+
### Removed
|
| 609 |
+
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
| 610 |
+
|
| 611 |
+
### Fixed
|
| 612 |
+
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
| 613 |
+
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
| 614 |
+
- The MANIFEST.in was not exhaustive (PR #78)
|
| 615 |
+
|
| 616 |
+
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
| 617 |
+
### Fixed
|
| 618 |
+
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
| 619 |
+
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
| 620 |
+
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
| 621 |
+
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
| 622 |
+
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
| 623 |
+
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
| 624 |
+
|
| 625 |
+
### Changed
|
| 626 |
+
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
| 627 |
+
- Allow fallback on specified encoding if any (PR #71)
|
| 628 |
+
|
| 629 |
+
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
| 630 |
+
### Changed
|
| 631 |
+
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
| 632 |
+
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
| 633 |
+
|
| 634 |
+
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
| 635 |
+
### Fixed
|
| 636 |
+
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
| 637 |
+
|
| 638 |
+
### Changed
|
| 639 |
+
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
| 640 |
+
|
| 641 |
+
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
| 642 |
+
### Fixed
|
| 643 |
+
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
| 644 |
+
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
| 645 |
+
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
| 646 |
+
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
| 647 |
+
|
| 648 |
+
### Changed
|
| 649 |
+
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
| 650 |
+
|
| 651 |
+
### Added
|
| 652 |
+
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
| 653 |
+
|
| 654 |
+
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
| 655 |
+
### Changed
|
| 656 |
+
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
| 657 |
+
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
| 658 |
+
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
| 659 |
+
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
| 660 |
+
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
| 661 |
+
- utf_7 detection has been reinstated.
|
| 662 |
+
|
| 663 |
+
### Removed
|
| 664 |
+
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
| 665 |
+
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
| 666 |
+
- The exception hook on UnicodeDecodeError has been removed.
|
| 667 |
+
|
| 668 |
+
### Deprecated
|
| 669 |
+
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
| 670 |
+
|
| 671 |
+
### Fixed
|
| 672 |
+
- The CLI output used the relative path of the file(s). Should be absolute.
|
| 673 |
+
|
| 674 |
+
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
| 675 |
+
### Fixed
|
| 676 |
+
- Logger configuration/usage no longer conflict with others (PR #44)
|
| 677 |
+
|
| 678 |
+
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
| 679 |
+
### Removed
|
| 680 |
+
- Using standard logging instead of using the package loguru.
|
| 681 |
+
- Dropping nose test framework in favor of the maintained pytest.
|
| 682 |
+
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
| 683 |
+
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
| 684 |
+
- Stop support for UTF-7 that does not contain a SIG.
|
| 685 |
+
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
| 686 |
+
|
| 687 |
+
### Fixed
|
| 688 |
+
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
| 689 |
+
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
| 690 |
+
|
| 691 |
+
### Changed
|
| 692 |
+
- Improving the package final size by compressing frequencies.json.
|
| 693 |
+
- Huge improvement over the larges payload.
|
| 694 |
+
|
| 695 |
+
### Added
|
| 696 |
+
- CLI now produces JSON consumable output.
|
| 697 |
+
- Return ASCII if given sequences fit. Given reasonable confidence.
|
| 698 |
+
|
| 699 |
+
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
| 700 |
+
|
| 701 |
+
### Fixed
|
| 702 |
+
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
| 703 |
+
|
| 704 |
+
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
| 705 |
+
|
| 706 |
+
### Fixed
|
| 707 |
+
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
| 708 |
+
|
| 709 |
+
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
| 710 |
+
|
| 711 |
+
### Fixed
|
| 712 |
+
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
| 713 |
+
|
| 714 |
+
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
| 715 |
+
|
| 716 |
+
### Changed
|
| 717 |
+
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
| 718 |
+
|
| 719 |
+
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
| 720 |
+
|
| 721 |
+
### Fixed
|
| 722 |
+
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
| 723 |
+
|
| 724 |
+
### Changed
|
| 725 |
+
- Dependencies refactoring, constraints revised.
|
| 726 |
+
|
| 727 |
+
### Added
|
| 728 |
+
- Add python 3.9 and 3.10 to the supported interpreters
|
| 729 |
+
|
| 730 |
+
MIT License
|
| 731 |
+
|
| 732 |
+
Copyright (c) 2025 TAHRI Ahmed R.
|
| 733 |
+
|
| 734 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 735 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 736 |
+
in the Software without restriction, including without limitation the rights
|
| 737 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 738 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 739 |
+
furnished to do so, subject to the following conditions:
|
| 740 |
+
|
| 741 |
+
The above copyright notice and this permission notice shall be included in all
|
| 742 |
+
copies or substantial portions of the Software.
|
| 743 |
+
|
| 744 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 745 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 746 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 747 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 748 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 749 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 750 |
+
SOFTWARE.
|
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/RECORD
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
../../Scripts/normalizer.exe,sha256=IvfL1xIwLcN8AdjczrodNaXLaHBUERWBgh7YbfqJYUw,106364
|
| 2 |
+
charset_normalizer-3.4.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 3 |
+
charset_normalizer-3.4.3.dist-info/METADATA,sha256=tqX3UoI-UkqIN99aZsk646yI4NgMbu1MjlKr6BbITG4,37450
|
| 4 |
+
charset_normalizer-3.4.3.dist-info/RECORD,,
|
| 5 |
+
charset_normalizer-3.4.3.dist-info/WHEEL,sha256=XkFE14KmFh7mutkkb-qn_ueuH2lwfT8rLdfc5xpQ7wE,99
|
| 6 |
+
charset_normalizer-3.4.3.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
|
| 7 |
+
charset_normalizer-3.4.3.dist-info/licenses/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
|
| 8 |
+
charset_normalizer-3.4.3.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
| 9 |
+
charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
|
| 10 |
+
charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
|
| 11 |
+
charset_normalizer/__pycache__/__init__.cpython-39.pyc,,
|
| 12 |
+
charset_normalizer/__pycache__/__main__.cpython-39.pyc,,
|
| 13 |
+
charset_normalizer/__pycache__/api.cpython-39.pyc,,
|
| 14 |
+
charset_normalizer/__pycache__/cd.cpython-39.pyc,,
|
| 15 |
+
charset_normalizer/__pycache__/constant.cpython-39.pyc,,
|
| 16 |
+
charset_normalizer/__pycache__/legacy.cpython-39.pyc,,
|
| 17 |
+
charset_normalizer/__pycache__/md.cpython-39.pyc,,
|
| 18 |
+
charset_normalizer/__pycache__/models.cpython-39.pyc,,
|
| 19 |
+
charset_normalizer/__pycache__/utils.cpython-39.pyc,,
|
| 20 |
+
charset_normalizer/__pycache__/version.cpython-39.pyc,,
|
| 21 |
+
charset_normalizer/api.py,sha256=ODy4hX78b3ldTl5sViYPU1yzQ5qkclfgSIFE8BtNrTI,23337
|
| 22 |
+
charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
|
| 23 |
+
charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
|
| 24 |
+
charset_normalizer/cli/__main__.py,sha256=-pdJCyPywouPyFsC8_eTSgTmvh1YEvgjsvy1WZ0XjaA,13027
|
| 25 |
+
charset_normalizer/cli/__pycache__/__init__.cpython-39.pyc,,
|
| 26 |
+
charset_normalizer/cli/__pycache__/__main__.cpython-39.pyc,,
|
| 27 |
+
charset_normalizer/constant.py,sha256=mCJmYzpBU27Ut9kiNWWoBbhhxQ-aRVw3K7LSwoFwBGI,44728
|
| 28 |
+
charset_normalizer/legacy.py,sha256=ui08NlKqAXU3Y7smK-NFJjEgRRQz9ruM7aNCbT0OOrE,2811
|
| 29 |
+
charset_normalizer/md.cp39-win_amd64.pyd,sha256=GBRkMtCJSwm_0H_fJ-Jus0DdpkxHcWVC4XcSnC_seLk,10752
|
| 30 |
+
charset_normalizer/md.py,sha256=LSuW2hNgXSgF7JGdRapLAHLuj6pABHiP85LTNAYmu7c,20780
|
| 31 |
+
charset_normalizer/md__mypyc.cp39-win_amd64.pyd,sha256=CZOPvYPp7PJ4wdp_LKOtla0M0e856CwbTsusjGtnb_k,125440
|
| 32 |
+
charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
|
| 33 |
+
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 34 |
+
charset_normalizer/utils.py,sha256=XtWIQeOuz7cnGebMzyi4Vvi1JtA84QBSIeR9PDzF7pw,12584
|
| 35 |
+
charset_normalizer/version.py,sha256=laniWEeVCCfwRgYLf_rZ2f0qWaNwWTEXQEfUUL_MMvw,123
|
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp39-cp39-win_amd64
|
| 5 |
+
|
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
normalizer = charset_normalizer.cli:cli_detect
|
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 TAHRI Ahmed R.
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
charset_normalizer
|
phivenv/Lib/site-packages/charset_normalizer/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Charset-Normalizer
|
| 3 |
+
~~~~~~~~~~~~~~
|
| 4 |
+
The Real First Universal Charset Detector.
|
| 5 |
+
A library that helps you read text from an unknown charset encoding.
|
| 6 |
+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
| 7 |
+
All IANA character set names for which the Python core library provides codecs are supported.
|
| 8 |
+
|
| 9 |
+
Basic usage:
|
| 10 |
+
>>> from charset_normalizer import from_bytes
|
| 11 |
+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
| 12 |
+
>>> best_guess = results.best()
|
| 13 |
+
>>> str(best_guess)
|
| 14 |
+
'Bсеки човек има право на образование. Oбразованието!'
|
| 15 |
+
|
| 16 |
+
Others methods and usages are available - see the full documentation
|
| 17 |
+
at <https://github.com/Ousret/charset_normalizer>.
|
| 18 |
+
:copyright: (c) 2021 by Ahmed TAHRI
|
| 19 |
+
:license: MIT, see LICENSE for more details.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import logging
|
| 25 |
+
|
| 26 |
+
from .api import from_bytes, from_fp, from_path, is_binary
|
| 27 |
+
from .legacy import detect
|
| 28 |
+
from .models import CharsetMatch, CharsetMatches
|
| 29 |
+
from .utils import set_logging_handler
|
| 30 |
+
from .version import VERSION, __version__
|
| 31 |
+
|
| 32 |
+
__all__ = (
|
| 33 |
+
"from_fp",
|
| 34 |
+
"from_path",
|
| 35 |
+
"from_bytes",
|
| 36 |
+
"is_binary",
|
| 37 |
+
"detect",
|
| 38 |
+
"CharsetMatch",
|
| 39 |
+
"CharsetMatches",
|
| 40 |
+
"__version__",
|
| 41 |
+
"VERSION",
|
| 42 |
+
"set_logging_handler",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Attach a NullHandler to the top level logger by default
|
| 46 |
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
| 47 |
+
|
| 48 |
+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
phivenv/Lib/site-packages/charset_normalizer/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .cli import cli_detect
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
cli_detect()
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (1.61 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-39.pyc
ADDED
|
Binary file (289 Bytes). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-39.pyc
ADDED
|
Binary file (11.6 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-39.pyc
ADDED
|
Binary file (9.59 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-39.pyc
ADDED
|
Binary file (27.3 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-39.pyc
ADDED
|
Binary file (2.25 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-39.pyc
ADDED
|
Binary file (16.8 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-39.pyc
ADDED
|
Binary file (11.9 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-39.pyc
ADDED
|
Binary file (9.1 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-39.pyc
ADDED
|
Binary file (297 Bytes). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/cd.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
from codecs import IncrementalDecoder
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Counter as TypeCounter
|
| 8 |
+
|
| 9 |
+
from .constant import (
|
| 10 |
+
FREQUENCIES,
|
| 11 |
+
KO_NAMES,
|
| 12 |
+
LANGUAGE_SUPPORTED_COUNT,
|
| 13 |
+
TOO_SMALL_SEQUENCE,
|
| 14 |
+
ZH_NAMES,
|
| 15 |
+
)
|
| 16 |
+
from .md import is_suspiciously_successive_range
|
| 17 |
+
from .models import CoherenceMatches
|
| 18 |
+
from .utils import (
|
| 19 |
+
is_accentuated,
|
| 20 |
+
is_latin,
|
| 21 |
+
is_multi_byte_encoding,
|
| 22 |
+
is_unicode_range_secondary,
|
| 23 |
+
unicode_range,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def encoding_unicode_range(iana_name: str) -> list[str]:
|
| 28 |
+
"""
|
| 29 |
+
Return associated unicode ranges in a single byte code page.
|
| 30 |
+
"""
|
| 31 |
+
if is_multi_byte_encoding(iana_name):
|
| 32 |
+
raise OSError("Function not supported on multi-byte code page")
|
| 33 |
+
|
| 34 |
+
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
| 35 |
+
|
| 36 |
+
p: IncrementalDecoder = decoder(errors="ignore")
|
| 37 |
+
seen_ranges: dict[str, int] = {}
|
| 38 |
+
character_count: int = 0
|
| 39 |
+
|
| 40 |
+
for i in range(0x40, 0xFF):
|
| 41 |
+
chunk: str = p.decode(bytes([i]))
|
| 42 |
+
|
| 43 |
+
if chunk:
|
| 44 |
+
character_range: str | None = unicode_range(chunk)
|
| 45 |
+
|
| 46 |
+
if character_range is None:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
if is_unicode_range_secondary(character_range) is False:
|
| 50 |
+
if character_range not in seen_ranges:
|
| 51 |
+
seen_ranges[character_range] = 0
|
| 52 |
+
seen_ranges[character_range] += 1
|
| 53 |
+
character_count += 1
|
| 54 |
+
|
| 55 |
+
return sorted(
|
| 56 |
+
[
|
| 57 |
+
character_range
|
| 58 |
+
for character_range in seen_ranges
|
| 59 |
+
if seen_ranges[character_range] / character_count >= 0.15
|
| 60 |
+
]
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def unicode_range_languages(primary_range: str) -> list[str]:
|
| 65 |
+
"""
|
| 66 |
+
Return inferred languages used with a unicode range.
|
| 67 |
+
"""
|
| 68 |
+
languages: list[str] = []
|
| 69 |
+
|
| 70 |
+
for language, characters in FREQUENCIES.items():
|
| 71 |
+
for character in characters:
|
| 72 |
+
if unicode_range(character) == primary_range:
|
| 73 |
+
languages.append(language)
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
return languages
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@lru_cache()
|
| 80 |
+
def encoding_languages(iana_name: str) -> list[str]:
|
| 81 |
+
"""
|
| 82 |
+
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
| 83 |
+
This function does the correspondence.
|
| 84 |
+
"""
|
| 85 |
+
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
| 86 |
+
primary_range: str | None = None
|
| 87 |
+
|
| 88 |
+
for specified_range in unicode_ranges:
|
| 89 |
+
if "Latin" not in specified_range:
|
| 90 |
+
primary_range = specified_range
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
if primary_range is None:
|
| 94 |
+
return ["Latin Based"]
|
| 95 |
+
|
| 96 |
+
return unicode_range_languages(primary_range)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@lru_cache()
|
| 100 |
+
def mb_encoding_languages(iana_name: str) -> list[str]:
|
| 101 |
+
"""
|
| 102 |
+
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
| 103 |
+
This function does the correspondence.
|
| 104 |
+
"""
|
| 105 |
+
if (
|
| 106 |
+
iana_name.startswith("shift_")
|
| 107 |
+
or iana_name.startswith("iso2022_jp")
|
| 108 |
+
or iana_name.startswith("euc_j")
|
| 109 |
+
or iana_name == "cp932"
|
| 110 |
+
):
|
| 111 |
+
return ["Japanese"]
|
| 112 |
+
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
| 113 |
+
return ["Chinese"]
|
| 114 |
+
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
| 115 |
+
return ["Korean"]
|
| 116 |
+
|
| 117 |
+
return []
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
| 121 |
+
def get_target_features(language: str) -> tuple[bool, bool]:
|
| 122 |
+
"""
|
| 123 |
+
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
| 124 |
+
"""
|
| 125 |
+
target_have_accents: bool = False
|
| 126 |
+
target_pure_latin: bool = True
|
| 127 |
+
|
| 128 |
+
for character in FREQUENCIES[language]:
|
| 129 |
+
if not target_have_accents and is_accentuated(character):
|
| 130 |
+
target_have_accents = True
|
| 131 |
+
if target_pure_latin and is_latin(character) is False:
|
| 132 |
+
target_pure_latin = False
|
| 133 |
+
|
| 134 |
+
return target_have_accents, target_pure_latin
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def alphabet_languages(
|
| 138 |
+
characters: list[str], ignore_non_latin: bool = False
|
| 139 |
+
) -> list[str]:
|
| 140 |
+
"""
|
| 141 |
+
Return associated languages associated to given characters.
|
| 142 |
+
"""
|
| 143 |
+
languages: list[tuple[str, float]] = []
|
| 144 |
+
|
| 145 |
+
source_have_accents = any(is_accentuated(character) for character in characters)
|
| 146 |
+
|
| 147 |
+
for language, language_characters in FREQUENCIES.items():
|
| 148 |
+
target_have_accents, target_pure_latin = get_target_features(language)
|
| 149 |
+
|
| 150 |
+
if ignore_non_latin and target_pure_latin is False:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
if target_have_accents is False and source_have_accents:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
character_count: int = len(language_characters)
|
| 157 |
+
|
| 158 |
+
character_match_count: int = len(
|
| 159 |
+
[c for c in language_characters if c in characters]
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
ratio: float = character_match_count / character_count
|
| 163 |
+
|
| 164 |
+
if ratio >= 0.2:
|
| 165 |
+
languages.append((language, ratio))
|
| 166 |
+
|
| 167 |
+
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
| 168 |
+
|
| 169 |
+
return [compatible_language[0] for compatible_language in languages]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def characters_popularity_compare(
|
| 173 |
+
language: str, ordered_characters: list[str]
|
| 174 |
+
) -> float:
|
| 175 |
+
"""
|
| 176 |
+
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
| 177 |
+
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
| 178 |
+
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
| 179 |
+
"""
|
| 180 |
+
if language not in FREQUENCIES:
|
| 181 |
+
raise ValueError(f"{language} not available")
|
| 182 |
+
|
| 183 |
+
character_approved_count: int = 0
|
| 184 |
+
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
| 185 |
+
|
| 186 |
+
ordered_characters_count: int = len(ordered_characters)
|
| 187 |
+
target_language_characters_count: int = len(FREQUENCIES[language])
|
| 188 |
+
|
| 189 |
+
large_alphabet: bool = target_language_characters_count > 26
|
| 190 |
+
|
| 191 |
+
for character, character_rank in zip(
|
| 192 |
+
ordered_characters, range(0, ordered_characters_count)
|
| 193 |
+
):
|
| 194 |
+
if character not in FREQUENCIES_language_set:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
| 198 |
+
expected_projection_ratio: float = (
|
| 199 |
+
target_language_characters_count / ordered_characters_count
|
| 200 |
+
)
|
| 201 |
+
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
| 202 |
+
|
| 203 |
+
if (
|
| 204 |
+
large_alphabet is False
|
| 205 |
+
and abs(character_rank_projection - character_rank_in_language) > 4
|
| 206 |
+
):
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
if (
|
| 210 |
+
large_alphabet is True
|
| 211 |
+
and abs(character_rank_projection - character_rank_in_language)
|
| 212 |
+
< target_language_characters_count / 3
|
| 213 |
+
):
|
| 214 |
+
character_approved_count += 1
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
characters_before_source: list[str] = FREQUENCIES[language][
|
| 218 |
+
0:character_rank_in_language
|
| 219 |
+
]
|
| 220 |
+
characters_after_source: list[str] = FREQUENCIES[language][
|
| 221 |
+
character_rank_in_language:
|
| 222 |
+
]
|
| 223 |
+
characters_before: list[str] = ordered_characters[0:character_rank]
|
| 224 |
+
characters_after: list[str] = ordered_characters[character_rank:]
|
| 225 |
+
|
| 226 |
+
before_match_count: int = len(
|
| 227 |
+
set(characters_before) & set(characters_before_source)
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
after_match_count: int = len(
|
| 231 |
+
set(characters_after) & set(characters_after_source)
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if len(characters_before_source) == 0 and before_match_count <= 4:
|
| 235 |
+
character_approved_count += 1
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
if len(characters_after_source) == 0 and after_match_count <= 4:
|
| 239 |
+
character_approved_count += 1
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
if (
|
| 243 |
+
before_match_count / len(characters_before_source) >= 0.4
|
| 244 |
+
or after_match_count / len(characters_after_source) >= 0.4
|
| 245 |
+
):
|
| 246 |
+
character_approved_count += 1
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
return character_approved_count / len(ordered_characters)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
| 253 |
+
"""
|
| 254 |
+
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
| 255 |
+
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
| 256 |
+
One containing the latin letters and the other hebrew.
|
| 257 |
+
"""
|
| 258 |
+
layers: dict[str, str] = {}
|
| 259 |
+
|
| 260 |
+
for character in decoded_sequence:
|
| 261 |
+
if character.isalpha() is False:
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
character_range: str | None = unicode_range(character)
|
| 265 |
+
|
| 266 |
+
if character_range is None:
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
layer_target_range: str | None = None
|
| 270 |
+
|
| 271 |
+
for discovered_range in layers:
|
| 272 |
+
if (
|
| 273 |
+
is_suspiciously_successive_range(discovered_range, character_range)
|
| 274 |
+
is False
|
| 275 |
+
):
|
| 276 |
+
layer_target_range = discovered_range
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
if layer_target_range is None:
|
| 280 |
+
layer_target_range = character_range
|
| 281 |
+
|
| 282 |
+
if layer_target_range not in layers:
|
| 283 |
+
layers[layer_target_range] = character.lower()
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
layers[layer_target_range] += character.lower()
|
| 287 |
+
|
| 288 |
+
return list(layers.values())
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
| 292 |
+
"""
|
| 293 |
+
This function merge results previously given by the function coherence_ratio.
|
| 294 |
+
The return type is the same as coherence_ratio.
|
| 295 |
+
"""
|
| 296 |
+
per_language_ratios: dict[str, list[float]] = {}
|
| 297 |
+
for result in results:
|
| 298 |
+
for sub_result in result:
|
| 299 |
+
language, ratio = sub_result
|
| 300 |
+
if language not in per_language_ratios:
|
| 301 |
+
per_language_ratios[language] = [ratio]
|
| 302 |
+
continue
|
| 303 |
+
per_language_ratios[language].append(ratio)
|
| 304 |
+
|
| 305 |
+
merge = [
|
| 306 |
+
(
|
| 307 |
+
language,
|
| 308 |
+
round(
|
| 309 |
+
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
| 310 |
+
4,
|
| 311 |
+
),
|
| 312 |
+
)
|
| 313 |
+
for language in per_language_ratios
|
| 314 |
+
]
|
| 315 |
+
|
| 316 |
+
return sorted(merge, key=lambda x: x[1], reverse=True)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
| 320 |
+
"""
|
| 321 |
+
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
| 322 |
+
of "English". This function only keeps the best match and remove the em-dash in it.
|
| 323 |
+
"""
|
| 324 |
+
index_results: dict[str, list[float]] = dict()
|
| 325 |
+
|
| 326 |
+
for result in results:
|
| 327 |
+
language, ratio = result
|
| 328 |
+
no_em_name: str = language.replace("—", "")
|
| 329 |
+
|
| 330 |
+
if no_em_name not in index_results:
|
| 331 |
+
index_results[no_em_name] = []
|
| 332 |
+
|
| 333 |
+
index_results[no_em_name].append(ratio)
|
| 334 |
+
|
| 335 |
+
if any(len(index_results[e]) > 1 for e in index_results):
|
| 336 |
+
filtered_results: CoherenceMatches = []
|
| 337 |
+
|
| 338 |
+
for language in index_results:
|
| 339 |
+
filtered_results.append((language, max(index_results[language])))
|
| 340 |
+
|
| 341 |
+
return filtered_results
|
| 342 |
+
|
| 343 |
+
return results
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
@lru_cache(maxsize=2048)
|
| 347 |
+
def coherence_ratio(
|
| 348 |
+
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
| 349 |
+
) -> CoherenceMatches:
|
| 350 |
+
"""
|
| 351 |
+
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
| 352 |
+
A layer = Character extraction by alphabets/ranges.
|
| 353 |
+
"""
|
| 354 |
+
|
| 355 |
+
results: list[tuple[str, float]] = []
|
| 356 |
+
ignore_non_latin: bool = False
|
| 357 |
+
|
| 358 |
+
sufficient_match_count: int = 0
|
| 359 |
+
|
| 360 |
+
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
| 361 |
+
if "Latin Based" in lg_inclusion_list:
|
| 362 |
+
ignore_non_latin = True
|
| 363 |
+
lg_inclusion_list.remove("Latin Based")
|
| 364 |
+
|
| 365 |
+
for layer in alpha_unicode_split(decoded_sequence):
|
| 366 |
+
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
| 367 |
+
most_common = sequence_frequencies.most_common()
|
| 368 |
+
|
| 369 |
+
character_count: int = sum(o for c, o in most_common)
|
| 370 |
+
|
| 371 |
+
if character_count <= TOO_SMALL_SEQUENCE:
|
| 372 |
+
continue
|
| 373 |
+
|
| 374 |
+
popular_character_ordered: list[str] = [c for c, o in most_common]
|
| 375 |
+
|
| 376 |
+
for language in lg_inclusion_list or alphabet_languages(
|
| 377 |
+
popular_character_ordered, ignore_non_latin
|
| 378 |
+
):
|
| 379 |
+
ratio: float = characters_popularity_compare(
|
| 380 |
+
language, popular_character_ordered
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
if ratio < threshold:
|
| 384 |
+
continue
|
| 385 |
+
elif ratio >= 0.8:
|
| 386 |
+
sufficient_match_count += 1
|
| 387 |
+
|
| 388 |
+
results.append((language, round(ratio, 4)))
|
| 389 |
+
|
| 390 |
+
if sufficient_match_count >= 3:
|
| 391 |
+
break
|
| 392 |
+
|
| 393 |
+
return sorted(
|
| 394 |
+
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
| 395 |
+
)
|
phivenv/Lib/site-packages/charset_normalizer/cli/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .__main__ import cli_detect, query_yes_no
|
| 4 |
+
|
| 5 |
+
__all__ = (
|
| 6 |
+
"cli_detect",
|
| 7 |
+
"query_yes_no",
|
| 8 |
+
)
|
phivenv/Lib/site-packages/charset_normalizer/cli/__main__.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import sys
|
| 5 |
+
import typing
|
| 6 |
+
from json import dumps
|
| 7 |
+
from os.path import abspath, basename, dirname, join, realpath
|
| 8 |
+
from platform import python_version
|
| 9 |
+
from unicodedata import unidata_version
|
| 10 |
+
|
| 11 |
+
import charset_normalizer.md as md_module
|
| 12 |
+
from charset_normalizer import from_fp
|
| 13 |
+
from charset_normalizer.models import CliDetectionResult
|
| 14 |
+
from charset_normalizer.version import __version__
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def query_yes_no(question: str, default: str = "yes") -> bool:
|
| 18 |
+
"""Ask a yes/no question via input() and return their answer.
|
| 19 |
+
|
| 20 |
+
"question" is a string that is presented to the user.
|
| 21 |
+
"default" is the presumed answer if the user just hits <Enter>.
|
| 22 |
+
It must be "yes" (the default), "no" or None (meaning
|
| 23 |
+
an answer is required of the user).
|
| 24 |
+
|
| 25 |
+
The "answer" return value is True for "yes" or False for "no".
|
| 26 |
+
|
| 27 |
+
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
| 28 |
+
"""
|
| 29 |
+
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
| 30 |
+
if default is None:
|
| 31 |
+
prompt = " [y/n] "
|
| 32 |
+
elif default == "yes":
|
| 33 |
+
prompt = " [Y/n] "
|
| 34 |
+
elif default == "no":
|
| 35 |
+
prompt = " [y/N] "
|
| 36 |
+
else:
|
| 37 |
+
raise ValueError("invalid default answer: '%s'" % default)
|
| 38 |
+
|
| 39 |
+
while True:
|
| 40 |
+
sys.stdout.write(question + prompt)
|
| 41 |
+
choice = input().lower()
|
| 42 |
+
if default is not None and choice == "":
|
| 43 |
+
return valid[default]
|
| 44 |
+
elif choice in valid:
|
| 45 |
+
return valid[choice]
|
| 46 |
+
else:
|
| 47 |
+
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class FileType:
|
| 51 |
+
"""Factory for creating file object types
|
| 52 |
+
|
| 53 |
+
Instances of FileType are typically passed as type= arguments to the
|
| 54 |
+
ArgumentParser add_argument() method.
|
| 55 |
+
|
| 56 |
+
Keyword Arguments:
|
| 57 |
+
- mode -- A string indicating how the file is to be opened. Accepts the
|
| 58 |
+
same values as the builtin open() function.
|
| 59 |
+
- bufsize -- The file's desired buffer size. Accepts the same values as
|
| 60 |
+
the builtin open() function.
|
| 61 |
+
- encoding -- The file's encoding. Accepts the same values as the
|
| 62 |
+
builtin open() function.
|
| 63 |
+
- errors -- A string indicating how encoding and decoding errors are to
|
| 64 |
+
be handled. Accepts the same value as the builtin open() function.
|
| 65 |
+
|
| 66 |
+
Backported from CPython 3.12
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
mode: str = "r",
|
| 72 |
+
bufsize: int = -1,
|
| 73 |
+
encoding: str | None = None,
|
| 74 |
+
errors: str | None = None,
|
| 75 |
+
):
|
| 76 |
+
self._mode = mode
|
| 77 |
+
self._bufsize = bufsize
|
| 78 |
+
self._encoding = encoding
|
| 79 |
+
self._errors = errors
|
| 80 |
+
|
| 81 |
+
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
|
| 82 |
+
# the special argument "-" means sys.std{in,out}
|
| 83 |
+
if string == "-":
|
| 84 |
+
if "r" in self._mode:
|
| 85 |
+
return sys.stdin.buffer if "b" in self._mode else sys.stdin
|
| 86 |
+
elif any(c in self._mode for c in "wax"):
|
| 87 |
+
return sys.stdout.buffer if "b" in self._mode else sys.stdout
|
| 88 |
+
else:
|
| 89 |
+
msg = f'argument "-" with mode {self._mode}'
|
| 90 |
+
raise ValueError(msg)
|
| 91 |
+
|
| 92 |
+
# all other arguments are used as file names
|
| 93 |
+
try:
|
| 94 |
+
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
|
| 95 |
+
except OSError as e:
|
| 96 |
+
message = f"can't open '{string}': {e}"
|
| 97 |
+
raise argparse.ArgumentTypeError(message)
|
| 98 |
+
|
| 99 |
+
def __repr__(self) -> str:
|
| 100 |
+
args = self._mode, self._bufsize
|
| 101 |
+
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
|
| 102 |
+
args_str = ", ".join(
|
| 103 |
+
[repr(arg) for arg in args if arg != -1]
|
| 104 |
+
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
|
| 105 |
+
)
|
| 106 |
+
return f"{type(self).__name__}({args_str})"
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def cli_detect(argv: list[str] | None = None) -> int:
|
| 110 |
+
"""
|
| 111 |
+
CLI assistant using ARGV and ArgumentParser
|
| 112 |
+
:param argv:
|
| 113 |
+
:return: 0 if everything is fine, anything else equal trouble
|
| 114 |
+
"""
|
| 115 |
+
parser = argparse.ArgumentParser(
|
| 116 |
+
description="The Real First Universal Charset Detector. "
|
| 117 |
+
"Discover originating encoding used on text file. "
|
| 118 |
+
"Normalize text to unicode."
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
parser.add_argument(
|
| 122 |
+
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
|
| 123 |
+
)
|
| 124 |
+
parser.add_argument(
|
| 125 |
+
"-v",
|
| 126 |
+
"--verbose",
|
| 127 |
+
action="store_true",
|
| 128 |
+
default=False,
|
| 129 |
+
dest="verbose",
|
| 130 |
+
help="Display complementary information about file if any. "
|
| 131 |
+
"Stdout will contain logs about the detection process.",
|
| 132 |
+
)
|
| 133 |
+
parser.add_argument(
|
| 134 |
+
"-a",
|
| 135 |
+
"--with-alternative",
|
| 136 |
+
action="store_true",
|
| 137 |
+
default=False,
|
| 138 |
+
dest="alternatives",
|
| 139 |
+
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
| 140 |
+
)
|
| 141 |
+
parser.add_argument(
|
| 142 |
+
"-n",
|
| 143 |
+
"--normalize",
|
| 144 |
+
action="store_true",
|
| 145 |
+
default=False,
|
| 146 |
+
dest="normalize",
|
| 147 |
+
help="Permit to normalize input file. If not set, program does not write anything.",
|
| 148 |
+
)
|
| 149 |
+
parser.add_argument(
|
| 150 |
+
"-m",
|
| 151 |
+
"--minimal",
|
| 152 |
+
action="store_true",
|
| 153 |
+
default=False,
|
| 154 |
+
dest="minimal",
|
| 155 |
+
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
| 156 |
+
)
|
| 157 |
+
parser.add_argument(
|
| 158 |
+
"-r",
|
| 159 |
+
"--replace",
|
| 160 |
+
action="store_true",
|
| 161 |
+
default=False,
|
| 162 |
+
dest="replace",
|
| 163 |
+
help="Replace file when trying to normalize it instead of creating a new one.",
|
| 164 |
+
)
|
| 165 |
+
parser.add_argument(
|
| 166 |
+
"-f",
|
| 167 |
+
"--force",
|
| 168 |
+
action="store_true",
|
| 169 |
+
default=False,
|
| 170 |
+
dest="force",
|
| 171 |
+
help="Replace file without asking if you are sure, use this flag with caution.",
|
| 172 |
+
)
|
| 173 |
+
parser.add_argument(
|
| 174 |
+
"-i",
|
| 175 |
+
"--no-preemptive",
|
| 176 |
+
action="store_true",
|
| 177 |
+
default=False,
|
| 178 |
+
dest="no_preemptive",
|
| 179 |
+
help="Disable looking at a charset declaration to hint the detector.",
|
| 180 |
+
)
|
| 181 |
+
parser.add_argument(
|
| 182 |
+
"-t",
|
| 183 |
+
"--threshold",
|
| 184 |
+
action="store",
|
| 185 |
+
default=0.2,
|
| 186 |
+
type=float,
|
| 187 |
+
dest="threshold",
|
| 188 |
+
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
| 189 |
+
)
|
| 190 |
+
parser.add_argument(
|
| 191 |
+
"--version",
|
| 192 |
+
action="version",
|
| 193 |
+
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
| 194 |
+
__version__,
|
| 195 |
+
python_version(),
|
| 196 |
+
unidata_version,
|
| 197 |
+
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
| 198 |
+
),
|
| 199 |
+
help="Show version information and exit.",
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
args = parser.parse_args(argv)
|
| 203 |
+
|
| 204 |
+
if args.replace is True and args.normalize is False:
|
| 205 |
+
if args.files:
|
| 206 |
+
for my_file in args.files:
|
| 207 |
+
my_file.close()
|
| 208 |
+
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
| 209 |
+
return 1
|
| 210 |
+
|
| 211 |
+
if args.force is True and args.replace is False:
|
| 212 |
+
if args.files:
|
| 213 |
+
for my_file in args.files:
|
| 214 |
+
my_file.close()
|
| 215 |
+
print("Use --force in addition of --replace only.", file=sys.stderr)
|
| 216 |
+
return 1
|
| 217 |
+
|
| 218 |
+
if args.threshold < 0.0 or args.threshold > 1.0:
|
| 219 |
+
if args.files:
|
| 220 |
+
for my_file in args.files:
|
| 221 |
+
my_file.close()
|
| 222 |
+
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
| 223 |
+
return 1
|
| 224 |
+
|
| 225 |
+
x_ = []
|
| 226 |
+
|
| 227 |
+
for my_file in args.files:
|
| 228 |
+
matches = from_fp(
|
| 229 |
+
my_file,
|
| 230 |
+
threshold=args.threshold,
|
| 231 |
+
explain=args.verbose,
|
| 232 |
+
preemptive_behaviour=args.no_preemptive is False,
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
best_guess = matches.best()
|
| 236 |
+
|
| 237 |
+
if best_guess is None:
|
| 238 |
+
print(
|
| 239 |
+
'Unable to identify originating encoding for "{}". {}'.format(
|
| 240 |
+
my_file.name,
|
| 241 |
+
(
|
| 242 |
+
"Maybe try increasing maximum amount of chaos."
|
| 243 |
+
if args.threshold < 1.0
|
| 244 |
+
else ""
|
| 245 |
+
),
|
| 246 |
+
),
|
| 247 |
+
file=sys.stderr,
|
| 248 |
+
)
|
| 249 |
+
x_.append(
|
| 250 |
+
CliDetectionResult(
|
| 251 |
+
abspath(my_file.name),
|
| 252 |
+
None,
|
| 253 |
+
[],
|
| 254 |
+
[],
|
| 255 |
+
"Unknown",
|
| 256 |
+
[],
|
| 257 |
+
False,
|
| 258 |
+
1.0,
|
| 259 |
+
0.0,
|
| 260 |
+
None,
|
| 261 |
+
True,
|
| 262 |
+
)
|
| 263 |
+
)
|
| 264 |
+
else:
|
| 265 |
+
x_.append(
|
| 266 |
+
CliDetectionResult(
|
| 267 |
+
abspath(my_file.name),
|
| 268 |
+
best_guess.encoding,
|
| 269 |
+
best_guess.encoding_aliases,
|
| 270 |
+
[
|
| 271 |
+
cp
|
| 272 |
+
for cp in best_guess.could_be_from_charset
|
| 273 |
+
if cp != best_guess.encoding
|
| 274 |
+
],
|
| 275 |
+
best_guess.language,
|
| 276 |
+
best_guess.alphabets,
|
| 277 |
+
best_guess.bom,
|
| 278 |
+
best_guess.percent_chaos,
|
| 279 |
+
best_guess.percent_coherence,
|
| 280 |
+
None,
|
| 281 |
+
True,
|
| 282 |
+
)
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
if len(matches) > 1 and args.alternatives:
|
| 286 |
+
for el in matches:
|
| 287 |
+
if el != best_guess:
|
| 288 |
+
x_.append(
|
| 289 |
+
CliDetectionResult(
|
| 290 |
+
abspath(my_file.name),
|
| 291 |
+
el.encoding,
|
| 292 |
+
el.encoding_aliases,
|
| 293 |
+
[
|
| 294 |
+
cp
|
| 295 |
+
for cp in el.could_be_from_charset
|
| 296 |
+
if cp != el.encoding
|
| 297 |
+
],
|
| 298 |
+
el.language,
|
| 299 |
+
el.alphabets,
|
| 300 |
+
el.bom,
|
| 301 |
+
el.percent_chaos,
|
| 302 |
+
el.percent_coherence,
|
| 303 |
+
None,
|
| 304 |
+
False,
|
| 305 |
+
)
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
if args.normalize is True:
|
| 309 |
+
if best_guess.encoding.startswith("utf") is True:
|
| 310 |
+
print(
|
| 311 |
+
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
| 312 |
+
my_file.name
|
| 313 |
+
),
|
| 314 |
+
file=sys.stderr,
|
| 315 |
+
)
|
| 316 |
+
if my_file.closed is False:
|
| 317 |
+
my_file.close()
|
| 318 |
+
continue
|
| 319 |
+
|
| 320 |
+
dir_path = dirname(realpath(my_file.name))
|
| 321 |
+
file_name = basename(realpath(my_file.name))
|
| 322 |
+
|
| 323 |
+
o_: list[str] = file_name.split(".")
|
| 324 |
+
|
| 325 |
+
if args.replace is False:
|
| 326 |
+
o_.insert(-1, best_guess.encoding)
|
| 327 |
+
if my_file.closed is False:
|
| 328 |
+
my_file.close()
|
| 329 |
+
elif (
|
| 330 |
+
args.force is False
|
| 331 |
+
and query_yes_no(
|
| 332 |
+
'Are you sure to normalize "{}" by replacing it ?'.format(
|
| 333 |
+
my_file.name
|
| 334 |
+
),
|
| 335 |
+
"no",
|
| 336 |
+
)
|
| 337 |
+
is False
|
| 338 |
+
):
|
| 339 |
+
if my_file.closed is False:
|
| 340 |
+
my_file.close()
|
| 341 |
+
continue
|
| 342 |
+
|
| 343 |
+
try:
|
| 344 |
+
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
| 345 |
+
|
| 346 |
+
with open(x_[0].unicode_path, "wb") as fp:
|
| 347 |
+
fp.write(best_guess.output())
|
| 348 |
+
except OSError as e:
|
| 349 |
+
print(str(e), file=sys.stderr)
|
| 350 |
+
if my_file.closed is False:
|
| 351 |
+
my_file.close()
|
| 352 |
+
return 2
|
| 353 |
+
|
| 354 |
+
if my_file.closed is False:
|
| 355 |
+
my_file.close()
|
| 356 |
+
|
| 357 |
+
if args.minimal is False:
|
| 358 |
+
print(
|
| 359 |
+
dumps(
|
| 360 |
+
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
| 361 |
+
ensure_ascii=True,
|
| 362 |
+
indent=4,
|
| 363 |
+
)
|
| 364 |
+
)
|
| 365 |
+
else:
|
| 366 |
+
for my_file in args.files:
|
| 367 |
+
print(
|
| 368 |
+
", ".join(
|
| 369 |
+
[
|
| 370 |
+
el.encoding or "undefined"
|
| 371 |
+
for el in x_
|
| 372 |
+
if el.path == abspath(my_file.name)
|
| 373 |
+
]
|
| 374 |
+
)
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
return 0
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
if __name__ == "__main__":
|
| 381 |
+
cli_detect()
|
phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (298 Bytes). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-39.pyc
ADDED
|
Binary file (9.25 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/constant.py
ADDED
|
@@ -0,0 +1,2015 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
| 4 |
+
from encodings.aliases import aliases
|
| 5 |
+
from re import IGNORECASE
|
| 6 |
+
from re import compile as re_compile
|
| 7 |
+
|
| 8 |
+
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
| 9 |
+
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
|
| 10 |
+
"utf_8": BOM_UTF8,
|
| 11 |
+
"utf_7": [
|
| 12 |
+
b"\x2b\x2f\x76\x38",
|
| 13 |
+
b"\x2b\x2f\x76\x39",
|
| 14 |
+
b"\x2b\x2f\x76\x2b",
|
| 15 |
+
b"\x2b\x2f\x76\x2f",
|
| 16 |
+
b"\x2b\x2f\x76\x38\x2d",
|
| 17 |
+
],
|
| 18 |
+
"gb18030": b"\x84\x31\x95\x33",
|
| 19 |
+
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
| 20 |
+
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
TOO_SMALL_SEQUENCE: int = 32
|
| 24 |
+
TOO_BIG_SEQUENCE: int = int(10e6)
|
| 25 |
+
|
| 26 |
+
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
| 27 |
+
|
| 28 |
+
# Up-to-date Unicode ucd/15.0.0
|
| 29 |
+
UNICODE_RANGES_COMBINED: dict[str, range] = {
|
| 30 |
+
"Control character": range(32),
|
| 31 |
+
"Basic Latin": range(32, 128),
|
| 32 |
+
"Latin-1 Supplement": range(128, 256),
|
| 33 |
+
"Latin Extended-A": range(256, 384),
|
| 34 |
+
"Latin Extended-B": range(384, 592),
|
| 35 |
+
"IPA Extensions": range(592, 688),
|
| 36 |
+
"Spacing Modifier Letters": range(688, 768),
|
| 37 |
+
"Combining Diacritical Marks": range(768, 880),
|
| 38 |
+
"Greek and Coptic": range(880, 1024),
|
| 39 |
+
"Cyrillic": range(1024, 1280),
|
| 40 |
+
"Cyrillic Supplement": range(1280, 1328),
|
| 41 |
+
"Armenian": range(1328, 1424),
|
| 42 |
+
"Hebrew": range(1424, 1536),
|
| 43 |
+
"Arabic": range(1536, 1792),
|
| 44 |
+
"Syriac": range(1792, 1872),
|
| 45 |
+
"Arabic Supplement": range(1872, 1920),
|
| 46 |
+
"Thaana": range(1920, 1984),
|
| 47 |
+
"NKo": range(1984, 2048),
|
| 48 |
+
"Samaritan": range(2048, 2112),
|
| 49 |
+
"Mandaic": range(2112, 2144),
|
| 50 |
+
"Syriac Supplement": range(2144, 2160),
|
| 51 |
+
"Arabic Extended-B": range(2160, 2208),
|
| 52 |
+
"Arabic Extended-A": range(2208, 2304),
|
| 53 |
+
"Devanagari": range(2304, 2432),
|
| 54 |
+
"Bengali": range(2432, 2560),
|
| 55 |
+
"Gurmukhi": range(2560, 2688),
|
| 56 |
+
"Gujarati": range(2688, 2816),
|
| 57 |
+
"Oriya": range(2816, 2944),
|
| 58 |
+
"Tamil": range(2944, 3072),
|
| 59 |
+
"Telugu": range(3072, 3200),
|
| 60 |
+
"Kannada": range(3200, 3328),
|
| 61 |
+
"Malayalam": range(3328, 3456),
|
| 62 |
+
"Sinhala": range(3456, 3584),
|
| 63 |
+
"Thai": range(3584, 3712),
|
| 64 |
+
"Lao": range(3712, 3840),
|
| 65 |
+
"Tibetan": range(3840, 4096),
|
| 66 |
+
"Myanmar": range(4096, 4256),
|
| 67 |
+
"Georgian": range(4256, 4352),
|
| 68 |
+
"Hangul Jamo": range(4352, 4608),
|
| 69 |
+
"Ethiopic": range(4608, 4992),
|
| 70 |
+
"Ethiopic Supplement": range(4992, 5024),
|
| 71 |
+
"Cherokee": range(5024, 5120),
|
| 72 |
+
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
|
| 73 |
+
"Ogham": range(5760, 5792),
|
| 74 |
+
"Runic": range(5792, 5888),
|
| 75 |
+
"Tagalog": range(5888, 5920),
|
| 76 |
+
"Hanunoo": range(5920, 5952),
|
| 77 |
+
"Buhid": range(5952, 5984),
|
| 78 |
+
"Tagbanwa": range(5984, 6016),
|
| 79 |
+
"Khmer": range(6016, 6144),
|
| 80 |
+
"Mongolian": range(6144, 6320),
|
| 81 |
+
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
|
| 82 |
+
"Limbu": range(6400, 6480),
|
| 83 |
+
"Tai Le": range(6480, 6528),
|
| 84 |
+
"New Tai Lue": range(6528, 6624),
|
| 85 |
+
"Khmer Symbols": range(6624, 6656),
|
| 86 |
+
"Buginese": range(6656, 6688),
|
| 87 |
+
"Tai Tham": range(6688, 6832),
|
| 88 |
+
"Combining Diacritical Marks Extended": range(6832, 6912),
|
| 89 |
+
"Balinese": range(6912, 7040),
|
| 90 |
+
"Sundanese": range(7040, 7104),
|
| 91 |
+
"Batak": range(7104, 7168),
|
| 92 |
+
"Lepcha": range(7168, 7248),
|
| 93 |
+
"Ol Chiki": range(7248, 7296),
|
| 94 |
+
"Cyrillic Extended-C": range(7296, 7312),
|
| 95 |
+
"Georgian Extended": range(7312, 7360),
|
| 96 |
+
"Sundanese Supplement": range(7360, 7376),
|
| 97 |
+
"Vedic Extensions": range(7376, 7424),
|
| 98 |
+
"Phonetic Extensions": range(7424, 7552),
|
| 99 |
+
"Phonetic Extensions Supplement": range(7552, 7616),
|
| 100 |
+
"Combining Diacritical Marks Supplement": range(7616, 7680),
|
| 101 |
+
"Latin Extended Additional": range(7680, 7936),
|
| 102 |
+
"Greek Extended": range(7936, 8192),
|
| 103 |
+
"General Punctuation": range(8192, 8304),
|
| 104 |
+
"Superscripts and Subscripts": range(8304, 8352),
|
| 105 |
+
"Currency Symbols": range(8352, 8400),
|
| 106 |
+
"Combining Diacritical Marks for Symbols": range(8400, 8448),
|
| 107 |
+
"Letterlike Symbols": range(8448, 8528),
|
| 108 |
+
"Number Forms": range(8528, 8592),
|
| 109 |
+
"Arrows": range(8592, 8704),
|
| 110 |
+
"Mathematical Operators": range(8704, 8960),
|
| 111 |
+
"Miscellaneous Technical": range(8960, 9216),
|
| 112 |
+
"Control Pictures": range(9216, 9280),
|
| 113 |
+
"Optical Character Recognition": range(9280, 9312),
|
| 114 |
+
"Enclosed Alphanumerics": range(9312, 9472),
|
| 115 |
+
"Box Drawing": range(9472, 9600),
|
| 116 |
+
"Block Elements": range(9600, 9632),
|
| 117 |
+
"Geometric Shapes": range(9632, 9728),
|
| 118 |
+
"Miscellaneous Symbols": range(9728, 9984),
|
| 119 |
+
"Dingbats": range(9984, 10176),
|
| 120 |
+
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
|
| 121 |
+
"Supplemental Arrows-A": range(10224, 10240),
|
| 122 |
+
"Braille Patterns": range(10240, 10496),
|
| 123 |
+
"Supplemental Arrows-B": range(10496, 10624),
|
| 124 |
+
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
|
| 125 |
+
"Supplemental Mathematical Operators": range(10752, 11008),
|
| 126 |
+
"Miscellaneous Symbols and Arrows": range(11008, 11264),
|
| 127 |
+
"Glagolitic": range(11264, 11360),
|
| 128 |
+
"Latin Extended-C": range(11360, 11392),
|
| 129 |
+
"Coptic": range(11392, 11520),
|
| 130 |
+
"Georgian Supplement": range(11520, 11568),
|
| 131 |
+
"Tifinagh": range(11568, 11648),
|
| 132 |
+
"Ethiopic Extended": range(11648, 11744),
|
| 133 |
+
"Cyrillic Extended-A": range(11744, 11776),
|
| 134 |
+
"Supplemental Punctuation": range(11776, 11904),
|
| 135 |
+
"CJK Radicals Supplement": range(11904, 12032),
|
| 136 |
+
"Kangxi Radicals": range(12032, 12256),
|
| 137 |
+
"Ideographic Description Characters": range(12272, 12288),
|
| 138 |
+
"CJK Symbols and Punctuation": range(12288, 12352),
|
| 139 |
+
"Hiragana": range(12352, 12448),
|
| 140 |
+
"Katakana": range(12448, 12544),
|
| 141 |
+
"Bopomofo": range(12544, 12592),
|
| 142 |
+
"Hangul Compatibility Jamo": range(12592, 12688),
|
| 143 |
+
"Kanbun": range(12688, 12704),
|
| 144 |
+
"Bopomofo Extended": range(12704, 12736),
|
| 145 |
+
"CJK Strokes": range(12736, 12784),
|
| 146 |
+
"Katakana Phonetic Extensions": range(12784, 12800),
|
| 147 |
+
"Enclosed CJK Letters and Months": range(12800, 13056),
|
| 148 |
+
"CJK Compatibility": range(13056, 13312),
|
| 149 |
+
"CJK Unified Ideographs Extension A": range(13312, 19904),
|
| 150 |
+
"Yijing Hexagram Symbols": range(19904, 19968),
|
| 151 |
+
"CJK Unified Ideographs": range(19968, 40960),
|
| 152 |
+
"Yi Syllables": range(40960, 42128),
|
| 153 |
+
"Yi Radicals": range(42128, 42192),
|
| 154 |
+
"Lisu": range(42192, 42240),
|
| 155 |
+
"Vai": range(42240, 42560),
|
| 156 |
+
"Cyrillic Extended-B": range(42560, 42656),
|
| 157 |
+
"Bamum": range(42656, 42752),
|
| 158 |
+
"Modifier Tone Letters": range(42752, 42784),
|
| 159 |
+
"Latin Extended-D": range(42784, 43008),
|
| 160 |
+
"Syloti Nagri": range(43008, 43056),
|
| 161 |
+
"Common Indic Number Forms": range(43056, 43072),
|
| 162 |
+
"Phags-pa": range(43072, 43136),
|
| 163 |
+
"Saurashtra": range(43136, 43232),
|
| 164 |
+
"Devanagari Extended": range(43232, 43264),
|
| 165 |
+
"Kayah Li": range(43264, 43312),
|
| 166 |
+
"Rejang": range(43312, 43360),
|
| 167 |
+
"Hangul Jamo Extended-A": range(43360, 43392),
|
| 168 |
+
"Javanese": range(43392, 43488),
|
| 169 |
+
"Myanmar Extended-B": range(43488, 43520),
|
| 170 |
+
"Cham": range(43520, 43616),
|
| 171 |
+
"Myanmar Extended-A": range(43616, 43648),
|
| 172 |
+
"Tai Viet": range(43648, 43744),
|
| 173 |
+
"Meetei Mayek Extensions": range(43744, 43776),
|
| 174 |
+
"Ethiopic Extended-A": range(43776, 43824),
|
| 175 |
+
"Latin Extended-E": range(43824, 43888),
|
| 176 |
+
"Cherokee Supplement": range(43888, 43968),
|
| 177 |
+
"Meetei Mayek": range(43968, 44032),
|
| 178 |
+
"Hangul Syllables": range(44032, 55216),
|
| 179 |
+
"Hangul Jamo Extended-B": range(55216, 55296),
|
| 180 |
+
"High Surrogates": range(55296, 56192),
|
| 181 |
+
"High Private Use Surrogates": range(56192, 56320),
|
| 182 |
+
"Low Surrogates": range(56320, 57344),
|
| 183 |
+
"Private Use Area": range(57344, 63744),
|
| 184 |
+
"CJK Compatibility Ideographs": range(63744, 64256),
|
| 185 |
+
"Alphabetic Presentation Forms": range(64256, 64336),
|
| 186 |
+
"Arabic Presentation Forms-A": range(64336, 65024),
|
| 187 |
+
"Variation Selectors": range(65024, 65040),
|
| 188 |
+
"Vertical Forms": range(65040, 65056),
|
| 189 |
+
"Combining Half Marks": range(65056, 65072),
|
| 190 |
+
"CJK Compatibility Forms": range(65072, 65104),
|
| 191 |
+
"Small Form Variants": range(65104, 65136),
|
| 192 |
+
"Arabic Presentation Forms-B": range(65136, 65280),
|
| 193 |
+
"Halfwidth and Fullwidth Forms": range(65280, 65520),
|
| 194 |
+
"Specials": range(65520, 65536),
|
| 195 |
+
"Linear B Syllabary": range(65536, 65664),
|
| 196 |
+
"Linear B Ideograms": range(65664, 65792),
|
| 197 |
+
"Aegean Numbers": range(65792, 65856),
|
| 198 |
+
"Ancient Greek Numbers": range(65856, 65936),
|
| 199 |
+
"Ancient Symbols": range(65936, 66000),
|
| 200 |
+
"Phaistos Disc": range(66000, 66048),
|
| 201 |
+
"Lycian": range(66176, 66208),
|
| 202 |
+
"Carian": range(66208, 66272),
|
| 203 |
+
"Coptic Epact Numbers": range(66272, 66304),
|
| 204 |
+
"Old Italic": range(66304, 66352),
|
| 205 |
+
"Gothic": range(66352, 66384),
|
| 206 |
+
"Old Permic": range(66384, 66432),
|
| 207 |
+
"Ugaritic": range(66432, 66464),
|
| 208 |
+
"Old Persian": range(66464, 66528),
|
| 209 |
+
"Deseret": range(66560, 66640),
|
| 210 |
+
"Shavian": range(66640, 66688),
|
| 211 |
+
"Osmanya": range(66688, 66736),
|
| 212 |
+
"Osage": range(66736, 66816),
|
| 213 |
+
"Elbasan": range(66816, 66864),
|
| 214 |
+
"Caucasian Albanian": range(66864, 66928),
|
| 215 |
+
"Vithkuqi": range(66928, 67008),
|
| 216 |
+
"Linear A": range(67072, 67456),
|
| 217 |
+
"Latin Extended-F": range(67456, 67520),
|
| 218 |
+
"Cypriot Syllabary": range(67584, 67648),
|
| 219 |
+
"Imperial Aramaic": range(67648, 67680),
|
| 220 |
+
"Palmyrene": range(67680, 67712),
|
| 221 |
+
"Nabataean": range(67712, 67760),
|
| 222 |
+
"Hatran": range(67808, 67840),
|
| 223 |
+
"Phoenician": range(67840, 67872),
|
| 224 |
+
"Lydian": range(67872, 67904),
|
| 225 |
+
"Meroitic Hieroglyphs": range(67968, 68000),
|
| 226 |
+
"Meroitic Cursive": range(68000, 68096),
|
| 227 |
+
"Kharoshthi": range(68096, 68192),
|
| 228 |
+
"Old South Arabian": range(68192, 68224),
|
| 229 |
+
"Old North Arabian": range(68224, 68256),
|
| 230 |
+
"Manichaean": range(68288, 68352),
|
| 231 |
+
"Avestan": range(68352, 68416),
|
| 232 |
+
"Inscriptional Parthian": range(68416, 68448),
|
| 233 |
+
"Inscriptional Pahlavi": range(68448, 68480),
|
| 234 |
+
"Psalter Pahlavi": range(68480, 68528),
|
| 235 |
+
"Old Turkic": range(68608, 68688),
|
| 236 |
+
"Old Hungarian": range(68736, 68864),
|
| 237 |
+
"Hanifi Rohingya": range(68864, 68928),
|
| 238 |
+
"Rumi Numeral Symbols": range(69216, 69248),
|
| 239 |
+
"Yezidi": range(69248, 69312),
|
| 240 |
+
"Arabic Extended-C": range(69312, 69376),
|
| 241 |
+
"Old Sogdian": range(69376, 69424),
|
| 242 |
+
"Sogdian": range(69424, 69488),
|
| 243 |
+
"Old Uyghur": range(69488, 69552),
|
| 244 |
+
"Chorasmian": range(69552, 69600),
|
| 245 |
+
"Elymaic": range(69600, 69632),
|
| 246 |
+
"Brahmi": range(69632, 69760),
|
| 247 |
+
"Kaithi": range(69760, 69840),
|
| 248 |
+
"Sora Sompeng": range(69840, 69888),
|
| 249 |
+
"Chakma": range(69888, 69968),
|
| 250 |
+
"Mahajani": range(69968, 70016),
|
| 251 |
+
"Sharada": range(70016, 70112),
|
| 252 |
+
"Sinhala Archaic Numbers": range(70112, 70144),
|
| 253 |
+
"Khojki": range(70144, 70224),
|
| 254 |
+
"Multani": range(70272, 70320),
|
| 255 |
+
"Khudawadi": range(70320, 70400),
|
| 256 |
+
"Grantha": range(70400, 70528),
|
| 257 |
+
"Newa": range(70656, 70784),
|
| 258 |
+
"Tirhuta": range(70784, 70880),
|
| 259 |
+
"Siddham": range(71040, 71168),
|
| 260 |
+
"Modi": range(71168, 71264),
|
| 261 |
+
"Mongolian Supplement": range(71264, 71296),
|
| 262 |
+
"Takri": range(71296, 71376),
|
| 263 |
+
"Ahom": range(71424, 71504),
|
| 264 |
+
"Dogra": range(71680, 71760),
|
| 265 |
+
"Warang Citi": range(71840, 71936),
|
| 266 |
+
"Dives Akuru": range(71936, 72032),
|
| 267 |
+
"Nandinagari": range(72096, 72192),
|
| 268 |
+
"Zanabazar Square": range(72192, 72272),
|
| 269 |
+
"Soyombo": range(72272, 72368),
|
| 270 |
+
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
|
| 271 |
+
"Pau Cin Hau": range(72384, 72448),
|
| 272 |
+
"Devanagari Extended-A": range(72448, 72544),
|
| 273 |
+
"Bhaiksuki": range(72704, 72816),
|
| 274 |
+
"Marchen": range(72816, 72896),
|
| 275 |
+
"Masaram Gondi": range(72960, 73056),
|
| 276 |
+
"Gunjala Gondi": range(73056, 73136),
|
| 277 |
+
"Makasar": range(73440, 73472),
|
| 278 |
+
"Kawi": range(73472, 73568),
|
| 279 |
+
"Lisu Supplement": range(73648, 73664),
|
| 280 |
+
"Tamil Supplement": range(73664, 73728),
|
| 281 |
+
"Cuneiform": range(73728, 74752),
|
| 282 |
+
"Cuneiform Numbers and Punctuation": range(74752, 74880),
|
| 283 |
+
"Early Dynastic Cuneiform": range(74880, 75088),
|
| 284 |
+
"Cypro-Minoan": range(77712, 77824),
|
| 285 |
+
"Egyptian Hieroglyphs": range(77824, 78896),
|
| 286 |
+
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
|
| 287 |
+
"Anatolian Hieroglyphs": range(82944, 83584),
|
| 288 |
+
"Bamum Supplement": range(92160, 92736),
|
| 289 |
+
"Mro": range(92736, 92784),
|
| 290 |
+
"Tangsa": range(92784, 92880),
|
| 291 |
+
"Bassa Vah": range(92880, 92928),
|
| 292 |
+
"Pahawh Hmong": range(92928, 93072),
|
| 293 |
+
"Medefaidrin": range(93760, 93856),
|
| 294 |
+
"Miao": range(93952, 94112),
|
| 295 |
+
"Ideographic Symbols and Punctuation": range(94176, 94208),
|
| 296 |
+
"Tangut": range(94208, 100352),
|
| 297 |
+
"Tangut Components": range(100352, 101120),
|
| 298 |
+
"Khitan Small Script": range(101120, 101632),
|
| 299 |
+
"Tangut Supplement": range(101632, 101760),
|
| 300 |
+
"Kana Extended-B": range(110576, 110592),
|
| 301 |
+
"Kana Supplement": range(110592, 110848),
|
| 302 |
+
"Kana Extended-A": range(110848, 110896),
|
| 303 |
+
"Small Kana Extension": range(110896, 110960),
|
| 304 |
+
"Nushu": range(110960, 111360),
|
| 305 |
+
"Duployan": range(113664, 113824),
|
| 306 |
+
"Shorthand Format Controls": range(113824, 113840),
|
| 307 |
+
"Znamenny Musical Notation": range(118528, 118736),
|
| 308 |
+
"Byzantine Musical Symbols": range(118784, 119040),
|
| 309 |
+
"Musical Symbols": range(119040, 119296),
|
| 310 |
+
"Ancient Greek Musical Notation": range(119296, 119376),
|
| 311 |
+
"Kaktovik Numerals": range(119488, 119520),
|
| 312 |
+
"Mayan Numerals": range(119520, 119552),
|
| 313 |
+
"Tai Xuan Jing Symbols": range(119552, 119648),
|
| 314 |
+
"Counting Rod Numerals": range(119648, 119680),
|
| 315 |
+
"Mathematical Alphanumeric Symbols": range(119808, 120832),
|
| 316 |
+
"Sutton SignWriting": range(120832, 121520),
|
| 317 |
+
"Latin Extended-G": range(122624, 122880),
|
| 318 |
+
"Glagolitic Supplement": range(122880, 122928),
|
| 319 |
+
"Cyrillic Extended-D": range(122928, 123024),
|
| 320 |
+
"Nyiakeng Puachue Hmong": range(123136, 123216),
|
| 321 |
+
"Toto": range(123536, 123584),
|
| 322 |
+
"Wancho": range(123584, 123648),
|
| 323 |
+
"Nag Mundari": range(124112, 124160),
|
| 324 |
+
"Ethiopic Extended-B": range(124896, 124928),
|
| 325 |
+
"Mende Kikakui": range(124928, 125152),
|
| 326 |
+
"Adlam": range(125184, 125280),
|
| 327 |
+
"Indic Siyaq Numbers": range(126064, 126144),
|
| 328 |
+
"Ottoman Siyaq Numbers": range(126208, 126288),
|
| 329 |
+
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
|
| 330 |
+
"Mahjong Tiles": range(126976, 127024),
|
| 331 |
+
"Domino Tiles": range(127024, 127136),
|
| 332 |
+
"Playing Cards": range(127136, 127232),
|
| 333 |
+
"Enclosed Alphanumeric Supplement": range(127232, 127488),
|
| 334 |
+
"Enclosed Ideographic Supplement": range(127488, 127744),
|
| 335 |
+
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
|
| 336 |
+
"Emoticons range(Emoji)": range(128512, 128592),
|
| 337 |
+
"Ornamental Dingbats": range(128592, 128640),
|
| 338 |
+
"Transport and Map Symbols": range(128640, 128768),
|
| 339 |
+
"Alchemical Symbols": range(128768, 128896),
|
| 340 |
+
"Geometric Shapes Extended": range(128896, 129024),
|
| 341 |
+
"Supplemental Arrows-C": range(129024, 129280),
|
| 342 |
+
"Supplemental Symbols and Pictographs": range(129280, 129536),
|
| 343 |
+
"Chess Symbols": range(129536, 129648),
|
| 344 |
+
"Symbols and Pictographs Extended-A": range(129648, 129792),
|
| 345 |
+
"Symbols for Legacy Computing": range(129792, 130048),
|
| 346 |
+
"CJK Unified Ideographs Extension B": range(131072, 173792),
|
| 347 |
+
"CJK Unified Ideographs Extension C": range(173824, 177984),
|
| 348 |
+
"CJK Unified Ideographs Extension D": range(177984, 178208),
|
| 349 |
+
"CJK Unified Ideographs Extension E": range(178208, 183984),
|
| 350 |
+
"CJK Unified Ideographs Extension F": range(183984, 191472),
|
| 351 |
+
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
|
| 352 |
+
"CJK Unified Ideographs Extension G": range(196608, 201552),
|
| 353 |
+
"CJK Unified Ideographs Extension H": range(201552, 205744),
|
| 354 |
+
"Tags": range(917504, 917632),
|
| 355 |
+
"Variation Selectors Supplement": range(917760, 918000),
|
| 356 |
+
"Supplementary Private Use Area-A": range(983040, 1048576),
|
| 357 |
+
"Supplementary Private Use Area-B": range(1048576, 1114112),
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
|
| 362 |
+
"Supplement",
|
| 363 |
+
"Extended",
|
| 364 |
+
"Extensions",
|
| 365 |
+
"Modifier",
|
| 366 |
+
"Marks",
|
| 367 |
+
"Punctuation",
|
| 368 |
+
"Symbols",
|
| 369 |
+
"Forms",
|
| 370 |
+
"Operators",
|
| 371 |
+
"Miscellaneous",
|
| 372 |
+
"Drawing",
|
| 373 |
+
"Block",
|
| 374 |
+
"Shapes",
|
| 375 |
+
"Supplemental",
|
| 376 |
+
"Tags",
|
| 377 |
+
]
|
| 378 |
+
|
| 379 |
+
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
| 380 |
+
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
| 381 |
+
IGNORECASE,
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
IANA_NO_ALIASES = [
|
| 385 |
+
"cp720",
|
| 386 |
+
"cp737",
|
| 387 |
+
"cp856",
|
| 388 |
+
"cp874",
|
| 389 |
+
"cp875",
|
| 390 |
+
"cp1006",
|
| 391 |
+
"koi8_r",
|
| 392 |
+
"koi8_t",
|
| 393 |
+
"koi8_u",
|
| 394 |
+
]
|
| 395 |
+
|
| 396 |
+
IANA_SUPPORTED: list[str] = sorted(
|
| 397 |
+
filter(
|
| 398 |
+
lambda x: x.endswith("_codec") is False
|
| 399 |
+
and x not in {"rot_13", "tactis", "mbcs"},
|
| 400 |
+
list(set(aliases.values())) + IANA_NO_ALIASES,
|
| 401 |
+
)
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
| 405 |
+
|
| 406 |
+
# pre-computed code page that are similar using the function cp_similarity.
|
| 407 |
+
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
|
| 408 |
+
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
| 409 |
+
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
| 410 |
+
"cp1125": ["cp866"],
|
| 411 |
+
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
| 412 |
+
"cp1250": ["iso8859_2"],
|
| 413 |
+
"cp1251": ["kz1048", "ptcp154"],
|
| 414 |
+
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
| 415 |
+
"cp1253": ["iso8859_7"],
|
| 416 |
+
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
| 417 |
+
"cp1257": ["iso8859_13"],
|
| 418 |
+
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
| 419 |
+
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
| 420 |
+
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
| 421 |
+
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
| 422 |
+
"cp857": ["cp850", "cp858", "cp865"],
|
| 423 |
+
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
| 424 |
+
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
| 425 |
+
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
| 426 |
+
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
| 427 |
+
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
| 428 |
+
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
| 429 |
+
"cp866": ["cp1125"],
|
| 430 |
+
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
| 431 |
+
"iso8859_11": ["tis_620"],
|
| 432 |
+
"iso8859_13": ["cp1257"],
|
| 433 |
+
"iso8859_14": [
|
| 434 |
+
"iso8859_10",
|
| 435 |
+
"iso8859_15",
|
| 436 |
+
"iso8859_16",
|
| 437 |
+
"iso8859_3",
|
| 438 |
+
"iso8859_9",
|
| 439 |
+
"latin_1",
|
| 440 |
+
],
|
| 441 |
+
"iso8859_15": [
|
| 442 |
+
"cp1252",
|
| 443 |
+
"cp1254",
|
| 444 |
+
"iso8859_10",
|
| 445 |
+
"iso8859_14",
|
| 446 |
+
"iso8859_16",
|
| 447 |
+
"iso8859_3",
|
| 448 |
+
"iso8859_9",
|
| 449 |
+
"latin_1",
|
| 450 |
+
],
|
| 451 |
+
"iso8859_16": [
|
| 452 |
+
"iso8859_14",
|
| 453 |
+
"iso8859_15",
|
| 454 |
+
"iso8859_2",
|
| 455 |
+
"iso8859_3",
|
| 456 |
+
"iso8859_9",
|
| 457 |
+
"latin_1",
|
| 458 |
+
],
|
| 459 |
+
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
| 460 |
+
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
| 461 |
+
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
| 462 |
+
"iso8859_7": ["cp1253"],
|
| 463 |
+
"iso8859_9": [
|
| 464 |
+
"cp1252",
|
| 465 |
+
"cp1254",
|
| 466 |
+
"cp1258",
|
| 467 |
+
"iso8859_10",
|
| 468 |
+
"iso8859_14",
|
| 469 |
+
"iso8859_15",
|
| 470 |
+
"iso8859_16",
|
| 471 |
+
"iso8859_3",
|
| 472 |
+
"iso8859_4",
|
| 473 |
+
"latin_1",
|
| 474 |
+
],
|
| 475 |
+
"kz1048": ["cp1251", "ptcp154"],
|
| 476 |
+
"latin_1": [
|
| 477 |
+
"cp1252",
|
| 478 |
+
"cp1254",
|
| 479 |
+
"cp1258",
|
| 480 |
+
"iso8859_10",
|
| 481 |
+
"iso8859_14",
|
| 482 |
+
"iso8859_15",
|
| 483 |
+
"iso8859_16",
|
| 484 |
+
"iso8859_3",
|
| 485 |
+
"iso8859_4",
|
| 486 |
+
"iso8859_9",
|
| 487 |
+
],
|
| 488 |
+
"mac_iceland": ["mac_roman", "mac_turkish"],
|
| 489 |
+
"mac_roman": ["mac_iceland", "mac_turkish"],
|
| 490 |
+
"mac_turkish": ["mac_iceland", "mac_roman"],
|
| 491 |
+
"ptcp154": ["cp1251", "kz1048"],
|
| 492 |
+
"tis_620": ["iso8859_11"],
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
CHARDET_CORRESPONDENCE: dict[str, str] = {
|
| 497 |
+
"iso2022_kr": "ISO-2022-KR",
|
| 498 |
+
"iso2022_jp": "ISO-2022-JP",
|
| 499 |
+
"euc_kr": "EUC-KR",
|
| 500 |
+
"tis_620": "TIS-620",
|
| 501 |
+
"utf_32": "UTF-32",
|
| 502 |
+
"euc_jp": "EUC-JP",
|
| 503 |
+
"koi8_r": "KOI8-R",
|
| 504 |
+
"iso8859_1": "ISO-8859-1",
|
| 505 |
+
"iso8859_2": "ISO-8859-2",
|
| 506 |
+
"iso8859_5": "ISO-8859-5",
|
| 507 |
+
"iso8859_6": "ISO-8859-6",
|
| 508 |
+
"iso8859_7": "ISO-8859-7",
|
| 509 |
+
"iso8859_8": "ISO-8859-8",
|
| 510 |
+
"utf_16": "UTF-16",
|
| 511 |
+
"cp855": "IBM855",
|
| 512 |
+
"mac_cyrillic": "MacCyrillic",
|
| 513 |
+
"gb2312": "GB2312",
|
| 514 |
+
"gb18030": "GB18030",
|
| 515 |
+
"cp932": "CP932",
|
| 516 |
+
"cp866": "IBM866",
|
| 517 |
+
"utf_8": "utf-8",
|
| 518 |
+
"utf_8_sig": "UTF-8-SIG",
|
| 519 |
+
"shift_jis": "SHIFT_JIS",
|
| 520 |
+
"big5": "Big5",
|
| 521 |
+
"cp1250": "windows-1250",
|
| 522 |
+
"cp1251": "windows-1251",
|
| 523 |
+
"cp1252": "Windows-1252",
|
| 524 |
+
"cp1253": "windows-1253",
|
| 525 |
+
"cp1255": "windows-1255",
|
| 526 |
+
"cp1256": "windows-1256",
|
| 527 |
+
"cp1254": "Windows-1254",
|
| 528 |
+
"cp949": "CP949",
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
|
| 533 |
+
"<",
|
| 534 |
+
">",
|
| 535 |
+
"=",
|
| 536 |
+
":",
|
| 537 |
+
"/",
|
| 538 |
+
"&",
|
| 539 |
+
";",
|
| 540 |
+
"{",
|
| 541 |
+
"}",
|
| 542 |
+
"[",
|
| 543 |
+
"]",
|
| 544 |
+
",",
|
| 545 |
+
"|",
|
| 546 |
+
'"',
|
| 547 |
+
"-",
|
| 548 |
+
"(",
|
| 549 |
+
")",
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
# Sample character sets — replace with full lists if needed
|
| 553 |
+
COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
|
| 554 |
+
|
| 555 |
+
COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
|
| 556 |
+
|
| 557 |
+
COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
|
| 558 |
+
|
| 559 |
+
# Combine all into a set
|
| 560 |
+
COMMON_CJK_CHARACTERS = set(
|
| 561 |
+
"".join(
|
| 562 |
+
[
|
| 563 |
+
COMMON_CHINESE_CHARACTERS,
|
| 564 |
+
COMMON_JAPANESE_CHARACTERS,
|
| 565 |
+
COMMON_KOREAN_CHARACTERS,
|
| 566 |
+
]
|
| 567 |
+
)
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
|
| 571 |
+
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
| 572 |
+
|
| 573 |
+
# Logging LEVEL below DEBUG
|
| 574 |
+
TRACE: int = 5
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
# Language label that contain the em dash "—"
|
| 578 |
+
# character are to be considered alternative seq to origin
|
| 579 |
+
FREQUENCIES: dict[str, list[str]] = {
|
| 580 |
+
"English": [
|
| 581 |
+
"e",
|
| 582 |
+
"a",
|
| 583 |
+
"t",
|
| 584 |
+
"i",
|
| 585 |
+
"o",
|
| 586 |
+
"n",
|
| 587 |
+
"s",
|
| 588 |
+
"r",
|
| 589 |
+
"h",
|
| 590 |
+
"l",
|
| 591 |
+
"d",
|
| 592 |
+
"c",
|
| 593 |
+
"u",
|
| 594 |
+
"m",
|
| 595 |
+
"f",
|
| 596 |
+
"p",
|
| 597 |
+
"g",
|
| 598 |
+
"w",
|
| 599 |
+
"y",
|
| 600 |
+
"b",
|
| 601 |
+
"v",
|
| 602 |
+
"k",
|
| 603 |
+
"x",
|
| 604 |
+
"j",
|
| 605 |
+
"z",
|
| 606 |
+
"q",
|
| 607 |
+
],
|
| 608 |
+
"English—": [
|
| 609 |
+
"e",
|
| 610 |
+
"a",
|
| 611 |
+
"t",
|
| 612 |
+
"i",
|
| 613 |
+
"o",
|
| 614 |
+
"n",
|
| 615 |
+
"s",
|
| 616 |
+
"r",
|
| 617 |
+
"h",
|
| 618 |
+
"l",
|
| 619 |
+
"d",
|
| 620 |
+
"c",
|
| 621 |
+
"m",
|
| 622 |
+
"u",
|
| 623 |
+
"f",
|
| 624 |
+
"p",
|
| 625 |
+
"g",
|
| 626 |
+
"w",
|
| 627 |
+
"b",
|
| 628 |
+
"y",
|
| 629 |
+
"v",
|
| 630 |
+
"k",
|
| 631 |
+
"j",
|
| 632 |
+
"x",
|
| 633 |
+
"z",
|
| 634 |
+
"q",
|
| 635 |
+
],
|
| 636 |
+
"German": [
|
| 637 |
+
"e",
|
| 638 |
+
"n",
|
| 639 |
+
"i",
|
| 640 |
+
"r",
|
| 641 |
+
"s",
|
| 642 |
+
"t",
|
| 643 |
+
"a",
|
| 644 |
+
"d",
|
| 645 |
+
"h",
|
| 646 |
+
"u",
|
| 647 |
+
"l",
|
| 648 |
+
"g",
|
| 649 |
+
"o",
|
| 650 |
+
"c",
|
| 651 |
+
"m",
|
| 652 |
+
"b",
|
| 653 |
+
"f",
|
| 654 |
+
"k",
|
| 655 |
+
"w",
|
| 656 |
+
"z",
|
| 657 |
+
"p",
|
| 658 |
+
"v",
|
| 659 |
+
"ü",
|
| 660 |
+
"ä",
|
| 661 |
+
"ö",
|
| 662 |
+
"j",
|
| 663 |
+
],
|
| 664 |
+
"French": [
|
| 665 |
+
"e",
|
| 666 |
+
"a",
|
| 667 |
+
"s",
|
| 668 |
+
"n",
|
| 669 |
+
"i",
|
| 670 |
+
"t",
|
| 671 |
+
"r",
|
| 672 |
+
"l",
|
| 673 |
+
"u",
|
| 674 |
+
"o",
|
| 675 |
+
"d",
|
| 676 |
+
"c",
|
| 677 |
+
"p",
|
| 678 |
+
"m",
|
| 679 |
+
"é",
|
| 680 |
+
"v",
|
| 681 |
+
"g",
|
| 682 |
+
"f",
|
| 683 |
+
"b",
|
| 684 |
+
"h",
|
| 685 |
+
"q",
|
| 686 |
+
"à",
|
| 687 |
+
"x",
|
| 688 |
+
"è",
|
| 689 |
+
"y",
|
| 690 |
+
"j",
|
| 691 |
+
],
|
| 692 |
+
"Dutch": [
|
| 693 |
+
"e",
|
| 694 |
+
"n",
|
| 695 |
+
"a",
|
| 696 |
+
"i",
|
| 697 |
+
"r",
|
| 698 |
+
"t",
|
| 699 |
+
"o",
|
| 700 |
+
"d",
|
| 701 |
+
"s",
|
| 702 |
+
"l",
|
| 703 |
+
"g",
|
| 704 |
+
"h",
|
| 705 |
+
"v",
|
| 706 |
+
"m",
|
| 707 |
+
"u",
|
| 708 |
+
"k",
|
| 709 |
+
"c",
|
| 710 |
+
"p",
|
| 711 |
+
"b",
|
| 712 |
+
"w",
|
| 713 |
+
"j",
|
| 714 |
+
"z",
|
| 715 |
+
"f",
|
| 716 |
+
"y",
|
| 717 |
+
"x",
|
| 718 |
+
"ë",
|
| 719 |
+
],
|
| 720 |
+
"Italian": [
|
| 721 |
+
"e",
|
| 722 |
+
"i",
|
| 723 |
+
"a",
|
| 724 |
+
"o",
|
| 725 |
+
"n",
|
| 726 |
+
"l",
|
| 727 |
+
"t",
|
| 728 |
+
"r",
|
| 729 |
+
"s",
|
| 730 |
+
"c",
|
| 731 |
+
"d",
|
| 732 |
+
"u",
|
| 733 |
+
"p",
|
| 734 |
+
"m",
|
| 735 |
+
"g",
|
| 736 |
+
"v",
|
| 737 |
+
"f",
|
| 738 |
+
"b",
|
| 739 |
+
"z",
|
| 740 |
+
"h",
|
| 741 |
+
"q",
|
| 742 |
+
"è",
|
| 743 |
+
"à",
|
| 744 |
+
"k",
|
| 745 |
+
"y",
|
| 746 |
+
"ò",
|
| 747 |
+
],
|
| 748 |
+
"Polish": [
|
| 749 |
+
"a",
|
| 750 |
+
"i",
|
| 751 |
+
"o",
|
| 752 |
+
"e",
|
| 753 |
+
"n",
|
| 754 |
+
"r",
|
| 755 |
+
"z",
|
| 756 |
+
"w",
|
| 757 |
+
"s",
|
| 758 |
+
"c",
|
| 759 |
+
"t",
|
| 760 |
+
"k",
|
| 761 |
+
"y",
|
| 762 |
+
"d",
|
| 763 |
+
"p",
|
| 764 |
+
"m",
|
| 765 |
+
"u",
|
| 766 |
+
"l",
|
| 767 |
+
"j",
|
| 768 |
+
"ł",
|
| 769 |
+
"g",
|
| 770 |
+
"b",
|
| 771 |
+
"h",
|
| 772 |
+
"ą",
|
| 773 |
+
"ę",
|
| 774 |
+
"ó",
|
| 775 |
+
],
|
| 776 |
+
"Spanish": [
|
| 777 |
+
"e",
|
| 778 |
+
"a",
|
| 779 |
+
"o",
|
| 780 |
+
"n",
|
| 781 |
+
"s",
|
| 782 |
+
"r",
|
| 783 |
+
"i",
|
| 784 |
+
"l",
|
| 785 |
+
"d",
|
| 786 |
+
"t",
|
| 787 |
+
"c",
|
| 788 |
+
"u",
|
| 789 |
+
"m",
|
| 790 |
+
"p",
|
| 791 |
+
"b",
|
| 792 |
+
"g",
|
| 793 |
+
"v",
|
| 794 |
+
"f",
|
| 795 |
+
"y",
|
| 796 |
+
"ó",
|
| 797 |
+
"h",
|
| 798 |
+
"q",
|
| 799 |
+
"í",
|
| 800 |
+
"j",
|
| 801 |
+
"z",
|
| 802 |
+
"á",
|
| 803 |
+
],
|
| 804 |
+
"Russian": [
|
| 805 |
+
"о",
|
| 806 |
+
"а",
|
| 807 |
+
"е",
|
| 808 |
+
"и",
|
| 809 |
+
"н",
|
| 810 |
+
"с",
|
| 811 |
+
"т",
|
| 812 |
+
"р",
|
| 813 |
+
"в",
|
| 814 |
+
"л",
|
| 815 |
+
"к",
|
| 816 |
+
"м",
|
| 817 |
+
"д",
|
| 818 |
+
"п",
|
| 819 |
+
"у",
|
| 820 |
+
"г",
|
| 821 |
+
"я",
|
| 822 |
+
"ы",
|
| 823 |
+
"з",
|
| 824 |
+
"б",
|
| 825 |
+
"й",
|
| 826 |
+
"ь",
|
| 827 |
+
"ч",
|
| 828 |
+
"х",
|
| 829 |
+
"ж",
|
| 830 |
+
"ц",
|
| 831 |
+
],
|
| 832 |
+
# Jap-Kanji
|
| 833 |
+
"Japanese": [
|
| 834 |
+
"人",
|
| 835 |
+
"一",
|
| 836 |
+
"大",
|
| 837 |
+
"亅",
|
| 838 |
+
"丁",
|
| 839 |
+
"丨",
|
| 840 |
+
"竹",
|
| 841 |
+
"笑",
|
| 842 |
+
"口",
|
| 843 |
+
"日",
|
| 844 |
+
"今",
|
| 845 |
+
"二",
|
| 846 |
+
"彳",
|
| 847 |
+
"行",
|
| 848 |
+
"十",
|
| 849 |
+
"土",
|
| 850 |
+
"丶",
|
| 851 |
+
"寸",
|
| 852 |
+
"寺",
|
| 853 |
+
"時",
|
| 854 |
+
"乙",
|
| 855 |
+
"丿",
|
| 856 |
+
"乂",
|
| 857 |
+
"气",
|
| 858 |
+
"気",
|
| 859 |
+
"冂",
|
| 860 |
+
"巾",
|
| 861 |
+
"亠",
|
| 862 |
+
"市",
|
| 863 |
+
"目",
|
| 864 |
+
"儿",
|
| 865 |
+
"見",
|
| 866 |
+
"八",
|
| 867 |
+
"小",
|
| 868 |
+
"凵",
|
| 869 |
+
"県",
|
| 870 |
+
"月",
|
| 871 |
+
"彐",
|
| 872 |
+
"門",
|
| 873 |
+
"間",
|
| 874 |
+
"木",
|
| 875 |
+
"東",
|
| 876 |
+
"山",
|
| 877 |
+
"出",
|
| 878 |
+
"本",
|
| 879 |
+
"中",
|
| 880 |
+
"刀",
|
| 881 |
+
"分",
|
| 882 |
+
"耳",
|
| 883 |
+
"又",
|
| 884 |
+
"取",
|
| 885 |
+
"最",
|
| 886 |
+
"言",
|
| 887 |
+
"田",
|
| 888 |
+
"心",
|
| 889 |
+
"思",
|
| 890 |
+
"刂",
|
| 891 |
+
"前",
|
| 892 |
+
"京",
|
| 893 |
+
"尹",
|
| 894 |
+
"事",
|
| 895 |
+
"生",
|
| 896 |
+
"厶",
|
| 897 |
+
"云",
|
| 898 |
+
"会",
|
| 899 |
+
"未",
|
| 900 |
+
"来",
|
| 901 |
+
"白",
|
| 902 |
+
"冫",
|
| 903 |
+
"楽",
|
| 904 |
+
"灬",
|
| 905 |
+
"馬",
|
| 906 |
+
"尸",
|
| 907 |
+
"尺",
|
| 908 |
+
"駅",
|
| 909 |
+
"明",
|
| 910 |
+
"耂",
|
| 911 |
+
"者",
|
| 912 |
+
"了",
|
| 913 |
+
"阝",
|
| 914 |
+
"都",
|
| 915 |
+
"高",
|
| 916 |
+
"卜",
|
| 917 |
+
"占",
|
| 918 |
+
"厂",
|
| 919 |
+
"广",
|
| 920 |
+
"店",
|
| 921 |
+
"子",
|
| 922 |
+
"申",
|
| 923 |
+
"奄",
|
| 924 |
+
"亻",
|
| 925 |
+
"俺",
|
| 926 |
+
"上",
|
| 927 |
+
"方",
|
| 928 |
+
"冖",
|
| 929 |
+
"学",
|
| 930 |
+
"衣",
|
| 931 |
+
"艮",
|
| 932 |
+
"食",
|
| 933 |
+
"自",
|
| 934 |
+
],
|
| 935 |
+
# Jap-Katakana
|
| 936 |
+
"Japanese—": [
|
| 937 |
+
"ー",
|
| 938 |
+
"ン",
|
| 939 |
+
"ス",
|
| 940 |
+
"・",
|
| 941 |
+
"ル",
|
| 942 |
+
"ト",
|
| 943 |
+
"リ",
|
| 944 |
+
"イ",
|
| 945 |
+
"ア",
|
| 946 |
+
"ラ",
|
| 947 |
+
"ッ",
|
| 948 |
+
"ク",
|
| 949 |
+
"ド",
|
| 950 |
+
"シ",
|
| 951 |
+
"レ",
|
| 952 |
+
"ジ",
|
| 953 |
+
"タ",
|
| 954 |
+
"フ",
|
| 955 |
+
"ロ",
|
| 956 |
+
"カ",
|
| 957 |
+
"テ",
|
| 958 |
+
"マ",
|
| 959 |
+
"ィ",
|
| 960 |
+
"グ",
|
| 961 |
+
"バ",
|
| 962 |
+
"ム",
|
| 963 |
+
"プ",
|
| 964 |
+
"オ",
|
| 965 |
+
"コ",
|
| 966 |
+
"デ",
|
| 967 |
+
"ニ",
|
| 968 |
+
"ウ",
|
| 969 |
+
"メ",
|
| 970 |
+
"サ",
|
| 971 |
+
"ビ",
|
| 972 |
+
"ナ",
|
| 973 |
+
"ブ",
|
| 974 |
+
"ャ",
|
| 975 |
+
"エ",
|
| 976 |
+
"ュ",
|
| 977 |
+
"チ",
|
| 978 |
+
"キ",
|
| 979 |
+
"ズ",
|
| 980 |
+
"ダ",
|
| 981 |
+
"パ",
|
| 982 |
+
"ミ",
|
| 983 |
+
"ェ",
|
| 984 |
+
"ョ",
|
| 985 |
+
"ハ",
|
| 986 |
+
"セ",
|
| 987 |
+
"ベ",
|
| 988 |
+
"ガ",
|
| 989 |
+
"モ",
|
| 990 |
+
"ツ",
|
| 991 |
+
"ネ",
|
| 992 |
+
"ボ",
|
| 993 |
+
"ソ",
|
| 994 |
+
"ノ",
|
| 995 |
+
"ァ",
|
| 996 |
+
"ヴ",
|
| 997 |
+
"ワ",
|
| 998 |
+
"ポ",
|
| 999 |
+
"ペ",
|
| 1000 |
+
"ピ",
|
| 1001 |
+
"ケ",
|
| 1002 |
+
"ゴ",
|
| 1003 |
+
"ギ",
|
| 1004 |
+
"ザ",
|
| 1005 |
+
"ホ",
|
| 1006 |
+
"ゲ",
|
| 1007 |
+
"ォ",
|
| 1008 |
+
"ヤ",
|
| 1009 |
+
"ヒ",
|
| 1010 |
+
"ユ",
|
| 1011 |
+
"ヨ",
|
| 1012 |
+
"ヘ",
|
| 1013 |
+
"ゼ",
|
| 1014 |
+
"ヌ",
|
| 1015 |
+
"ゥ",
|
| 1016 |
+
"ゾ",
|
| 1017 |
+
"ヶ",
|
| 1018 |
+
"ヂ",
|
| 1019 |
+
"ヲ",
|
| 1020 |
+
"ヅ",
|
| 1021 |
+
"ヵ",
|
| 1022 |
+
"ヱ",
|
| 1023 |
+
"ヰ",
|
| 1024 |
+
"ヮ",
|
| 1025 |
+
"ヽ",
|
| 1026 |
+
"゠",
|
| 1027 |
+
"ヾ",
|
| 1028 |
+
"ヷ",
|
| 1029 |
+
"ヿ",
|
| 1030 |
+
"ヸ",
|
| 1031 |
+
"ヹ",
|
| 1032 |
+
"ヺ",
|
| 1033 |
+
],
|
| 1034 |
+
# Jap-Hiragana
|
| 1035 |
+
"Japanese——": [
|
| 1036 |
+
"の",
|
| 1037 |
+
"に",
|
| 1038 |
+
"る",
|
| 1039 |
+
"た",
|
| 1040 |
+
"と",
|
| 1041 |
+
"は",
|
| 1042 |
+
"し",
|
| 1043 |
+
"い",
|
| 1044 |
+
"を",
|
| 1045 |
+
"で",
|
| 1046 |
+
"て",
|
| 1047 |
+
"が",
|
| 1048 |
+
"な",
|
| 1049 |
+
"れ",
|
| 1050 |
+
"か",
|
| 1051 |
+
"ら",
|
| 1052 |
+
"さ",
|
| 1053 |
+
"っ",
|
| 1054 |
+
"り",
|
| 1055 |
+
"す",
|
| 1056 |
+
"あ",
|
| 1057 |
+
"も",
|
| 1058 |
+
"こ",
|
| 1059 |
+
"ま",
|
| 1060 |
+
"う",
|
| 1061 |
+
"く",
|
| 1062 |
+
"よ",
|
| 1063 |
+
"き",
|
| 1064 |
+
"ん",
|
| 1065 |
+
"め",
|
| 1066 |
+
"お",
|
| 1067 |
+
"け",
|
| 1068 |
+
"そ",
|
| 1069 |
+
"つ",
|
| 1070 |
+
"だ",
|
| 1071 |
+
"や",
|
| 1072 |
+
"え",
|
| 1073 |
+
"ど",
|
| 1074 |
+
"わ",
|
| 1075 |
+
"ち",
|
| 1076 |
+
"み",
|
| 1077 |
+
"せ",
|
| 1078 |
+
"じ",
|
| 1079 |
+
"ば",
|
| 1080 |
+
"へ",
|
| 1081 |
+
"び",
|
| 1082 |
+
"ず",
|
| 1083 |
+
"ろ",
|
| 1084 |
+
"ほ",
|
| 1085 |
+
"げ",
|
| 1086 |
+
"む",
|
| 1087 |
+
"べ",
|
| 1088 |
+
"ひ",
|
| 1089 |
+
"ょ",
|
| 1090 |
+
"ゆ",
|
| 1091 |
+
"ぶ",
|
| 1092 |
+
"ご",
|
| 1093 |
+
"ゃ",
|
| 1094 |
+
"ね",
|
| 1095 |
+
"ふ",
|
| 1096 |
+
"ぐ",
|
| 1097 |
+
"ぎ",
|
| 1098 |
+
"ぼ",
|
| 1099 |
+
"ゅ",
|
| 1100 |
+
"づ",
|
| 1101 |
+
"ざ",
|
| 1102 |
+
"ぞ",
|
| 1103 |
+
"ぬ",
|
| 1104 |
+
"ぜ",
|
| 1105 |
+
"ぱ",
|
| 1106 |
+
"ぽ",
|
| 1107 |
+
"ぷ",
|
| 1108 |
+
"ぴ",
|
| 1109 |
+
"ぃ",
|
| 1110 |
+
"ぁ",
|
| 1111 |
+
"ぇ",
|
| 1112 |
+
"ぺ",
|
| 1113 |
+
"ゞ",
|
| 1114 |
+
"ぢ",
|
| 1115 |
+
"ぉ",
|
| 1116 |
+
"ぅ",
|
| 1117 |
+
"ゐ",
|
| 1118 |
+
"ゝ",
|
| 1119 |
+
"ゑ",
|
| 1120 |
+
"゛",
|
| 1121 |
+
"゜",
|
| 1122 |
+
"ゎ",
|
| 1123 |
+
"ゔ",
|
| 1124 |
+
"゚",
|
| 1125 |
+
"ゟ",
|
| 1126 |
+
"゙",
|
| 1127 |
+
"ゕ",
|
| 1128 |
+
"ゖ",
|
| 1129 |
+
],
|
| 1130 |
+
"Portuguese": [
|
| 1131 |
+
"a",
|
| 1132 |
+
"e",
|
| 1133 |
+
"o",
|
| 1134 |
+
"s",
|
| 1135 |
+
"i",
|
| 1136 |
+
"r",
|
| 1137 |
+
"d",
|
| 1138 |
+
"n",
|
| 1139 |
+
"t",
|
| 1140 |
+
"m",
|
| 1141 |
+
"u",
|
| 1142 |
+
"c",
|
| 1143 |
+
"l",
|
| 1144 |
+
"p",
|
| 1145 |
+
"g",
|
| 1146 |
+
"v",
|
| 1147 |
+
"b",
|
| 1148 |
+
"f",
|
| 1149 |
+
"h",
|
| 1150 |
+
"ã",
|
| 1151 |
+
"q",
|
| 1152 |
+
"é",
|
| 1153 |
+
"ç",
|
| 1154 |
+
"á",
|
| 1155 |
+
"z",
|
| 1156 |
+
"í",
|
| 1157 |
+
],
|
| 1158 |
+
"Swedish": [
|
| 1159 |
+
"e",
|
| 1160 |
+
"a",
|
| 1161 |
+
"n",
|
| 1162 |
+
"r",
|
| 1163 |
+
"t",
|
| 1164 |
+
"s",
|
| 1165 |
+
"i",
|
| 1166 |
+
"l",
|
| 1167 |
+
"d",
|
| 1168 |
+
"o",
|
| 1169 |
+
"m",
|
| 1170 |
+
"k",
|
| 1171 |
+
"g",
|
| 1172 |
+
"v",
|
| 1173 |
+
"h",
|
| 1174 |
+
"f",
|
| 1175 |
+
"u",
|
| 1176 |
+
"p",
|
| 1177 |
+
"ä",
|
| 1178 |
+
"c",
|
| 1179 |
+
"b",
|
| 1180 |
+
"ö",
|
| 1181 |
+
"å",
|
| 1182 |
+
"y",
|
| 1183 |
+
"j",
|
| 1184 |
+
"x",
|
| 1185 |
+
],
|
| 1186 |
+
"Chinese": [
|
| 1187 |
+
"的",
|
| 1188 |
+
"一",
|
| 1189 |
+
"是",
|
| 1190 |
+
"不",
|
| 1191 |
+
"了",
|
| 1192 |
+
"在",
|
| 1193 |
+
"人",
|
| 1194 |
+
"有",
|
| 1195 |
+
"我",
|
| 1196 |
+
"他",
|
| 1197 |
+
"这",
|
| 1198 |
+
"个",
|
| 1199 |
+
"们",
|
| 1200 |
+
"中",
|
| 1201 |
+
"来",
|
| 1202 |
+
"上",
|
| 1203 |
+
"大",
|
| 1204 |
+
"为",
|
| 1205 |
+
"和",
|
| 1206 |
+
"国",
|
| 1207 |
+
"地",
|
| 1208 |
+
"到",
|
| 1209 |
+
"以",
|
| 1210 |
+
"说",
|
| 1211 |
+
"时",
|
| 1212 |
+
"要",
|
| 1213 |
+
"就",
|
| 1214 |
+
"出",
|
| 1215 |
+
"会",
|
| 1216 |
+
"可",
|
| 1217 |
+
"也",
|
| 1218 |
+
"你",
|
| 1219 |
+
"对",
|
| 1220 |
+
"生",
|
| 1221 |
+
"能",
|
| 1222 |
+
"而",
|
| 1223 |
+
"子",
|
| 1224 |
+
"那",
|
| 1225 |
+
"得",
|
| 1226 |
+
"于",
|
| 1227 |
+
"着",
|
| 1228 |
+
"下",
|
| 1229 |
+
"自",
|
| 1230 |
+
"之",
|
| 1231 |
+
"年",
|
| 1232 |
+
"过",
|
| 1233 |
+
"发",
|
| 1234 |
+
"后",
|
| 1235 |
+
"作",
|
| 1236 |
+
"里",
|
| 1237 |
+
"用",
|
| 1238 |
+
"道",
|
| 1239 |
+
"行",
|
| 1240 |
+
"所",
|
| 1241 |
+
"然",
|
| 1242 |
+
"家",
|
| 1243 |
+
"种",
|
| 1244 |
+
"事",
|
| 1245 |
+
"成",
|
| 1246 |
+
"方",
|
| 1247 |
+
"多",
|
| 1248 |
+
"经",
|
| 1249 |
+
"么",
|
| 1250 |
+
"去",
|
| 1251 |
+
"法",
|
| 1252 |
+
"学",
|
| 1253 |
+
"如",
|
| 1254 |
+
"都",
|
| 1255 |
+
"同",
|
| 1256 |
+
"现",
|
| 1257 |
+
"当",
|
| 1258 |
+
"没",
|
| 1259 |
+
"动",
|
| 1260 |
+
"面",
|
| 1261 |
+
"起",
|
| 1262 |
+
"看",
|
| 1263 |
+
"定",
|
| 1264 |
+
"天",
|
| 1265 |
+
"分",
|
| 1266 |
+
"还",
|
| 1267 |
+
"进",
|
| 1268 |
+
"好",
|
| 1269 |
+
"小",
|
| 1270 |
+
"部",
|
| 1271 |
+
"其",
|
| 1272 |
+
"些",
|
| 1273 |
+
"主",
|
| 1274 |
+
"样",
|
| 1275 |
+
"理",
|
| 1276 |
+
"心",
|
| 1277 |
+
"她",
|
| 1278 |
+
"本",
|
| 1279 |
+
"前",
|
| 1280 |
+
"开",
|
| 1281 |
+
"但",
|
| 1282 |
+
"因",
|
| 1283 |
+
"只",
|
| 1284 |
+
"从",
|
| 1285 |
+
"想",
|
| 1286 |
+
"实",
|
| 1287 |
+
],
|
| 1288 |
+
"Ukrainian": [
|
| 1289 |
+
"о",
|
| 1290 |
+
"а",
|
| 1291 |
+
"н",
|
| 1292 |
+
"і",
|
| 1293 |
+
"и",
|
| 1294 |
+
"р",
|
| 1295 |
+
"в",
|
| 1296 |
+
"т",
|
| 1297 |
+
"е",
|
| 1298 |
+
"с",
|
| 1299 |
+
"к",
|
| 1300 |
+
"л",
|
| 1301 |
+
"у",
|
| 1302 |
+
"д",
|
| 1303 |
+
"м",
|
| 1304 |
+
"п",
|
| 1305 |
+
"з",
|
| 1306 |
+
"я",
|
| 1307 |
+
"ь",
|
| 1308 |
+
"б",
|
| 1309 |
+
"г",
|
| 1310 |
+
"й",
|
| 1311 |
+
"ч",
|
| 1312 |
+
"х",
|
| 1313 |
+
"ц",
|
| 1314 |
+
"ї",
|
| 1315 |
+
],
|
| 1316 |
+
"Norwegian": [
|
| 1317 |
+
"e",
|
| 1318 |
+
"r",
|
| 1319 |
+
"n",
|
| 1320 |
+
"t",
|
| 1321 |
+
"a",
|
| 1322 |
+
"s",
|
| 1323 |
+
"i",
|
| 1324 |
+
"o",
|
| 1325 |
+
"l",
|
| 1326 |
+
"d",
|
| 1327 |
+
"g",
|
| 1328 |
+
"k",
|
| 1329 |
+
"m",
|
| 1330 |
+
"v",
|
| 1331 |
+
"f",
|
| 1332 |
+
"p",
|
| 1333 |
+
"u",
|
| 1334 |
+
"b",
|
| 1335 |
+
"h",
|
| 1336 |
+
"å",
|
| 1337 |
+
"y",
|
| 1338 |
+
"j",
|
| 1339 |
+
"ø",
|
| 1340 |
+
"c",
|
| 1341 |
+
"æ",
|
| 1342 |
+
"w",
|
| 1343 |
+
],
|
| 1344 |
+
"Finnish": [
|
| 1345 |
+
"a",
|
| 1346 |
+
"i",
|
| 1347 |
+
"n",
|
| 1348 |
+
"t",
|
| 1349 |
+
"e",
|
| 1350 |
+
"s",
|
| 1351 |
+
"l",
|
| 1352 |
+
"o",
|
| 1353 |
+
"u",
|
| 1354 |
+
"k",
|
| 1355 |
+
"ä",
|
| 1356 |
+
"m",
|
| 1357 |
+
"r",
|
| 1358 |
+
"v",
|
| 1359 |
+
"j",
|
| 1360 |
+
"h",
|
| 1361 |
+
"p",
|
| 1362 |
+
"y",
|
| 1363 |
+
"d",
|
| 1364 |
+
"ö",
|
| 1365 |
+
"g",
|
| 1366 |
+
"c",
|
| 1367 |
+
"b",
|
| 1368 |
+
"f",
|
| 1369 |
+
"w",
|
| 1370 |
+
"z",
|
| 1371 |
+
],
|
| 1372 |
+
"Vietnamese": [
|
| 1373 |
+
"n",
|
| 1374 |
+
"h",
|
| 1375 |
+
"t",
|
| 1376 |
+
"i",
|
| 1377 |
+
"c",
|
| 1378 |
+
"g",
|
| 1379 |
+
"a",
|
| 1380 |
+
"o",
|
| 1381 |
+
"u",
|
| 1382 |
+
"m",
|
| 1383 |
+
"l",
|
| 1384 |
+
"r",
|
| 1385 |
+
"à",
|
| 1386 |
+
"đ",
|
| 1387 |
+
"s",
|
| 1388 |
+
"e",
|
| 1389 |
+
"v",
|
| 1390 |
+
"p",
|
| 1391 |
+
"b",
|
| 1392 |
+
"y",
|
| 1393 |
+
"ư",
|
| 1394 |
+
"d",
|
| 1395 |
+
"á",
|
| 1396 |
+
"k",
|
| 1397 |
+
"ộ",
|
| 1398 |
+
"ế",
|
| 1399 |
+
],
|
| 1400 |
+
"Czech": [
|
| 1401 |
+
"o",
|
| 1402 |
+
"e",
|
| 1403 |
+
"a",
|
| 1404 |
+
"n",
|
| 1405 |
+
"t",
|
| 1406 |
+
"s",
|
| 1407 |
+
"i",
|
| 1408 |
+
"l",
|
| 1409 |
+
"v",
|
| 1410 |
+
"r",
|
| 1411 |
+
"k",
|
| 1412 |
+
"d",
|
| 1413 |
+
"u",
|
| 1414 |
+
"m",
|
| 1415 |
+
"p",
|
| 1416 |
+
"í",
|
| 1417 |
+
"c",
|
| 1418 |
+
"h",
|
| 1419 |
+
"z",
|
| 1420 |
+
"á",
|
| 1421 |
+
"y",
|
| 1422 |
+
"j",
|
| 1423 |
+
"b",
|
| 1424 |
+
"ě",
|
| 1425 |
+
"é",
|
| 1426 |
+
"ř",
|
| 1427 |
+
],
|
| 1428 |
+
"Hungarian": [
|
| 1429 |
+
"e",
|
| 1430 |
+
"a",
|
| 1431 |
+
"t",
|
| 1432 |
+
"l",
|
| 1433 |
+
"s",
|
| 1434 |
+
"n",
|
| 1435 |
+
"k",
|
| 1436 |
+
"r",
|
| 1437 |
+
"i",
|
| 1438 |
+
"o",
|
| 1439 |
+
"z",
|
| 1440 |
+
"á",
|
| 1441 |
+
"é",
|
| 1442 |
+
"g",
|
| 1443 |
+
"m",
|
| 1444 |
+
"b",
|
| 1445 |
+
"y",
|
| 1446 |
+
"v",
|
| 1447 |
+
"d",
|
| 1448 |
+
"h",
|
| 1449 |
+
"u",
|
| 1450 |
+
"p",
|
| 1451 |
+
"j",
|
| 1452 |
+
"ö",
|
| 1453 |
+
"f",
|
| 1454 |
+
"c",
|
| 1455 |
+
],
|
| 1456 |
+
"Korean": [
|
| 1457 |
+
"이",
|
| 1458 |
+
"다",
|
| 1459 |
+
"에",
|
| 1460 |
+
"의",
|
| 1461 |
+
"는",
|
| 1462 |
+
"로",
|
| 1463 |
+
"하",
|
| 1464 |
+
"을",
|
| 1465 |
+
"가",
|
| 1466 |
+
"고",
|
| 1467 |
+
"지",
|
| 1468 |
+
"서",
|
| 1469 |
+
"한",
|
| 1470 |
+
"은",
|
| 1471 |
+
"기",
|
| 1472 |
+
"으",
|
| 1473 |
+
"년",
|
| 1474 |
+
"대",
|
| 1475 |
+
"사",
|
| 1476 |
+
"시",
|
| 1477 |
+
"를",
|
| 1478 |
+
"리",
|
| 1479 |
+
"도",
|
| 1480 |
+
"인",
|
| 1481 |
+
"스",
|
| 1482 |
+
"일",
|
| 1483 |
+
],
|
| 1484 |
+
"Indonesian": [
|
| 1485 |
+
"a",
|
| 1486 |
+
"n",
|
| 1487 |
+
"e",
|
| 1488 |
+
"i",
|
| 1489 |
+
"r",
|
| 1490 |
+
"t",
|
| 1491 |
+
"u",
|
| 1492 |
+
"s",
|
| 1493 |
+
"d",
|
| 1494 |
+
"k",
|
| 1495 |
+
"m",
|
| 1496 |
+
"l",
|
| 1497 |
+
"g",
|
| 1498 |
+
"p",
|
| 1499 |
+
"b",
|
| 1500 |
+
"o",
|
| 1501 |
+
"h",
|
| 1502 |
+
"y",
|
| 1503 |
+
"j",
|
| 1504 |
+
"c",
|
| 1505 |
+
"w",
|
| 1506 |
+
"f",
|
| 1507 |
+
"v",
|
| 1508 |
+
"z",
|
| 1509 |
+
"x",
|
| 1510 |
+
"q",
|
| 1511 |
+
],
|
| 1512 |
+
"Turkish": [
|
| 1513 |
+
"a",
|
| 1514 |
+
"e",
|
| 1515 |
+
"i",
|
| 1516 |
+
"n",
|
| 1517 |
+
"r",
|
| 1518 |
+
"l",
|
| 1519 |
+
"ı",
|
| 1520 |
+
"k",
|
| 1521 |
+
"d",
|
| 1522 |
+
"t",
|
| 1523 |
+
"s",
|
| 1524 |
+
"m",
|
| 1525 |
+
"y",
|
| 1526 |
+
"u",
|
| 1527 |
+
"o",
|
| 1528 |
+
"b",
|
| 1529 |
+
"ü",
|
| 1530 |
+
"ş",
|
| 1531 |
+
"v",
|
| 1532 |
+
"g",
|
| 1533 |
+
"z",
|
| 1534 |
+
"h",
|
| 1535 |
+
"c",
|
| 1536 |
+
"p",
|
| 1537 |
+
"ç",
|
| 1538 |
+
"ğ",
|
| 1539 |
+
],
|
| 1540 |
+
"Romanian": [
|
| 1541 |
+
"e",
|
| 1542 |
+
"i",
|
| 1543 |
+
"a",
|
| 1544 |
+
"r",
|
| 1545 |
+
"n",
|
| 1546 |
+
"t",
|
| 1547 |
+
"u",
|
| 1548 |
+
"l",
|
| 1549 |
+
"o",
|
| 1550 |
+
"c",
|
| 1551 |
+
"s",
|
| 1552 |
+
"d",
|
| 1553 |
+
"p",
|
| 1554 |
+
"m",
|
| 1555 |
+
"ă",
|
| 1556 |
+
"f",
|
| 1557 |
+
"v",
|
| 1558 |
+
"î",
|
| 1559 |
+
"g",
|
| 1560 |
+
"b",
|
| 1561 |
+
"ș",
|
| 1562 |
+
"ț",
|
| 1563 |
+
"z",
|
| 1564 |
+
"h",
|
| 1565 |
+
"â",
|
| 1566 |
+
"j",
|
| 1567 |
+
],
|
| 1568 |
+
"Farsi": [
|
| 1569 |
+
"ا",
|
| 1570 |
+
"ی",
|
| 1571 |
+
"ر",
|
| 1572 |
+
"د",
|
| 1573 |
+
"ن",
|
| 1574 |
+
"ه",
|
| 1575 |
+
"و",
|
| 1576 |
+
"م",
|
| 1577 |
+
"ت",
|
| 1578 |
+
"ب",
|
| 1579 |
+
"س",
|
| 1580 |
+
"ل",
|
| 1581 |
+
"ک",
|
| 1582 |
+
"ش",
|
| 1583 |
+
"ز",
|
| 1584 |
+
"ف",
|
| 1585 |
+
"گ",
|
| 1586 |
+
"ع",
|
| 1587 |
+
"خ",
|
| 1588 |
+
"ق",
|
| 1589 |
+
"ج",
|
| 1590 |
+
"آ",
|
| 1591 |
+
"پ",
|
| 1592 |
+
"ح",
|
| 1593 |
+
"ط",
|
| 1594 |
+
"ص",
|
| 1595 |
+
],
|
| 1596 |
+
"Arabic": [
|
| 1597 |
+
"ا",
|
| 1598 |
+
"ل",
|
| 1599 |
+
"ي",
|
| 1600 |
+
"م",
|
| 1601 |
+
"و",
|
| 1602 |
+
"ن",
|
| 1603 |
+
"ر",
|
| 1604 |
+
"ت",
|
| 1605 |
+
"ب",
|
| 1606 |
+
"ة",
|
| 1607 |
+
"ع",
|
| 1608 |
+
"د",
|
| 1609 |
+
"س",
|
| 1610 |
+
"ف",
|
| 1611 |
+
"ه",
|
| 1612 |
+
"ك",
|
| 1613 |
+
"ق",
|
| 1614 |
+
"أ",
|
| 1615 |
+
"ح",
|
| 1616 |
+
"ج",
|
| 1617 |
+
"ش",
|
| 1618 |
+
"ط",
|
| 1619 |
+
"ص",
|
| 1620 |
+
"ى",
|
| 1621 |
+
"خ",
|
| 1622 |
+
"إ",
|
| 1623 |
+
],
|
| 1624 |
+
"Danish": [
|
| 1625 |
+
"e",
|
| 1626 |
+
"r",
|
| 1627 |
+
"n",
|
| 1628 |
+
"t",
|
| 1629 |
+
"a",
|
| 1630 |
+
"i",
|
| 1631 |
+
"s",
|
| 1632 |
+
"d",
|
| 1633 |
+
"l",
|
| 1634 |
+
"o",
|
| 1635 |
+
"g",
|
| 1636 |
+
"m",
|
| 1637 |
+
"k",
|
| 1638 |
+
"f",
|
| 1639 |
+
"v",
|
| 1640 |
+
"u",
|
| 1641 |
+
"b",
|
| 1642 |
+
"h",
|
| 1643 |
+
"p",
|
| 1644 |
+
"å",
|
| 1645 |
+
"y",
|
| 1646 |
+
"ø",
|
| 1647 |
+
"æ",
|
| 1648 |
+
"c",
|
| 1649 |
+
"j",
|
| 1650 |
+
"w",
|
| 1651 |
+
],
|
| 1652 |
+
"Serbian": [
|
| 1653 |
+
"а",
|
| 1654 |
+
"и",
|
| 1655 |
+
"о",
|
| 1656 |
+
"е",
|
| 1657 |
+
"н",
|
| 1658 |
+
"р",
|
| 1659 |
+
"с",
|
| 1660 |
+
"у",
|
| 1661 |
+
"т",
|
| 1662 |
+
"к",
|
| 1663 |
+
"ј",
|
| 1664 |
+
"в",
|
| 1665 |
+
"д",
|
| 1666 |
+
"м",
|
| 1667 |
+
"п",
|
| 1668 |
+
"л",
|
| 1669 |
+
"г",
|
| 1670 |
+
"з",
|
| 1671 |
+
"б",
|
| 1672 |
+
"a",
|
| 1673 |
+
"i",
|
| 1674 |
+
"e",
|
| 1675 |
+
"o",
|
| 1676 |
+
"n",
|
| 1677 |
+
"ц",
|
| 1678 |
+
"ш",
|
| 1679 |
+
],
|
| 1680 |
+
"Lithuanian": [
|
| 1681 |
+
"i",
|
| 1682 |
+
"a",
|
| 1683 |
+
"s",
|
| 1684 |
+
"o",
|
| 1685 |
+
"r",
|
| 1686 |
+
"e",
|
| 1687 |
+
"t",
|
| 1688 |
+
"n",
|
| 1689 |
+
"u",
|
| 1690 |
+
"k",
|
| 1691 |
+
"m",
|
| 1692 |
+
"l",
|
| 1693 |
+
"p",
|
| 1694 |
+
"v",
|
| 1695 |
+
"d",
|
| 1696 |
+
"j",
|
| 1697 |
+
"g",
|
| 1698 |
+
"ė",
|
| 1699 |
+
"b",
|
| 1700 |
+
"y",
|
| 1701 |
+
"ų",
|
| 1702 |
+
"š",
|
| 1703 |
+
"ž",
|
| 1704 |
+
"c",
|
| 1705 |
+
"ą",
|
| 1706 |
+
"į",
|
| 1707 |
+
],
|
| 1708 |
+
"Slovene": [
|
| 1709 |
+
"e",
|
| 1710 |
+
"a",
|
| 1711 |
+
"i",
|
| 1712 |
+
"o",
|
| 1713 |
+
"n",
|
| 1714 |
+
"r",
|
| 1715 |
+
"s",
|
| 1716 |
+
"l",
|
| 1717 |
+
"t",
|
| 1718 |
+
"j",
|
| 1719 |
+
"v",
|
| 1720 |
+
"k",
|
| 1721 |
+
"d",
|
| 1722 |
+
"p",
|
| 1723 |
+
"m",
|
| 1724 |
+
"u",
|
| 1725 |
+
"z",
|
| 1726 |
+
"b",
|
| 1727 |
+
"g",
|
| 1728 |
+
"h",
|
| 1729 |
+
"č",
|
| 1730 |
+
"c",
|
| 1731 |
+
"š",
|
| 1732 |
+
"ž",
|
| 1733 |
+
"f",
|
| 1734 |
+
"y",
|
| 1735 |
+
],
|
| 1736 |
+
"Slovak": [
|
| 1737 |
+
"o",
|
| 1738 |
+
"a",
|
| 1739 |
+
"e",
|
| 1740 |
+
"n",
|
| 1741 |
+
"i",
|
| 1742 |
+
"r",
|
| 1743 |
+
"v",
|
| 1744 |
+
"t",
|
| 1745 |
+
"s",
|
| 1746 |
+
"l",
|
| 1747 |
+
"k",
|
| 1748 |
+
"d",
|
| 1749 |
+
"m",
|
| 1750 |
+
"p",
|
| 1751 |
+
"u",
|
| 1752 |
+
"c",
|
| 1753 |
+
"h",
|
| 1754 |
+
"j",
|
| 1755 |
+
"b",
|
| 1756 |
+
"z",
|
| 1757 |
+
"á",
|
| 1758 |
+
"y",
|
| 1759 |
+
"ý",
|
| 1760 |
+
"í",
|
| 1761 |
+
"č",
|
| 1762 |
+
"é",
|
| 1763 |
+
],
|
| 1764 |
+
"Hebrew": [
|
| 1765 |
+
"י",
|
| 1766 |
+
"ו",
|
| 1767 |
+
"ה",
|
| 1768 |
+
"ל",
|
| 1769 |
+
"ר",
|
| 1770 |
+
"ב",
|
| 1771 |
+
"ת",
|
| 1772 |
+
"מ",
|
| 1773 |
+
"א",
|
| 1774 |
+
"ש",
|
| 1775 |
+
"נ",
|
| 1776 |
+
"ע",
|
| 1777 |
+
"ם",
|
| 1778 |
+
"ד",
|
| 1779 |
+
"ק",
|
| 1780 |
+
"ח",
|
| 1781 |
+
"פ",
|
| 1782 |
+
"ס",
|
| 1783 |
+
"כ",
|
| 1784 |
+
"ג",
|
| 1785 |
+
"ט",
|
| 1786 |
+
"צ",
|
| 1787 |
+
"ן",
|
| 1788 |
+
"ז",
|
| 1789 |
+
"ך",
|
| 1790 |
+
],
|
| 1791 |
+
"Bulgarian": [
|
| 1792 |
+
"а",
|
| 1793 |
+
"и",
|
| 1794 |
+
"о",
|
| 1795 |
+
"е",
|
| 1796 |
+
"н",
|
| 1797 |
+
"т",
|
| 1798 |
+
"р",
|
| 1799 |
+
"с",
|
| 1800 |
+
"в",
|
| 1801 |
+
"л",
|
| 1802 |
+
"к",
|
| 1803 |
+
"д",
|
| 1804 |
+
"п",
|
| 1805 |
+
"м",
|
| 1806 |
+
"з",
|
| 1807 |
+
"г",
|
| 1808 |
+
"я",
|
| 1809 |
+
"ъ",
|
| 1810 |
+
"у",
|
| 1811 |
+
"б",
|
| 1812 |
+
"ч",
|
| 1813 |
+
"ц",
|
| 1814 |
+
"й",
|
| 1815 |
+
"ж",
|
| 1816 |
+
"щ",
|
| 1817 |
+
"х",
|
| 1818 |
+
],
|
| 1819 |
+
"Croatian": [
|
| 1820 |
+
"a",
|
| 1821 |
+
"i",
|
| 1822 |
+
"o",
|
| 1823 |
+
"e",
|
| 1824 |
+
"n",
|
| 1825 |
+
"r",
|
| 1826 |
+
"j",
|
| 1827 |
+
"s",
|
| 1828 |
+
"t",
|
| 1829 |
+
"u",
|
| 1830 |
+
"k",
|
| 1831 |
+
"l",
|
| 1832 |
+
"v",
|
| 1833 |
+
"d",
|
| 1834 |
+
"m",
|
| 1835 |
+
"p",
|
| 1836 |
+
"g",
|
| 1837 |
+
"z",
|
| 1838 |
+
"b",
|
| 1839 |
+
"c",
|
| 1840 |
+
"č",
|
| 1841 |
+
"h",
|
| 1842 |
+
"š",
|
| 1843 |
+
"ž",
|
| 1844 |
+
"ć",
|
| 1845 |
+
"f",
|
| 1846 |
+
],
|
| 1847 |
+
"Hindi": [
|
| 1848 |
+
"क",
|
| 1849 |
+
"र",
|
| 1850 |
+
"स",
|
| 1851 |
+
"न",
|
| 1852 |
+
"त",
|
| 1853 |
+
"म",
|
| 1854 |
+
"ह",
|
| 1855 |
+
"प",
|
| 1856 |
+
"य",
|
| 1857 |
+
"ल",
|
| 1858 |
+
"व",
|
| 1859 |
+
"ज",
|
| 1860 |
+
"द",
|
| 1861 |
+
"ग",
|
| 1862 |
+
"ब",
|
| 1863 |
+
"श",
|
| 1864 |
+
"ट",
|
| 1865 |
+
"अ",
|
| 1866 |
+
"ए",
|
| 1867 |
+
"थ",
|
| 1868 |
+
"भ",
|
| 1869 |
+
"ड",
|
| 1870 |
+
"च",
|
| 1871 |
+
"ध",
|
| 1872 |
+
"ष",
|
| 1873 |
+
"इ",
|
| 1874 |
+
],
|
| 1875 |
+
"Estonian": [
|
| 1876 |
+
"a",
|
| 1877 |
+
"i",
|
| 1878 |
+
"e",
|
| 1879 |
+
"s",
|
| 1880 |
+
"t",
|
| 1881 |
+
"l",
|
| 1882 |
+
"u",
|
| 1883 |
+
"n",
|
| 1884 |
+
"o",
|
| 1885 |
+
"k",
|
| 1886 |
+
"r",
|
| 1887 |
+
"d",
|
| 1888 |
+
"m",
|
| 1889 |
+
"v",
|
| 1890 |
+
"g",
|
| 1891 |
+
"p",
|
| 1892 |
+
"j",
|
| 1893 |
+
"h",
|
| 1894 |
+
"ä",
|
| 1895 |
+
"b",
|
| 1896 |
+
"õ",
|
| 1897 |
+
"ü",
|
| 1898 |
+
"f",
|
| 1899 |
+
"c",
|
| 1900 |
+
"ö",
|
| 1901 |
+
"y",
|
| 1902 |
+
],
|
| 1903 |
+
"Thai": [
|
| 1904 |
+
"า",
|
| 1905 |
+
"น",
|
| 1906 |
+
"ร",
|
| 1907 |
+
"อ",
|
| 1908 |
+
"ก",
|
| 1909 |
+
"เ",
|
| 1910 |
+
"ง",
|
| 1911 |
+
"ม",
|
| 1912 |
+
"ย",
|
| 1913 |
+
"ล",
|
| 1914 |
+
"ว",
|
| 1915 |
+
"ด",
|
| 1916 |
+
"ท",
|
| 1917 |
+
"ส",
|
| 1918 |
+
"ต",
|
| 1919 |
+
"ะ",
|
| 1920 |
+
"ป",
|
| 1921 |
+
"บ",
|
| 1922 |
+
"ค",
|
| 1923 |
+
"ห",
|
| 1924 |
+
"แ",
|
| 1925 |
+
"จ",
|
| 1926 |
+
"พ",
|
| 1927 |
+
"ช",
|
| 1928 |
+
"ข",
|
| 1929 |
+
"ใ",
|
| 1930 |
+
],
|
| 1931 |
+
"Greek": [
|
| 1932 |
+
"α",
|
| 1933 |
+
"τ",
|
| 1934 |
+
"ο",
|
| 1935 |
+
"ι",
|
| 1936 |
+
"ε",
|
| 1937 |
+
"ν",
|
| 1938 |
+
"ρ",
|
| 1939 |
+
"σ",
|
| 1940 |
+
"κ",
|
| 1941 |
+
"η",
|
| 1942 |
+
"π",
|
| 1943 |
+
"ς",
|
| 1944 |
+
"υ",
|
| 1945 |
+
"μ",
|
| 1946 |
+
"λ",
|
| 1947 |
+
"ί",
|
| 1948 |
+
"ό",
|
| 1949 |
+
"ά",
|
| 1950 |
+
"γ",
|
| 1951 |
+
"έ",
|
| 1952 |
+
"δ",
|
| 1953 |
+
"ή",
|
| 1954 |
+
"ω",
|
| 1955 |
+
"χ",
|
| 1956 |
+
"θ",
|
| 1957 |
+
"ύ",
|
| 1958 |
+
],
|
| 1959 |
+
"Tamil": [
|
| 1960 |
+
"க",
|
| 1961 |
+
"த",
|
| 1962 |
+
"ப",
|
| 1963 |
+
"ட",
|
| 1964 |
+
"ர",
|
| 1965 |
+
"ம",
|
| 1966 |
+
"ல",
|
| 1967 |
+
"ன",
|
| 1968 |
+
"வ",
|
| 1969 |
+
"ற",
|
| 1970 |
+
"ய",
|
| 1971 |
+
"ள",
|
| 1972 |
+
"ச",
|
| 1973 |
+
"ந",
|
| 1974 |
+
"இ",
|
| 1975 |
+
"ண",
|
| 1976 |
+
"அ",
|
| 1977 |
+
"ஆ",
|
| 1978 |
+
"ழ",
|
| 1979 |
+
"ங",
|
| 1980 |
+
"எ",
|
| 1981 |
+
"உ",
|
| 1982 |
+
"ஒ",
|
| 1983 |
+
"ஸ",
|
| 1984 |
+
],
|
| 1985 |
+
"Kazakh": [
|
| 1986 |
+
"а",
|
| 1987 |
+
"ы",
|
| 1988 |
+
"е",
|
| 1989 |
+
"н",
|
| 1990 |
+
"т",
|
| 1991 |
+
"р",
|
| 1992 |
+
"л",
|
| 1993 |
+
"і",
|
| 1994 |
+
"д",
|
| 1995 |
+
"с",
|
| 1996 |
+
"м",
|
| 1997 |
+
"қ",
|
| 1998 |
+
"к",
|
| 1999 |
+
"о",
|
| 2000 |
+
"б",
|
| 2001 |
+
"и",
|
| 2002 |
+
"у",
|
| 2003 |
+
"ғ",
|
| 2004 |
+
"ж",
|
| 2005 |
+
"ң",
|
| 2006 |
+
"з",
|
| 2007 |
+
"ш",
|
| 2008 |
+
"й",
|
| 2009 |
+
"п",
|
| 2010 |
+
"г",
|
| 2011 |
+
"ө",
|
| 2012 |
+
],
|
| 2013 |
+
}
|
| 2014 |
+
|
| 2015 |
+
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
phivenv/Lib/site-packages/charset_normalizer/legacy.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import TYPE_CHECKING, Any
|
| 4 |
+
from warnings import warn
|
| 5 |
+
|
| 6 |
+
from .api import from_bytes
|
| 7 |
+
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
|
| 8 |
+
|
| 9 |
+
# TODO: remove this check when dropping Python 3.7 support
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from typing_extensions import TypedDict
|
| 12 |
+
|
| 13 |
+
class ResultDict(TypedDict):
|
| 14 |
+
encoding: str | None
|
| 15 |
+
language: str
|
| 16 |
+
confidence: float | None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def detect(
|
| 20 |
+
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
| 21 |
+
) -> ResultDict:
|
| 22 |
+
"""
|
| 23 |
+
chardet legacy method
|
| 24 |
+
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
| 25 |
+
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
| 26 |
+
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
| 27 |
+
further information. Not planned for removal.
|
| 28 |
+
|
| 29 |
+
:param byte_str: The byte sequence to examine.
|
| 30 |
+
:param should_rename_legacy: Should we rename legacy encodings
|
| 31 |
+
to their more modern equivalents?
|
| 32 |
+
"""
|
| 33 |
+
if len(kwargs):
|
| 34 |
+
warn(
|
| 35 |
+
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
if not isinstance(byte_str, (bytearray, bytes)):
|
| 39 |
+
raise TypeError( # pragma: nocover
|
| 40 |
+
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if isinstance(byte_str, bytearray):
|
| 44 |
+
byte_str = bytes(byte_str)
|
| 45 |
+
|
| 46 |
+
r = from_bytes(byte_str).best()
|
| 47 |
+
|
| 48 |
+
encoding = r.encoding if r is not None else None
|
| 49 |
+
language = r.language if r is not None and r.language != "Unknown" else ""
|
| 50 |
+
confidence = 1.0 - r.chaos if r is not None else None
|
| 51 |
+
|
| 52 |
+
# automatically lower confidence
|
| 53 |
+
# on small bytes samples.
|
| 54 |
+
# https://github.com/jawah/charset_normalizer/issues/391
|
| 55 |
+
if (
|
| 56 |
+
confidence is not None
|
| 57 |
+
and confidence >= 0.9
|
| 58 |
+
and encoding
|
| 59 |
+
not in {
|
| 60 |
+
"utf_8",
|
| 61 |
+
"ascii",
|
| 62 |
+
}
|
| 63 |
+
and r.bom is False # type: ignore[union-attr]
|
| 64 |
+
and len(byte_str) < TOO_SMALL_SEQUENCE
|
| 65 |
+
):
|
| 66 |
+
confidence -= 0.2
|
| 67 |
+
|
| 68 |
+
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
| 69 |
+
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
| 70 |
+
if r is not None and encoding == "utf_8" and r.bom:
|
| 71 |
+
encoding += "_sig"
|
| 72 |
+
|
| 73 |
+
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
| 74 |
+
encoding = CHARDET_CORRESPONDENCE[encoding]
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"encoding": encoding,
|
| 78 |
+
"language": language,
|
| 79 |
+
"confidence": confidence,
|
| 80 |
+
}
|
phivenv/Lib/site-packages/charset_normalizer/md.cp39-win_amd64.pyd
ADDED
|
Binary file (10.8 kB). View file
|
|
|
phivenv/Lib/site-packages/charset_normalizer/md.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from logging import getLogger
|
| 5 |
+
|
| 6 |
+
from .constant import (
|
| 7 |
+
COMMON_SAFE_ASCII_CHARACTERS,
|
| 8 |
+
TRACE,
|
| 9 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
| 10 |
+
)
|
| 11 |
+
from .utils import (
|
| 12 |
+
is_accentuated,
|
| 13 |
+
is_arabic,
|
| 14 |
+
is_arabic_isolated_form,
|
| 15 |
+
is_case_variable,
|
| 16 |
+
is_cjk,
|
| 17 |
+
is_emoticon,
|
| 18 |
+
is_hangul,
|
| 19 |
+
is_hiragana,
|
| 20 |
+
is_katakana,
|
| 21 |
+
is_latin,
|
| 22 |
+
is_punctuation,
|
| 23 |
+
is_separator,
|
| 24 |
+
is_symbol,
|
| 25 |
+
is_thai,
|
| 26 |
+
is_unprintable,
|
| 27 |
+
remove_accent,
|
| 28 |
+
unicode_range,
|
| 29 |
+
is_cjk_uncommon,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class MessDetectorPlugin:
|
| 34 |
+
"""
|
| 35 |
+
Base abstract class used for mess detection plugins.
|
| 36 |
+
All detectors MUST extend and implement given methods.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def eligible(self, character: str) -> bool:
|
| 40 |
+
"""
|
| 41 |
+
Determine if given character should be fed in.
|
| 42 |
+
"""
|
| 43 |
+
raise NotImplementedError # pragma: nocover
|
| 44 |
+
|
| 45 |
+
def feed(self, character: str) -> None:
|
| 46 |
+
"""
|
| 47 |
+
The main routine to be executed upon character.
|
| 48 |
+
Insert the logic in witch the text would be considered chaotic.
|
| 49 |
+
"""
|
| 50 |
+
raise NotImplementedError # pragma: nocover
|
| 51 |
+
|
| 52 |
+
def reset(self) -> None: # pragma: no cover
|
| 53 |
+
"""
|
| 54 |
+
Permit to reset the plugin to the initial state.
|
| 55 |
+
"""
|
| 56 |
+
raise NotImplementedError
|
| 57 |
+
|
| 58 |
+
@property
|
| 59 |
+
def ratio(self) -> float:
|
| 60 |
+
"""
|
| 61 |
+
Compute the chaos ratio based on what your feed() has seen.
|
| 62 |
+
Must NOT be lower than 0.; No restriction gt 0.
|
| 63 |
+
"""
|
| 64 |
+
raise NotImplementedError # pragma: nocover
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
| 68 |
+
def __init__(self) -> None:
|
| 69 |
+
self._punctuation_count: int = 0
|
| 70 |
+
self._symbol_count: int = 0
|
| 71 |
+
self._character_count: int = 0
|
| 72 |
+
|
| 73 |
+
self._last_printable_char: str | None = None
|
| 74 |
+
self._frenzy_symbol_in_word: bool = False
|
| 75 |
+
|
| 76 |
+
def eligible(self, character: str) -> bool:
|
| 77 |
+
return character.isprintable()
|
| 78 |
+
|
| 79 |
+
def feed(self, character: str) -> None:
|
| 80 |
+
self._character_count += 1
|
| 81 |
+
|
| 82 |
+
if (
|
| 83 |
+
character != self._last_printable_char
|
| 84 |
+
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
| 85 |
+
):
|
| 86 |
+
if is_punctuation(character):
|
| 87 |
+
self._punctuation_count += 1
|
| 88 |
+
elif (
|
| 89 |
+
character.isdigit() is False
|
| 90 |
+
and is_symbol(character)
|
| 91 |
+
and is_emoticon(character) is False
|
| 92 |
+
):
|
| 93 |
+
self._symbol_count += 2
|
| 94 |
+
|
| 95 |
+
self._last_printable_char = character
|
| 96 |
+
|
| 97 |
+
def reset(self) -> None: # Abstract
|
| 98 |
+
self._punctuation_count = 0
|
| 99 |
+
self._character_count = 0
|
| 100 |
+
self._symbol_count = 0
|
| 101 |
+
|
| 102 |
+
@property
|
| 103 |
+
def ratio(self) -> float:
|
| 104 |
+
if self._character_count == 0:
|
| 105 |
+
return 0.0
|
| 106 |
+
|
| 107 |
+
ratio_of_punctuation: float = (
|
| 108 |
+
self._punctuation_count + self._symbol_count
|
| 109 |
+
) / self._character_count
|
| 110 |
+
|
| 111 |
+
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
| 115 |
+
def __init__(self) -> None:
|
| 116 |
+
self._character_count: int = 0
|
| 117 |
+
self._accentuated_count: int = 0
|
| 118 |
+
|
| 119 |
+
def eligible(self, character: str) -> bool:
|
| 120 |
+
return character.isalpha()
|
| 121 |
+
|
| 122 |
+
def feed(self, character: str) -> None:
|
| 123 |
+
self._character_count += 1
|
| 124 |
+
|
| 125 |
+
if is_accentuated(character):
|
| 126 |
+
self._accentuated_count += 1
|
| 127 |
+
|
| 128 |
+
def reset(self) -> None: # Abstract
|
| 129 |
+
self._character_count = 0
|
| 130 |
+
self._accentuated_count = 0
|
| 131 |
+
|
| 132 |
+
@property
|
| 133 |
+
def ratio(self) -> float:
|
| 134 |
+
if self._character_count < 8:
|
| 135 |
+
return 0.0
|
| 136 |
+
|
| 137 |
+
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
| 138 |
+
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class UnprintablePlugin(MessDetectorPlugin):
|
| 142 |
+
def __init__(self) -> None:
|
| 143 |
+
self._unprintable_count: int = 0
|
| 144 |
+
self._character_count: int = 0
|
| 145 |
+
|
| 146 |
+
def eligible(self, character: str) -> bool:
|
| 147 |
+
return True
|
| 148 |
+
|
| 149 |
+
def feed(self, character: str) -> None:
|
| 150 |
+
if is_unprintable(character):
|
| 151 |
+
self._unprintable_count += 1
|
| 152 |
+
self._character_count += 1
|
| 153 |
+
|
| 154 |
+
def reset(self) -> None: # Abstract
|
| 155 |
+
self._unprintable_count = 0
|
| 156 |
+
|
| 157 |
+
@property
|
| 158 |
+
def ratio(self) -> float:
|
| 159 |
+
if self._character_count == 0:
|
| 160 |
+
return 0.0
|
| 161 |
+
|
| 162 |
+
return (self._unprintable_count * 8) / self._character_count
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
| 166 |
+
def __init__(self) -> None:
|
| 167 |
+
self._successive_count: int = 0
|
| 168 |
+
self._character_count: int = 0
|
| 169 |
+
|
| 170 |
+
self._last_latin_character: str | None = None
|
| 171 |
+
|
| 172 |
+
def eligible(self, character: str) -> bool:
|
| 173 |
+
return character.isalpha() and is_latin(character)
|
| 174 |
+
|
| 175 |
+
def feed(self, character: str) -> None:
|
| 176 |
+
self._character_count += 1
|
| 177 |
+
if (
|
| 178 |
+
self._last_latin_character is not None
|
| 179 |
+
and is_accentuated(character)
|
| 180 |
+
and is_accentuated(self._last_latin_character)
|
| 181 |
+
):
|
| 182 |
+
if character.isupper() and self._last_latin_character.isupper():
|
| 183 |
+
self._successive_count += 1
|
| 184 |
+
# Worse if its the same char duplicated with different accent.
|
| 185 |
+
if remove_accent(character) == remove_accent(self._last_latin_character):
|
| 186 |
+
self._successive_count += 1
|
| 187 |
+
self._last_latin_character = character
|
| 188 |
+
|
| 189 |
+
def reset(self) -> None: # Abstract
|
| 190 |
+
self._successive_count = 0
|
| 191 |
+
self._character_count = 0
|
| 192 |
+
self._last_latin_character = None
|
| 193 |
+
|
| 194 |
+
@property
|
| 195 |
+
def ratio(self) -> float:
|
| 196 |
+
if self._character_count == 0:
|
| 197 |
+
return 0.0
|
| 198 |
+
|
| 199 |
+
return (self._successive_count * 2) / self._character_count
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
class SuspiciousRange(MessDetectorPlugin):
|
| 203 |
+
def __init__(self) -> None:
|
| 204 |
+
self._suspicious_successive_range_count: int = 0
|
| 205 |
+
self._character_count: int = 0
|
| 206 |
+
self._last_printable_seen: str | None = None
|
| 207 |
+
|
| 208 |
+
def eligible(self, character: str) -> bool:
|
| 209 |
+
return character.isprintable()
|
| 210 |
+
|
| 211 |
+
def feed(self, character: str) -> None:
|
| 212 |
+
self._character_count += 1
|
| 213 |
+
|
| 214 |
+
if (
|
| 215 |
+
character.isspace()
|
| 216 |
+
or is_punctuation(character)
|
| 217 |
+
or character in COMMON_SAFE_ASCII_CHARACTERS
|
| 218 |
+
):
|
| 219 |
+
self._last_printable_seen = None
|
| 220 |
+
return
|
| 221 |
+
|
| 222 |
+
if self._last_printable_seen is None:
|
| 223 |
+
self._last_printable_seen = character
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
| 227 |
+
unicode_range_b: str | None = unicode_range(character)
|
| 228 |
+
|
| 229 |
+
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
| 230 |
+
self._suspicious_successive_range_count += 1
|
| 231 |
+
|
| 232 |
+
self._last_printable_seen = character
|
| 233 |
+
|
| 234 |
+
def reset(self) -> None: # Abstract
|
| 235 |
+
self._character_count = 0
|
| 236 |
+
self._suspicious_successive_range_count = 0
|
| 237 |
+
self._last_printable_seen = None
|
| 238 |
+
|
| 239 |
+
@property
|
| 240 |
+
def ratio(self) -> float:
|
| 241 |
+
if self._character_count <= 13:
|
| 242 |
+
return 0.0
|
| 243 |
+
|
| 244 |
+
ratio_of_suspicious_range_usage: float = (
|
| 245 |
+
self._suspicious_successive_range_count * 2
|
| 246 |
+
) / self._character_count
|
| 247 |
+
|
| 248 |
+
return ratio_of_suspicious_range_usage
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
| 252 |
+
def __init__(self) -> None:
|
| 253 |
+
self._word_count: int = 0
|
| 254 |
+
self._bad_word_count: int = 0
|
| 255 |
+
self._foreign_long_count: int = 0
|
| 256 |
+
|
| 257 |
+
self._is_current_word_bad: bool = False
|
| 258 |
+
self._foreign_long_watch: bool = False
|
| 259 |
+
|
| 260 |
+
self._character_count: int = 0
|
| 261 |
+
self._bad_character_count: int = 0
|
| 262 |
+
|
| 263 |
+
self._buffer: str = ""
|
| 264 |
+
self._buffer_accent_count: int = 0
|
| 265 |
+
self._buffer_glyph_count: int = 0
|
| 266 |
+
|
| 267 |
+
def eligible(self, character: str) -> bool:
|
| 268 |
+
return True
|
| 269 |
+
|
| 270 |
+
def feed(self, character: str) -> None:
|
| 271 |
+
if character.isalpha():
|
| 272 |
+
self._buffer += character
|
| 273 |
+
if is_accentuated(character):
|
| 274 |
+
self._buffer_accent_count += 1
|
| 275 |
+
if (
|
| 276 |
+
self._foreign_long_watch is False
|
| 277 |
+
and (is_latin(character) is False or is_accentuated(character))
|
| 278 |
+
and is_cjk(character) is False
|
| 279 |
+
and is_hangul(character) is False
|
| 280 |
+
and is_katakana(character) is False
|
| 281 |
+
and is_hiragana(character) is False
|
| 282 |
+
and is_thai(character) is False
|
| 283 |
+
):
|
| 284 |
+
self._foreign_long_watch = True
|
| 285 |
+
if (
|
| 286 |
+
is_cjk(character)
|
| 287 |
+
or is_hangul(character)
|
| 288 |
+
or is_katakana(character)
|
| 289 |
+
or is_hiragana(character)
|
| 290 |
+
or is_thai(character)
|
| 291 |
+
):
|
| 292 |
+
self._buffer_glyph_count += 1
|
| 293 |
+
return
|
| 294 |
+
if not self._buffer:
|
| 295 |
+
return
|
| 296 |
+
if (
|
| 297 |
+
character.isspace() or is_punctuation(character) or is_separator(character)
|
| 298 |
+
) and self._buffer:
|
| 299 |
+
self._word_count += 1
|
| 300 |
+
buffer_length: int = len(self._buffer)
|
| 301 |
+
|
| 302 |
+
self._character_count += buffer_length
|
| 303 |
+
|
| 304 |
+
if buffer_length >= 4:
|
| 305 |
+
if self._buffer_accent_count / buffer_length >= 0.5:
|
| 306 |
+
self._is_current_word_bad = True
|
| 307 |
+
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
| 308 |
+
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
| 309 |
+
elif (
|
| 310 |
+
is_accentuated(self._buffer[-1])
|
| 311 |
+
and self._buffer[-1].isupper()
|
| 312 |
+
and all(_.isupper() for _ in self._buffer) is False
|
| 313 |
+
):
|
| 314 |
+
self._foreign_long_count += 1
|
| 315 |
+
self._is_current_word_bad = True
|
| 316 |
+
elif self._buffer_glyph_count == 1:
|
| 317 |
+
self._is_current_word_bad = True
|
| 318 |
+
self._foreign_long_count += 1
|
| 319 |
+
if buffer_length >= 24 and self._foreign_long_watch:
|
| 320 |
+
camel_case_dst = [
|
| 321 |
+
i
|
| 322 |
+
for c, i in zip(self._buffer, range(0, buffer_length))
|
| 323 |
+
if c.isupper()
|
| 324 |
+
]
|
| 325 |
+
probable_camel_cased: bool = False
|
| 326 |
+
|
| 327 |
+
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
| 328 |
+
probable_camel_cased = True
|
| 329 |
+
|
| 330 |
+
if not probable_camel_cased:
|
| 331 |
+
self._foreign_long_count += 1
|
| 332 |
+
self._is_current_word_bad = True
|
| 333 |
+
|
| 334 |
+
if self._is_current_word_bad:
|
| 335 |
+
self._bad_word_count += 1
|
| 336 |
+
self._bad_character_count += len(self._buffer)
|
| 337 |
+
self._is_current_word_bad = False
|
| 338 |
+
|
| 339 |
+
self._foreign_long_watch = False
|
| 340 |
+
self._buffer = ""
|
| 341 |
+
self._buffer_accent_count = 0
|
| 342 |
+
self._buffer_glyph_count = 0
|
| 343 |
+
elif (
|
| 344 |
+
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
| 345 |
+
and character.isdigit() is False
|
| 346 |
+
and is_symbol(character)
|
| 347 |
+
):
|
| 348 |
+
self._is_current_word_bad = True
|
| 349 |
+
self._buffer += character
|
| 350 |
+
|
| 351 |
+
def reset(self) -> None: # Abstract
|
| 352 |
+
self._buffer = ""
|
| 353 |
+
self._is_current_word_bad = False
|
| 354 |
+
self._foreign_long_watch = False
|
| 355 |
+
self._bad_word_count = 0
|
| 356 |
+
self._word_count = 0
|
| 357 |
+
self._character_count = 0
|
| 358 |
+
self._bad_character_count = 0
|
| 359 |
+
self._foreign_long_count = 0
|
| 360 |
+
|
| 361 |
+
@property
|
| 362 |
+
def ratio(self) -> float:
|
| 363 |
+
if self._word_count <= 10 and self._foreign_long_count == 0:
|
| 364 |
+
return 0.0
|
| 365 |
+
|
| 366 |
+
return self._bad_character_count / self._character_count
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
class CjkUncommonPlugin(MessDetectorPlugin):
|
| 370 |
+
"""
|
| 371 |
+
Detect messy CJK text that probably means nothing.
|
| 372 |
+
"""
|
| 373 |
+
|
| 374 |
+
def __init__(self) -> None:
|
| 375 |
+
self._character_count: int = 0
|
| 376 |
+
self._uncommon_count: int = 0
|
| 377 |
+
|
| 378 |
+
def eligible(self, character: str) -> bool:
|
| 379 |
+
return is_cjk(character)
|
| 380 |
+
|
| 381 |
+
def feed(self, character: str) -> None:
|
| 382 |
+
self._character_count += 1
|
| 383 |
+
|
| 384 |
+
if is_cjk_uncommon(character):
|
| 385 |
+
self._uncommon_count += 1
|
| 386 |
+
return
|
| 387 |
+
|
| 388 |
+
def reset(self) -> None: # Abstract
|
| 389 |
+
self._character_count = 0
|
| 390 |
+
self._uncommon_count = 0
|
| 391 |
+
|
| 392 |
+
@property
|
| 393 |
+
def ratio(self) -> float:
|
| 394 |
+
if self._character_count < 8:
|
| 395 |
+
return 0.0
|
| 396 |
+
|
| 397 |
+
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
| 398 |
+
|
| 399 |
+
# we can be pretty sure it's garbage when uncommon characters are widely
|
| 400 |
+
# used. otherwise it could just be traditional chinese for example.
|
| 401 |
+
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
| 405 |
+
def __init__(self) -> None:
|
| 406 |
+
self._buf: bool = False
|
| 407 |
+
|
| 408 |
+
self._character_count_since_last_sep: int = 0
|
| 409 |
+
|
| 410 |
+
self._successive_upper_lower_count: int = 0
|
| 411 |
+
self._successive_upper_lower_count_final: int = 0
|
| 412 |
+
|
| 413 |
+
self._character_count: int = 0
|
| 414 |
+
|
| 415 |
+
self._last_alpha_seen: str | None = None
|
| 416 |
+
self._current_ascii_only: bool = True
|
| 417 |
+
|
| 418 |
+
def eligible(self, character: str) -> bool:
|
| 419 |
+
return True
|
| 420 |
+
|
| 421 |
+
def feed(self, character: str) -> None:
|
| 422 |
+
is_concerned = character.isalpha() and is_case_variable(character)
|
| 423 |
+
chunk_sep = is_concerned is False
|
| 424 |
+
|
| 425 |
+
if chunk_sep and self._character_count_since_last_sep > 0:
|
| 426 |
+
if (
|
| 427 |
+
self._character_count_since_last_sep <= 64
|
| 428 |
+
and character.isdigit() is False
|
| 429 |
+
and self._current_ascii_only is False
|
| 430 |
+
):
|
| 431 |
+
self._successive_upper_lower_count_final += (
|
| 432 |
+
self._successive_upper_lower_count
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
self._successive_upper_lower_count = 0
|
| 436 |
+
self._character_count_since_last_sep = 0
|
| 437 |
+
self._last_alpha_seen = None
|
| 438 |
+
self._buf = False
|
| 439 |
+
self._character_count += 1
|
| 440 |
+
self._current_ascii_only = True
|
| 441 |
+
|
| 442 |
+
return
|
| 443 |
+
|
| 444 |
+
if self._current_ascii_only is True and character.isascii() is False:
|
| 445 |
+
self._current_ascii_only = False
|
| 446 |
+
|
| 447 |
+
if self._last_alpha_seen is not None:
|
| 448 |
+
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
| 449 |
+
character.islower() and self._last_alpha_seen.isupper()
|
| 450 |
+
):
|
| 451 |
+
if self._buf is True:
|
| 452 |
+
self._successive_upper_lower_count += 2
|
| 453 |
+
self._buf = False
|
| 454 |
+
else:
|
| 455 |
+
self._buf = True
|
| 456 |
+
else:
|
| 457 |
+
self._buf = False
|
| 458 |
+
|
| 459 |
+
self._character_count += 1
|
| 460 |
+
self._character_count_since_last_sep += 1
|
| 461 |
+
self._last_alpha_seen = character
|
| 462 |
+
|
| 463 |
+
def reset(self) -> None: # Abstract
|
| 464 |
+
self._character_count = 0
|
| 465 |
+
self._character_count_since_last_sep = 0
|
| 466 |
+
self._successive_upper_lower_count = 0
|
| 467 |
+
self._successive_upper_lower_count_final = 0
|
| 468 |
+
self._last_alpha_seen = None
|
| 469 |
+
self._buf = False
|
| 470 |
+
self._current_ascii_only = True
|
| 471 |
+
|
| 472 |
+
@property
|
| 473 |
+
def ratio(self) -> float:
|
| 474 |
+
if self._character_count == 0:
|
| 475 |
+
return 0.0
|
| 476 |
+
|
| 477 |
+
return self._successive_upper_lower_count_final / self._character_count
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
| 481 |
+
def __init__(self) -> None:
|
| 482 |
+
self._character_count: int = 0
|
| 483 |
+
self._isolated_form_count: int = 0
|
| 484 |
+
|
| 485 |
+
def reset(self) -> None: # Abstract
|
| 486 |
+
self._character_count = 0
|
| 487 |
+
self._isolated_form_count = 0
|
| 488 |
+
|
| 489 |
+
def eligible(self, character: str) -> bool:
|
| 490 |
+
return is_arabic(character)
|
| 491 |
+
|
| 492 |
+
def feed(self, character: str) -> None:
|
| 493 |
+
self._character_count += 1
|
| 494 |
+
|
| 495 |
+
if is_arabic_isolated_form(character):
|
| 496 |
+
self._isolated_form_count += 1
|
| 497 |
+
|
| 498 |
+
@property
|
| 499 |
+
def ratio(self) -> float:
|
| 500 |
+
if self._character_count < 8:
|
| 501 |
+
return 0.0
|
| 502 |
+
|
| 503 |
+
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
| 504 |
+
|
| 505 |
+
return isolated_form_usage
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
@lru_cache(maxsize=1024)
|
| 509 |
+
def is_suspiciously_successive_range(
|
| 510 |
+
unicode_range_a: str | None, unicode_range_b: str | None
|
| 511 |
+
) -> bool:
|
| 512 |
+
"""
|
| 513 |
+
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
| 514 |
+
"""
|
| 515 |
+
if unicode_range_a is None or unicode_range_b is None:
|
| 516 |
+
return True
|
| 517 |
+
|
| 518 |
+
if unicode_range_a == unicode_range_b:
|
| 519 |
+
return False
|
| 520 |
+
|
| 521 |
+
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
| 522 |
+
return False
|
| 523 |
+
|
| 524 |
+
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
| 525 |
+
return False
|
| 526 |
+
|
| 527 |
+
# Latin characters can be accompanied with a combining diacritical mark
|
| 528 |
+
# eg. Vietnamese.
|
| 529 |
+
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
| 530 |
+
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
| 531 |
+
):
|
| 532 |
+
return False
|
| 533 |
+
|
| 534 |
+
keywords_range_a, keywords_range_b = (
|
| 535 |
+
unicode_range_a.split(" "),
|
| 536 |
+
unicode_range_b.split(" "),
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
for el in keywords_range_a:
|
| 540 |
+
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
| 541 |
+
continue
|
| 542 |
+
if el in keywords_range_b:
|
| 543 |
+
return False
|
| 544 |
+
|
| 545 |
+
# Japanese Exception
|
| 546 |
+
range_a_jp_chars, range_b_jp_chars = (
|
| 547 |
+
unicode_range_a
|
| 548 |
+
in (
|
| 549 |
+
"Hiragana",
|
| 550 |
+
"Katakana",
|
| 551 |
+
),
|
| 552 |
+
unicode_range_b in ("Hiragana", "Katakana"),
|
| 553 |
+
)
|
| 554 |
+
if (range_a_jp_chars or range_b_jp_chars) and (
|
| 555 |
+
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
| 556 |
+
):
|
| 557 |
+
return False
|
| 558 |
+
if range_a_jp_chars and range_b_jp_chars:
|
| 559 |
+
return False
|
| 560 |
+
|
| 561 |
+
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
| 562 |
+
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
| 563 |
+
return False
|
| 564 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
| 565 |
+
return False
|
| 566 |
+
|
| 567 |
+
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
| 568 |
+
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
| 569 |
+
unicode_range_a in ["Katakana", "Hiragana"]
|
| 570 |
+
and unicode_range_b in ["Katakana", "Hiragana"]
|
| 571 |
+
):
|
| 572 |
+
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
| 573 |
+
return False
|
| 574 |
+
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
| 575 |
+
return False
|
| 576 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
| 577 |
+
return False
|
| 578 |
+
|
| 579 |
+
return True
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
@lru_cache(maxsize=2048)
|
| 583 |
+
def mess_ratio(
|
| 584 |
+
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
| 585 |
+
) -> float:
|
| 586 |
+
"""
|
| 587 |
+
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
| 588 |
+
"""
|
| 589 |
+
|
| 590 |
+
detectors: list[MessDetectorPlugin] = [
|
| 591 |
+
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
| 592 |
+
]
|
| 593 |
+
|
| 594 |
+
length: int = len(decoded_sequence) + 1
|
| 595 |
+
|
| 596 |
+
mean_mess_ratio: float = 0.0
|
| 597 |
+
|
| 598 |
+
if length < 512:
|
| 599 |
+
intermediary_mean_mess_ratio_calc: int = 32
|
| 600 |
+
elif length <= 1024:
|
| 601 |
+
intermediary_mean_mess_ratio_calc = 64
|
| 602 |
+
else:
|
| 603 |
+
intermediary_mean_mess_ratio_calc = 128
|
| 604 |
+
|
| 605 |
+
for character, index in zip(decoded_sequence + "\n", range(length)):
|
| 606 |
+
for detector in detectors:
|
| 607 |
+
if detector.eligible(character):
|
| 608 |
+
detector.feed(character)
|
| 609 |
+
|
| 610 |
+
if (
|
| 611 |
+
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
| 612 |
+
) or index == length - 1:
|
| 613 |
+
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
| 614 |
+
|
| 615 |
+
if mean_mess_ratio >= maximum_threshold:
|
| 616 |
+
break
|
| 617 |
+
|
| 618 |
+
if debug:
|
| 619 |
+
logger = getLogger("charset_normalizer")
|
| 620 |
+
|
| 621 |
+
logger.log(
|
| 622 |
+
TRACE,
|
| 623 |
+
"Mess-detector extended-analysis start. "
|
| 624 |
+
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
| 625 |
+
f"maximum_threshold={maximum_threshold}",
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
if len(decoded_sequence) > 16:
|
| 629 |
+
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
| 630 |
+
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
| 631 |
+
|
| 632 |
+
for dt in detectors:
|
| 633 |
+
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
| 634 |
+
|
| 635 |
+
return round(mean_mess_ratio, 3)
|
phivenv/Lib/site-packages/charset_normalizer/models.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from encodings.aliases import aliases
|
| 4 |
+
from hashlib import sha256
|
| 5 |
+
from json import dumps
|
| 6 |
+
from re import sub
|
| 7 |
+
from typing import Any, Iterator, List, Tuple
|
| 8 |
+
|
| 9 |
+
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
| 10 |
+
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CharsetMatch:
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
payload: bytes,
|
| 17 |
+
guessed_encoding: str,
|
| 18 |
+
mean_mess_ratio: float,
|
| 19 |
+
has_sig_or_bom: bool,
|
| 20 |
+
languages: CoherenceMatches,
|
| 21 |
+
decoded_payload: str | None = None,
|
| 22 |
+
preemptive_declaration: str | None = None,
|
| 23 |
+
):
|
| 24 |
+
self._payload: bytes = payload
|
| 25 |
+
|
| 26 |
+
self._encoding: str = guessed_encoding
|
| 27 |
+
self._mean_mess_ratio: float = mean_mess_ratio
|
| 28 |
+
self._languages: CoherenceMatches = languages
|
| 29 |
+
self._has_sig_or_bom: bool = has_sig_or_bom
|
| 30 |
+
self._unicode_ranges: list[str] | None = None
|
| 31 |
+
|
| 32 |
+
self._leaves: list[CharsetMatch] = []
|
| 33 |
+
self._mean_coherence_ratio: float = 0.0
|
| 34 |
+
|
| 35 |
+
self._output_payload: bytes | None = None
|
| 36 |
+
self._output_encoding: str | None = None
|
| 37 |
+
|
| 38 |
+
self._string: str | None = decoded_payload
|
| 39 |
+
|
| 40 |
+
self._preemptive_declaration: str | None = preemptive_declaration
|
| 41 |
+
|
| 42 |
+
def __eq__(self, other: object) -> bool:
|
| 43 |
+
if not isinstance(other, CharsetMatch):
|
| 44 |
+
if isinstance(other, str):
|
| 45 |
+
return iana_name(other) == self.encoding
|
| 46 |
+
return False
|
| 47 |
+
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
| 48 |
+
|
| 49 |
+
def __lt__(self, other: object) -> bool:
|
| 50 |
+
"""
|
| 51 |
+
Implemented to make sorted available upon CharsetMatches items.
|
| 52 |
+
"""
|
| 53 |
+
if not isinstance(other, CharsetMatch):
|
| 54 |
+
raise ValueError
|
| 55 |
+
|
| 56 |
+
chaos_difference: float = abs(self.chaos - other.chaos)
|
| 57 |
+
coherence_difference: float = abs(self.coherence - other.coherence)
|
| 58 |
+
|
| 59 |
+
# Below 1% difference --> Use Coherence
|
| 60 |
+
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
| 61 |
+
return self.coherence > other.coherence
|
| 62 |
+
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
| 63 |
+
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
| 64 |
+
# preserve RAM usage!
|
| 65 |
+
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
| 66 |
+
return self.chaos < other.chaos
|
| 67 |
+
return self.multi_byte_usage > other.multi_byte_usage
|
| 68 |
+
|
| 69 |
+
return self.chaos < other.chaos
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def multi_byte_usage(self) -> float:
|
| 73 |
+
return 1.0 - (len(str(self)) / len(self.raw))
|
| 74 |
+
|
| 75 |
+
def __str__(self) -> str:
|
| 76 |
+
# Lazy Str Loading
|
| 77 |
+
if self._string is None:
|
| 78 |
+
self._string = str(self._payload, self._encoding, "strict")
|
| 79 |
+
return self._string
|
| 80 |
+
|
| 81 |
+
def __repr__(self) -> str:
|
| 82 |
+
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
| 83 |
+
|
| 84 |
+
def add_submatch(self, other: CharsetMatch) -> None:
|
| 85 |
+
if not isinstance(other, CharsetMatch) or other == self:
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
| 88 |
+
other.__class__
|
| 89 |
+
)
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
other._string = None # Unload RAM usage; dirty trick.
|
| 93 |
+
self._leaves.append(other)
|
| 94 |
+
|
| 95 |
+
@property
|
| 96 |
+
def encoding(self) -> str:
|
| 97 |
+
return self._encoding
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def encoding_aliases(self) -> list[str]:
|
| 101 |
+
"""
|
| 102 |
+
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
| 103 |
+
"""
|
| 104 |
+
also_known_as: list[str] = []
|
| 105 |
+
for u, p in aliases.items():
|
| 106 |
+
if self.encoding == u:
|
| 107 |
+
also_known_as.append(p)
|
| 108 |
+
elif self.encoding == p:
|
| 109 |
+
also_known_as.append(u)
|
| 110 |
+
return also_known_as
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def bom(self) -> bool:
|
| 114 |
+
return self._has_sig_or_bom
|
| 115 |
+
|
| 116 |
+
@property
|
| 117 |
+
def byte_order_mark(self) -> bool:
|
| 118 |
+
return self._has_sig_or_bom
|
| 119 |
+
|
| 120 |
+
@property
|
| 121 |
+
def languages(self) -> list[str]:
|
| 122 |
+
"""
|
| 123 |
+
Return the complete list of possible languages found in decoded sequence.
|
| 124 |
+
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
| 125 |
+
"""
|
| 126 |
+
return [e[0] for e in self._languages]
|
| 127 |
+
|
| 128 |
+
@property
|
| 129 |
+
def language(self) -> str:
|
| 130 |
+
"""
|
| 131 |
+
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
| 132 |
+
"Unknown".
|
| 133 |
+
"""
|
| 134 |
+
if not self._languages:
|
| 135 |
+
# Trying to infer the language based on the given encoding
|
| 136 |
+
# Its either English or we should not pronounce ourselves in certain cases.
|
| 137 |
+
if "ascii" in self.could_be_from_charset:
|
| 138 |
+
return "English"
|
| 139 |
+
|
| 140 |
+
# doing it there to avoid circular import
|
| 141 |
+
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
| 142 |
+
|
| 143 |
+
languages = (
|
| 144 |
+
mb_encoding_languages(self.encoding)
|
| 145 |
+
if is_multi_byte_encoding(self.encoding)
|
| 146 |
+
else encoding_languages(self.encoding)
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
if len(languages) == 0 or "Latin Based" in languages:
|
| 150 |
+
return "Unknown"
|
| 151 |
+
|
| 152 |
+
return languages[0]
|
| 153 |
+
|
| 154 |
+
return self._languages[0][0]
|
| 155 |
+
|
| 156 |
+
@property
|
| 157 |
+
def chaos(self) -> float:
|
| 158 |
+
return self._mean_mess_ratio
|
| 159 |
+
|
| 160 |
+
@property
|
| 161 |
+
def coherence(self) -> float:
|
| 162 |
+
if not self._languages:
|
| 163 |
+
return 0.0
|
| 164 |
+
return self._languages[0][1]
|
| 165 |
+
|
| 166 |
+
@property
|
| 167 |
+
def percent_chaos(self) -> float:
|
| 168 |
+
return round(self.chaos * 100, ndigits=3)
|
| 169 |
+
|
| 170 |
+
@property
|
| 171 |
+
def percent_coherence(self) -> float:
|
| 172 |
+
return round(self.coherence * 100, ndigits=3)
|
| 173 |
+
|
| 174 |
+
@property
|
| 175 |
+
def raw(self) -> bytes:
|
| 176 |
+
"""
|
| 177 |
+
Original untouched bytes.
|
| 178 |
+
"""
|
| 179 |
+
return self._payload
|
| 180 |
+
|
| 181 |
+
@property
|
| 182 |
+
def submatch(self) -> list[CharsetMatch]:
|
| 183 |
+
return self._leaves
|
| 184 |
+
|
| 185 |
+
@property
|
| 186 |
+
def has_submatch(self) -> bool:
|
| 187 |
+
return len(self._leaves) > 0
|
| 188 |
+
|
| 189 |
+
@property
|
| 190 |
+
def alphabets(self) -> list[str]:
|
| 191 |
+
if self._unicode_ranges is not None:
|
| 192 |
+
return self._unicode_ranges
|
| 193 |
+
# list detected ranges
|
| 194 |
+
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
| 195 |
+
# filter and sort
|
| 196 |
+
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
| 197 |
+
return self._unicode_ranges
|
| 198 |
+
|
| 199 |
+
@property
|
| 200 |
+
def could_be_from_charset(self) -> list[str]:
|
| 201 |
+
"""
|
| 202 |
+
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
| 203 |
+
encoding.
|
| 204 |
+
This list does include the encoding available in property 'encoding'.
|
| 205 |
+
"""
|
| 206 |
+
return [self._encoding] + [m.encoding for m in self._leaves]
|
| 207 |
+
|
| 208 |
+
def output(self, encoding: str = "utf_8") -> bytes:
|
| 209 |
+
"""
|
| 210 |
+
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
| 211 |
+
Any errors will be simply ignored by the encoder NOT replaced.
|
| 212 |
+
"""
|
| 213 |
+
if self._output_encoding is None or self._output_encoding != encoding:
|
| 214 |
+
self._output_encoding = encoding
|
| 215 |
+
decoded_string = str(self)
|
| 216 |
+
if (
|
| 217 |
+
self._preemptive_declaration is not None
|
| 218 |
+
and self._preemptive_declaration.lower()
|
| 219 |
+
not in ["utf-8", "utf8", "utf_8"]
|
| 220 |
+
):
|
| 221 |
+
patched_header = sub(
|
| 222 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
| 223 |
+
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
| 224 |
+
m.groups()[0],
|
| 225 |
+
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
| 226 |
+
),
|
| 227 |
+
decoded_string[:8192],
|
| 228 |
+
count=1,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
decoded_string = patched_header + decoded_string[8192:]
|
| 232 |
+
|
| 233 |
+
self._output_payload = decoded_string.encode(encoding, "replace")
|
| 234 |
+
|
| 235 |
+
return self._output_payload # type: ignore
|
| 236 |
+
|
| 237 |
+
@property
|
| 238 |
+
def fingerprint(self) -> str:
|
| 239 |
+
"""
|
| 240 |
+
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
| 241 |
+
"""
|
| 242 |
+
return sha256(self.output()).hexdigest()
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class CharsetMatches:
|
| 246 |
+
"""
|
| 247 |
+
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
| 248 |
+
Act like a list(iterable) but does not implements all related methods.
|
| 249 |
+
"""
|
| 250 |
+
|
| 251 |
+
def __init__(self, results: list[CharsetMatch] | None = None):
|
| 252 |
+
self._results: list[CharsetMatch] = sorted(results) if results else []
|
| 253 |
+
|
| 254 |
+
def __iter__(self) -> Iterator[CharsetMatch]:
|
| 255 |
+
yield from self._results
|
| 256 |
+
|
| 257 |
+
def __getitem__(self, item: int | str) -> CharsetMatch:
|
| 258 |
+
"""
|
| 259 |
+
Retrieve a single item either by its position or encoding name (alias may be used here).
|
| 260 |
+
Raise KeyError upon invalid index or encoding not present in results.
|
| 261 |
+
"""
|
| 262 |
+
if isinstance(item, int):
|
| 263 |
+
return self._results[item]
|
| 264 |
+
if isinstance(item, str):
|
| 265 |
+
item = iana_name(item, False)
|
| 266 |
+
for result in self._results:
|
| 267 |
+
if item in result.could_be_from_charset:
|
| 268 |
+
return result
|
| 269 |
+
raise KeyError
|
| 270 |
+
|
| 271 |
+
def __len__(self) -> int:
|
| 272 |
+
return len(self._results)
|
| 273 |
+
|
| 274 |
+
def __bool__(self) -> bool:
|
| 275 |
+
return len(self._results) > 0
|
| 276 |
+
|
| 277 |
+
def append(self, item: CharsetMatch) -> None:
|
| 278 |
+
"""
|
| 279 |
+
Insert a single match. Will be inserted accordingly to preserve sort.
|
| 280 |
+
Can be inserted as a submatch.
|
| 281 |
+
"""
|
| 282 |
+
if not isinstance(item, CharsetMatch):
|
| 283 |
+
raise ValueError(
|
| 284 |
+
"Cannot append instance '{}' to CharsetMatches".format(
|
| 285 |
+
str(item.__class__)
|
| 286 |
+
)
|
| 287 |
+
)
|
| 288 |
+
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
| 289 |
+
if len(item.raw) < TOO_BIG_SEQUENCE:
|
| 290 |
+
for match in self._results:
|
| 291 |
+
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
| 292 |
+
match.add_submatch(item)
|
| 293 |
+
return
|
| 294 |
+
self._results.append(item)
|
| 295 |
+
self._results = sorted(self._results)
|
| 296 |
+
|
| 297 |
+
def best(self) -> CharsetMatch | None:
|
| 298 |
+
"""
|
| 299 |
+
Simply return the first match. Strict equivalent to matches[0].
|
| 300 |
+
"""
|
| 301 |
+
if not self._results:
|
| 302 |
+
return None
|
| 303 |
+
return self._results[0]
|
| 304 |
+
|
| 305 |
+
def first(self) -> CharsetMatch | None:
|
| 306 |
+
"""
|
| 307 |
+
Redundant method, call the method best(). Kept for BC reasons.
|
| 308 |
+
"""
|
| 309 |
+
return self.best()
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
CoherenceMatch = Tuple[str, float]
|
| 313 |
+
CoherenceMatches = List[CoherenceMatch]
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
class CliDetectionResult:
|
| 317 |
+
def __init__(
|
| 318 |
+
self,
|
| 319 |
+
path: str,
|
| 320 |
+
encoding: str | None,
|
| 321 |
+
encoding_aliases: list[str],
|
| 322 |
+
alternative_encodings: list[str],
|
| 323 |
+
language: str,
|
| 324 |
+
alphabets: list[str],
|
| 325 |
+
has_sig_or_bom: bool,
|
| 326 |
+
chaos: float,
|
| 327 |
+
coherence: float,
|
| 328 |
+
unicode_path: str | None,
|
| 329 |
+
is_preferred: bool,
|
| 330 |
+
):
|
| 331 |
+
self.path: str = path
|
| 332 |
+
self.unicode_path: str | None = unicode_path
|
| 333 |
+
self.encoding: str | None = encoding
|
| 334 |
+
self.encoding_aliases: list[str] = encoding_aliases
|
| 335 |
+
self.alternative_encodings: list[str] = alternative_encodings
|
| 336 |
+
self.language: str = language
|
| 337 |
+
self.alphabets: list[str] = alphabets
|
| 338 |
+
self.has_sig_or_bom: bool = has_sig_or_bom
|
| 339 |
+
self.chaos: float = chaos
|
| 340 |
+
self.coherence: float = coherence
|
| 341 |
+
self.is_preferred: bool = is_preferred
|
| 342 |
+
|
| 343 |
+
@property
|
| 344 |
+
def __dict__(self) -> dict[str, Any]: # type: ignore
|
| 345 |
+
return {
|
| 346 |
+
"path": self.path,
|
| 347 |
+
"encoding": self.encoding,
|
| 348 |
+
"encoding_aliases": self.encoding_aliases,
|
| 349 |
+
"alternative_encodings": self.alternative_encodings,
|
| 350 |
+
"language": self.language,
|
| 351 |
+
"alphabets": self.alphabets,
|
| 352 |
+
"has_sig_or_bom": self.has_sig_or_bom,
|
| 353 |
+
"chaos": self.chaos,
|
| 354 |
+
"coherence": self.coherence,
|
| 355 |
+
"unicode_path": self.unicode_path,
|
| 356 |
+
"is_preferred": self.is_preferred,
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
def to_json(self) -> str:
|
| 360 |
+
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
phivenv/Lib/site-packages/charset_normalizer/py.typed
ADDED
|
File without changes
|
phivenv/Lib/site-packages/charset_normalizer/utils.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
import logging
|
| 5 |
+
import unicodedata
|
| 6 |
+
from codecs import IncrementalDecoder
|
| 7 |
+
from encodings.aliases import aliases
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
from re import findall
|
| 10 |
+
from typing import Generator
|
| 11 |
+
|
| 12 |
+
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
| 13 |
+
MultibyteIncrementalDecoder,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
from .constant import (
|
| 17 |
+
ENCODING_MARKS,
|
| 18 |
+
IANA_SUPPORTED_SIMILAR,
|
| 19 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
| 20 |
+
UNICODE_RANGES_COMBINED,
|
| 21 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
| 22 |
+
UTF8_MAXIMAL_ALLOCATION,
|
| 23 |
+
COMMON_CJK_CHARACTERS,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 28 |
+
def is_accentuated(character: str) -> bool:
|
| 29 |
+
try:
|
| 30 |
+
description: str = unicodedata.name(character)
|
| 31 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 32 |
+
return False
|
| 33 |
+
return (
|
| 34 |
+
"WITH GRAVE" in description
|
| 35 |
+
or "WITH ACUTE" in description
|
| 36 |
+
or "WITH CEDILLA" in description
|
| 37 |
+
or "WITH DIAERESIS" in description
|
| 38 |
+
or "WITH CIRCUMFLEX" in description
|
| 39 |
+
or "WITH TILDE" in description
|
| 40 |
+
or "WITH MACRON" in description
|
| 41 |
+
or "WITH RING ABOVE" in description
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 46 |
+
def remove_accent(character: str) -> str:
|
| 47 |
+
decomposed: str = unicodedata.decomposition(character)
|
| 48 |
+
if not decomposed:
|
| 49 |
+
return character
|
| 50 |
+
|
| 51 |
+
codes: list[str] = decomposed.split(" ")
|
| 52 |
+
|
| 53 |
+
return chr(int(codes[0], 16))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 57 |
+
def unicode_range(character: str) -> str | None:
|
| 58 |
+
"""
|
| 59 |
+
Retrieve the Unicode range official name from a single character.
|
| 60 |
+
"""
|
| 61 |
+
character_ord: int = ord(character)
|
| 62 |
+
|
| 63 |
+
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
| 64 |
+
if character_ord in ord_range:
|
| 65 |
+
return range_name
|
| 66 |
+
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 71 |
+
def is_latin(character: str) -> bool:
|
| 72 |
+
try:
|
| 73 |
+
description: str = unicodedata.name(character)
|
| 74 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 75 |
+
return False
|
| 76 |
+
return "LATIN" in description
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 80 |
+
def is_punctuation(character: str) -> bool:
|
| 81 |
+
character_category: str = unicodedata.category(character)
|
| 82 |
+
|
| 83 |
+
if "P" in character_category:
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
character_range: str | None = unicode_range(character)
|
| 87 |
+
|
| 88 |
+
if character_range is None:
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
return "Punctuation" in character_range
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 95 |
+
def is_symbol(character: str) -> bool:
|
| 96 |
+
character_category: str = unicodedata.category(character)
|
| 97 |
+
|
| 98 |
+
if "S" in character_category or "N" in character_category:
|
| 99 |
+
return True
|
| 100 |
+
|
| 101 |
+
character_range: str | None = unicode_range(character)
|
| 102 |
+
|
| 103 |
+
if character_range is None:
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
return "Forms" in character_range and character_category != "Lo"
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 110 |
+
def is_emoticon(character: str) -> bool:
|
| 111 |
+
character_range: str | None = unicode_range(character)
|
| 112 |
+
|
| 113 |
+
if character_range is None:
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
return "Emoticons" in character_range or "Pictographs" in character_range
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 120 |
+
def is_separator(character: str) -> bool:
|
| 121 |
+
if character.isspace() or character in {"|", "+", "<", ">"}:
|
| 122 |
+
return True
|
| 123 |
+
|
| 124 |
+
character_category: str = unicodedata.category(character)
|
| 125 |
+
|
| 126 |
+
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 130 |
+
def is_case_variable(character: str) -> bool:
|
| 131 |
+
return character.islower() != character.isupper()
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 135 |
+
def is_cjk(character: str) -> bool:
|
| 136 |
+
try:
|
| 137 |
+
character_name = unicodedata.name(character)
|
| 138 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
return "CJK" in character_name
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 145 |
+
def is_hiragana(character: str) -> bool:
|
| 146 |
+
try:
|
| 147 |
+
character_name = unicodedata.name(character)
|
| 148 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
return "HIRAGANA" in character_name
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 155 |
+
def is_katakana(character: str) -> bool:
|
| 156 |
+
try:
|
| 157 |
+
character_name = unicodedata.name(character)
|
| 158 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
return "KATAKANA" in character_name
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 165 |
+
def is_hangul(character: str) -> bool:
|
| 166 |
+
try:
|
| 167 |
+
character_name = unicodedata.name(character)
|
| 168 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
return "HANGUL" in character_name
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 175 |
+
def is_thai(character: str) -> bool:
|
| 176 |
+
try:
|
| 177 |
+
character_name = unicodedata.name(character)
|
| 178 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 179 |
+
return False
|
| 180 |
+
|
| 181 |
+
return "THAI" in character_name
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 185 |
+
def is_arabic(character: str) -> bool:
|
| 186 |
+
try:
|
| 187 |
+
character_name = unicodedata.name(character)
|
| 188 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
return "ARABIC" in character_name
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 195 |
+
def is_arabic_isolated_form(character: str) -> bool:
|
| 196 |
+
try:
|
| 197 |
+
character_name = unicodedata.name(character)
|
| 198 |
+
except ValueError: # Defensive: unicode database outdated?
|
| 199 |
+
return False
|
| 200 |
+
|
| 201 |
+
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 205 |
+
def is_cjk_uncommon(character: str) -> bool:
|
| 206 |
+
return character not in COMMON_CJK_CHARACTERS
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
| 210 |
+
def is_unicode_range_secondary(range_name: str) -> bool:
|
| 211 |
+
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
| 215 |
+
def is_unprintable(character: str) -> bool:
|
| 216 |
+
return (
|
| 217 |
+
character.isspace() is False # includes \n \t \r \v
|
| 218 |
+
and character.isprintable() is False
|
| 219 |
+
and character != "\x1a" # Why? Its the ASCII substitute character.
|
| 220 |
+
and character != "\ufeff" # bug discovered in Python,
|
| 221 |
+
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
| 226 |
+
"""
|
| 227 |
+
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
| 228 |
+
"""
|
| 229 |
+
if not isinstance(sequence, bytes):
|
| 230 |
+
raise TypeError
|
| 231 |
+
|
| 232 |
+
seq_len: int = len(sequence)
|
| 233 |
+
|
| 234 |
+
results: list[str] = findall(
|
| 235 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
| 236 |
+
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
if len(results) == 0:
|
| 240 |
+
return None
|
| 241 |
+
|
| 242 |
+
for specified_encoding in results:
|
| 243 |
+
specified_encoding = specified_encoding.lower().replace("-", "_")
|
| 244 |
+
|
| 245 |
+
encoding_alias: str
|
| 246 |
+
encoding_iana: str
|
| 247 |
+
|
| 248 |
+
for encoding_alias, encoding_iana in aliases.items():
|
| 249 |
+
if encoding_alias == specified_encoding:
|
| 250 |
+
return encoding_iana
|
| 251 |
+
if encoding_iana == specified_encoding:
|
| 252 |
+
return encoding_iana
|
| 253 |
+
|
| 254 |
+
return None
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
@lru_cache(maxsize=128)
|
| 258 |
+
def is_multi_byte_encoding(name: str) -> bool:
|
| 259 |
+
"""
|
| 260 |
+
Verify is a specific encoding is a multi byte one based on it IANA name
|
| 261 |
+
"""
|
| 262 |
+
return name in {
|
| 263 |
+
"utf_8",
|
| 264 |
+
"utf_8_sig",
|
| 265 |
+
"utf_16",
|
| 266 |
+
"utf_16_be",
|
| 267 |
+
"utf_16_le",
|
| 268 |
+
"utf_32",
|
| 269 |
+
"utf_32_le",
|
| 270 |
+
"utf_32_be",
|
| 271 |
+
"utf_7",
|
| 272 |
+
} or issubclass(
|
| 273 |
+
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
| 274 |
+
MultibyteIncrementalDecoder,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
| 279 |
+
"""
|
| 280 |
+
Identify and extract SIG/BOM in given sequence.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
for iana_encoding in ENCODING_MARKS:
|
| 284 |
+
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
| 285 |
+
|
| 286 |
+
if isinstance(marks, bytes):
|
| 287 |
+
marks = [marks]
|
| 288 |
+
|
| 289 |
+
for mark in marks:
|
| 290 |
+
if sequence.startswith(mark):
|
| 291 |
+
return iana_encoding, mark
|
| 292 |
+
|
| 293 |
+
return None, b""
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
| 297 |
+
return iana_encoding not in {"utf_16", "utf_32"}
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def iana_name(cp_name: str, strict: bool = True) -> str:
|
| 301 |
+
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
| 302 |
+
cp_name = cp_name.lower().replace("-", "_")
|
| 303 |
+
|
| 304 |
+
encoding_alias: str
|
| 305 |
+
encoding_iana: str
|
| 306 |
+
|
| 307 |
+
for encoding_alias, encoding_iana in aliases.items():
|
| 308 |
+
if cp_name in [encoding_alias, encoding_iana]:
|
| 309 |
+
return encoding_iana
|
| 310 |
+
|
| 311 |
+
if strict:
|
| 312 |
+
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
| 313 |
+
|
| 314 |
+
return cp_name
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
| 318 |
+
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
| 319 |
+
return 0.0
|
| 320 |
+
|
| 321 |
+
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
| 322 |
+
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
| 323 |
+
|
| 324 |
+
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
| 325 |
+
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
| 326 |
+
|
| 327 |
+
character_match_count: int = 0
|
| 328 |
+
|
| 329 |
+
for i in range(255):
|
| 330 |
+
to_be_decoded: bytes = bytes([i])
|
| 331 |
+
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
| 332 |
+
character_match_count += 1
|
| 333 |
+
|
| 334 |
+
return character_match_count / 254
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
| 338 |
+
"""
|
| 339 |
+
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
| 340 |
+
the function cp_similarity.
|
| 341 |
+
"""
|
| 342 |
+
return (
|
| 343 |
+
iana_name_a in IANA_SUPPORTED_SIMILAR
|
| 344 |
+
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def set_logging_handler(
|
| 349 |
+
name: str = "charset_normalizer",
|
| 350 |
+
level: int = logging.INFO,
|
| 351 |
+
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
| 352 |
+
) -> None:
|
| 353 |
+
logger = logging.getLogger(name)
|
| 354 |
+
logger.setLevel(level)
|
| 355 |
+
|
| 356 |
+
handler = logging.StreamHandler()
|
| 357 |
+
handler.setFormatter(logging.Formatter(format_string))
|
| 358 |
+
logger.addHandler(handler)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def cut_sequence_chunks(
|
| 362 |
+
sequences: bytes,
|
| 363 |
+
encoding_iana: str,
|
| 364 |
+
offsets: range,
|
| 365 |
+
chunk_size: int,
|
| 366 |
+
bom_or_sig_available: bool,
|
| 367 |
+
strip_sig_or_bom: bool,
|
| 368 |
+
sig_payload: bytes,
|
| 369 |
+
is_multi_byte_decoder: bool,
|
| 370 |
+
decoded_payload: str | None = None,
|
| 371 |
+
) -> Generator[str, None, None]:
|
| 372 |
+
if decoded_payload and is_multi_byte_decoder is False:
|
| 373 |
+
for i in offsets:
|
| 374 |
+
chunk = decoded_payload[i : i + chunk_size]
|
| 375 |
+
if not chunk:
|
| 376 |
+
break
|
| 377 |
+
yield chunk
|
| 378 |
+
else:
|
| 379 |
+
for i in offsets:
|
| 380 |
+
chunk_end = i + chunk_size
|
| 381 |
+
if chunk_end > len(sequences) + 8:
|
| 382 |
+
continue
|
| 383 |
+
|
| 384 |
+
cut_sequence = sequences[i : i + chunk_size]
|
| 385 |
+
|
| 386 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
| 387 |
+
cut_sequence = sig_payload + cut_sequence
|
| 388 |
+
|
| 389 |
+
chunk = cut_sequence.decode(
|
| 390 |
+
encoding_iana,
|
| 391 |
+
errors="ignore" if is_multi_byte_decoder else "strict",
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# multi-byte bad cutting detector and adjustment
|
| 395 |
+
# not the cleanest way to perform that fix but clever enough for now.
|
| 396 |
+
if is_multi_byte_decoder and i > 0:
|
| 397 |
+
chunk_partial_size_chk: int = min(chunk_size, 16)
|
| 398 |
+
|
| 399 |
+
if (
|
| 400 |
+
decoded_payload
|
| 401 |
+
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
| 402 |
+
):
|
| 403 |
+
for j in range(i, i - 4, -1):
|
| 404 |
+
cut_sequence = sequences[j:chunk_end]
|
| 405 |
+
|
| 406 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
| 407 |
+
cut_sequence = sig_payload + cut_sequence
|
| 408 |
+
|
| 409 |
+
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
| 410 |
+
|
| 411 |
+
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
| 412 |
+
break
|
| 413 |
+
|
| 414 |
+
yield chunk
|
phivenv/Lib/site-packages/charset_normalizer/version.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Expose version
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
__version__ = "3.4.3"
|
| 8 |
+
VERSION = __version__.split(".")
|
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/METADATA
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: colorama
|
| 3 |
+
Version: 0.4.6
|
| 4 |
+
Summary: Cross-platform colored terminal text.
|
| 5 |
+
Project-URL: Homepage, https://github.com/tartley/colorama
|
| 6 |
+
Author-email: Jonathan Hartley <tartley@tartley.com>
|
| 7 |
+
License-File: LICENSE.txt
|
| 8 |
+
Keywords: ansi,color,colour,crossplatform,terminal,text,windows,xplatform
|
| 9 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 10 |
+
Classifier: Environment :: Console
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 13 |
+
Classifier: Operating System :: OS Independent
|
| 14 |
+
Classifier: Programming Language :: Python
|
| 15 |
+
Classifier: Programming Language :: Python :: 2
|
| 16 |
+
Classifier: Programming Language :: Python :: 2.7
|
| 17 |
+
Classifier: Programming Language :: Python :: 3
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 22 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 23 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 24 |
+
Classifier: Topic :: Terminals
|
| 25 |
+
Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7
|
| 26 |
+
Description-Content-Type: text/x-rst
|
| 27 |
+
|
| 28 |
+
.. image:: https://img.shields.io/pypi/v/colorama.svg
|
| 29 |
+
:target: https://pypi.org/project/colorama/
|
| 30 |
+
:alt: Latest Version
|
| 31 |
+
|
| 32 |
+
.. image:: https://img.shields.io/pypi/pyversions/colorama.svg
|
| 33 |
+
:target: https://pypi.org/project/colorama/
|
| 34 |
+
:alt: Supported Python versions
|
| 35 |
+
|
| 36 |
+
.. image:: https://github.com/tartley/colorama/actions/workflows/test.yml/badge.svg
|
| 37 |
+
:target: https://github.com/tartley/colorama/actions/workflows/test.yml
|
| 38 |
+
:alt: Build Status
|
| 39 |
+
|
| 40 |
+
Colorama
|
| 41 |
+
========
|
| 42 |
+
|
| 43 |
+
Makes ANSI escape character sequences (for producing colored terminal text and
|
| 44 |
+
cursor positioning) work under MS Windows.
|
| 45 |
+
|
| 46 |
+
.. |donate| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif
|
| 47 |
+
:target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=2MZ9D2GMLYCUJ&item_name=Colorama¤cy_code=USD
|
| 48 |
+
:alt: Donate with Paypal
|
| 49 |
+
|
| 50 |
+
`PyPI for releases <https://pypi.org/project/colorama/>`_ |
|
| 51 |
+
`Github for source <https://github.com/tartley/colorama>`_ |
|
| 52 |
+
`Colorama for enterprise on Tidelift <https://github.com/tartley/colorama/blob/master/ENTERPRISE.md>`_
|
| 53 |
+
|
| 54 |
+
If you find Colorama useful, please |donate| to the authors. Thank you!
|
| 55 |
+
|
| 56 |
+
Installation
|
| 57 |
+
------------
|
| 58 |
+
|
| 59 |
+
Tested on CPython 2.7, 3.7, 3.8, 3.9 and 3.10 and Pypy 2.7 and 3.8.
|
| 60 |
+
|
| 61 |
+
No requirements other than the standard library.
|
| 62 |
+
|
| 63 |
+
.. code-block:: bash
|
| 64 |
+
|
| 65 |
+
pip install colorama
|
| 66 |
+
# or
|
| 67 |
+
conda install -c anaconda colorama
|
| 68 |
+
|
| 69 |
+
Description
|
| 70 |
+
-----------
|
| 71 |
+
|
| 72 |
+
ANSI escape character sequences have long been used to produce colored terminal
|
| 73 |
+
text and cursor positioning on Unix and Macs. Colorama makes this work on
|
| 74 |
+
Windows, too, by wrapping ``stdout``, stripping ANSI sequences it finds (which
|
| 75 |
+
would appear as gobbledygook in the output), and converting them into the
|
| 76 |
+
appropriate win32 calls to modify the state of the terminal. On other platforms,
|
| 77 |
+
Colorama does nothing.
|
| 78 |
+
|
| 79 |
+
This has the upshot of providing a simple cross-platform API for printing
|
| 80 |
+
colored terminal text from Python, and has the happy side-effect that existing
|
| 81 |
+
applications or libraries which use ANSI sequences to produce colored output on
|
| 82 |
+
Linux or Macs can now also work on Windows, simply by calling
|
| 83 |
+
``colorama.just_fix_windows_console()`` (since v0.4.6) or ``colorama.init()``
|
| 84 |
+
(all versions, but may have other side-effects – see below).
|
| 85 |
+
|
| 86 |
+
An alternative approach is to install ``ansi.sys`` on Windows machines, which
|
| 87 |
+
provides the same behaviour for all applications running in terminals. Colorama
|
| 88 |
+
is intended for situations where that isn't easy (e.g., maybe your app doesn't
|
| 89 |
+
have an installer.)
|
| 90 |
+
|
| 91 |
+
Demo scripts in the source code repository print some colored text using
|
| 92 |
+
ANSI sequences. Compare their output under Gnome-terminal's built in ANSI
|
| 93 |
+
handling, versus on Windows Command-Prompt using Colorama:
|
| 94 |
+
|
| 95 |
+
.. image:: https://github.com/tartley/colorama/raw/master/screenshots/ubuntu-demo.png
|
| 96 |
+
:width: 661
|
| 97 |
+
:height: 357
|
| 98 |
+
:alt: ANSI sequences on Ubuntu under gnome-terminal.
|
| 99 |
+
|
| 100 |
+
.. image:: https://github.com/tartley/colorama/raw/master/screenshots/windows-demo.png
|
| 101 |
+
:width: 668
|
| 102 |
+
:height: 325
|
| 103 |
+
:alt: Same ANSI sequences on Windows, using Colorama.
|
| 104 |
+
|
| 105 |
+
These screenshots show that, on Windows, Colorama does not support ANSI 'dim
|
| 106 |
+
text'; it looks the same as 'normal text'.
|
| 107 |
+
|
| 108 |
+
Usage
|
| 109 |
+
-----
|
| 110 |
+
|
| 111 |
+
Initialisation
|
| 112 |
+
..............
|
| 113 |
+
|
| 114 |
+
If the only thing you want from Colorama is to get ANSI escapes to work on
|
| 115 |
+
Windows, then run:
|
| 116 |
+
|
| 117 |
+
.. code-block:: python
|
| 118 |
+
|
| 119 |
+
from colorama import just_fix_windows_console
|
| 120 |
+
just_fix_windows_console()
|
| 121 |
+
|
| 122 |
+
If you're on a recent version of Windows 10 or better, and your stdout/stderr
|
| 123 |
+
are pointing to a Windows console, then this will flip the magic configuration
|
| 124 |
+
switch to enable Windows' built-in ANSI support.
|
| 125 |
+
|
| 126 |
+
If you're on an older version of Windows, and your stdout/stderr are pointing to
|
| 127 |
+
a Windows console, then this will wrap ``sys.stdout`` and/or ``sys.stderr`` in a
|
| 128 |
+
magic file object that intercepts ANSI escape sequences and issues the
|
| 129 |
+
appropriate Win32 calls to emulate them.
|
| 130 |
+
|
| 131 |
+
In all other circumstances, it does nothing whatsoever. Basically the idea is
|
| 132 |
+
that this makes Windows act like Unix with respect to ANSI escape handling.
|
| 133 |
+
|
| 134 |
+
It's safe to call this function multiple times. It's safe to call this function
|
| 135 |
+
on non-Windows platforms, but it won't do anything. It's safe to call this
|
| 136 |
+
function when one or both of your stdout/stderr are redirected to a file – it
|
| 137 |
+
won't do anything to those streams.
|
| 138 |
+
|
| 139 |
+
Alternatively, you can use the older interface with more features (but also more
|
| 140 |
+
potential footguns):
|
| 141 |
+
|
| 142 |
+
.. code-block:: python
|
| 143 |
+
|
| 144 |
+
from colorama import init
|
| 145 |
+
init()
|
| 146 |
+
|
| 147 |
+
This does the same thing as ``just_fix_windows_console``, except for the
|
| 148 |
+
following differences:
|
| 149 |
+
|
| 150 |
+
- It's not safe to call ``init`` multiple times; you can end up with multiple
|
| 151 |
+
layers of wrapping and broken ANSI support.
|
| 152 |
+
|
| 153 |
+
- Colorama will apply a heuristic to guess whether stdout/stderr support ANSI,
|
| 154 |
+
and if it thinks they don't, then it will wrap ``sys.stdout`` and
|
| 155 |
+
``sys.stderr`` in a magic file object that strips out ANSI escape sequences
|
| 156 |
+
before printing them. This happens on all platforms, and can be convenient if
|
| 157 |
+
you want to write your code to emit ANSI escape sequences unconditionally, and
|
| 158 |
+
let Colorama decide whether they should actually be output. But note that
|
| 159 |
+
Colorama's heuristic is not particularly clever.
|
| 160 |
+
|
| 161 |
+
- ``init`` also accepts explicit keyword args to enable/disable various
|
| 162 |
+
functionality – see below.
|
| 163 |
+
|
| 164 |
+
To stop using Colorama before your program exits, simply call ``deinit()``.
|
| 165 |
+
This will restore ``stdout`` and ``stderr`` to their original values, so that
|
| 166 |
+
Colorama is disabled. To resume using Colorama again, call ``reinit()``; it is
|
| 167 |
+
cheaper than calling ``init()`` again (but does the same thing).
|
| 168 |
+
|
| 169 |
+
Most users should depend on ``colorama >= 0.4.6``, and use
|
| 170 |
+
``just_fix_windows_console``. The old ``init`` interface will be supported
|
| 171 |
+
indefinitely for backwards compatibility, but we don't plan to fix any issues
|
| 172 |
+
with it, also for backwards compatibility.
|
| 173 |
+
|
| 174 |
+
Colored Output
|
| 175 |
+
..............
|
| 176 |
+
|
| 177 |
+
Cross-platform printing of colored text can then be done using Colorama's
|
| 178 |
+
constant shorthand for ANSI escape sequences. These are deliberately
|
| 179 |
+
rudimentary, see below.
|
| 180 |
+
|
| 181 |
+
.. code-block:: python
|
| 182 |
+
|
| 183 |
+
from colorama import Fore, Back, Style
|
| 184 |
+
print(Fore.RED + 'some red text')
|
| 185 |
+
print(Back.GREEN + 'and with a green background')
|
| 186 |
+
print(Style.DIM + 'and in dim text')
|
| 187 |
+
print(Style.RESET_ALL)
|
| 188 |
+
print('back to normal now')
|
| 189 |
+
|
| 190 |
+
...or simply by manually printing ANSI sequences from your own code:
|
| 191 |
+
|
| 192 |
+
.. code-block:: python
|
| 193 |
+
|
| 194 |
+
print('\033[31m' + 'some red text')
|
| 195 |
+
print('\033[39m') # and reset to default color
|
| 196 |
+
|
| 197 |
+
...or, Colorama can be used in conjunction with existing ANSI libraries
|
| 198 |
+
such as the venerable `Termcolor <https://pypi.org/project/termcolor/>`_
|
| 199 |
+
the fabulous `Blessings <https://pypi.org/project/blessings/>`_,
|
| 200 |
+
or the incredible `_Rich <https://pypi.org/project/rich/>`_.
|
| 201 |
+
|
| 202 |
+
If you wish Colorama's Fore, Back and Style constants were more capable,
|
| 203 |
+
then consider using one of the above highly capable libraries to generate
|
| 204 |
+
colors, etc, and use Colorama just for its primary purpose: to convert
|
| 205 |
+
those ANSI sequences to also work on Windows:
|
| 206 |
+
|
| 207 |
+
SIMILARLY, do not send PRs adding the generation of new ANSI types to Colorama.
|
| 208 |
+
We are only interested in converting ANSI codes to win32 API calls, not
|
| 209 |
+
shortcuts like the above to generate ANSI characters.
|
| 210 |
+
|
| 211 |
+
.. code-block:: python
|
| 212 |
+
|
| 213 |
+
from colorama import just_fix_windows_console
|
| 214 |
+
from termcolor import colored
|
| 215 |
+
|
| 216 |
+
# use Colorama to make Termcolor work on Windows too
|
| 217 |
+
just_fix_windows_console()
|
| 218 |
+
|
| 219 |
+
# then use Termcolor for all colored text output
|
| 220 |
+
print(colored('Hello, World!', 'green', 'on_red'))
|
| 221 |
+
|
| 222 |
+
Available formatting constants are::
|
| 223 |
+
|
| 224 |
+
Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
|
| 225 |
+
Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
|
| 226 |
+
Style: DIM, NORMAL, BRIGHT, RESET_ALL
|
| 227 |
+
|
| 228 |
+
``Style.RESET_ALL`` resets foreground, background, and brightness. Colorama will
|
| 229 |
+
perform this reset automatically on program exit.
|
| 230 |
+
|
| 231 |
+
These are fairly well supported, but not part of the standard::
|
| 232 |
+
|
| 233 |
+
Fore: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
|
| 234 |
+
Back: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
|
| 235 |
+
|
| 236 |
+
Cursor Positioning
|
| 237 |
+
..................
|
| 238 |
+
|
| 239 |
+
ANSI codes to reposition the cursor are supported. See ``demos/demo06.py`` for
|
| 240 |
+
an example of how to generate them.
|
| 241 |
+
|
| 242 |
+
Init Keyword Args
|
| 243 |
+
.................
|
| 244 |
+
|
| 245 |
+
``init()`` accepts some ``**kwargs`` to override default behaviour.
|
| 246 |
+
|
| 247 |
+
init(autoreset=False):
|
| 248 |
+
If you find yourself repeatedly sending reset sequences to turn off color
|
| 249 |
+
changes at the end of every print, then ``init(autoreset=True)`` will
|
| 250 |
+
automate that:
|
| 251 |
+
|
| 252 |
+
.. code-block:: python
|
| 253 |
+
|
| 254 |
+
from colorama import init
|
| 255 |
+
init(autoreset=True)
|
| 256 |
+
print(Fore.RED + 'some red text')
|
| 257 |
+
print('automatically back to default color again')
|
| 258 |
+
|
| 259 |
+
init(strip=None):
|
| 260 |
+
Pass ``True`` or ``False`` to override whether ANSI codes should be
|
| 261 |
+
stripped from the output. The default behaviour is to strip if on Windows
|
| 262 |
+
or if output is redirected (not a tty).
|
| 263 |
+
|
| 264 |
+
init(convert=None):
|
| 265 |
+
Pass ``True`` or ``False`` to override whether to convert ANSI codes in the
|
| 266 |
+
output into win32 calls. The default behaviour is to convert if on Windows
|
| 267 |
+
and output is to a tty (terminal).
|
| 268 |
+
|
| 269 |
+
init(wrap=True):
|
| 270 |
+
On Windows, Colorama works by replacing ``sys.stdout`` and ``sys.stderr``
|
| 271 |
+
with proxy objects, which override the ``.write()`` method to do their work.
|
| 272 |
+
If this wrapping causes you problems, then this can be disabled by passing
|
| 273 |
+
``init(wrap=False)``. The default behaviour is to wrap if ``autoreset`` or
|
| 274 |
+
``strip`` or ``convert`` are True.
|
| 275 |
+
|
| 276 |
+
When wrapping is disabled, colored printing on non-Windows platforms will
|
| 277 |
+
continue to work as normal. To do cross-platform colored output, you can
|
| 278 |
+
use Colorama's ``AnsiToWin32`` proxy directly:
|
| 279 |
+
|
| 280 |
+
.. code-block:: python
|
| 281 |
+
|
| 282 |
+
import sys
|
| 283 |
+
from colorama import init, AnsiToWin32
|
| 284 |
+
init(wrap=False)
|
| 285 |
+
stream = AnsiToWin32(sys.stderr).stream
|
| 286 |
+
|
| 287 |
+
# Python 2
|
| 288 |
+
print >>stream, Fore.BLUE + 'blue text on stderr'
|
| 289 |
+
|
| 290 |
+
# Python 3
|
| 291 |
+
print(Fore.BLUE + 'blue text on stderr', file=stream)
|
| 292 |
+
|
| 293 |
+
Recognised ANSI Sequences
|
| 294 |
+
.........................
|
| 295 |
+
|
| 296 |
+
ANSI sequences generally take the form::
|
| 297 |
+
|
| 298 |
+
ESC [ <param> ; <param> ... <command>
|
| 299 |
+
|
| 300 |
+
Where ``<param>`` is an integer, and ``<command>`` is a single letter. Zero or
|
| 301 |
+
more params are passed to a ``<command>``. If no params are passed, it is
|
| 302 |
+
generally synonymous with passing a single zero. No spaces exist in the
|
| 303 |
+
sequence; they have been inserted here simply to read more easily.
|
| 304 |
+
|
| 305 |
+
The only ANSI sequences that Colorama converts into win32 calls are::
|
| 306 |
+
|
| 307 |
+
ESC [ 0 m # reset all (colors and brightness)
|
| 308 |
+
ESC [ 1 m # bright
|
| 309 |
+
ESC [ 2 m # dim (looks same as normal brightness)
|
| 310 |
+
ESC [ 22 m # normal brightness
|
| 311 |
+
|
| 312 |
+
# FOREGROUND:
|
| 313 |
+
ESC [ 30 m # black
|
| 314 |
+
ESC [ 31 m # red
|
| 315 |
+
ESC [ 32 m # green
|
| 316 |
+
ESC [ 33 m # yellow
|
| 317 |
+
ESC [ 34 m # blue
|
| 318 |
+
ESC [ 35 m # magenta
|
| 319 |
+
ESC [ 36 m # cyan
|
| 320 |
+
ESC [ 37 m # white
|
| 321 |
+
ESC [ 39 m # reset
|
| 322 |
+
|
| 323 |
+
# BACKGROUND
|
| 324 |
+
ESC [ 40 m # black
|
| 325 |
+
ESC [ 41 m # red
|
| 326 |
+
ESC [ 42 m # green
|
| 327 |
+
ESC [ 43 m # yellow
|
| 328 |
+
ESC [ 44 m # blue
|
| 329 |
+
ESC [ 45 m # magenta
|
| 330 |
+
ESC [ 46 m # cyan
|
| 331 |
+
ESC [ 47 m # white
|
| 332 |
+
ESC [ 49 m # reset
|
| 333 |
+
|
| 334 |
+
# cursor positioning
|
| 335 |
+
ESC [ y;x H # position cursor at x across, y down
|
| 336 |
+
ESC [ y;x f # position cursor at x across, y down
|
| 337 |
+
ESC [ n A # move cursor n lines up
|
| 338 |
+
ESC [ n B # move cursor n lines down
|
| 339 |
+
ESC [ n C # move cursor n characters forward
|
| 340 |
+
ESC [ n D # move cursor n characters backward
|
| 341 |
+
|
| 342 |
+
# clear the screen
|
| 343 |
+
ESC [ mode J # clear the screen
|
| 344 |
+
|
| 345 |
+
# clear the line
|
| 346 |
+
ESC [ mode K # clear the line
|
| 347 |
+
|
| 348 |
+
Multiple numeric params to the ``'m'`` command can be combined into a single
|
| 349 |
+
sequence::
|
| 350 |
+
|
| 351 |
+
ESC [ 36 ; 45 ; 1 m # bright cyan text on magenta background
|
| 352 |
+
|
| 353 |
+
All other ANSI sequences of the form ``ESC [ <param> ; <param> ... <command>``
|
| 354 |
+
are silently stripped from the output on Windows.
|
| 355 |
+
|
| 356 |
+
Any other form of ANSI sequence, such as single-character codes or alternative
|
| 357 |
+
initial characters, are not recognised or stripped. It would be cool to add
|
| 358 |
+
them though. Let me know if it would be useful for you, via the Issues on
|
| 359 |
+
GitHub.
|
| 360 |
+
|
| 361 |
+
Status & Known Problems
|
| 362 |
+
-----------------------
|
| 363 |
+
|
| 364 |
+
I've personally only tested it on Windows XP (CMD, Console2), Ubuntu
|
| 365 |
+
(gnome-terminal, xterm), and OS X.
|
| 366 |
+
|
| 367 |
+
Some valid ANSI sequences aren't recognised.
|
| 368 |
+
|
| 369 |
+
If you're hacking on the code, see `README-hacking.md`_. ESPECIALLY, see the
|
| 370 |
+
explanation there of why we do not want PRs that allow Colorama to generate new
|
| 371 |
+
types of ANSI codes.
|
| 372 |
+
|
| 373 |
+
See outstanding issues and wish-list:
|
| 374 |
+
https://github.com/tartley/colorama/issues
|
| 375 |
+
|
| 376 |
+
If anything doesn't work for you, or doesn't do what you expected or hoped for,
|
| 377 |
+
I'd love to hear about it on that issues list, would be delighted by patches,
|
| 378 |
+
and would be happy to grant commit access to anyone who submits a working patch
|
| 379 |
+
or two.
|
| 380 |
+
|
| 381 |
+
.. _README-hacking.md: README-hacking.md
|
| 382 |
+
|
| 383 |
+
License
|
| 384 |
+
-------
|
| 385 |
+
|
| 386 |
+
Copyright Jonathan Hartley & Arnon Yaari, 2013-2020. BSD 3-Clause license; see
|
| 387 |
+
LICENSE file.
|
| 388 |
+
|
| 389 |
+
Professional support
|
| 390 |
+
--------------------
|
| 391 |
+
|
| 392 |
+
.. |tideliftlogo| image:: https://cdn2.hubspot.net/hubfs/4008838/website/logos/logos_for_download/Tidelift_primary-shorthand-logo.png
|
| 393 |
+
:alt: Tidelift
|
| 394 |
+
:target: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
|
| 395 |
+
|
| 396 |
+
.. list-table::
|
| 397 |
+
:widths: 10 100
|
| 398 |
+
|
| 399 |
+
* - |tideliftlogo|
|
| 400 |
+
- Professional support for colorama is available as part of the
|
| 401 |
+
`Tidelift Subscription`_.
|
| 402 |
+
Tidelift gives software development teams a single source for purchasing
|
| 403 |
+
and maintaining their software, with professional grade assurances from
|
| 404 |
+
the experts who know it best, while seamlessly integrating with existing
|
| 405 |
+
tools.
|
| 406 |
+
|
| 407 |
+
.. _Tidelift Subscription: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
|
| 408 |
+
|
| 409 |
+
Thanks
|
| 410 |
+
------
|
| 411 |
+
|
| 412 |
+
See the CHANGELOG for more thanks!
|
| 413 |
+
|
| 414 |
+
* Marc Schlaich (schlamar) for a ``setup.py`` fix for Python2.5.
|
| 415 |
+
* Marc Abramowitz, reported & fixed a crash on exit with closed ``stdout``,
|
| 416 |
+
providing a solution to issue #7's setuptools/distutils debate,
|
| 417 |
+
and other fixes.
|
| 418 |
+
* User 'eryksun', for guidance on correctly instantiating ``ctypes.windll``.
|
| 419 |
+
* Matthew McCormick for politely pointing out a longstanding crash on non-Win.
|
| 420 |
+
* Ben Hoyt, for a magnificent fix under 64-bit Windows.
|
| 421 |
+
* Jesse at Empty Square for submitting a fix for examples in the README.
|
| 422 |
+
* User 'jamessp', an observant documentation fix for cursor positioning.
|
| 423 |
+
* User 'vaal1239', Dave Mckee & Lackner Kristof for a tiny but much-needed Win7
|
| 424 |
+
fix.
|
| 425 |
+
* Julien Stuyck, for wisely suggesting Python3 compatible updates to README.
|
| 426 |
+
* Daniel Griffith for multiple fabulous patches.
|
| 427 |
+
* Oscar Lesta for a valuable fix to stop ANSI chars being sent to non-tty
|
| 428 |
+
output.
|
| 429 |
+
* Roger Binns, for many suggestions, valuable feedback, & bug reports.
|
| 430 |
+
* Tim Golden for thought and much appreciated feedback on the initial idea.
|
| 431 |
+
* User 'Zearin' for updates to the README file.
|
| 432 |
+
* John Szakmeister for adding support for light colors
|
| 433 |
+
* Charles Merriam for adding documentation to demos
|
| 434 |
+
* Jurko for a fix on 64-bit Windows CPython2.5 w/o ctypes
|
| 435 |
+
* Florian Bruhin for a fix when stdout or stderr are None
|
| 436 |
+
* Thomas Weininger for fixing ValueError on Windows
|
| 437 |
+
* Remi Rampin for better Github integration and fixes to the README file
|
| 438 |
+
* Simeon Visser for closing a file handle using 'with' and updating classifiers
|
| 439 |
+
to include Python 3.3 and 3.4
|
| 440 |
+
* Andy Neff for fixing RESET of LIGHT_EX colors.
|
| 441 |
+
* Jonathan Hartley for the initial idea and implementation.
|
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/RECORD
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
colorama-0.4.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
colorama-0.4.6.dist-info/METADATA,sha256=e67SnrUMOym9sz_4TjF3vxvAV4T3aF7NyqRHHH3YEMw,17158
|
| 3 |
+
colorama-0.4.6.dist-info/RECORD,,
|
| 4 |
+
colorama-0.4.6.dist-info/WHEEL,sha256=cdcF4Fbd0FPtw2EMIOwH-3rSOTUdTCeOSXRMD1iLUb8,105
|
| 5 |
+
colorama-0.4.6.dist-info/licenses/LICENSE.txt,sha256=ysNcAmhuXQSlpxQL-zs25zrtSWZW6JEQLkKIhteTAxg,1491
|
| 6 |
+
colorama/__init__.py,sha256=wePQA4U20tKgYARySLEC047ucNX-g8pRLpYBuiHlLb8,266
|
| 7 |
+
colorama/__pycache__/__init__.cpython-39.pyc,,
|
| 8 |
+
colorama/__pycache__/ansi.cpython-39.pyc,,
|
| 9 |
+
colorama/__pycache__/ansitowin32.cpython-39.pyc,,
|
| 10 |
+
colorama/__pycache__/initialise.cpython-39.pyc,,
|
| 11 |
+
colorama/__pycache__/win32.cpython-39.pyc,,
|
| 12 |
+
colorama/__pycache__/winterm.cpython-39.pyc,,
|
| 13 |
+
colorama/ansi.py,sha256=Top4EeEuaQdBWdteKMEcGOTeKeF19Q-Wo_6_Cj5kOzQ,2522
|
| 14 |
+
colorama/ansitowin32.py,sha256=vPNYa3OZbxjbuFyaVo0Tmhmy1FZ1lKMWCnT7odXpItk,11128
|
| 15 |
+
colorama/initialise.py,sha256=-hIny86ClXo39ixh5iSCfUIa2f_h_bgKRDW7gqs-KLU,3325
|
| 16 |
+
colorama/tests/__init__.py,sha256=MkgPAEzGQd-Rq0w0PZXSX2LadRWhUECcisJY8lSrm4Q,75
|
| 17 |
+
colorama/tests/__pycache__/__init__.cpython-39.pyc,,
|
| 18 |
+
colorama/tests/__pycache__/ansi_test.cpython-39.pyc,,
|
| 19 |
+
colorama/tests/__pycache__/ansitowin32_test.cpython-39.pyc,,
|
| 20 |
+
colorama/tests/__pycache__/initialise_test.cpython-39.pyc,,
|
| 21 |
+
colorama/tests/__pycache__/isatty_test.cpython-39.pyc,,
|
| 22 |
+
colorama/tests/__pycache__/utils.cpython-39.pyc,,
|
| 23 |
+
colorama/tests/__pycache__/winterm_test.cpython-39.pyc,,
|
| 24 |
+
colorama/tests/ansi_test.py,sha256=FeViDrUINIZcr505PAxvU4AjXz1asEiALs9GXMhwRaE,2839
|
| 25 |
+
colorama/tests/ansitowin32_test.py,sha256=RN7AIhMJ5EqDsYaCjVo-o4u8JzDD4ukJbmevWKS70rY,10678
|
| 26 |
+
colorama/tests/initialise_test.py,sha256=BbPy-XfyHwJ6zKozuQOvNvQZzsx9vdb_0bYXn7hsBTc,6741
|
| 27 |
+
colorama/tests/isatty_test.py,sha256=Pg26LRpv0yQDB5Ac-sxgVXG7hsA1NYvapFgApZfYzZg,1866
|
| 28 |
+
colorama/tests/utils.py,sha256=1IIRylG39z5-dzq09R_ngufxyPZxgldNbrxKxUGwGKE,1079
|
| 29 |
+
colorama/tests/winterm_test.py,sha256=qoWFPEjym5gm2RuMwpf3pOis3a5r_PJZFCzK254JL8A,3709
|
| 30 |
+
colorama/win32.py,sha256=YQOKwMTwtGBbsY4dL5HYTvwTeP9wIQra5MvPNddpxZs,6181
|
| 31 |
+
colorama/winterm.py,sha256=XCQFDHjPi6AHYNdZwy0tA02H-Jh48Jp-HvCjeLeLp3U,7134
|
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: hatchling 1.11.1
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py2-none-any
|
| 5 |
+
Tag: py3-none-any
|
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright (c) 2010 Jonathan Hartley
|
| 2 |
+
All rights reserved.
|
| 3 |
+
|
| 4 |
+
Redistribution and use in source and binary forms, with or without
|
| 5 |
+
modification, are permitted provided that the following conditions are met:
|
| 6 |
+
|
| 7 |
+
* Redistributions of source code must retain the above copyright notice, this
|
| 8 |
+
list of conditions and the following disclaimer.
|
| 9 |
+
|
| 10 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
| 11 |
+
this list of conditions and the following disclaimer in the documentation
|
| 12 |
+
and/or other materials provided with the distribution.
|
| 13 |
+
|
| 14 |
+
* Neither the name of the copyright holders, nor those of its contributors
|
| 15 |
+
may be used to endorse or promote products derived from this software without
|
| 16 |
+
specific prior written permission.
|
| 17 |
+
|
| 18 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
| 19 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
| 20 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 21 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
| 22 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 23 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 24 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 25 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 26 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 27 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
phivenv/Lib/site-packages/colorama/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
from .initialise import init, deinit, reinit, colorama_text, just_fix_windows_console
|
| 3 |
+
from .ansi import Fore, Back, Style, Cursor
|
| 4 |
+
from .ansitowin32 import AnsiToWin32
|
| 5 |
+
|
| 6 |
+
__version__ = '0.4.6'
|
| 7 |
+
|
phivenv/Lib/site-packages/colorama/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (438 Bytes). View file
|
|
|
phivenv/Lib/site-packages/colorama/__pycache__/ansi.cpython-39.pyc
ADDED
|
Binary file (3.19 kB). View file
|
|
|
phivenv/Lib/site-packages/colorama/__pycache__/ansitowin32.cpython-39.pyc
ADDED
|
Binary file (8.27 kB). View file
|
|
|
phivenv/Lib/site-packages/colorama/__pycache__/initialise.cpython-39.pyc
ADDED
|
Binary file (2.24 kB). View file
|
|
|
phivenv/Lib/site-packages/colorama/__pycache__/win32.cpython-39.pyc
ADDED
|
Binary file (4.42 kB). View file
|
|
|
phivenv/Lib/site-packages/colorama/__pycache__/winterm.cpython-39.pyc
ADDED
|
Binary file (5.22 kB). View file
|
|
|
phivenv/Lib/site-packages/colorama/ansi.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
'''
|
| 3 |
+
This module generates ANSI character codes to printing colors to terminals.
|
| 4 |
+
See: http://en.wikipedia.org/wiki/ANSI_escape_code
|
| 5 |
+
'''
|
| 6 |
+
|
| 7 |
+
CSI = '\033['
|
| 8 |
+
OSC = '\033]'
|
| 9 |
+
BEL = '\a'
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def code_to_chars(code):
|
| 13 |
+
return CSI + str(code) + 'm'
|
| 14 |
+
|
| 15 |
+
def set_title(title):
|
| 16 |
+
return OSC + '2;' + title + BEL
|
| 17 |
+
|
| 18 |
+
def clear_screen(mode=2):
|
| 19 |
+
return CSI + str(mode) + 'J'
|
| 20 |
+
|
| 21 |
+
def clear_line(mode=2):
|
| 22 |
+
return CSI + str(mode) + 'K'
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class AnsiCodes(object):
|
| 26 |
+
def __init__(self):
|
| 27 |
+
# the subclasses declare class attributes which are numbers.
|
| 28 |
+
# Upon instantiation we define instance attributes, which are the same
|
| 29 |
+
# as the class attributes but wrapped with the ANSI escape sequence
|
| 30 |
+
for name in dir(self):
|
| 31 |
+
if not name.startswith('_'):
|
| 32 |
+
value = getattr(self, name)
|
| 33 |
+
setattr(self, name, code_to_chars(value))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class AnsiCursor(object):
|
| 37 |
+
def UP(self, n=1):
|
| 38 |
+
return CSI + str(n) + 'A'
|
| 39 |
+
def DOWN(self, n=1):
|
| 40 |
+
return CSI + str(n) + 'B'
|
| 41 |
+
def FORWARD(self, n=1):
|
| 42 |
+
return CSI + str(n) + 'C'
|
| 43 |
+
def BACK(self, n=1):
|
| 44 |
+
return CSI + str(n) + 'D'
|
| 45 |
+
def POS(self, x=1, y=1):
|
| 46 |
+
return CSI + str(y) + ';' + str(x) + 'H'
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class AnsiFore(AnsiCodes):
|
| 50 |
+
BLACK = 30
|
| 51 |
+
RED = 31
|
| 52 |
+
GREEN = 32
|
| 53 |
+
YELLOW = 33
|
| 54 |
+
BLUE = 34
|
| 55 |
+
MAGENTA = 35
|
| 56 |
+
CYAN = 36
|
| 57 |
+
WHITE = 37
|
| 58 |
+
RESET = 39
|
| 59 |
+
|
| 60 |
+
# These are fairly well supported, but not part of the standard.
|
| 61 |
+
LIGHTBLACK_EX = 90
|
| 62 |
+
LIGHTRED_EX = 91
|
| 63 |
+
LIGHTGREEN_EX = 92
|
| 64 |
+
LIGHTYELLOW_EX = 93
|
| 65 |
+
LIGHTBLUE_EX = 94
|
| 66 |
+
LIGHTMAGENTA_EX = 95
|
| 67 |
+
LIGHTCYAN_EX = 96
|
| 68 |
+
LIGHTWHITE_EX = 97
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class AnsiBack(AnsiCodes):
|
| 72 |
+
BLACK = 40
|
| 73 |
+
RED = 41
|
| 74 |
+
GREEN = 42
|
| 75 |
+
YELLOW = 43
|
| 76 |
+
BLUE = 44
|
| 77 |
+
MAGENTA = 45
|
| 78 |
+
CYAN = 46
|
| 79 |
+
WHITE = 47
|
| 80 |
+
RESET = 49
|
| 81 |
+
|
| 82 |
+
# These are fairly well supported, but not part of the standard.
|
| 83 |
+
LIGHTBLACK_EX = 100
|
| 84 |
+
LIGHTRED_EX = 101
|
| 85 |
+
LIGHTGREEN_EX = 102
|
| 86 |
+
LIGHTYELLOW_EX = 103
|
| 87 |
+
LIGHTBLUE_EX = 104
|
| 88 |
+
LIGHTMAGENTA_EX = 105
|
| 89 |
+
LIGHTCYAN_EX = 106
|
| 90 |
+
LIGHTWHITE_EX = 107
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class AnsiStyle(AnsiCodes):
|
| 94 |
+
BRIGHT = 1
|
| 95 |
+
DIM = 2
|
| 96 |
+
NORMAL = 22
|
| 97 |
+
RESET_ALL = 0
|
| 98 |
+
|
| 99 |
+
Fore = AnsiFore()
|
| 100 |
+
Back = AnsiBack()
|
| 101 |
+
Style = AnsiStyle()
|
| 102 |
+
Cursor = AnsiCursor()
|
phivenv/Lib/site-packages/colorama/ansitowin32.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
import re
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
|
| 7 |
+
from .winterm import enable_vt_processing, WinTerm, WinColor, WinStyle
|
| 8 |
+
from .win32 import windll, winapi_test
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
winterm = None
|
| 12 |
+
if windll is not None:
|
| 13 |
+
winterm = WinTerm()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class StreamWrapper(object):
|
| 17 |
+
'''
|
| 18 |
+
Wraps a stream (such as stdout), acting as a transparent proxy for all
|
| 19 |
+
attribute access apart from method 'write()', which is delegated to our
|
| 20 |
+
Converter instance.
|
| 21 |
+
'''
|
| 22 |
+
def __init__(self, wrapped, converter):
|
| 23 |
+
# double-underscore everything to prevent clashes with names of
|
| 24 |
+
# attributes on the wrapped stream object.
|
| 25 |
+
self.__wrapped = wrapped
|
| 26 |
+
self.__convertor = converter
|
| 27 |
+
|
| 28 |
+
def __getattr__(self, name):
|
| 29 |
+
return getattr(self.__wrapped, name)
|
| 30 |
+
|
| 31 |
+
def __enter__(self, *args, **kwargs):
|
| 32 |
+
# special method lookup bypasses __getattr__/__getattribute__, see
|
| 33 |
+
# https://stackoverflow.com/questions/12632894/why-doesnt-getattr-work-with-exit
|
| 34 |
+
# thus, contextlib magic methods are not proxied via __getattr__
|
| 35 |
+
return self.__wrapped.__enter__(*args, **kwargs)
|
| 36 |
+
|
| 37 |
+
def __exit__(self, *args, **kwargs):
|
| 38 |
+
return self.__wrapped.__exit__(*args, **kwargs)
|
| 39 |
+
|
| 40 |
+
def __setstate__(self, state):
|
| 41 |
+
self.__dict__ = state
|
| 42 |
+
|
| 43 |
+
def __getstate__(self):
|
| 44 |
+
return self.__dict__
|
| 45 |
+
|
| 46 |
+
def write(self, text):
|
| 47 |
+
self.__convertor.write(text)
|
| 48 |
+
|
| 49 |
+
def isatty(self):
|
| 50 |
+
stream = self.__wrapped
|
| 51 |
+
if 'PYCHARM_HOSTED' in os.environ:
|
| 52 |
+
if stream is not None and (stream is sys.__stdout__ or stream is sys.__stderr__):
|
| 53 |
+
return True
|
| 54 |
+
try:
|
| 55 |
+
stream_isatty = stream.isatty
|
| 56 |
+
except AttributeError:
|
| 57 |
+
return False
|
| 58 |
+
else:
|
| 59 |
+
return stream_isatty()
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def closed(self):
|
| 63 |
+
stream = self.__wrapped
|
| 64 |
+
try:
|
| 65 |
+
return stream.closed
|
| 66 |
+
# AttributeError in the case that the stream doesn't support being closed
|
| 67 |
+
# ValueError for the case that the stream has already been detached when atexit runs
|
| 68 |
+
except (AttributeError, ValueError):
|
| 69 |
+
return True
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class AnsiToWin32(object):
|
| 73 |
+
'''
|
| 74 |
+
Implements a 'write()' method which, on Windows, will strip ANSI character
|
| 75 |
+
sequences from the text, and if outputting to a tty, will convert them into
|
| 76 |
+
win32 function calls.
|
| 77 |
+
'''
|
| 78 |
+
ANSI_CSI_RE = re.compile('\001?\033\\[((?:\\d|;)*)([a-zA-Z])\002?') # Control Sequence Introducer
|
| 79 |
+
ANSI_OSC_RE = re.compile('\001?\033\\]([^\a]*)(\a)\002?') # Operating System Command
|
| 80 |
+
|
| 81 |
+
def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
|
| 82 |
+
# The wrapped stream (normally sys.stdout or sys.stderr)
|
| 83 |
+
self.wrapped = wrapped
|
| 84 |
+
|
| 85 |
+
# should we reset colors to defaults after every .write()
|
| 86 |
+
self.autoreset = autoreset
|
| 87 |
+
|
| 88 |
+
# create the proxy wrapping our output stream
|
| 89 |
+
self.stream = StreamWrapper(wrapped, self)
|
| 90 |
+
|
| 91 |
+
on_windows = os.name == 'nt'
|
| 92 |
+
# We test if the WinAPI works, because even if we are on Windows
|
| 93 |
+
# we may be using a terminal that doesn't support the WinAPI
|
| 94 |
+
# (e.g. Cygwin Terminal). In this case it's up to the terminal
|
| 95 |
+
# to support the ANSI codes.
|
| 96 |
+
conversion_supported = on_windows and winapi_test()
|
| 97 |
+
try:
|
| 98 |
+
fd = wrapped.fileno()
|
| 99 |
+
except Exception:
|
| 100 |
+
fd = -1
|
| 101 |
+
system_has_native_ansi = not on_windows or enable_vt_processing(fd)
|
| 102 |
+
have_tty = not self.stream.closed and self.stream.isatty()
|
| 103 |
+
need_conversion = conversion_supported and not system_has_native_ansi
|
| 104 |
+
|
| 105 |
+
# should we strip ANSI sequences from our output?
|
| 106 |
+
if strip is None:
|
| 107 |
+
strip = need_conversion or not have_tty
|
| 108 |
+
self.strip = strip
|
| 109 |
+
|
| 110 |
+
# should we should convert ANSI sequences into win32 calls?
|
| 111 |
+
if convert is None:
|
| 112 |
+
convert = need_conversion and have_tty
|
| 113 |
+
self.convert = convert
|
| 114 |
+
|
| 115 |
+
# dict of ansi codes to win32 functions and parameters
|
| 116 |
+
self.win32_calls = self.get_win32_calls()
|
| 117 |
+
|
| 118 |
+
# are we wrapping stderr?
|
| 119 |
+
self.on_stderr = self.wrapped is sys.stderr
|
| 120 |
+
|
| 121 |
+
def should_wrap(self):
|
| 122 |
+
'''
|
| 123 |
+
True if this class is actually needed. If false, then the output
|
| 124 |
+
stream will not be affected, nor will win32 calls be issued, so
|
| 125 |
+
wrapping stdout is not actually required. This will generally be
|
| 126 |
+
False on non-Windows platforms, unless optional functionality like
|
| 127 |
+
autoreset has been requested using kwargs to init()
|
| 128 |
+
'''
|
| 129 |
+
return self.convert or self.strip or self.autoreset
|
| 130 |
+
|
| 131 |
+
def get_win32_calls(self):
|
| 132 |
+
if self.convert and winterm:
|
| 133 |
+
return {
|
| 134 |
+
AnsiStyle.RESET_ALL: (winterm.reset_all, ),
|
| 135 |
+
AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
|
| 136 |
+
AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
|
| 137 |
+
AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
|
| 138 |
+
AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
|
| 139 |
+
AnsiFore.RED: (winterm.fore, WinColor.RED),
|
| 140 |
+
AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
|
| 141 |
+
AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
|
| 142 |
+
AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
|
| 143 |
+
AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
|
| 144 |
+
AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
|
| 145 |
+
AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
|
| 146 |
+
AnsiFore.RESET: (winterm.fore, ),
|
| 147 |
+
AnsiFore.LIGHTBLACK_EX: (winterm.fore, WinColor.BLACK, True),
|
| 148 |
+
AnsiFore.LIGHTRED_EX: (winterm.fore, WinColor.RED, True),
|
| 149 |
+
AnsiFore.LIGHTGREEN_EX: (winterm.fore, WinColor.GREEN, True),
|
| 150 |
+
AnsiFore.LIGHTYELLOW_EX: (winterm.fore, WinColor.YELLOW, True),
|
| 151 |
+
AnsiFore.LIGHTBLUE_EX: (winterm.fore, WinColor.BLUE, True),
|
| 152 |
+
AnsiFore.LIGHTMAGENTA_EX: (winterm.fore, WinColor.MAGENTA, True),
|
| 153 |
+
AnsiFore.LIGHTCYAN_EX: (winterm.fore, WinColor.CYAN, True),
|
| 154 |
+
AnsiFore.LIGHTWHITE_EX: (winterm.fore, WinColor.GREY, True),
|
| 155 |
+
AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
|
| 156 |
+
AnsiBack.RED: (winterm.back, WinColor.RED),
|
| 157 |
+
AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
|
| 158 |
+
AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
|
| 159 |
+
AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
|
| 160 |
+
AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
|
| 161 |
+
AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
|
| 162 |
+
AnsiBack.WHITE: (winterm.back, WinColor.GREY),
|
| 163 |
+
AnsiBack.RESET: (winterm.back, ),
|
| 164 |
+
AnsiBack.LIGHTBLACK_EX: (winterm.back, WinColor.BLACK, True),
|
| 165 |
+
AnsiBack.LIGHTRED_EX: (winterm.back, WinColor.RED, True),
|
| 166 |
+
AnsiBack.LIGHTGREEN_EX: (winterm.back, WinColor.GREEN, True),
|
| 167 |
+
AnsiBack.LIGHTYELLOW_EX: (winterm.back, WinColor.YELLOW, True),
|
| 168 |
+
AnsiBack.LIGHTBLUE_EX: (winterm.back, WinColor.BLUE, True),
|
| 169 |
+
AnsiBack.LIGHTMAGENTA_EX: (winterm.back, WinColor.MAGENTA, True),
|
| 170 |
+
AnsiBack.LIGHTCYAN_EX: (winterm.back, WinColor.CYAN, True),
|
| 171 |
+
AnsiBack.LIGHTWHITE_EX: (winterm.back, WinColor.GREY, True),
|
| 172 |
+
}
|
| 173 |
+
return dict()
|
| 174 |
+
|
| 175 |
+
def write(self, text):
|
| 176 |
+
if self.strip or self.convert:
|
| 177 |
+
self.write_and_convert(text)
|
| 178 |
+
else:
|
| 179 |
+
self.wrapped.write(text)
|
| 180 |
+
self.wrapped.flush()
|
| 181 |
+
if self.autoreset:
|
| 182 |
+
self.reset_all()
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def reset_all(self):
|
| 186 |
+
if self.convert:
|
| 187 |
+
self.call_win32('m', (0,))
|
| 188 |
+
elif not self.strip and not self.stream.closed:
|
| 189 |
+
self.wrapped.write(Style.RESET_ALL)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def write_and_convert(self, text):
|
| 193 |
+
'''
|
| 194 |
+
Write the given text to our wrapped stream, stripping any ANSI
|
| 195 |
+
sequences from the text, and optionally converting them into win32
|
| 196 |
+
calls.
|
| 197 |
+
'''
|
| 198 |
+
cursor = 0
|
| 199 |
+
text = self.convert_osc(text)
|
| 200 |
+
for match in self.ANSI_CSI_RE.finditer(text):
|
| 201 |
+
start, end = match.span()
|
| 202 |
+
self.write_plain_text(text, cursor, start)
|
| 203 |
+
self.convert_ansi(*match.groups())
|
| 204 |
+
cursor = end
|
| 205 |
+
self.write_plain_text(text, cursor, len(text))
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def write_plain_text(self, text, start, end):
|
| 209 |
+
if start < end:
|
| 210 |
+
self.wrapped.write(text[start:end])
|
| 211 |
+
self.wrapped.flush()
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def convert_ansi(self, paramstring, command):
|
| 215 |
+
if self.convert:
|
| 216 |
+
params = self.extract_params(command, paramstring)
|
| 217 |
+
self.call_win32(command, params)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def extract_params(self, command, paramstring):
|
| 221 |
+
if command in 'Hf':
|
| 222 |
+
params = tuple(int(p) if len(p) != 0 else 1 for p in paramstring.split(';'))
|
| 223 |
+
while len(params) < 2:
|
| 224 |
+
# defaults:
|
| 225 |
+
params = params + (1,)
|
| 226 |
+
else:
|
| 227 |
+
params = tuple(int(p) for p in paramstring.split(';') if len(p) != 0)
|
| 228 |
+
if len(params) == 0:
|
| 229 |
+
# defaults:
|
| 230 |
+
if command in 'JKm':
|
| 231 |
+
params = (0,)
|
| 232 |
+
elif command in 'ABCD':
|
| 233 |
+
params = (1,)
|
| 234 |
+
|
| 235 |
+
return params
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def call_win32(self, command, params):
|
| 239 |
+
if command == 'm':
|
| 240 |
+
for param in params:
|
| 241 |
+
if param in self.win32_calls:
|
| 242 |
+
func_args = self.win32_calls[param]
|
| 243 |
+
func = func_args[0]
|
| 244 |
+
args = func_args[1:]
|
| 245 |
+
kwargs = dict(on_stderr=self.on_stderr)
|
| 246 |
+
func(*args, **kwargs)
|
| 247 |
+
elif command in 'J':
|
| 248 |
+
winterm.erase_screen(params[0], on_stderr=self.on_stderr)
|
| 249 |
+
elif command in 'K':
|
| 250 |
+
winterm.erase_line(params[0], on_stderr=self.on_stderr)
|
| 251 |
+
elif command in 'Hf': # cursor position - absolute
|
| 252 |
+
winterm.set_cursor_position(params, on_stderr=self.on_stderr)
|
| 253 |
+
elif command in 'ABCD': # cursor position - relative
|
| 254 |
+
n = params[0]
|
| 255 |
+
# A - up, B - down, C - forward, D - back
|
| 256 |
+
x, y = {'A': (0, -n), 'B': (0, n), 'C': (n, 0), 'D': (-n, 0)}[command]
|
| 257 |
+
winterm.cursor_adjust(x, y, on_stderr=self.on_stderr)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def convert_osc(self, text):
|
| 261 |
+
for match in self.ANSI_OSC_RE.finditer(text):
|
| 262 |
+
start, end = match.span()
|
| 263 |
+
text = text[:start] + text[end:]
|
| 264 |
+
paramstring, command = match.groups()
|
| 265 |
+
if command == BEL:
|
| 266 |
+
if paramstring.count(";") == 1:
|
| 267 |
+
params = paramstring.split(";")
|
| 268 |
+
# 0 - change title and icon (we will only change title)
|
| 269 |
+
# 1 - change icon (we don't support this)
|
| 270 |
+
# 2 - change title
|
| 271 |
+
if params[0] in '02':
|
| 272 |
+
winterm.set_title(params[1])
|
| 273 |
+
return text
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def flush(self):
|
| 277 |
+
self.wrapped.flush()
|
phivenv/Lib/site-packages/colorama/initialise.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
import atexit
|
| 3 |
+
import contextlib
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
from .ansitowin32 import AnsiToWin32
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _wipe_internal_state_for_tests():
|
| 10 |
+
global orig_stdout, orig_stderr
|
| 11 |
+
orig_stdout = None
|
| 12 |
+
orig_stderr = None
|
| 13 |
+
|
| 14 |
+
global wrapped_stdout, wrapped_stderr
|
| 15 |
+
wrapped_stdout = None
|
| 16 |
+
wrapped_stderr = None
|
| 17 |
+
|
| 18 |
+
global atexit_done
|
| 19 |
+
atexit_done = False
|
| 20 |
+
|
| 21 |
+
global fixed_windows_console
|
| 22 |
+
fixed_windows_console = False
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
# no-op if it wasn't registered
|
| 26 |
+
atexit.unregister(reset_all)
|
| 27 |
+
except AttributeError:
|
| 28 |
+
# python 2: no atexit.unregister. Oh well, we did our best.
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def reset_all():
|
| 33 |
+
if AnsiToWin32 is not None: # Issue #74: objects might become None at exit
|
| 34 |
+
AnsiToWin32(orig_stdout).reset_all()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def init(autoreset=False, convert=None, strip=None, wrap=True):
|
| 38 |
+
|
| 39 |
+
if not wrap and any([autoreset, convert, strip]):
|
| 40 |
+
raise ValueError('wrap=False conflicts with any other arg=True')
|
| 41 |
+
|
| 42 |
+
global wrapped_stdout, wrapped_stderr
|
| 43 |
+
global orig_stdout, orig_stderr
|
| 44 |
+
|
| 45 |
+
orig_stdout = sys.stdout
|
| 46 |
+
orig_stderr = sys.stderr
|
| 47 |
+
|
| 48 |
+
if sys.stdout is None:
|
| 49 |
+
wrapped_stdout = None
|
| 50 |
+
else:
|
| 51 |
+
sys.stdout = wrapped_stdout = \
|
| 52 |
+
wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
|
| 53 |
+
if sys.stderr is None:
|
| 54 |
+
wrapped_stderr = None
|
| 55 |
+
else:
|
| 56 |
+
sys.stderr = wrapped_stderr = \
|
| 57 |
+
wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
|
| 58 |
+
|
| 59 |
+
global atexit_done
|
| 60 |
+
if not atexit_done:
|
| 61 |
+
atexit.register(reset_all)
|
| 62 |
+
atexit_done = True
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def deinit():
|
| 66 |
+
if orig_stdout is not None:
|
| 67 |
+
sys.stdout = orig_stdout
|
| 68 |
+
if orig_stderr is not None:
|
| 69 |
+
sys.stderr = orig_stderr
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def just_fix_windows_console():
|
| 73 |
+
global fixed_windows_console
|
| 74 |
+
|
| 75 |
+
if sys.platform != "win32":
|
| 76 |
+
return
|
| 77 |
+
if fixed_windows_console:
|
| 78 |
+
return
|
| 79 |
+
if wrapped_stdout is not None or wrapped_stderr is not None:
|
| 80 |
+
# Someone already ran init() and it did stuff, so we won't second-guess them
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
# On newer versions of Windows, AnsiToWin32.__init__ will implicitly enable the
|
| 84 |
+
# native ANSI support in the console as a side-effect. We only need to actually
|
| 85 |
+
# replace sys.stdout/stderr if we're in the old-style conversion mode.
|
| 86 |
+
new_stdout = AnsiToWin32(sys.stdout, convert=None, strip=None, autoreset=False)
|
| 87 |
+
if new_stdout.convert:
|
| 88 |
+
sys.stdout = new_stdout
|
| 89 |
+
new_stderr = AnsiToWin32(sys.stderr, convert=None, strip=None, autoreset=False)
|
| 90 |
+
if new_stderr.convert:
|
| 91 |
+
sys.stderr = new_stderr
|
| 92 |
+
|
| 93 |
+
fixed_windows_console = True
|
| 94 |
+
|
| 95 |
+
@contextlib.contextmanager
|
| 96 |
+
def colorama_text(*args, **kwargs):
|
| 97 |
+
init(*args, **kwargs)
|
| 98 |
+
try:
|
| 99 |
+
yield
|
| 100 |
+
finally:
|
| 101 |
+
deinit()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def reinit():
|
| 105 |
+
if wrapped_stdout is not None:
|
| 106 |
+
sys.stdout = wrapped_stdout
|
| 107 |
+
if wrapped_stderr is not None:
|
| 108 |
+
sys.stderr = wrapped_stderr
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def wrap_stream(stream, convert, strip, autoreset, wrap):
|
| 112 |
+
if wrap:
|
| 113 |
+
wrapper = AnsiToWin32(stream,
|
| 114 |
+
convert=convert, strip=strip, autoreset=autoreset)
|
| 115 |
+
if wrapper.should_wrap():
|
| 116 |
+
stream = wrapper.stream
|
| 117 |
+
return stream
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Use this for initial setup as well, to reduce code duplication
|
| 121 |
+
_wipe_internal_state_for_tests()
|
phivenv/Lib/site-packages/colorama/tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
phivenv/Lib/site-packages/colorama/tests/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (158 Bytes). View file
|
|
|
phivenv/Lib/site-packages/colorama/tests/__pycache__/ansi_test.cpython-39.pyc
ADDED
|
Binary file (2.52 kB). View file
|
|
|