cranky-coder08 commited on
Commit
a91ea44
·
verified ·
1 Parent(s): b48a35b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/INSTALLER +1 -0
  2. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/METADATA +750 -0
  3. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/RECORD +35 -0
  4. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/WHEEL +5 -0
  5. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/entry_points.txt +2 -0
  6. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/licenses/LICENSE +21 -0
  7. phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/top_level.txt +1 -0
  8. phivenv/Lib/site-packages/charset_normalizer/__init__.py +48 -0
  9. phivenv/Lib/site-packages/charset_normalizer/__main__.py +6 -0
  10. phivenv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-39.pyc +0 -0
  11. phivenv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-39.pyc +0 -0
  12. phivenv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-39.pyc +0 -0
  13. phivenv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-39.pyc +0 -0
  14. phivenv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-39.pyc +0 -0
  15. phivenv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-39.pyc +0 -0
  16. phivenv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-39.pyc +0 -0
  17. phivenv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-39.pyc +0 -0
  18. phivenv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-39.pyc +0 -0
  19. phivenv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-39.pyc +0 -0
  20. phivenv/Lib/site-packages/charset_normalizer/cd.py +395 -0
  21. phivenv/Lib/site-packages/charset_normalizer/cli/__init__.py +8 -0
  22. phivenv/Lib/site-packages/charset_normalizer/cli/__main__.py +381 -0
  23. phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-39.pyc +0 -0
  24. phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-39.pyc +0 -0
  25. phivenv/Lib/site-packages/charset_normalizer/constant.py +2015 -0
  26. phivenv/Lib/site-packages/charset_normalizer/legacy.py +80 -0
  27. phivenv/Lib/site-packages/charset_normalizer/md.cp39-win_amd64.pyd +0 -0
  28. phivenv/Lib/site-packages/charset_normalizer/md.py +635 -0
  29. phivenv/Lib/site-packages/charset_normalizer/models.py +360 -0
  30. phivenv/Lib/site-packages/charset_normalizer/py.typed +0 -0
  31. phivenv/Lib/site-packages/charset_normalizer/utils.py +414 -0
  32. phivenv/Lib/site-packages/charset_normalizer/version.py +8 -0
  33. phivenv/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER +1 -0
  34. phivenv/Lib/site-packages/colorama-0.4.6.dist-info/METADATA +441 -0
  35. phivenv/Lib/site-packages/colorama-0.4.6.dist-info/RECORD +31 -0
  36. phivenv/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL +5 -0
  37. phivenv/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt +27 -0
  38. phivenv/Lib/site-packages/colorama/__init__.py +7 -0
  39. phivenv/Lib/site-packages/colorama/__pycache__/__init__.cpython-39.pyc +0 -0
  40. phivenv/Lib/site-packages/colorama/__pycache__/ansi.cpython-39.pyc +0 -0
  41. phivenv/Lib/site-packages/colorama/__pycache__/ansitowin32.cpython-39.pyc +0 -0
  42. phivenv/Lib/site-packages/colorama/__pycache__/initialise.cpython-39.pyc +0 -0
  43. phivenv/Lib/site-packages/colorama/__pycache__/win32.cpython-39.pyc +0 -0
  44. phivenv/Lib/site-packages/colorama/__pycache__/winterm.cpython-39.pyc +0 -0
  45. phivenv/Lib/site-packages/colorama/ansi.py +102 -0
  46. phivenv/Lib/site-packages/colorama/ansitowin32.py +277 -0
  47. phivenv/Lib/site-packages/colorama/initialise.py +121 -0
  48. phivenv/Lib/site-packages/colorama/tests/__init__.py +1 -0
  49. phivenv/Lib/site-packages/colorama/tests/__pycache__/__init__.cpython-39.pyc +0 -0
  50. phivenv/Lib/site-packages/colorama/tests/__pycache__/ansi_test.cpython-39.pyc +0 -0
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/METADATA ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: charset-normalizer
3
+ Version: 3.4.3
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
6
+ Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
7
+ License: MIT
8
+ Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
+ Project-URL: Code, https://github.com/jawah/charset_normalizer
11
+ Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Programming Language :: Python :: 3.14
26
+ Classifier: Programming Language :: Python :: 3 :: Only
27
+ Classifier: Programming Language :: Python :: Implementation :: CPython
28
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
29
+ Classifier: Topic :: Text Processing :: Linguistic
30
+ Classifier: Topic :: Utilities
31
+ Classifier: Typing :: Typed
32
+ Requires-Python: >=3.7
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Provides-Extra: unicode-backport
36
+ Dynamic: license-file
37
+
38
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
39
+
40
+ <p align="center">
41
+ <sup>The Real First Universal Charset Detector</sup><br>
42
+ <a href="https://pypi.org/project/charset-normalizer">
43
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
44
+ </a>
45
+ <a href="https://pepy.tech/project/charset-normalizer/">
46
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
47
+ </a>
48
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
49
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
50
+ </a>
51
+ </p>
52
+ <p align="center">
53
+ <sup><i>Featured Packages</i></sup><br>
54
+ <a href="https://github.com/jawah/niquests">
55
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
56
+ </a>
57
+ <a href="https://github.com/jawah/wassima">
58
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
59
+ </a>
60
+ </p>
61
+ <p align="center">
62
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
63
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
64
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
65
+ </a>
66
+ </p>
67
+
68
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
69
+ > I'm trying to resolve the issue by taking a new approach.
70
+ > All IANA character set names for which the Python core library provides codecs are supported.
71
+
72
+ <p align="center">
73
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
74
+ </p>
75
+
76
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
77
+
78
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
79
+ |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
80
+ | `Fast` | ❌ | ✅ | ✅ |
81
+ | `Universal**` | ❌ | ✅ | ❌ |
82
+ | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
83
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
84
+ | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
85
+ | `Native Python` | ✅ | ✅ | ❌ |
86
+ | `Detect spoken language` | ❌ | ✅ | N/A |
87
+ | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
88
+ | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
89
+ | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
90
+
91
+ <p align="center">
92
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
93
+ </p>
94
+
95
+ *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
96
+
97
+ ## ⚡ Performance
98
+
99
+ This package offer better performance than its counterpart Chardet. Here are some numbers.
100
+
101
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
102
+ |-----------------------------------------------|:--------:|:------------------:|:------------------:|
103
+ | [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
104
+ | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
105
+
106
+ | Package | 99th percentile | 95th percentile | 50th percentile |
107
+ |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
108
+ | [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
109
+ | charset-normalizer | 100 ms | 50 ms | 5 ms |
110
+
111
+ _updated as of december 2024 using CPython 3.12_
112
+
113
+ Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
114
+
115
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
116
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
117
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
118
+ > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
119
+ > (e.g. Supported Encoding) Challenge-them if you want.
120
+
121
+ ## ✨ Installation
122
+
123
+ Using pip:
124
+
125
+ ```sh
126
+ pip install charset-normalizer -U
127
+ ```
128
+
129
+ ## 🚀 Basic Usage
130
+
131
+ ### CLI
132
+ This package comes with a CLI.
133
+
134
+ ```
135
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
136
+ file [file ...]
137
+
138
+ The Real First Universal Charset Detector. Discover originating encoding used
139
+ on text file. Normalize text to unicode.
140
+
141
+ positional arguments:
142
+ files File(s) to be analysed
143
+
144
+ optional arguments:
145
+ -h, --help show this help message and exit
146
+ -v, --verbose Display complementary information about file if any.
147
+ Stdout will contain logs about the detection process.
148
+ -a, --with-alternative
149
+ Output complementary possibilities if any. Top-level
150
+ JSON WILL be a list.
151
+ -n, --normalize Permit to normalize input file. If not set, program
152
+ does not write anything.
153
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
154
+ JSON output.
155
+ -r, --replace Replace file when trying to normalize it instead of
156
+ creating a new one.
157
+ -f, --force Replace file without asking if you are sure, use this
158
+ flag with caution.
159
+ -t THRESHOLD, --threshold THRESHOLD
160
+ Define a custom maximum amount of chaos allowed in
161
+ decoded content. 0. <= chaos <= 1.
162
+ --version Show version information and exit.
163
+ ```
164
+
165
+ ```bash
166
+ normalizer ./data/sample.1.fr.srt
167
+ ```
168
+
169
+ or
170
+
171
+ ```bash
172
+ python -m charset_normalizer ./data/sample.1.fr.srt
173
+ ```
174
+
175
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
176
+
177
+ ```json
178
+ {
179
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
180
+ "encoding": "cp1252",
181
+ "encoding_aliases": [
182
+ "1252",
183
+ "windows_1252"
184
+ ],
185
+ "alternative_encodings": [
186
+ "cp1254",
187
+ "cp1256",
188
+ "cp1258",
189
+ "iso8859_14",
190
+ "iso8859_15",
191
+ "iso8859_16",
192
+ "iso8859_3",
193
+ "iso8859_9",
194
+ "latin_1",
195
+ "mbcs"
196
+ ],
197
+ "language": "French",
198
+ "alphabets": [
199
+ "Basic Latin",
200
+ "Latin-1 Supplement"
201
+ ],
202
+ "has_sig_or_bom": false,
203
+ "chaos": 0.149,
204
+ "coherence": 97.152,
205
+ "unicode_path": null,
206
+ "is_preferred": true
207
+ }
208
+ ```
209
+
210
+ ### Python
211
+ *Just print out normalized text*
212
+ ```python
213
+ from charset_normalizer import from_path
214
+
215
+ results = from_path('./my_subtitle.srt')
216
+
217
+ print(str(results.best()))
218
+ ```
219
+
220
+ *Upgrade your code without effort*
221
+ ```python
222
+ from charset_normalizer import detect
223
+ ```
224
+
225
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
226
+
227
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
228
+
229
+ ## 😇 Why
230
+
231
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
232
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
233
+
234
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
235
+ produce **two identical rendered string.**
236
+ What I want is to get readable text, the best I can.
237
+
238
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
239
+
240
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
241
+
242
+ ## 🍰 How
243
+
244
+ - Discard all charset encoding table that could not fit the binary content.
245
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
246
+ - Extract matches with the lowest mess detected.
247
+ - Additionally, we measure coherence / probe for a language.
248
+
249
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
250
+
251
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
252
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
253
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
254
+ improve or rewrite it.
255
+
256
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
257
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
258
+
259
+ ## ⚡ Known limitations
260
+
261
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
262
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
263
+
264
+ ## ⚠️ About Python EOLs
265
+
266
+ **If you are running:**
267
+
268
+ - Python >=2.7,<3.5: Unsupported
269
+ - Python 3.5: charset-normalizer < 2.1
270
+ - Python 3.6: charset-normalizer < 3.1
271
+ - Python 3.7: charset-normalizer < 4.0
272
+
273
+ Upgrade your Python interpreter as soon as possible.
274
+
275
+ ## 👤 Contributing
276
+
277
+ Contributions, issues and feature requests are very much welcome.<br />
278
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
279
+
280
+ ## 📝 License
281
+
282
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
283
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
284
+
285
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
286
+
287
+ ## 💼 For Enterprise
288
+
289
+ Professional support for charset-normalizer is available as part of the [Tidelift
290
+ Subscription][1]. Tidelift gives software development teams a single source for
291
+ purchasing and maintaining their software, with professional grade assurances
292
+ from the experts who know it best, while seamlessly integrating with existing
293
+ tools.
294
+
295
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
296
+
297
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
298
+
299
+ # Changelog
300
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
301
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
302
+
303
+ ## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
304
+
305
+ ### Changed
306
+ - mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
307
+ - automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
308
+
309
+ ### Added
310
+ - Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
311
+ - Support for Python 3.14
312
+
313
+ ### Fixed
314
+ - sdist archive contained useless directories.
315
+ - automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
316
+
317
+ ### Misc
318
+ - SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
319
+ Each published wheel comes with its SBOM. We choose CycloneDX as the format.
320
+ - Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
321
+
322
+ ## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
323
+
324
+ ### Fixed
325
+ - Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
326
+ - Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
327
+
328
+ ### Changed
329
+ - Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
330
+
331
+ ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
332
+
333
+ ### Changed
334
+ - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
335
+ - Enforce annotation delayed loading for a simpler and consistent types in the project.
336
+ - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
337
+
338
+ ### Added
339
+ - pre-commit configuration.
340
+ - noxfile.
341
+
342
+ ### Removed
343
+ - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
344
+ - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
345
+ - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
346
+ - Unused `utils.range_scan` function.
347
+
348
+ ### Fixed
349
+ - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
350
+ - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
351
+
352
+ ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
353
+
354
+ ### Added
355
+ - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
356
+ - Support for Python 3.13 (#512)
357
+
358
+ ### Fixed
359
+ - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
360
+ - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
361
+ - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
362
+
363
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
364
+
365
+ ### Fixed
366
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
367
+ - Regression on some detection case showcased in the documentation (#371)
368
+
369
+ ### Added
370
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
371
+
372
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
373
+
374
+ ### Changed
375
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
376
+ - Improved the general detection reliability based on reports from the community
377
+
378
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
379
+
380
+ ### Added
381
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
382
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
383
+
384
+ ### Removed
385
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
386
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
387
+
388
+ ### Changed
389
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
390
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
391
+
392
+ ### Fixed
393
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
394
+
395
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
396
+
397
+ ### Changed
398
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
399
+ - Minor improvement over the global detection reliability
400
+
401
+ ### Added
402
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
403
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
404
+ - Explicit support for Python 3.12
405
+
406
+ ### Fixed
407
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
408
+
409
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
410
+
411
+ ### Added
412
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
413
+
414
+ ### Removed
415
+ - Support for Python 3.6 (PR #260)
416
+
417
+ ### Changed
418
+ - Optional speedup provided by mypy/c 1.0.1
419
+
420
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
421
+
422
+ ### Fixed
423
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
424
+
425
+ ### Changed
426
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
427
+
428
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
429
+
430
+ ### Added
431
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
432
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
433
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
434
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
435
+
436
+ ### Changed
437
+ - Build with static metadata using 'build' frontend
438
+ - Make the language detection stricter
439
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
440
+
441
+ ### Fixed
442
+ - CLI with opt --normalize fail when using full path for files
443
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
444
+ - Sphinx warnings when generating the documentation
445
+
446
+ ### Removed
447
+ - Coherence detector no longer return 'Simple English' instead return 'English'
448
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
449
+ - Breaking: Method `first()` and `best()` from CharsetMatch
450
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
451
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
452
+ - Breaking: Top-level function `normalize`
453
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
454
+ - Support for the backport `unicodedata2`
455
+
456
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
457
+
458
+ ### Added
459
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
460
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
461
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
462
+
463
+ ### Changed
464
+ - Build with static metadata using 'build' frontend
465
+ - Make the language detection stricter
466
+
467
+ ### Fixed
468
+ - CLI with opt --normalize fail when using full path for files
469
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
470
+
471
+ ### Removed
472
+ - Coherence detector no longer return 'Simple English' instead return 'English'
473
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
474
+
475
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
476
+
477
+ ### Added
478
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
479
+
480
+ ### Removed
481
+ - Breaking: Method `first()` and `best()` from CharsetMatch
482
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
483
+
484
+ ### Fixed
485
+ - Sphinx warnings when generating the documentation
486
+
487
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
488
+
489
+ ### Changed
490
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
491
+
492
+ ### Removed
493
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
494
+ - Breaking: Top-level function `normalize`
495
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
496
+ - Support for the backport `unicodedata2`
497
+
498
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
499
+
500
+ ### Deprecated
501
+ - Function `normalize` scheduled for removal in 3.0
502
+
503
+ ### Changed
504
+ - Removed useless call to decode in fn is_unprintable (#206)
505
+
506
+ ### Fixed
507
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
508
+
509
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
510
+
511
+ ### Added
512
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
513
+
514
+ ### Changed
515
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
516
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
517
+
518
+ ### Fixed
519
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
520
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
521
+
522
+ ### Removed
523
+ - Support for Python 3.5 (PR #192)
524
+
525
+ ### Deprecated
526
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
527
+
528
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
529
+
530
+ ### Fixed
531
+ - ASCII miss-detection on rare cases (PR #170)
532
+
533
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
534
+
535
+ ### Added
536
+ - Explicit support for Python 3.11 (PR #164)
537
+
538
+ ### Changed
539
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
540
+
541
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
542
+
543
+ ### Fixed
544
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
545
+
546
+ ### Changed
547
+ - Skipping the language-detection (CD) on ASCII (PR #155)
548
+
549
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
550
+
551
+ ### Changed
552
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
553
+
554
+ ### Fixed
555
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
556
+
557
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
558
+ ### Changed
559
+ - Improvement over Vietnamese detection (PR #126)
560
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
561
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
562
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
563
+ - Code style as refactored by Sourcery-AI (PR #131)
564
+ - Minor adjustment on the MD around european words (PR #133)
565
+ - Remove and replace SRTs from assets / tests (PR #139)
566
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
567
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
568
+
569
+ ### Fixed
570
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
571
+ - Avoid using too insignificant chunk (PR #137)
572
+
573
+ ### Added
574
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
575
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
576
+
577
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
578
+ ### Added
579
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
580
+
581
+ ### Changed
582
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
583
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
584
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
585
+ - Various detection improvement (MD+CD) (PR #117)
586
+
587
+ ### Removed
588
+ - Remove redundant logging entry about detected language(s) (PR #115)
589
+
590
+ ### Fixed
591
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
592
+
593
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
594
+ ### Fixed
595
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
596
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
597
+
598
+ ### Changed
599
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
600
+
601
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
602
+ ### Changed
603
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
604
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
605
+ - The Unicode detection is slightly improved (PR #93)
606
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
607
+
608
+ ### Removed
609
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
610
+
611
+ ### Fixed
612
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
613
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
614
+ - The MANIFEST.in was not exhaustive (PR #78)
615
+
616
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
617
+ ### Fixed
618
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
619
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
620
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
621
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
622
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
623
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
624
+
625
+ ### Changed
626
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
627
+ - Allow fallback on specified encoding if any (PR #71)
628
+
629
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
630
+ ### Changed
631
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
632
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
633
+
634
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
635
+ ### Fixed
636
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
637
+
638
+ ### Changed
639
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
640
+
641
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
642
+ ### Fixed
643
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
644
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
645
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
646
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
647
+
648
+ ### Changed
649
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
650
+
651
+ ### Added
652
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
653
+
654
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
655
+ ### Changed
656
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
657
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
658
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
659
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
660
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
661
+ - utf_7 detection has been reinstated.
662
+
663
+ ### Removed
664
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
665
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
666
+ - The exception hook on UnicodeDecodeError has been removed.
667
+
668
+ ### Deprecated
669
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
670
+
671
+ ### Fixed
672
+ - The CLI output used the relative path of the file(s). Should be absolute.
673
+
674
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
675
+ ### Fixed
676
+ - Logger configuration/usage no longer conflict with others (PR #44)
677
+
678
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
679
+ ### Removed
680
+ - Using standard logging instead of using the package loguru.
681
+ - Dropping nose test framework in favor of the maintained pytest.
682
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
683
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
684
+ - Stop support for UTF-7 that does not contain a SIG.
685
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
686
+
687
+ ### Fixed
688
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
689
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
690
+
691
+ ### Changed
692
+ - Improving the package final size by compressing frequencies.json.
693
+ - Huge improvement over the larges payload.
694
+
695
+ ### Added
696
+ - CLI now produces JSON consumable output.
697
+ - Return ASCII if given sequences fit. Given reasonable confidence.
698
+
699
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
700
+
701
+ ### Fixed
702
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
703
+
704
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
705
+
706
+ ### Fixed
707
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
708
+
709
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
710
+
711
+ ### Fixed
712
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
713
+
714
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
715
+
716
+ ### Changed
717
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
718
+
719
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
720
+
721
+ ### Fixed
722
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
723
+
724
+ ### Changed
725
+ - Dependencies refactoring, constraints revised.
726
+
727
+ ### Added
728
+ - Add python 3.9 and 3.10 to the supported interpreters
729
+
730
+ MIT License
731
+
732
+ Copyright (c) 2025 TAHRI Ahmed R.
733
+
734
+ Permission is hereby granted, free of charge, to any person obtaining a copy
735
+ of this software and associated documentation files (the "Software"), to deal
736
+ in the Software without restriction, including without limitation the rights
737
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
738
+ copies of the Software, and to permit persons to whom the Software is
739
+ furnished to do so, subject to the following conditions:
740
+
741
+ The above copyright notice and this permission notice shall be included in all
742
+ copies or substantial portions of the Software.
743
+
744
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
745
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
746
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
747
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
748
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
749
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
750
+ SOFTWARE.
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/RECORD ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../Scripts/normalizer.exe,sha256=IvfL1xIwLcN8AdjczrodNaXLaHBUERWBgh7YbfqJYUw,106364
2
+ charset_normalizer-3.4.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
+ charset_normalizer-3.4.3.dist-info/METADATA,sha256=tqX3UoI-UkqIN99aZsk646yI4NgMbu1MjlKr6BbITG4,37450
4
+ charset_normalizer-3.4.3.dist-info/RECORD,,
5
+ charset_normalizer-3.4.3.dist-info/WHEEL,sha256=XkFE14KmFh7mutkkb-qn_ueuH2lwfT8rLdfc5xpQ7wE,99
6
+ charset_normalizer-3.4.3.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
7
+ charset_normalizer-3.4.3.dist-info/licenses/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
8
+ charset_normalizer-3.4.3.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
+ charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
10
+ charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
11
+ charset_normalizer/__pycache__/__init__.cpython-39.pyc,,
12
+ charset_normalizer/__pycache__/__main__.cpython-39.pyc,,
13
+ charset_normalizer/__pycache__/api.cpython-39.pyc,,
14
+ charset_normalizer/__pycache__/cd.cpython-39.pyc,,
15
+ charset_normalizer/__pycache__/constant.cpython-39.pyc,,
16
+ charset_normalizer/__pycache__/legacy.cpython-39.pyc,,
17
+ charset_normalizer/__pycache__/md.cpython-39.pyc,,
18
+ charset_normalizer/__pycache__/models.cpython-39.pyc,,
19
+ charset_normalizer/__pycache__/utils.cpython-39.pyc,,
20
+ charset_normalizer/__pycache__/version.cpython-39.pyc,,
21
+ charset_normalizer/api.py,sha256=ODy4hX78b3ldTl5sViYPU1yzQ5qkclfgSIFE8BtNrTI,23337
22
+ charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
23
+ charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
24
+ charset_normalizer/cli/__main__.py,sha256=-pdJCyPywouPyFsC8_eTSgTmvh1YEvgjsvy1WZ0XjaA,13027
25
+ charset_normalizer/cli/__pycache__/__init__.cpython-39.pyc,,
26
+ charset_normalizer/cli/__pycache__/__main__.cpython-39.pyc,,
27
+ charset_normalizer/constant.py,sha256=mCJmYzpBU27Ut9kiNWWoBbhhxQ-aRVw3K7LSwoFwBGI,44728
28
+ charset_normalizer/legacy.py,sha256=ui08NlKqAXU3Y7smK-NFJjEgRRQz9ruM7aNCbT0OOrE,2811
29
+ charset_normalizer/md.cp39-win_amd64.pyd,sha256=GBRkMtCJSwm_0H_fJ-Jus0DdpkxHcWVC4XcSnC_seLk,10752
30
+ charset_normalizer/md.py,sha256=LSuW2hNgXSgF7JGdRapLAHLuj6pABHiP85LTNAYmu7c,20780
31
+ charset_normalizer/md__mypyc.cp39-win_amd64.pyd,sha256=CZOPvYPp7PJ4wdp_LKOtla0M0e856CwbTsusjGtnb_k,125440
32
+ charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
33
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ charset_normalizer/utils.py,sha256=XtWIQeOuz7cnGebMzyi4Vvi1JtA84QBSIeR9PDzF7pw,12584
35
+ charset_normalizer/version.py,sha256=laniWEeVCCfwRgYLf_rZ2f0qWaNwWTEXQEfUUL_MMvw,123
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp39-cp39-win_amd64
5
+
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ normalizer = charset_normalizer.cli:cli_detect
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
phivenv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ charset_normalizer
phivenv/Lib/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
phivenv/Lib/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import cli_detect
4
+
5
+ if __name__ == "__main__":
6
+ cli_detect()
phivenv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (1.61 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-39.pyc ADDED
Binary file (289 Bytes). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-39.pyc ADDED
Binary file (11.6 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-39.pyc ADDED
Binary file (9.59 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-39.pyc ADDED
Binary file (27.3 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-39.pyc ADDED
Binary file (2.25 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-39.pyc ADDED
Binary file (16.8 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-39.pyc ADDED
Binary file (11.9 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-39.pyc ADDED
Binary file (9.1 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-39.pyc ADDED
Binary file (297 Bytes). View file
 
phivenv/Lib/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from codecs import IncrementalDecoder
5
+ from collections import Counter
6
+ from functools import lru_cache
7
+ from typing import Counter as TypeCounter
8
+
9
+ from .constant import (
10
+ FREQUENCIES,
11
+ KO_NAMES,
12
+ LANGUAGE_SUPPORTED_COUNT,
13
+ TOO_SMALL_SEQUENCE,
14
+ ZH_NAMES,
15
+ )
16
+ from .md import is_suspiciously_successive_range
17
+ from .models import CoherenceMatches
18
+ from .utils import (
19
+ is_accentuated,
20
+ is_latin,
21
+ is_multi_byte_encoding,
22
+ is_unicode_range_secondary,
23
+ unicode_range,
24
+ )
25
+
26
+
27
+ def encoding_unicode_range(iana_name: str) -> list[str]:
28
+ """
29
+ Return associated unicode ranges in a single byte code page.
30
+ """
31
+ if is_multi_byte_encoding(iana_name):
32
+ raise OSError("Function not supported on multi-byte code page")
33
+
34
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
35
+
36
+ p: IncrementalDecoder = decoder(errors="ignore")
37
+ seen_ranges: dict[str, int] = {}
38
+ character_count: int = 0
39
+
40
+ for i in range(0x40, 0xFF):
41
+ chunk: str = p.decode(bytes([i]))
42
+
43
+ if chunk:
44
+ character_range: str | None = unicode_range(chunk)
45
+
46
+ if character_range is None:
47
+ continue
48
+
49
+ if is_unicode_range_secondary(character_range) is False:
50
+ if character_range not in seen_ranges:
51
+ seen_ranges[character_range] = 0
52
+ seen_ranges[character_range] += 1
53
+ character_count += 1
54
+
55
+ return sorted(
56
+ [
57
+ character_range
58
+ for character_range in seen_ranges
59
+ if seen_ranges[character_range] / character_count >= 0.15
60
+ ]
61
+ )
62
+
63
+
64
+ def unicode_range_languages(primary_range: str) -> list[str]:
65
+ """
66
+ Return inferred languages used with a unicode range.
67
+ """
68
+ languages: list[str] = []
69
+
70
+ for language, characters in FREQUENCIES.items():
71
+ for character in characters:
72
+ if unicode_range(character) == primary_range:
73
+ languages.append(language)
74
+ break
75
+
76
+ return languages
77
+
78
+
79
+ @lru_cache()
80
+ def encoding_languages(iana_name: str) -> list[str]:
81
+ """
82
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
+ This function does the correspondence.
84
+ """
85
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
86
+ primary_range: str | None = None
87
+
88
+ for specified_range in unicode_ranges:
89
+ if "Latin" not in specified_range:
90
+ primary_range = specified_range
91
+ break
92
+
93
+ if primary_range is None:
94
+ return ["Latin Based"]
95
+
96
+ return unicode_range_languages(primary_range)
97
+
98
+
99
+ @lru_cache()
100
+ def mb_encoding_languages(iana_name: str) -> list[str]:
101
+ """
102
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
+ This function does the correspondence.
104
+ """
105
+ if (
106
+ iana_name.startswith("shift_")
107
+ or iana_name.startswith("iso2022_jp")
108
+ or iana_name.startswith("euc_j")
109
+ or iana_name == "cp932"
110
+ ):
111
+ return ["Japanese"]
112
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
+ return ["Chinese"]
114
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
+ return ["Korean"]
116
+
117
+ return []
118
+
119
+
120
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
+ def get_target_features(language: str) -> tuple[bool, bool]:
122
+ """
123
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
+ """
125
+ target_have_accents: bool = False
126
+ target_pure_latin: bool = True
127
+
128
+ for character in FREQUENCIES[language]:
129
+ if not target_have_accents and is_accentuated(character):
130
+ target_have_accents = True
131
+ if target_pure_latin and is_latin(character) is False:
132
+ target_pure_latin = False
133
+
134
+ return target_have_accents, target_pure_latin
135
+
136
+
137
+ def alphabet_languages(
138
+ characters: list[str], ignore_non_latin: bool = False
139
+ ) -> list[str]:
140
+ """
141
+ Return associated languages associated to given characters.
142
+ """
143
+ languages: list[tuple[str, float]] = []
144
+
145
+ source_have_accents = any(is_accentuated(character) for character in characters)
146
+
147
+ for language, language_characters in FREQUENCIES.items():
148
+ target_have_accents, target_pure_latin = get_target_features(language)
149
+
150
+ if ignore_non_latin and target_pure_latin is False:
151
+ continue
152
+
153
+ if target_have_accents is False and source_have_accents:
154
+ continue
155
+
156
+ character_count: int = len(language_characters)
157
+
158
+ character_match_count: int = len(
159
+ [c for c in language_characters if c in characters]
160
+ )
161
+
162
+ ratio: float = character_match_count / character_count
163
+
164
+ if ratio >= 0.2:
165
+ languages.append((language, ratio))
166
+
167
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
+
169
+ return [compatible_language[0] for compatible_language in languages]
170
+
171
+
172
+ def characters_popularity_compare(
173
+ language: str, ordered_characters: list[str]
174
+ ) -> float:
175
+ """
176
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
+ """
180
+ if language not in FREQUENCIES:
181
+ raise ValueError(f"{language} not available")
182
+
183
+ character_approved_count: int = 0
184
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
185
+
186
+ ordered_characters_count: int = len(ordered_characters)
187
+ target_language_characters_count: int = len(FREQUENCIES[language])
188
+
189
+ large_alphabet: bool = target_language_characters_count > 26
190
+
191
+ for character, character_rank in zip(
192
+ ordered_characters, range(0, ordered_characters_count)
193
+ ):
194
+ if character not in FREQUENCIES_language_set:
195
+ continue
196
+
197
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
198
+ expected_projection_ratio: float = (
199
+ target_language_characters_count / ordered_characters_count
200
+ )
201
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
+
203
+ if (
204
+ large_alphabet is False
205
+ and abs(character_rank_projection - character_rank_in_language) > 4
206
+ ):
207
+ continue
208
+
209
+ if (
210
+ large_alphabet is True
211
+ and abs(character_rank_projection - character_rank_in_language)
212
+ < target_language_characters_count / 3
213
+ ):
214
+ character_approved_count += 1
215
+ continue
216
+
217
+ characters_before_source: list[str] = FREQUENCIES[language][
218
+ 0:character_rank_in_language
219
+ ]
220
+ characters_after_source: list[str] = FREQUENCIES[language][
221
+ character_rank_in_language:
222
+ ]
223
+ characters_before: list[str] = ordered_characters[0:character_rank]
224
+ characters_after: list[str] = ordered_characters[character_rank:]
225
+
226
+ before_match_count: int = len(
227
+ set(characters_before) & set(characters_before_source)
228
+ )
229
+
230
+ after_match_count: int = len(
231
+ set(characters_after) & set(characters_after_source)
232
+ )
233
+
234
+ if len(characters_before_source) == 0 and before_match_count <= 4:
235
+ character_approved_count += 1
236
+ continue
237
+
238
+ if len(characters_after_source) == 0 and after_match_count <= 4:
239
+ character_approved_count += 1
240
+ continue
241
+
242
+ if (
243
+ before_match_count / len(characters_before_source) >= 0.4
244
+ or after_match_count / len(characters_after_source) >= 0.4
245
+ ):
246
+ character_approved_count += 1
247
+ continue
248
+
249
+ return character_approved_count / len(ordered_characters)
250
+
251
+
252
+ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
253
+ """
254
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
+ One containing the latin letters and the other hebrew.
257
+ """
258
+ layers: dict[str, str] = {}
259
+
260
+ for character in decoded_sequence:
261
+ if character.isalpha() is False:
262
+ continue
263
+
264
+ character_range: str | None = unicode_range(character)
265
+
266
+ if character_range is None:
267
+ continue
268
+
269
+ layer_target_range: str | None = None
270
+
271
+ for discovered_range in layers:
272
+ if (
273
+ is_suspiciously_successive_range(discovered_range, character_range)
274
+ is False
275
+ ):
276
+ layer_target_range = discovered_range
277
+ break
278
+
279
+ if layer_target_range is None:
280
+ layer_target_range = character_range
281
+
282
+ if layer_target_range not in layers:
283
+ layers[layer_target_range] = character.lower()
284
+ continue
285
+
286
+ layers[layer_target_range] += character.lower()
287
+
288
+ return list(layers.values())
289
+
290
+
291
+ def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
292
+ """
293
+ This function merge results previously given by the function coherence_ratio.
294
+ The return type is the same as coherence_ratio.
295
+ """
296
+ per_language_ratios: dict[str, list[float]] = {}
297
+ for result in results:
298
+ for sub_result in result:
299
+ language, ratio = sub_result
300
+ if language not in per_language_ratios:
301
+ per_language_ratios[language] = [ratio]
302
+ continue
303
+ per_language_ratios[language].append(ratio)
304
+
305
+ merge = [
306
+ (
307
+ language,
308
+ round(
309
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
+ 4,
311
+ ),
312
+ )
313
+ for language in per_language_ratios
314
+ ]
315
+
316
+ return sorted(merge, key=lambda x: x[1], reverse=True)
317
+
318
+
319
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
+ """
321
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
+ of "English". This function only keeps the best match and remove the em-dash in it.
323
+ """
324
+ index_results: dict[str, list[float]] = dict()
325
+
326
+ for result in results:
327
+ language, ratio = result
328
+ no_em_name: str = language.replace("—", "")
329
+
330
+ if no_em_name not in index_results:
331
+ index_results[no_em_name] = []
332
+
333
+ index_results[no_em_name].append(ratio)
334
+
335
+ if any(len(index_results[e]) > 1 for e in index_results):
336
+ filtered_results: CoherenceMatches = []
337
+
338
+ for language in index_results:
339
+ filtered_results.append((language, max(index_results[language])))
340
+
341
+ return filtered_results
342
+
343
+ return results
344
+
345
+
346
+ @lru_cache(maxsize=2048)
347
+ def coherence_ratio(
348
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
349
+ ) -> CoherenceMatches:
350
+ """
351
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
+ A layer = Character extraction by alphabets/ranges.
353
+ """
354
+
355
+ results: list[tuple[str, float]] = []
356
+ ignore_non_latin: bool = False
357
+
358
+ sufficient_match_count: int = 0
359
+
360
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
+ if "Latin Based" in lg_inclusion_list:
362
+ ignore_non_latin = True
363
+ lg_inclusion_list.remove("Latin Based")
364
+
365
+ for layer in alpha_unicode_split(decoded_sequence):
366
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
367
+ most_common = sequence_frequencies.most_common()
368
+
369
+ character_count: int = sum(o for c, o in most_common)
370
+
371
+ if character_count <= TOO_SMALL_SEQUENCE:
372
+ continue
373
+
374
+ popular_character_ordered: list[str] = [c for c, o in most_common]
375
+
376
+ for language in lg_inclusion_list or alphabet_languages(
377
+ popular_character_ordered, ignore_non_latin
378
+ ):
379
+ ratio: float = characters_popularity_compare(
380
+ language, popular_character_ordered
381
+ )
382
+
383
+ if ratio < threshold:
384
+ continue
385
+ elif ratio >= 0.8:
386
+ sufficient_match_count += 1
387
+
388
+ results.append((language, round(ratio, 4)))
389
+
390
+ if sufficient_match_count >= 3:
391
+ break
392
+
393
+ return sorted(
394
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
+ )
phivenv/Lib/site-packages/charset_normalizer/cli/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .__main__ import cli_detect, query_yes_no
4
+
5
+ __all__ = (
6
+ "cli_detect",
7
+ "query_yes_no",
8
+ )
phivenv/Lib/site-packages/charset_normalizer/cli/__main__.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ import typing
6
+ from json import dumps
7
+ from os.path import abspath, basename, dirname, join, realpath
8
+ from platform import python_version
9
+ from unicodedata import unidata_version
10
+
11
+ import charset_normalizer.md as md_module
12
+ from charset_normalizer import from_fp
13
+ from charset_normalizer.models import CliDetectionResult
14
+ from charset_normalizer.version import __version__
15
+
16
+
17
+ def query_yes_no(question: str, default: str = "yes") -> bool:
18
+ """Ask a yes/no question via input() and return their answer.
19
+
20
+ "question" is a string that is presented to the user.
21
+ "default" is the presumed answer if the user just hits <Enter>.
22
+ It must be "yes" (the default), "no" or None (meaning
23
+ an answer is required of the user).
24
+
25
+ The "answer" return value is True for "yes" or False for "no".
26
+
27
+ Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
28
+ """
29
+ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
30
+ if default is None:
31
+ prompt = " [y/n] "
32
+ elif default == "yes":
33
+ prompt = " [Y/n] "
34
+ elif default == "no":
35
+ prompt = " [y/N] "
36
+ else:
37
+ raise ValueError("invalid default answer: '%s'" % default)
38
+
39
+ while True:
40
+ sys.stdout.write(question + prompt)
41
+ choice = input().lower()
42
+ if default is not None and choice == "":
43
+ return valid[default]
44
+ elif choice in valid:
45
+ return valid[choice]
46
+ else:
47
+ sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
48
+
49
+
50
+ class FileType:
51
+ """Factory for creating file object types
52
+
53
+ Instances of FileType are typically passed as type= arguments to the
54
+ ArgumentParser add_argument() method.
55
+
56
+ Keyword Arguments:
57
+ - mode -- A string indicating how the file is to be opened. Accepts the
58
+ same values as the builtin open() function.
59
+ - bufsize -- The file's desired buffer size. Accepts the same values as
60
+ the builtin open() function.
61
+ - encoding -- The file's encoding. Accepts the same values as the
62
+ builtin open() function.
63
+ - errors -- A string indicating how encoding and decoding errors are to
64
+ be handled. Accepts the same value as the builtin open() function.
65
+
66
+ Backported from CPython 3.12
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ mode: str = "r",
72
+ bufsize: int = -1,
73
+ encoding: str | None = None,
74
+ errors: str | None = None,
75
+ ):
76
+ self._mode = mode
77
+ self._bufsize = bufsize
78
+ self._encoding = encoding
79
+ self._errors = errors
80
+
81
+ def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
82
+ # the special argument "-" means sys.std{in,out}
83
+ if string == "-":
84
+ if "r" in self._mode:
85
+ return sys.stdin.buffer if "b" in self._mode else sys.stdin
86
+ elif any(c in self._mode for c in "wax"):
87
+ return sys.stdout.buffer if "b" in self._mode else sys.stdout
88
+ else:
89
+ msg = f'argument "-" with mode {self._mode}'
90
+ raise ValueError(msg)
91
+
92
+ # all other arguments are used as file names
93
+ try:
94
+ return open(string, self._mode, self._bufsize, self._encoding, self._errors)
95
+ except OSError as e:
96
+ message = f"can't open '{string}': {e}"
97
+ raise argparse.ArgumentTypeError(message)
98
+
99
+ def __repr__(self) -> str:
100
+ args = self._mode, self._bufsize
101
+ kwargs = [("encoding", self._encoding), ("errors", self._errors)]
102
+ args_str = ", ".join(
103
+ [repr(arg) for arg in args if arg != -1]
104
+ + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
105
+ )
106
+ return f"{type(self).__name__}({args_str})"
107
+
108
+
109
+ def cli_detect(argv: list[str] | None = None) -> int:
110
+ """
111
+ CLI assistant using ARGV and ArgumentParser
112
+ :param argv:
113
+ :return: 0 if everything is fine, anything else equal trouble
114
+ """
115
+ parser = argparse.ArgumentParser(
116
+ description="The Real First Universal Charset Detector. "
117
+ "Discover originating encoding used on text file. "
118
+ "Normalize text to unicode."
119
+ )
120
+
121
+ parser.add_argument(
122
+ "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
123
+ )
124
+ parser.add_argument(
125
+ "-v",
126
+ "--verbose",
127
+ action="store_true",
128
+ default=False,
129
+ dest="verbose",
130
+ help="Display complementary information about file if any. "
131
+ "Stdout will contain logs about the detection process.",
132
+ )
133
+ parser.add_argument(
134
+ "-a",
135
+ "--with-alternative",
136
+ action="store_true",
137
+ default=False,
138
+ dest="alternatives",
139
+ help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
140
+ )
141
+ parser.add_argument(
142
+ "-n",
143
+ "--normalize",
144
+ action="store_true",
145
+ default=False,
146
+ dest="normalize",
147
+ help="Permit to normalize input file. If not set, program does not write anything.",
148
+ )
149
+ parser.add_argument(
150
+ "-m",
151
+ "--minimal",
152
+ action="store_true",
153
+ default=False,
154
+ dest="minimal",
155
+ help="Only output the charset detected to STDOUT. Disabling JSON output.",
156
+ )
157
+ parser.add_argument(
158
+ "-r",
159
+ "--replace",
160
+ action="store_true",
161
+ default=False,
162
+ dest="replace",
163
+ help="Replace file when trying to normalize it instead of creating a new one.",
164
+ )
165
+ parser.add_argument(
166
+ "-f",
167
+ "--force",
168
+ action="store_true",
169
+ default=False,
170
+ dest="force",
171
+ help="Replace file without asking if you are sure, use this flag with caution.",
172
+ )
173
+ parser.add_argument(
174
+ "-i",
175
+ "--no-preemptive",
176
+ action="store_true",
177
+ default=False,
178
+ dest="no_preemptive",
179
+ help="Disable looking at a charset declaration to hint the detector.",
180
+ )
181
+ parser.add_argument(
182
+ "-t",
183
+ "--threshold",
184
+ action="store",
185
+ default=0.2,
186
+ type=float,
187
+ dest="threshold",
188
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
189
+ )
190
+ parser.add_argument(
191
+ "--version",
192
+ action="version",
193
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
194
+ __version__,
195
+ python_version(),
196
+ unidata_version,
197
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
198
+ ),
199
+ help="Show version information and exit.",
200
+ )
201
+
202
+ args = parser.parse_args(argv)
203
+
204
+ if args.replace is True and args.normalize is False:
205
+ if args.files:
206
+ for my_file in args.files:
207
+ my_file.close()
208
+ print("Use --replace in addition of --normalize only.", file=sys.stderr)
209
+ return 1
210
+
211
+ if args.force is True and args.replace is False:
212
+ if args.files:
213
+ for my_file in args.files:
214
+ my_file.close()
215
+ print("Use --force in addition of --replace only.", file=sys.stderr)
216
+ return 1
217
+
218
+ if args.threshold < 0.0 or args.threshold > 1.0:
219
+ if args.files:
220
+ for my_file in args.files:
221
+ my_file.close()
222
+ print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
223
+ return 1
224
+
225
+ x_ = []
226
+
227
+ for my_file in args.files:
228
+ matches = from_fp(
229
+ my_file,
230
+ threshold=args.threshold,
231
+ explain=args.verbose,
232
+ preemptive_behaviour=args.no_preemptive is False,
233
+ )
234
+
235
+ best_guess = matches.best()
236
+
237
+ if best_guess is None:
238
+ print(
239
+ 'Unable to identify originating encoding for "{}". {}'.format(
240
+ my_file.name,
241
+ (
242
+ "Maybe try increasing maximum amount of chaos."
243
+ if args.threshold < 1.0
244
+ else ""
245
+ ),
246
+ ),
247
+ file=sys.stderr,
248
+ )
249
+ x_.append(
250
+ CliDetectionResult(
251
+ abspath(my_file.name),
252
+ None,
253
+ [],
254
+ [],
255
+ "Unknown",
256
+ [],
257
+ False,
258
+ 1.0,
259
+ 0.0,
260
+ None,
261
+ True,
262
+ )
263
+ )
264
+ else:
265
+ x_.append(
266
+ CliDetectionResult(
267
+ abspath(my_file.name),
268
+ best_guess.encoding,
269
+ best_guess.encoding_aliases,
270
+ [
271
+ cp
272
+ for cp in best_guess.could_be_from_charset
273
+ if cp != best_guess.encoding
274
+ ],
275
+ best_guess.language,
276
+ best_guess.alphabets,
277
+ best_guess.bom,
278
+ best_guess.percent_chaos,
279
+ best_guess.percent_coherence,
280
+ None,
281
+ True,
282
+ )
283
+ )
284
+
285
+ if len(matches) > 1 and args.alternatives:
286
+ for el in matches:
287
+ if el != best_guess:
288
+ x_.append(
289
+ CliDetectionResult(
290
+ abspath(my_file.name),
291
+ el.encoding,
292
+ el.encoding_aliases,
293
+ [
294
+ cp
295
+ for cp in el.could_be_from_charset
296
+ if cp != el.encoding
297
+ ],
298
+ el.language,
299
+ el.alphabets,
300
+ el.bom,
301
+ el.percent_chaos,
302
+ el.percent_coherence,
303
+ None,
304
+ False,
305
+ )
306
+ )
307
+
308
+ if args.normalize is True:
309
+ if best_guess.encoding.startswith("utf") is True:
310
+ print(
311
+ '"{}" file does not need to be normalized, as it already came from unicode.'.format(
312
+ my_file.name
313
+ ),
314
+ file=sys.stderr,
315
+ )
316
+ if my_file.closed is False:
317
+ my_file.close()
318
+ continue
319
+
320
+ dir_path = dirname(realpath(my_file.name))
321
+ file_name = basename(realpath(my_file.name))
322
+
323
+ o_: list[str] = file_name.split(".")
324
+
325
+ if args.replace is False:
326
+ o_.insert(-1, best_guess.encoding)
327
+ if my_file.closed is False:
328
+ my_file.close()
329
+ elif (
330
+ args.force is False
331
+ and query_yes_no(
332
+ 'Are you sure to normalize "{}" by replacing it ?'.format(
333
+ my_file.name
334
+ ),
335
+ "no",
336
+ )
337
+ is False
338
+ ):
339
+ if my_file.closed is False:
340
+ my_file.close()
341
+ continue
342
+
343
+ try:
344
+ x_[0].unicode_path = join(dir_path, ".".join(o_))
345
+
346
+ with open(x_[0].unicode_path, "wb") as fp:
347
+ fp.write(best_guess.output())
348
+ except OSError as e:
349
+ print(str(e), file=sys.stderr)
350
+ if my_file.closed is False:
351
+ my_file.close()
352
+ return 2
353
+
354
+ if my_file.closed is False:
355
+ my_file.close()
356
+
357
+ if args.minimal is False:
358
+ print(
359
+ dumps(
360
+ [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
361
+ ensure_ascii=True,
362
+ indent=4,
363
+ )
364
+ )
365
+ else:
366
+ for my_file in args.files:
367
+ print(
368
+ ", ".join(
369
+ [
370
+ el.encoding or "undefined"
371
+ for el in x_
372
+ if el.path == abspath(my_file.name)
373
+ ]
374
+ )
375
+ )
376
+
377
+ return 0
378
+
379
+
380
+ if __name__ == "__main__":
381
+ cli_detect()
phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (298 Bytes). View file
 
phivenv/Lib/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-39.pyc ADDED
Binary file (9.25 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/constant.py ADDED
@@ -0,0 +1,2015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
4
+ from encodings.aliases import aliases
5
+ from re import IGNORECASE
6
+ from re import compile as re_compile
7
+
8
+ # Contain for each eligible encoding a list of/item bytes SIG/BOM
9
+ ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
10
+ "utf_8": BOM_UTF8,
11
+ "utf_7": [
12
+ b"\x2b\x2f\x76\x38",
13
+ b"\x2b\x2f\x76\x39",
14
+ b"\x2b\x2f\x76\x2b",
15
+ b"\x2b\x2f\x76\x2f",
16
+ b"\x2b\x2f\x76\x38\x2d",
17
+ ],
18
+ "gb18030": b"\x84\x31\x95\x33",
19
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
20
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
21
+ }
22
+
23
+ TOO_SMALL_SEQUENCE: int = 32
24
+ TOO_BIG_SEQUENCE: int = int(10e6)
25
+
26
+ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
27
+
28
+ # Up-to-date Unicode ucd/15.0.0
29
+ UNICODE_RANGES_COMBINED: dict[str, range] = {
30
+ "Control character": range(32),
31
+ "Basic Latin": range(32, 128),
32
+ "Latin-1 Supplement": range(128, 256),
33
+ "Latin Extended-A": range(256, 384),
34
+ "Latin Extended-B": range(384, 592),
35
+ "IPA Extensions": range(592, 688),
36
+ "Spacing Modifier Letters": range(688, 768),
37
+ "Combining Diacritical Marks": range(768, 880),
38
+ "Greek and Coptic": range(880, 1024),
39
+ "Cyrillic": range(1024, 1280),
40
+ "Cyrillic Supplement": range(1280, 1328),
41
+ "Armenian": range(1328, 1424),
42
+ "Hebrew": range(1424, 1536),
43
+ "Arabic": range(1536, 1792),
44
+ "Syriac": range(1792, 1872),
45
+ "Arabic Supplement": range(1872, 1920),
46
+ "Thaana": range(1920, 1984),
47
+ "NKo": range(1984, 2048),
48
+ "Samaritan": range(2048, 2112),
49
+ "Mandaic": range(2112, 2144),
50
+ "Syriac Supplement": range(2144, 2160),
51
+ "Arabic Extended-B": range(2160, 2208),
52
+ "Arabic Extended-A": range(2208, 2304),
53
+ "Devanagari": range(2304, 2432),
54
+ "Bengali": range(2432, 2560),
55
+ "Gurmukhi": range(2560, 2688),
56
+ "Gujarati": range(2688, 2816),
57
+ "Oriya": range(2816, 2944),
58
+ "Tamil": range(2944, 3072),
59
+ "Telugu": range(3072, 3200),
60
+ "Kannada": range(3200, 3328),
61
+ "Malayalam": range(3328, 3456),
62
+ "Sinhala": range(3456, 3584),
63
+ "Thai": range(3584, 3712),
64
+ "Lao": range(3712, 3840),
65
+ "Tibetan": range(3840, 4096),
66
+ "Myanmar": range(4096, 4256),
67
+ "Georgian": range(4256, 4352),
68
+ "Hangul Jamo": range(4352, 4608),
69
+ "Ethiopic": range(4608, 4992),
70
+ "Ethiopic Supplement": range(4992, 5024),
71
+ "Cherokee": range(5024, 5120),
72
+ "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
73
+ "Ogham": range(5760, 5792),
74
+ "Runic": range(5792, 5888),
75
+ "Tagalog": range(5888, 5920),
76
+ "Hanunoo": range(5920, 5952),
77
+ "Buhid": range(5952, 5984),
78
+ "Tagbanwa": range(5984, 6016),
79
+ "Khmer": range(6016, 6144),
80
+ "Mongolian": range(6144, 6320),
81
+ "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
82
+ "Limbu": range(6400, 6480),
83
+ "Tai Le": range(6480, 6528),
84
+ "New Tai Lue": range(6528, 6624),
85
+ "Khmer Symbols": range(6624, 6656),
86
+ "Buginese": range(6656, 6688),
87
+ "Tai Tham": range(6688, 6832),
88
+ "Combining Diacritical Marks Extended": range(6832, 6912),
89
+ "Balinese": range(6912, 7040),
90
+ "Sundanese": range(7040, 7104),
91
+ "Batak": range(7104, 7168),
92
+ "Lepcha": range(7168, 7248),
93
+ "Ol Chiki": range(7248, 7296),
94
+ "Cyrillic Extended-C": range(7296, 7312),
95
+ "Georgian Extended": range(7312, 7360),
96
+ "Sundanese Supplement": range(7360, 7376),
97
+ "Vedic Extensions": range(7376, 7424),
98
+ "Phonetic Extensions": range(7424, 7552),
99
+ "Phonetic Extensions Supplement": range(7552, 7616),
100
+ "Combining Diacritical Marks Supplement": range(7616, 7680),
101
+ "Latin Extended Additional": range(7680, 7936),
102
+ "Greek Extended": range(7936, 8192),
103
+ "General Punctuation": range(8192, 8304),
104
+ "Superscripts and Subscripts": range(8304, 8352),
105
+ "Currency Symbols": range(8352, 8400),
106
+ "Combining Diacritical Marks for Symbols": range(8400, 8448),
107
+ "Letterlike Symbols": range(8448, 8528),
108
+ "Number Forms": range(8528, 8592),
109
+ "Arrows": range(8592, 8704),
110
+ "Mathematical Operators": range(8704, 8960),
111
+ "Miscellaneous Technical": range(8960, 9216),
112
+ "Control Pictures": range(9216, 9280),
113
+ "Optical Character Recognition": range(9280, 9312),
114
+ "Enclosed Alphanumerics": range(9312, 9472),
115
+ "Box Drawing": range(9472, 9600),
116
+ "Block Elements": range(9600, 9632),
117
+ "Geometric Shapes": range(9632, 9728),
118
+ "Miscellaneous Symbols": range(9728, 9984),
119
+ "Dingbats": range(9984, 10176),
120
+ "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
121
+ "Supplemental Arrows-A": range(10224, 10240),
122
+ "Braille Patterns": range(10240, 10496),
123
+ "Supplemental Arrows-B": range(10496, 10624),
124
+ "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
125
+ "Supplemental Mathematical Operators": range(10752, 11008),
126
+ "Miscellaneous Symbols and Arrows": range(11008, 11264),
127
+ "Glagolitic": range(11264, 11360),
128
+ "Latin Extended-C": range(11360, 11392),
129
+ "Coptic": range(11392, 11520),
130
+ "Georgian Supplement": range(11520, 11568),
131
+ "Tifinagh": range(11568, 11648),
132
+ "Ethiopic Extended": range(11648, 11744),
133
+ "Cyrillic Extended-A": range(11744, 11776),
134
+ "Supplemental Punctuation": range(11776, 11904),
135
+ "CJK Radicals Supplement": range(11904, 12032),
136
+ "Kangxi Radicals": range(12032, 12256),
137
+ "Ideographic Description Characters": range(12272, 12288),
138
+ "CJK Symbols and Punctuation": range(12288, 12352),
139
+ "Hiragana": range(12352, 12448),
140
+ "Katakana": range(12448, 12544),
141
+ "Bopomofo": range(12544, 12592),
142
+ "Hangul Compatibility Jamo": range(12592, 12688),
143
+ "Kanbun": range(12688, 12704),
144
+ "Bopomofo Extended": range(12704, 12736),
145
+ "CJK Strokes": range(12736, 12784),
146
+ "Katakana Phonetic Extensions": range(12784, 12800),
147
+ "Enclosed CJK Letters and Months": range(12800, 13056),
148
+ "CJK Compatibility": range(13056, 13312),
149
+ "CJK Unified Ideographs Extension A": range(13312, 19904),
150
+ "Yijing Hexagram Symbols": range(19904, 19968),
151
+ "CJK Unified Ideographs": range(19968, 40960),
152
+ "Yi Syllables": range(40960, 42128),
153
+ "Yi Radicals": range(42128, 42192),
154
+ "Lisu": range(42192, 42240),
155
+ "Vai": range(42240, 42560),
156
+ "Cyrillic Extended-B": range(42560, 42656),
157
+ "Bamum": range(42656, 42752),
158
+ "Modifier Tone Letters": range(42752, 42784),
159
+ "Latin Extended-D": range(42784, 43008),
160
+ "Syloti Nagri": range(43008, 43056),
161
+ "Common Indic Number Forms": range(43056, 43072),
162
+ "Phags-pa": range(43072, 43136),
163
+ "Saurashtra": range(43136, 43232),
164
+ "Devanagari Extended": range(43232, 43264),
165
+ "Kayah Li": range(43264, 43312),
166
+ "Rejang": range(43312, 43360),
167
+ "Hangul Jamo Extended-A": range(43360, 43392),
168
+ "Javanese": range(43392, 43488),
169
+ "Myanmar Extended-B": range(43488, 43520),
170
+ "Cham": range(43520, 43616),
171
+ "Myanmar Extended-A": range(43616, 43648),
172
+ "Tai Viet": range(43648, 43744),
173
+ "Meetei Mayek Extensions": range(43744, 43776),
174
+ "Ethiopic Extended-A": range(43776, 43824),
175
+ "Latin Extended-E": range(43824, 43888),
176
+ "Cherokee Supplement": range(43888, 43968),
177
+ "Meetei Mayek": range(43968, 44032),
178
+ "Hangul Syllables": range(44032, 55216),
179
+ "Hangul Jamo Extended-B": range(55216, 55296),
180
+ "High Surrogates": range(55296, 56192),
181
+ "High Private Use Surrogates": range(56192, 56320),
182
+ "Low Surrogates": range(56320, 57344),
183
+ "Private Use Area": range(57344, 63744),
184
+ "CJK Compatibility Ideographs": range(63744, 64256),
185
+ "Alphabetic Presentation Forms": range(64256, 64336),
186
+ "Arabic Presentation Forms-A": range(64336, 65024),
187
+ "Variation Selectors": range(65024, 65040),
188
+ "Vertical Forms": range(65040, 65056),
189
+ "Combining Half Marks": range(65056, 65072),
190
+ "CJK Compatibility Forms": range(65072, 65104),
191
+ "Small Form Variants": range(65104, 65136),
192
+ "Arabic Presentation Forms-B": range(65136, 65280),
193
+ "Halfwidth and Fullwidth Forms": range(65280, 65520),
194
+ "Specials": range(65520, 65536),
195
+ "Linear B Syllabary": range(65536, 65664),
196
+ "Linear B Ideograms": range(65664, 65792),
197
+ "Aegean Numbers": range(65792, 65856),
198
+ "Ancient Greek Numbers": range(65856, 65936),
199
+ "Ancient Symbols": range(65936, 66000),
200
+ "Phaistos Disc": range(66000, 66048),
201
+ "Lycian": range(66176, 66208),
202
+ "Carian": range(66208, 66272),
203
+ "Coptic Epact Numbers": range(66272, 66304),
204
+ "Old Italic": range(66304, 66352),
205
+ "Gothic": range(66352, 66384),
206
+ "Old Permic": range(66384, 66432),
207
+ "Ugaritic": range(66432, 66464),
208
+ "Old Persian": range(66464, 66528),
209
+ "Deseret": range(66560, 66640),
210
+ "Shavian": range(66640, 66688),
211
+ "Osmanya": range(66688, 66736),
212
+ "Osage": range(66736, 66816),
213
+ "Elbasan": range(66816, 66864),
214
+ "Caucasian Albanian": range(66864, 66928),
215
+ "Vithkuqi": range(66928, 67008),
216
+ "Linear A": range(67072, 67456),
217
+ "Latin Extended-F": range(67456, 67520),
218
+ "Cypriot Syllabary": range(67584, 67648),
219
+ "Imperial Aramaic": range(67648, 67680),
220
+ "Palmyrene": range(67680, 67712),
221
+ "Nabataean": range(67712, 67760),
222
+ "Hatran": range(67808, 67840),
223
+ "Phoenician": range(67840, 67872),
224
+ "Lydian": range(67872, 67904),
225
+ "Meroitic Hieroglyphs": range(67968, 68000),
226
+ "Meroitic Cursive": range(68000, 68096),
227
+ "Kharoshthi": range(68096, 68192),
228
+ "Old South Arabian": range(68192, 68224),
229
+ "Old North Arabian": range(68224, 68256),
230
+ "Manichaean": range(68288, 68352),
231
+ "Avestan": range(68352, 68416),
232
+ "Inscriptional Parthian": range(68416, 68448),
233
+ "Inscriptional Pahlavi": range(68448, 68480),
234
+ "Psalter Pahlavi": range(68480, 68528),
235
+ "Old Turkic": range(68608, 68688),
236
+ "Old Hungarian": range(68736, 68864),
237
+ "Hanifi Rohingya": range(68864, 68928),
238
+ "Rumi Numeral Symbols": range(69216, 69248),
239
+ "Yezidi": range(69248, 69312),
240
+ "Arabic Extended-C": range(69312, 69376),
241
+ "Old Sogdian": range(69376, 69424),
242
+ "Sogdian": range(69424, 69488),
243
+ "Old Uyghur": range(69488, 69552),
244
+ "Chorasmian": range(69552, 69600),
245
+ "Elymaic": range(69600, 69632),
246
+ "Brahmi": range(69632, 69760),
247
+ "Kaithi": range(69760, 69840),
248
+ "Sora Sompeng": range(69840, 69888),
249
+ "Chakma": range(69888, 69968),
250
+ "Mahajani": range(69968, 70016),
251
+ "Sharada": range(70016, 70112),
252
+ "Sinhala Archaic Numbers": range(70112, 70144),
253
+ "Khojki": range(70144, 70224),
254
+ "Multani": range(70272, 70320),
255
+ "Khudawadi": range(70320, 70400),
256
+ "Grantha": range(70400, 70528),
257
+ "Newa": range(70656, 70784),
258
+ "Tirhuta": range(70784, 70880),
259
+ "Siddham": range(71040, 71168),
260
+ "Modi": range(71168, 71264),
261
+ "Mongolian Supplement": range(71264, 71296),
262
+ "Takri": range(71296, 71376),
263
+ "Ahom": range(71424, 71504),
264
+ "Dogra": range(71680, 71760),
265
+ "Warang Citi": range(71840, 71936),
266
+ "Dives Akuru": range(71936, 72032),
267
+ "Nandinagari": range(72096, 72192),
268
+ "Zanabazar Square": range(72192, 72272),
269
+ "Soyombo": range(72272, 72368),
270
+ "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
271
+ "Pau Cin Hau": range(72384, 72448),
272
+ "Devanagari Extended-A": range(72448, 72544),
273
+ "Bhaiksuki": range(72704, 72816),
274
+ "Marchen": range(72816, 72896),
275
+ "Masaram Gondi": range(72960, 73056),
276
+ "Gunjala Gondi": range(73056, 73136),
277
+ "Makasar": range(73440, 73472),
278
+ "Kawi": range(73472, 73568),
279
+ "Lisu Supplement": range(73648, 73664),
280
+ "Tamil Supplement": range(73664, 73728),
281
+ "Cuneiform": range(73728, 74752),
282
+ "Cuneiform Numbers and Punctuation": range(74752, 74880),
283
+ "Early Dynastic Cuneiform": range(74880, 75088),
284
+ "Cypro-Minoan": range(77712, 77824),
285
+ "Egyptian Hieroglyphs": range(77824, 78896),
286
+ "Egyptian Hieroglyph Format Controls": range(78896, 78944),
287
+ "Anatolian Hieroglyphs": range(82944, 83584),
288
+ "Bamum Supplement": range(92160, 92736),
289
+ "Mro": range(92736, 92784),
290
+ "Tangsa": range(92784, 92880),
291
+ "Bassa Vah": range(92880, 92928),
292
+ "Pahawh Hmong": range(92928, 93072),
293
+ "Medefaidrin": range(93760, 93856),
294
+ "Miao": range(93952, 94112),
295
+ "Ideographic Symbols and Punctuation": range(94176, 94208),
296
+ "Tangut": range(94208, 100352),
297
+ "Tangut Components": range(100352, 101120),
298
+ "Khitan Small Script": range(101120, 101632),
299
+ "Tangut Supplement": range(101632, 101760),
300
+ "Kana Extended-B": range(110576, 110592),
301
+ "Kana Supplement": range(110592, 110848),
302
+ "Kana Extended-A": range(110848, 110896),
303
+ "Small Kana Extension": range(110896, 110960),
304
+ "Nushu": range(110960, 111360),
305
+ "Duployan": range(113664, 113824),
306
+ "Shorthand Format Controls": range(113824, 113840),
307
+ "Znamenny Musical Notation": range(118528, 118736),
308
+ "Byzantine Musical Symbols": range(118784, 119040),
309
+ "Musical Symbols": range(119040, 119296),
310
+ "Ancient Greek Musical Notation": range(119296, 119376),
311
+ "Kaktovik Numerals": range(119488, 119520),
312
+ "Mayan Numerals": range(119520, 119552),
313
+ "Tai Xuan Jing Symbols": range(119552, 119648),
314
+ "Counting Rod Numerals": range(119648, 119680),
315
+ "Mathematical Alphanumeric Symbols": range(119808, 120832),
316
+ "Sutton SignWriting": range(120832, 121520),
317
+ "Latin Extended-G": range(122624, 122880),
318
+ "Glagolitic Supplement": range(122880, 122928),
319
+ "Cyrillic Extended-D": range(122928, 123024),
320
+ "Nyiakeng Puachue Hmong": range(123136, 123216),
321
+ "Toto": range(123536, 123584),
322
+ "Wancho": range(123584, 123648),
323
+ "Nag Mundari": range(124112, 124160),
324
+ "Ethiopic Extended-B": range(124896, 124928),
325
+ "Mende Kikakui": range(124928, 125152),
326
+ "Adlam": range(125184, 125280),
327
+ "Indic Siyaq Numbers": range(126064, 126144),
328
+ "Ottoman Siyaq Numbers": range(126208, 126288),
329
+ "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
330
+ "Mahjong Tiles": range(126976, 127024),
331
+ "Domino Tiles": range(127024, 127136),
332
+ "Playing Cards": range(127136, 127232),
333
+ "Enclosed Alphanumeric Supplement": range(127232, 127488),
334
+ "Enclosed Ideographic Supplement": range(127488, 127744),
335
+ "Miscellaneous Symbols and Pictographs": range(127744, 128512),
336
+ "Emoticons range(Emoji)": range(128512, 128592),
337
+ "Ornamental Dingbats": range(128592, 128640),
338
+ "Transport and Map Symbols": range(128640, 128768),
339
+ "Alchemical Symbols": range(128768, 128896),
340
+ "Geometric Shapes Extended": range(128896, 129024),
341
+ "Supplemental Arrows-C": range(129024, 129280),
342
+ "Supplemental Symbols and Pictographs": range(129280, 129536),
343
+ "Chess Symbols": range(129536, 129648),
344
+ "Symbols and Pictographs Extended-A": range(129648, 129792),
345
+ "Symbols for Legacy Computing": range(129792, 130048),
346
+ "CJK Unified Ideographs Extension B": range(131072, 173792),
347
+ "CJK Unified Ideographs Extension C": range(173824, 177984),
348
+ "CJK Unified Ideographs Extension D": range(177984, 178208),
349
+ "CJK Unified Ideographs Extension E": range(178208, 183984),
350
+ "CJK Unified Ideographs Extension F": range(183984, 191472),
351
+ "CJK Compatibility Ideographs Supplement": range(194560, 195104),
352
+ "CJK Unified Ideographs Extension G": range(196608, 201552),
353
+ "CJK Unified Ideographs Extension H": range(201552, 205744),
354
+ "Tags": range(917504, 917632),
355
+ "Variation Selectors Supplement": range(917760, 918000),
356
+ "Supplementary Private Use Area-A": range(983040, 1048576),
357
+ "Supplementary Private Use Area-B": range(1048576, 1114112),
358
+ }
359
+
360
+
361
+ UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
362
+ "Supplement",
363
+ "Extended",
364
+ "Extensions",
365
+ "Modifier",
366
+ "Marks",
367
+ "Punctuation",
368
+ "Symbols",
369
+ "Forms",
370
+ "Operators",
371
+ "Miscellaneous",
372
+ "Drawing",
373
+ "Block",
374
+ "Shapes",
375
+ "Supplemental",
376
+ "Tags",
377
+ ]
378
+
379
+ RE_POSSIBLE_ENCODING_INDICATION = re_compile(
380
+ r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
381
+ IGNORECASE,
382
+ )
383
+
384
+ IANA_NO_ALIASES = [
385
+ "cp720",
386
+ "cp737",
387
+ "cp856",
388
+ "cp874",
389
+ "cp875",
390
+ "cp1006",
391
+ "koi8_r",
392
+ "koi8_t",
393
+ "koi8_u",
394
+ ]
395
+
396
+ IANA_SUPPORTED: list[str] = sorted(
397
+ filter(
398
+ lambda x: x.endswith("_codec") is False
399
+ and x not in {"rot_13", "tactis", "mbcs"},
400
+ list(set(aliases.values())) + IANA_NO_ALIASES,
401
+ )
402
+ )
403
+
404
+ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
405
+
406
+ # pre-computed code page that are similar using the function cp_similarity.
407
+ IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
408
+ "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
409
+ "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
410
+ "cp1125": ["cp866"],
411
+ "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
412
+ "cp1250": ["iso8859_2"],
413
+ "cp1251": ["kz1048", "ptcp154"],
414
+ "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
415
+ "cp1253": ["iso8859_7"],
416
+ "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
417
+ "cp1257": ["iso8859_13"],
418
+ "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
419
+ "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
420
+ "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
421
+ "cp850": ["cp437", "cp857", "cp858", "cp865"],
422
+ "cp857": ["cp850", "cp858", "cp865"],
423
+ "cp858": ["cp437", "cp850", "cp857", "cp865"],
424
+ "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
425
+ "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
426
+ "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
427
+ "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
428
+ "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
429
+ "cp866": ["cp1125"],
430
+ "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
431
+ "iso8859_11": ["tis_620"],
432
+ "iso8859_13": ["cp1257"],
433
+ "iso8859_14": [
434
+ "iso8859_10",
435
+ "iso8859_15",
436
+ "iso8859_16",
437
+ "iso8859_3",
438
+ "iso8859_9",
439
+ "latin_1",
440
+ ],
441
+ "iso8859_15": [
442
+ "cp1252",
443
+ "cp1254",
444
+ "iso8859_10",
445
+ "iso8859_14",
446
+ "iso8859_16",
447
+ "iso8859_3",
448
+ "iso8859_9",
449
+ "latin_1",
450
+ ],
451
+ "iso8859_16": [
452
+ "iso8859_14",
453
+ "iso8859_15",
454
+ "iso8859_2",
455
+ "iso8859_3",
456
+ "iso8859_9",
457
+ "latin_1",
458
+ ],
459
+ "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
460
+ "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
461
+ "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
462
+ "iso8859_7": ["cp1253"],
463
+ "iso8859_9": [
464
+ "cp1252",
465
+ "cp1254",
466
+ "cp1258",
467
+ "iso8859_10",
468
+ "iso8859_14",
469
+ "iso8859_15",
470
+ "iso8859_16",
471
+ "iso8859_3",
472
+ "iso8859_4",
473
+ "latin_1",
474
+ ],
475
+ "kz1048": ["cp1251", "ptcp154"],
476
+ "latin_1": [
477
+ "cp1252",
478
+ "cp1254",
479
+ "cp1258",
480
+ "iso8859_10",
481
+ "iso8859_14",
482
+ "iso8859_15",
483
+ "iso8859_16",
484
+ "iso8859_3",
485
+ "iso8859_4",
486
+ "iso8859_9",
487
+ ],
488
+ "mac_iceland": ["mac_roman", "mac_turkish"],
489
+ "mac_roman": ["mac_iceland", "mac_turkish"],
490
+ "mac_turkish": ["mac_iceland", "mac_roman"],
491
+ "ptcp154": ["cp1251", "kz1048"],
492
+ "tis_620": ["iso8859_11"],
493
+ }
494
+
495
+
496
+ CHARDET_CORRESPONDENCE: dict[str, str] = {
497
+ "iso2022_kr": "ISO-2022-KR",
498
+ "iso2022_jp": "ISO-2022-JP",
499
+ "euc_kr": "EUC-KR",
500
+ "tis_620": "TIS-620",
501
+ "utf_32": "UTF-32",
502
+ "euc_jp": "EUC-JP",
503
+ "koi8_r": "KOI8-R",
504
+ "iso8859_1": "ISO-8859-1",
505
+ "iso8859_2": "ISO-8859-2",
506
+ "iso8859_5": "ISO-8859-5",
507
+ "iso8859_6": "ISO-8859-6",
508
+ "iso8859_7": "ISO-8859-7",
509
+ "iso8859_8": "ISO-8859-8",
510
+ "utf_16": "UTF-16",
511
+ "cp855": "IBM855",
512
+ "mac_cyrillic": "MacCyrillic",
513
+ "gb2312": "GB2312",
514
+ "gb18030": "GB18030",
515
+ "cp932": "CP932",
516
+ "cp866": "IBM866",
517
+ "utf_8": "utf-8",
518
+ "utf_8_sig": "UTF-8-SIG",
519
+ "shift_jis": "SHIFT_JIS",
520
+ "big5": "Big5",
521
+ "cp1250": "windows-1250",
522
+ "cp1251": "windows-1251",
523
+ "cp1252": "Windows-1252",
524
+ "cp1253": "windows-1253",
525
+ "cp1255": "windows-1255",
526
+ "cp1256": "windows-1256",
527
+ "cp1254": "Windows-1254",
528
+ "cp949": "CP949",
529
+ }
530
+
531
+
532
+ COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
533
+ "<",
534
+ ">",
535
+ "=",
536
+ ":",
537
+ "/",
538
+ "&",
539
+ ";",
540
+ "{",
541
+ "}",
542
+ "[",
543
+ "]",
544
+ ",",
545
+ "|",
546
+ '"',
547
+ "-",
548
+ "(",
549
+ ")",
550
+ }
551
+
552
+ # Sample character sets — replace with full lists if needed
553
+ COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
554
+
555
+ COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
556
+
557
+ COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
558
+
559
+ # Combine all into a set
560
+ COMMON_CJK_CHARACTERS = set(
561
+ "".join(
562
+ [
563
+ COMMON_CHINESE_CHARACTERS,
564
+ COMMON_JAPANESE_CHARACTERS,
565
+ COMMON_KOREAN_CHARACTERS,
566
+ ]
567
+ )
568
+ )
569
+
570
+ KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
571
+ ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
572
+
573
+ # Logging LEVEL below DEBUG
574
+ TRACE: int = 5
575
+
576
+
577
+ # Language label that contain the em dash "—"
578
+ # character are to be considered alternative seq to origin
579
+ FREQUENCIES: dict[str, list[str]] = {
580
+ "English": [
581
+ "e",
582
+ "a",
583
+ "t",
584
+ "i",
585
+ "o",
586
+ "n",
587
+ "s",
588
+ "r",
589
+ "h",
590
+ "l",
591
+ "d",
592
+ "c",
593
+ "u",
594
+ "m",
595
+ "f",
596
+ "p",
597
+ "g",
598
+ "w",
599
+ "y",
600
+ "b",
601
+ "v",
602
+ "k",
603
+ "x",
604
+ "j",
605
+ "z",
606
+ "q",
607
+ ],
608
+ "English—": [
609
+ "e",
610
+ "a",
611
+ "t",
612
+ "i",
613
+ "o",
614
+ "n",
615
+ "s",
616
+ "r",
617
+ "h",
618
+ "l",
619
+ "d",
620
+ "c",
621
+ "m",
622
+ "u",
623
+ "f",
624
+ "p",
625
+ "g",
626
+ "w",
627
+ "b",
628
+ "y",
629
+ "v",
630
+ "k",
631
+ "j",
632
+ "x",
633
+ "z",
634
+ "q",
635
+ ],
636
+ "German": [
637
+ "e",
638
+ "n",
639
+ "i",
640
+ "r",
641
+ "s",
642
+ "t",
643
+ "a",
644
+ "d",
645
+ "h",
646
+ "u",
647
+ "l",
648
+ "g",
649
+ "o",
650
+ "c",
651
+ "m",
652
+ "b",
653
+ "f",
654
+ "k",
655
+ "w",
656
+ "z",
657
+ "p",
658
+ "v",
659
+ "ü",
660
+ "ä",
661
+ "ö",
662
+ "j",
663
+ ],
664
+ "French": [
665
+ "e",
666
+ "a",
667
+ "s",
668
+ "n",
669
+ "i",
670
+ "t",
671
+ "r",
672
+ "l",
673
+ "u",
674
+ "o",
675
+ "d",
676
+ "c",
677
+ "p",
678
+ "m",
679
+ "é",
680
+ "v",
681
+ "g",
682
+ "f",
683
+ "b",
684
+ "h",
685
+ "q",
686
+ "à",
687
+ "x",
688
+ "è",
689
+ "y",
690
+ "j",
691
+ ],
692
+ "Dutch": [
693
+ "e",
694
+ "n",
695
+ "a",
696
+ "i",
697
+ "r",
698
+ "t",
699
+ "o",
700
+ "d",
701
+ "s",
702
+ "l",
703
+ "g",
704
+ "h",
705
+ "v",
706
+ "m",
707
+ "u",
708
+ "k",
709
+ "c",
710
+ "p",
711
+ "b",
712
+ "w",
713
+ "j",
714
+ "z",
715
+ "f",
716
+ "y",
717
+ "x",
718
+ "ë",
719
+ ],
720
+ "Italian": [
721
+ "e",
722
+ "i",
723
+ "a",
724
+ "o",
725
+ "n",
726
+ "l",
727
+ "t",
728
+ "r",
729
+ "s",
730
+ "c",
731
+ "d",
732
+ "u",
733
+ "p",
734
+ "m",
735
+ "g",
736
+ "v",
737
+ "f",
738
+ "b",
739
+ "z",
740
+ "h",
741
+ "q",
742
+ "è",
743
+ "à",
744
+ "k",
745
+ "y",
746
+ "ò",
747
+ ],
748
+ "Polish": [
749
+ "a",
750
+ "i",
751
+ "o",
752
+ "e",
753
+ "n",
754
+ "r",
755
+ "z",
756
+ "w",
757
+ "s",
758
+ "c",
759
+ "t",
760
+ "k",
761
+ "y",
762
+ "d",
763
+ "p",
764
+ "m",
765
+ "u",
766
+ "l",
767
+ "j",
768
+ "ł",
769
+ "g",
770
+ "b",
771
+ "h",
772
+ "ą",
773
+ "ę",
774
+ "ó",
775
+ ],
776
+ "Spanish": [
777
+ "e",
778
+ "a",
779
+ "o",
780
+ "n",
781
+ "s",
782
+ "r",
783
+ "i",
784
+ "l",
785
+ "d",
786
+ "t",
787
+ "c",
788
+ "u",
789
+ "m",
790
+ "p",
791
+ "b",
792
+ "g",
793
+ "v",
794
+ "f",
795
+ "y",
796
+ "ó",
797
+ "h",
798
+ "q",
799
+ "í",
800
+ "j",
801
+ "z",
802
+ "á",
803
+ ],
804
+ "Russian": [
805
+ "о",
806
+ "а",
807
+ "е",
808
+ "и",
809
+ "н",
810
+ "с",
811
+ "т",
812
+ "р",
813
+ "в",
814
+ "л",
815
+ "к",
816
+ "м",
817
+ "д",
818
+ "п",
819
+ "у",
820
+ "г",
821
+ "я",
822
+ "ы",
823
+ "з",
824
+ "б",
825
+ "й",
826
+ "ь",
827
+ "ч",
828
+ "х",
829
+ "ж",
830
+ "ц",
831
+ ],
832
+ # Jap-Kanji
833
+ "Japanese": [
834
+ "人",
835
+ "一",
836
+ "大",
837
+ "亅",
838
+ "丁",
839
+ "丨",
840
+ "竹",
841
+ "笑",
842
+ "口",
843
+ "日",
844
+ "今",
845
+ "二",
846
+ "彳",
847
+ "行",
848
+ "十",
849
+ "土",
850
+ "丶",
851
+ "寸",
852
+ "寺",
853
+ "時",
854
+ "乙",
855
+ "丿",
856
+ "乂",
857
+ "气",
858
+ "気",
859
+ "冂",
860
+ "巾",
861
+ "亠",
862
+ "市",
863
+ "目",
864
+ "儿",
865
+ "見",
866
+ "八",
867
+ "小",
868
+ "凵",
869
+ "県",
870
+ "月",
871
+ "彐",
872
+ "門",
873
+ "間",
874
+ "木",
875
+ "東",
876
+ "山",
877
+ "出",
878
+ "本",
879
+ "中",
880
+ "刀",
881
+ "分",
882
+ "耳",
883
+ "又",
884
+ "取",
885
+ "最",
886
+ "言",
887
+ "田",
888
+ "心",
889
+ "思",
890
+ "刂",
891
+ "前",
892
+ "京",
893
+ "尹",
894
+ "事",
895
+ "生",
896
+ "厶",
897
+ "云",
898
+ "会",
899
+ "未",
900
+ "来",
901
+ "白",
902
+ "冫",
903
+ "楽",
904
+ "灬",
905
+ "馬",
906
+ "尸",
907
+ "尺",
908
+ "駅",
909
+ "明",
910
+ "耂",
911
+ "者",
912
+ "了",
913
+ "阝",
914
+ "都",
915
+ "高",
916
+ "卜",
917
+ "占",
918
+ "厂",
919
+ "广",
920
+ "店",
921
+ "子",
922
+ "申",
923
+ "奄",
924
+ "亻",
925
+ "俺",
926
+ "上",
927
+ "方",
928
+ "冖",
929
+ "学",
930
+ "衣",
931
+ "艮",
932
+ "食",
933
+ "自",
934
+ ],
935
+ # Jap-Katakana
936
+ "Japanese—": [
937
+ "ー",
938
+ "ン",
939
+ "ス",
940
+ "・",
941
+ "ル",
942
+ "ト",
943
+ "リ",
944
+ "イ",
945
+ "ア",
946
+ "ラ",
947
+ "ッ",
948
+ "ク",
949
+ "ド",
950
+ "シ",
951
+ "レ",
952
+ "ジ",
953
+ "タ",
954
+ "フ",
955
+ "ロ",
956
+ "カ",
957
+ "テ",
958
+ "マ",
959
+ "ィ",
960
+ "グ",
961
+ "バ",
962
+ "ム",
963
+ "プ",
964
+ "オ",
965
+ "コ",
966
+ "デ",
967
+ "ニ",
968
+ "ウ",
969
+ "メ",
970
+ "サ",
971
+ "ビ",
972
+ "ナ",
973
+ "ブ",
974
+ "ャ",
975
+ "エ",
976
+ "ュ",
977
+ "チ",
978
+ "キ",
979
+ "ズ",
980
+ "ダ",
981
+ "パ",
982
+ "ミ",
983
+ "ェ",
984
+ "ョ",
985
+ "ハ",
986
+ "セ",
987
+ "ベ",
988
+ "ガ",
989
+ "モ",
990
+ "ツ",
991
+ "ネ",
992
+ "ボ",
993
+ "ソ",
994
+ "ノ",
995
+ "ァ",
996
+ "ヴ",
997
+ "ワ",
998
+ "ポ",
999
+ "ペ",
1000
+ "ピ",
1001
+ "ケ",
1002
+ "ゴ",
1003
+ "ギ",
1004
+ "ザ",
1005
+ "ホ",
1006
+ "ゲ",
1007
+ "ォ",
1008
+ "ヤ",
1009
+ "ヒ",
1010
+ "ユ",
1011
+ "ヨ",
1012
+ "ヘ",
1013
+ "ゼ",
1014
+ "ヌ",
1015
+ "ゥ",
1016
+ "ゾ",
1017
+ "ヶ",
1018
+ "ヂ",
1019
+ "ヲ",
1020
+ "ヅ",
1021
+ "ヵ",
1022
+ "ヱ",
1023
+ "ヰ",
1024
+ "ヮ",
1025
+ "ヽ",
1026
+ "゠",
1027
+ "ヾ",
1028
+ "ヷ",
1029
+ "ヿ",
1030
+ "ヸ",
1031
+ "ヹ",
1032
+ "ヺ",
1033
+ ],
1034
+ # Jap-Hiragana
1035
+ "Japanese——": [
1036
+ "の",
1037
+ "に",
1038
+ "る",
1039
+ "た",
1040
+ "と",
1041
+ "は",
1042
+ "し",
1043
+ "い",
1044
+ "を",
1045
+ "で",
1046
+ "て",
1047
+ "が",
1048
+ "な",
1049
+ "れ",
1050
+ "か",
1051
+ "ら",
1052
+ "さ",
1053
+ "っ",
1054
+ "り",
1055
+ "す",
1056
+ "あ",
1057
+ "も",
1058
+ "こ",
1059
+ "ま",
1060
+ "う",
1061
+ "く",
1062
+ "よ",
1063
+ "き",
1064
+ "ん",
1065
+ "め",
1066
+ "お",
1067
+ "け",
1068
+ "そ",
1069
+ "つ",
1070
+ "だ",
1071
+ "や",
1072
+ "え",
1073
+ "ど",
1074
+ "わ",
1075
+ "ち",
1076
+ "み",
1077
+ "せ",
1078
+ "じ",
1079
+ "ば",
1080
+ "へ",
1081
+ "び",
1082
+ "ず",
1083
+ "ろ",
1084
+ "ほ",
1085
+ "げ",
1086
+ "む",
1087
+ "べ",
1088
+ "ひ",
1089
+ "ょ",
1090
+ "ゆ",
1091
+ "ぶ",
1092
+ "ご",
1093
+ "ゃ",
1094
+ "ね",
1095
+ "ふ",
1096
+ "ぐ",
1097
+ "ぎ",
1098
+ "ぼ",
1099
+ "ゅ",
1100
+ "づ",
1101
+ "ざ",
1102
+ "ぞ",
1103
+ "ぬ",
1104
+ "ぜ",
1105
+ "ぱ",
1106
+ "ぽ",
1107
+ "ぷ",
1108
+ "ぴ",
1109
+ "ぃ",
1110
+ "ぁ",
1111
+ "ぇ",
1112
+ "ぺ",
1113
+ "ゞ",
1114
+ "ぢ",
1115
+ "ぉ",
1116
+ "ぅ",
1117
+ "ゐ",
1118
+ "ゝ",
1119
+ "ゑ",
1120
+ "゛",
1121
+ "゜",
1122
+ "ゎ",
1123
+ "ゔ",
1124
+ "゚",
1125
+ "ゟ",
1126
+ "゙",
1127
+ "ゕ",
1128
+ "ゖ",
1129
+ ],
1130
+ "Portuguese": [
1131
+ "a",
1132
+ "e",
1133
+ "o",
1134
+ "s",
1135
+ "i",
1136
+ "r",
1137
+ "d",
1138
+ "n",
1139
+ "t",
1140
+ "m",
1141
+ "u",
1142
+ "c",
1143
+ "l",
1144
+ "p",
1145
+ "g",
1146
+ "v",
1147
+ "b",
1148
+ "f",
1149
+ "h",
1150
+ "ã",
1151
+ "q",
1152
+ "é",
1153
+ "ç",
1154
+ "á",
1155
+ "z",
1156
+ "í",
1157
+ ],
1158
+ "Swedish": [
1159
+ "e",
1160
+ "a",
1161
+ "n",
1162
+ "r",
1163
+ "t",
1164
+ "s",
1165
+ "i",
1166
+ "l",
1167
+ "d",
1168
+ "o",
1169
+ "m",
1170
+ "k",
1171
+ "g",
1172
+ "v",
1173
+ "h",
1174
+ "f",
1175
+ "u",
1176
+ "p",
1177
+ "ä",
1178
+ "c",
1179
+ "b",
1180
+ "ö",
1181
+ "å",
1182
+ "y",
1183
+ "j",
1184
+ "x",
1185
+ ],
1186
+ "Chinese": [
1187
+ "的",
1188
+ "一",
1189
+ "是",
1190
+ "不",
1191
+ "了",
1192
+ "在",
1193
+ "人",
1194
+ "有",
1195
+ "我",
1196
+ "他",
1197
+ "这",
1198
+ "个",
1199
+ "们",
1200
+ "中",
1201
+ "来",
1202
+ "上",
1203
+ "大",
1204
+ "为",
1205
+ "和",
1206
+ "国",
1207
+ "地",
1208
+ "到",
1209
+ "以",
1210
+ "说",
1211
+ "时",
1212
+ "要",
1213
+ "就",
1214
+ "出",
1215
+ "会",
1216
+ "可",
1217
+ "也",
1218
+ "你",
1219
+ "对",
1220
+ "生",
1221
+ "能",
1222
+ "而",
1223
+ "子",
1224
+ "那",
1225
+ "得",
1226
+ "于",
1227
+ "着",
1228
+ "下",
1229
+ "自",
1230
+ "之",
1231
+ "年",
1232
+ "过",
1233
+ "发",
1234
+ "后",
1235
+ "作",
1236
+ "里",
1237
+ "用",
1238
+ "道",
1239
+ "行",
1240
+ "所",
1241
+ "然",
1242
+ "家",
1243
+ "种",
1244
+ "事",
1245
+ "成",
1246
+ "方",
1247
+ "多",
1248
+ "经",
1249
+ "么",
1250
+ "去",
1251
+ "法",
1252
+ "学",
1253
+ "如",
1254
+ "都",
1255
+ "同",
1256
+ "现",
1257
+ "当",
1258
+ "没",
1259
+ "动",
1260
+ "面",
1261
+ "起",
1262
+ "看",
1263
+ "定",
1264
+ "天",
1265
+ "分",
1266
+ "还",
1267
+ "进",
1268
+ "好",
1269
+ "小",
1270
+ "部",
1271
+ "其",
1272
+ "些",
1273
+ "主",
1274
+ "样",
1275
+ "理",
1276
+ "心",
1277
+ "她",
1278
+ "本",
1279
+ "前",
1280
+ "开",
1281
+ "但",
1282
+ "因",
1283
+ "只",
1284
+ "从",
1285
+ "想",
1286
+ "实",
1287
+ ],
1288
+ "Ukrainian": [
1289
+ "о",
1290
+ "а",
1291
+ "н",
1292
+ "і",
1293
+ "и",
1294
+ "р",
1295
+ "в",
1296
+ "т",
1297
+ "е",
1298
+ "с",
1299
+ "к",
1300
+ "л",
1301
+ "у",
1302
+ "д",
1303
+ "м",
1304
+ "п",
1305
+ "з",
1306
+ "я",
1307
+ "ь",
1308
+ "б",
1309
+ "г",
1310
+ "й",
1311
+ "ч",
1312
+ "х",
1313
+ "ц",
1314
+ "ї",
1315
+ ],
1316
+ "Norwegian": [
1317
+ "e",
1318
+ "r",
1319
+ "n",
1320
+ "t",
1321
+ "a",
1322
+ "s",
1323
+ "i",
1324
+ "o",
1325
+ "l",
1326
+ "d",
1327
+ "g",
1328
+ "k",
1329
+ "m",
1330
+ "v",
1331
+ "f",
1332
+ "p",
1333
+ "u",
1334
+ "b",
1335
+ "h",
1336
+ "å",
1337
+ "y",
1338
+ "j",
1339
+ "ø",
1340
+ "c",
1341
+ "æ",
1342
+ "w",
1343
+ ],
1344
+ "Finnish": [
1345
+ "a",
1346
+ "i",
1347
+ "n",
1348
+ "t",
1349
+ "e",
1350
+ "s",
1351
+ "l",
1352
+ "o",
1353
+ "u",
1354
+ "k",
1355
+ "ä",
1356
+ "m",
1357
+ "r",
1358
+ "v",
1359
+ "j",
1360
+ "h",
1361
+ "p",
1362
+ "y",
1363
+ "d",
1364
+ "ö",
1365
+ "g",
1366
+ "c",
1367
+ "b",
1368
+ "f",
1369
+ "w",
1370
+ "z",
1371
+ ],
1372
+ "Vietnamese": [
1373
+ "n",
1374
+ "h",
1375
+ "t",
1376
+ "i",
1377
+ "c",
1378
+ "g",
1379
+ "a",
1380
+ "o",
1381
+ "u",
1382
+ "m",
1383
+ "l",
1384
+ "r",
1385
+ "à",
1386
+ "đ",
1387
+ "s",
1388
+ "e",
1389
+ "v",
1390
+ "p",
1391
+ "b",
1392
+ "y",
1393
+ "ư",
1394
+ "d",
1395
+ "á",
1396
+ "k",
1397
+ "ộ",
1398
+ "ế",
1399
+ ],
1400
+ "Czech": [
1401
+ "o",
1402
+ "e",
1403
+ "a",
1404
+ "n",
1405
+ "t",
1406
+ "s",
1407
+ "i",
1408
+ "l",
1409
+ "v",
1410
+ "r",
1411
+ "k",
1412
+ "d",
1413
+ "u",
1414
+ "m",
1415
+ "p",
1416
+ "í",
1417
+ "c",
1418
+ "h",
1419
+ "z",
1420
+ "á",
1421
+ "y",
1422
+ "j",
1423
+ "b",
1424
+ "ě",
1425
+ "é",
1426
+ "ř",
1427
+ ],
1428
+ "Hungarian": [
1429
+ "e",
1430
+ "a",
1431
+ "t",
1432
+ "l",
1433
+ "s",
1434
+ "n",
1435
+ "k",
1436
+ "r",
1437
+ "i",
1438
+ "o",
1439
+ "z",
1440
+ "á",
1441
+ "é",
1442
+ "g",
1443
+ "m",
1444
+ "b",
1445
+ "y",
1446
+ "v",
1447
+ "d",
1448
+ "h",
1449
+ "u",
1450
+ "p",
1451
+ "j",
1452
+ "ö",
1453
+ "f",
1454
+ "c",
1455
+ ],
1456
+ "Korean": [
1457
+ "이",
1458
+ "다",
1459
+ "에",
1460
+ "의",
1461
+ "는",
1462
+ "로",
1463
+ "하",
1464
+ "을",
1465
+ "가",
1466
+ "고",
1467
+ "지",
1468
+ "서",
1469
+ "한",
1470
+ "은",
1471
+ "기",
1472
+ "으",
1473
+ "년",
1474
+ "대",
1475
+ "사",
1476
+ "시",
1477
+ "를",
1478
+ "리",
1479
+ "도",
1480
+ "인",
1481
+ "스",
1482
+ "일",
1483
+ ],
1484
+ "Indonesian": [
1485
+ "a",
1486
+ "n",
1487
+ "e",
1488
+ "i",
1489
+ "r",
1490
+ "t",
1491
+ "u",
1492
+ "s",
1493
+ "d",
1494
+ "k",
1495
+ "m",
1496
+ "l",
1497
+ "g",
1498
+ "p",
1499
+ "b",
1500
+ "o",
1501
+ "h",
1502
+ "y",
1503
+ "j",
1504
+ "c",
1505
+ "w",
1506
+ "f",
1507
+ "v",
1508
+ "z",
1509
+ "x",
1510
+ "q",
1511
+ ],
1512
+ "Turkish": [
1513
+ "a",
1514
+ "e",
1515
+ "i",
1516
+ "n",
1517
+ "r",
1518
+ "l",
1519
+ "ı",
1520
+ "k",
1521
+ "d",
1522
+ "t",
1523
+ "s",
1524
+ "m",
1525
+ "y",
1526
+ "u",
1527
+ "o",
1528
+ "b",
1529
+ "ü",
1530
+ "ş",
1531
+ "v",
1532
+ "g",
1533
+ "z",
1534
+ "h",
1535
+ "c",
1536
+ "p",
1537
+ "ç",
1538
+ "ğ",
1539
+ ],
1540
+ "Romanian": [
1541
+ "e",
1542
+ "i",
1543
+ "a",
1544
+ "r",
1545
+ "n",
1546
+ "t",
1547
+ "u",
1548
+ "l",
1549
+ "o",
1550
+ "c",
1551
+ "s",
1552
+ "d",
1553
+ "p",
1554
+ "m",
1555
+ "ă",
1556
+ "f",
1557
+ "v",
1558
+ "î",
1559
+ "g",
1560
+ "b",
1561
+ "ș",
1562
+ "ț",
1563
+ "z",
1564
+ "h",
1565
+ "â",
1566
+ "j",
1567
+ ],
1568
+ "Farsi": [
1569
+ "ا",
1570
+ "ی",
1571
+ "ر",
1572
+ "د",
1573
+ "ن",
1574
+ "ه",
1575
+ "و",
1576
+ "م",
1577
+ "ت",
1578
+ "ب",
1579
+ "س",
1580
+ "ل",
1581
+ "ک",
1582
+ "ش",
1583
+ "ز",
1584
+ "ف",
1585
+ "گ",
1586
+ "ع",
1587
+ "خ",
1588
+ "ق",
1589
+ "ج",
1590
+ "آ",
1591
+ "پ",
1592
+ "ح",
1593
+ "ط",
1594
+ "ص",
1595
+ ],
1596
+ "Arabic": [
1597
+ "ا",
1598
+ "ل",
1599
+ "ي",
1600
+ "م",
1601
+ "و",
1602
+ "ن",
1603
+ "ر",
1604
+ "ت",
1605
+ "ب",
1606
+ "ة",
1607
+ "ع",
1608
+ "د",
1609
+ "س",
1610
+ "ف",
1611
+ "ه",
1612
+ "ك",
1613
+ "ق",
1614
+ "أ",
1615
+ "ح",
1616
+ "ج",
1617
+ "ش",
1618
+ "ط",
1619
+ "ص",
1620
+ "ى",
1621
+ "خ",
1622
+ "إ",
1623
+ ],
1624
+ "Danish": [
1625
+ "e",
1626
+ "r",
1627
+ "n",
1628
+ "t",
1629
+ "a",
1630
+ "i",
1631
+ "s",
1632
+ "d",
1633
+ "l",
1634
+ "o",
1635
+ "g",
1636
+ "m",
1637
+ "k",
1638
+ "f",
1639
+ "v",
1640
+ "u",
1641
+ "b",
1642
+ "h",
1643
+ "p",
1644
+ "å",
1645
+ "y",
1646
+ "ø",
1647
+ "æ",
1648
+ "c",
1649
+ "j",
1650
+ "w",
1651
+ ],
1652
+ "Serbian": [
1653
+ "а",
1654
+ "и",
1655
+ "о",
1656
+ "е",
1657
+ "н",
1658
+ "р",
1659
+ "с",
1660
+ "у",
1661
+ "т",
1662
+ "к",
1663
+ "ј",
1664
+ "в",
1665
+ "д",
1666
+ "м",
1667
+ "п",
1668
+ "л",
1669
+ "г",
1670
+ "з",
1671
+ "б",
1672
+ "a",
1673
+ "i",
1674
+ "e",
1675
+ "o",
1676
+ "n",
1677
+ "ц",
1678
+ "ш",
1679
+ ],
1680
+ "Lithuanian": [
1681
+ "i",
1682
+ "a",
1683
+ "s",
1684
+ "o",
1685
+ "r",
1686
+ "e",
1687
+ "t",
1688
+ "n",
1689
+ "u",
1690
+ "k",
1691
+ "m",
1692
+ "l",
1693
+ "p",
1694
+ "v",
1695
+ "d",
1696
+ "j",
1697
+ "g",
1698
+ "ė",
1699
+ "b",
1700
+ "y",
1701
+ "ų",
1702
+ "š",
1703
+ "ž",
1704
+ "c",
1705
+ "ą",
1706
+ "į",
1707
+ ],
1708
+ "Slovene": [
1709
+ "e",
1710
+ "a",
1711
+ "i",
1712
+ "o",
1713
+ "n",
1714
+ "r",
1715
+ "s",
1716
+ "l",
1717
+ "t",
1718
+ "j",
1719
+ "v",
1720
+ "k",
1721
+ "d",
1722
+ "p",
1723
+ "m",
1724
+ "u",
1725
+ "z",
1726
+ "b",
1727
+ "g",
1728
+ "h",
1729
+ "č",
1730
+ "c",
1731
+ "š",
1732
+ "ž",
1733
+ "f",
1734
+ "y",
1735
+ ],
1736
+ "Slovak": [
1737
+ "o",
1738
+ "a",
1739
+ "e",
1740
+ "n",
1741
+ "i",
1742
+ "r",
1743
+ "v",
1744
+ "t",
1745
+ "s",
1746
+ "l",
1747
+ "k",
1748
+ "d",
1749
+ "m",
1750
+ "p",
1751
+ "u",
1752
+ "c",
1753
+ "h",
1754
+ "j",
1755
+ "b",
1756
+ "z",
1757
+ "á",
1758
+ "y",
1759
+ "ý",
1760
+ "í",
1761
+ "č",
1762
+ "é",
1763
+ ],
1764
+ "Hebrew": [
1765
+ "י",
1766
+ "ו",
1767
+ "ה",
1768
+ "ל",
1769
+ "ר",
1770
+ "ב",
1771
+ "ת",
1772
+ "מ",
1773
+ "א",
1774
+ "ש",
1775
+ "נ",
1776
+ "ע",
1777
+ "ם",
1778
+ "ד",
1779
+ "ק",
1780
+ "ח",
1781
+ "פ",
1782
+ "ס",
1783
+ "כ",
1784
+ "ג",
1785
+ "ט",
1786
+ "צ",
1787
+ "ן",
1788
+ "ז",
1789
+ "ך",
1790
+ ],
1791
+ "Bulgarian": [
1792
+ "а",
1793
+ "и",
1794
+ "о",
1795
+ "е",
1796
+ "н",
1797
+ "т",
1798
+ "р",
1799
+ "с",
1800
+ "в",
1801
+ "л",
1802
+ "к",
1803
+ "д",
1804
+ "п",
1805
+ "м",
1806
+ "з",
1807
+ "г",
1808
+ "я",
1809
+ "ъ",
1810
+ "у",
1811
+ "б",
1812
+ "ч",
1813
+ "ц",
1814
+ "й",
1815
+ "ж",
1816
+ "щ",
1817
+ "х",
1818
+ ],
1819
+ "Croatian": [
1820
+ "a",
1821
+ "i",
1822
+ "o",
1823
+ "e",
1824
+ "n",
1825
+ "r",
1826
+ "j",
1827
+ "s",
1828
+ "t",
1829
+ "u",
1830
+ "k",
1831
+ "l",
1832
+ "v",
1833
+ "d",
1834
+ "m",
1835
+ "p",
1836
+ "g",
1837
+ "z",
1838
+ "b",
1839
+ "c",
1840
+ "č",
1841
+ "h",
1842
+ "š",
1843
+ "ž",
1844
+ "ć",
1845
+ "f",
1846
+ ],
1847
+ "Hindi": [
1848
+ "क",
1849
+ "र",
1850
+ "स",
1851
+ "न",
1852
+ "त",
1853
+ "म",
1854
+ "ह",
1855
+ "प",
1856
+ "य",
1857
+ "ल",
1858
+ "व",
1859
+ "ज",
1860
+ "द",
1861
+ "ग",
1862
+ "ब",
1863
+ "श",
1864
+ "ट",
1865
+ "अ",
1866
+ "ए",
1867
+ "थ",
1868
+ "भ",
1869
+ "ड",
1870
+ "च",
1871
+ "ध",
1872
+ "ष",
1873
+ "इ",
1874
+ ],
1875
+ "Estonian": [
1876
+ "a",
1877
+ "i",
1878
+ "e",
1879
+ "s",
1880
+ "t",
1881
+ "l",
1882
+ "u",
1883
+ "n",
1884
+ "o",
1885
+ "k",
1886
+ "r",
1887
+ "d",
1888
+ "m",
1889
+ "v",
1890
+ "g",
1891
+ "p",
1892
+ "j",
1893
+ "h",
1894
+ "ä",
1895
+ "b",
1896
+ "õ",
1897
+ "ü",
1898
+ "f",
1899
+ "c",
1900
+ "ö",
1901
+ "y",
1902
+ ],
1903
+ "Thai": [
1904
+ "า",
1905
+ "น",
1906
+ "ร",
1907
+ "อ",
1908
+ "ก",
1909
+ "เ",
1910
+ "ง",
1911
+ "ม",
1912
+ "ย",
1913
+ "ล",
1914
+ "ว",
1915
+ "ด",
1916
+ "ท",
1917
+ "ส",
1918
+ "ต",
1919
+ "ะ",
1920
+ "ป",
1921
+ "บ",
1922
+ "ค",
1923
+ "ห",
1924
+ "แ",
1925
+ "จ",
1926
+ "พ",
1927
+ "ช",
1928
+ "ข",
1929
+ "ใ",
1930
+ ],
1931
+ "Greek": [
1932
+ "α",
1933
+ "τ",
1934
+ "ο",
1935
+ "ι",
1936
+ "ε",
1937
+ "ν",
1938
+ "ρ",
1939
+ "σ",
1940
+ "κ",
1941
+ "η",
1942
+ "π",
1943
+ "ς",
1944
+ "υ",
1945
+ "μ",
1946
+ "λ",
1947
+ "ί",
1948
+ "ό",
1949
+ "ά",
1950
+ "γ",
1951
+ "έ",
1952
+ "δ",
1953
+ "ή",
1954
+ "ω",
1955
+ "χ",
1956
+ "θ",
1957
+ "ύ",
1958
+ ],
1959
+ "Tamil": [
1960
+ "க",
1961
+ "த",
1962
+ "ப",
1963
+ "ட",
1964
+ "ர",
1965
+ "ம",
1966
+ "ல",
1967
+ "ன",
1968
+ "வ",
1969
+ "ற",
1970
+ "ய",
1971
+ "ள",
1972
+ "ச",
1973
+ "ந",
1974
+ "இ",
1975
+ "ண",
1976
+ "அ",
1977
+ "ஆ",
1978
+ "ழ",
1979
+ "ங",
1980
+ "எ",
1981
+ "உ",
1982
+ "ஒ",
1983
+ "ஸ",
1984
+ ],
1985
+ "Kazakh": [
1986
+ "а",
1987
+ "ы",
1988
+ "е",
1989
+ "н",
1990
+ "т",
1991
+ "р",
1992
+ "л",
1993
+ "і",
1994
+ "д",
1995
+ "с",
1996
+ "м",
1997
+ "қ",
1998
+ "к",
1999
+ "о",
2000
+ "б",
2001
+ "и",
2002
+ "у",
2003
+ "ғ",
2004
+ "ж",
2005
+ "ң",
2006
+ "з",
2007
+ "ш",
2008
+ "й",
2009
+ "п",
2010
+ "г",
2011
+ "ө",
2012
+ ],
2013
+ }
2014
+
2015
+ LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
phivenv/Lib/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from warnings import warn
5
+
6
+ from .api import from_bytes
7
+ from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
8
+
9
+ # TODO: remove this check when dropping Python 3.7 support
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import TypedDict
12
+
13
+ class ResultDict(TypedDict):
14
+ encoding: str | None
15
+ language: str
16
+ confidence: float | None
17
+
18
+
19
+ def detect(
20
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
21
+ ) -> ResultDict:
22
+ """
23
+ chardet legacy method
24
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
25
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
26
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
27
+ further information. Not planned for removal.
28
+
29
+ :param byte_str: The byte sequence to examine.
30
+ :param should_rename_legacy: Should we rename legacy encodings
31
+ to their more modern equivalents?
32
+ """
33
+ if len(kwargs):
34
+ warn(
35
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
36
+ )
37
+
38
+ if not isinstance(byte_str, (bytearray, bytes)):
39
+ raise TypeError( # pragma: nocover
40
+ f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
41
+ )
42
+
43
+ if isinstance(byte_str, bytearray):
44
+ byte_str = bytes(byte_str)
45
+
46
+ r = from_bytes(byte_str).best()
47
+
48
+ encoding = r.encoding if r is not None else None
49
+ language = r.language if r is not None and r.language != "Unknown" else ""
50
+ confidence = 1.0 - r.chaos if r is not None else None
51
+
52
+ # automatically lower confidence
53
+ # on small bytes samples.
54
+ # https://github.com/jawah/charset_normalizer/issues/391
55
+ if (
56
+ confidence is not None
57
+ and confidence >= 0.9
58
+ and encoding
59
+ not in {
60
+ "utf_8",
61
+ "ascii",
62
+ }
63
+ and r.bom is False # type: ignore[union-attr]
64
+ and len(byte_str) < TOO_SMALL_SEQUENCE
65
+ ):
66
+ confidence -= 0.2
67
+
68
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
69
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
70
+ if r is not None and encoding == "utf_8" and r.bom:
71
+ encoding += "_sig"
72
+
73
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
74
+ encoding = CHARDET_CORRESPONDENCE[encoding]
75
+
76
+ return {
77
+ "encoding": encoding,
78
+ "language": language,
79
+ "confidence": confidence,
80
+ }
phivenv/Lib/site-packages/charset_normalizer/md.cp39-win_amd64.pyd ADDED
Binary file (10.8 kB). View file
 
phivenv/Lib/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from logging import getLogger
5
+
6
+ from .constant import (
7
+ COMMON_SAFE_ASCII_CHARACTERS,
8
+ TRACE,
9
+ UNICODE_SECONDARY_RANGE_KEYWORD,
10
+ )
11
+ from .utils import (
12
+ is_accentuated,
13
+ is_arabic,
14
+ is_arabic_isolated_form,
15
+ is_case_variable,
16
+ is_cjk,
17
+ is_emoticon,
18
+ is_hangul,
19
+ is_hiragana,
20
+ is_katakana,
21
+ is_latin,
22
+ is_punctuation,
23
+ is_separator,
24
+ is_symbol,
25
+ is_thai,
26
+ is_unprintable,
27
+ remove_accent,
28
+ unicode_range,
29
+ is_cjk_uncommon,
30
+ )
31
+
32
+
33
+ class MessDetectorPlugin:
34
+ """
35
+ Base abstract class used for mess detection plugins.
36
+ All detectors MUST extend and implement given methods.
37
+ """
38
+
39
+ def eligible(self, character: str) -> bool:
40
+ """
41
+ Determine if given character should be fed in.
42
+ """
43
+ raise NotImplementedError # pragma: nocover
44
+
45
+ def feed(self, character: str) -> None:
46
+ """
47
+ The main routine to be executed upon character.
48
+ Insert the logic in witch the text would be considered chaotic.
49
+ """
50
+ raise NotImplementedError # pragma: nocover
51
+
52
+ def reset(self) -> None: # pragma: no cover
53
+ """
54
+ Permit to reset the plugin to the initial state.
55
+ """
56
+ raise NotImplementedError
57
+
58
+ @property
59
+ def ratio(self) -> float:
60
+ """
61
+ Compute the chaos ratio based on what your feed() has seen.
62
+ Must NOT be lower than 0.; No restriction gt 0.
63
+ """
64
+ raise NotImplementedError # pragma: nocover
65
+
66
+
67
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
68
+ def __init__(self) -> None:
69
+ self._punctuation_count: int = 0
70
+ self._symbol_count: int = 0
71
+ self._character_count: int = 0
72
+
73
+ self._last_printable_char: str | None = None
74
+ self._frenzy_symbol_in_word: bool = False
75
+
76
+ def eligible(self, character: str) -> bool:
77
+ return character.isprintable()
78
+
79
+ def feed(self, character: str) -> None:
80
+ self._character_count += 1
81
+
82
+ if (
83
+ character != self._last_printable_char
84
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
85
+ ):
86
+ if is_punctuation(character):
87
+ self._punctuation_count += 1
88
+ elif (
89
+ character.isdigit() is False
90
+ and is_symbol(character)
91
+ and is_emoticon(character) is False
92
+ ):
93
+ self._symbol_count += 2
94
+
95
+ self._last_printable_char = character
96
+
97
+ def reset(self) -> None: # Abstract
98
+ self._punctuation_count = 0
99
+ self._character_count = 0
100
+ self._symbol_count = 0
101
+
102
+ @property
103
+ def ratio(self) -> float:
104
+ if self._character_count == 0:
105
+ return 0.0
106
+
107
+ ratio_of_punctuation: float = (
108
+ self._punctuation_count + self._symbol_count
109
+ ) / self._character_count
110
+
111
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
112
+
113
+
114
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
115
+ def __init__(self) -> None:
116
+ self._character_count: int = 0
117
+ self._accentuated_count: int = 0
118
+
119
+ def eligible(self, character: str) -> bool:
120
+ return character.isalpha()
121
+
122
+ def feed(self, character: str) -> None:
123
+ self._character_count += 1
124
+
125
+ if is_accentuated(character):
126
+ self._accentuated_count += 1
127
+
128
+ def reset(self) -> None: # Abstract
129
+ self._character_count = 0
130
+ self._accentuated_count = 0
131
+
132
+ @property
133
+ def ratio(self) -> float:
134
+ if self._character_count < 8:
135
+ return 0.0
136
+
137
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
138
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
139
+
140
+
141
+ class UnprintablePlugin(MessDetectorPlugin):
142
+ def __init__(self) -> None:
143
+ self._unprintable_count: int = 0
144
+ self._character_count: int = 0
145
+
146
+ def eligible(self, character: str) -> bool:
147
+ return True
148
+
149
+ def feed(self, character: str) -> None:
150
+ if is_unprintable(character):
151
+ self._unprintable_count += 1
152
+ self._character_count += 1
153
+
154
+ def reset(self) -> None: # Abstract
155
+ self._unprintable_count = 0
156
+
157
+ @property
158
+ def ratio(self) -> float:
159
+ if self._character_count == 0:
160
+ return 0.0
161
+
162
+ return (self._unprintable_count * 8) / self._character_count
163
+
164
+
165
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
166
+ def __init__(self) -> None:
167
+ self._successive_count: int = 0
168
+ self._character_count: int = 0
169
+
170
+ self._last_latin_character: str | None = None
171
+
172
+ def eligible(self, character: str) -> bool:
173
+ return character.isalpha() and is_latin(character)
174
+
175
+ def feed(self, character: str) -> None:
176
+ self._character_count += 1
177
+ if (
178
+ self._last_latin_character is not None
179
+ and is_accentuated(character)
180
+ and is_accentuated(self._last_latin_character)
181
+ ):
182
+ if character.isupper() and self._last_latin_character.isupper():
183
+ self._successive_count += 1
184
+ # Worse if its the same char duplicated with different accent.
185
+ if remove_accent(character) == remove_accent(self._last_latin_character):
186
+ self._successive_count += 1
187
+ self._last_latin_character = character
188
+
189
+ def reset(self) -> None: # Abstract
190
+ self._successive_count = 0
191
+ self._character_count = 0
192
+ self._last_latin_character = None
193
+
194
+ @property
195
+ def ratio(self) -> float:
196
+ if self._character_count == 0:
197
+ return 0.0
198
+
199
+ return (self._successive_count * 2) / self._character_count
200
+
201
+
202
+ class SuspiciousRange(MessDetectorPlugin):
203
+ def __init__(self) -> None:
204
+ self._suspicious_successive_range_count: int = 0
205
+ self._character_count: int = 0
206
+ self._last_printable_seen: str | None = None
207
+
208
+ def eligible(self, character: str) -> bool:
209
+ return character.isprintable()
210
+
211
+ def feed(self, character: str) -> None:
212
+ self._character_count += 1
213
+
214
+ if (
215
+ character.isspace()
216
+ or is_punctuation(character)
217
+ or character in COMMON_SAFE_ASCII_CHARACTERS
218
+ ):
219
+ self._last_printable_seen = None
220
+ return
221
+
222
+ if self._last_printable_seen is None:
223
+ self._last_printable_seen = character
224
+ return
225
+
226
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
227
+ unicode_range_b: str | None = unicode_range(character)
228
+
229
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
230
+ self._suspicious_successive_range_count += 1
231
+
232
+ self._last_printable_seen = character
233
+
234
+ def reset(self) -> None: # Abstract
235
+ self._character_count = 0
236
+ self._suspicious_successive_range_count = 0
237
+ self._last_printable_seen = None
238
+
239
+ @property
240
+ def ratio(self) -> float:
241
+ if self._character_count <= 13:
242
+ return 0.0
243
+
244
+ ratio_of_suspicious_range_usage: float = (
245
+ self._suspicious_successive_range_count * 2
246
+ ) / self._character_count
247
+
248
+ return ratio_of_suspicious_range_usage
249
+
250
+
251
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
252
+ def __init__(self) -> None:
253
+ self._word_count: int = 0
254
+ self._bad_word_count: int = 0
255
+ self._foreign_long_count: int = 0
256
+
257
+ self._is_current_word_bad: bool = False
258
+ self._foreign_long_watch: bool = False
259
+
260
+ self._character_count: int = 0
261
+ self._bad_character_count: int = 0
262
+
263
+ self._buffer: str = ""
264
+ self._buffer_accent_count: int = 0
265
+ self._buffer_glyph_count: int = 0
266
+
267
+ def eligible(self, character: str) -> bool:
268
+ return True
269
+
270
+ def feed(self, character: str) -> None:
271
+ if character.isalpha():
272
+ self._buffer += character
273
+ if is_accentuated(character):
274
+ self._buffer_accent_count += 1
275
+ if (
276
+ self._foreign_long_watch is False
277
+ and (is_latin(character) is False or is_accentuated(character))
278
+ and is_cjk(character) is False
279
+ and is_hangul(character) is False
280
+ and is_katakana(character) is False
281
+ and is_hiragana(character) is False
282
+ and is_thai(character) is False
283
+ ):
284
+ self._foreign_long_watch = True
285
+ if (
286
+ is_cjk(character)
287
+ or is_hangul(character)
288
+ or is_katakana(character)
289
+ or is_hiragana(character)
290
+ or is_thai(character)
291
+ ):
292
+ self._buffer_glyph_count += 1
293
+ return
294
+ if not self._buffer:
295
+ return
296
+ if (
297
+ character.isspace() or is_punctuation(character) or is_separator(character)
298
+ ) and self._buffer:
299
+ self._word_count += 1
300
+ buffer_length: int = len(self._buffer)
301
+
302
+ self._character_count += buffer_length
303
+
304
+ if buffer_length >= 4:
305
+ if self._buffer_accent_count / buffer_length >= 0.5:
306
+ self._is_current_word_bad = True
307
+ # Word/Buffer ending with an upper case accentuated letter are so rare,
308
+ # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
309
+ elif (
310
+ is_accentuated(self._buffer[-1])
311
+ and self._buffer[-1].isupper()
312
+ and all(_.isupper() for _ in self._buffer) is False
313
+ ):
314
+ self._foreign_long_count += 1
315
+ self._is_current_word_bad = True
316
+ elif self._buffer_glyph_count == 1:
317
+ self._is_current_word_bad = True
318
+ self._foreign_long_count += 1
319
+ if buffer_length >= 24 and self._foreign_long_watch:
320
+ camel_case_dst = [
321
+ i
322
+ for c, i in zip(self._buffer, range(0, buffer_length))
323
+ if c.isupper()
324
+ ]
325
+ probable_camel_cased: bool = False
326
+
327
+ if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
328
+ probable_camel_cased = True
329
+
330
+ if not probable_camel_cased:
331
+ self._foreign_long_count += 1
332
+ self._is_current_word_bad = True
333
+
334
+ if self._is_current_word_bad:
335
+ self._bad_word_count += 1
336
+ self._bad_character_count += len(self._buffer)
337
+ self._is_current_word_bad = False
338
+
339
+ self._foreign_long_watch = False
340
+ self._buffer = ""
341
+ self._buffer_accent_count = 0
342
+ self._buffer_glyph_count = 0
343
+ elif (
344
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
345
+ and character.isdigit() is False
346
+ and is_symbol(character)
347
+ ):
348
+ self._is_current_word_bad = True
349
+ self._buffer += character
350
+
351
+ def reset(self) -> None: # Abstract
352
+ self._buffer = ""
353
+ self._is_current_word_bad = False
354
+ self._foreign_long_watch = False
355
+ self._bad_word_count = 0
356
+ self._word_count = 0
357
+ self._character_count = 0
358
+ self._bad_character_count = 0
359
+ self._foreign_long_count = 0
360
+
361
+ @property
362
+ def ratio(self) -> float:
363
+ if self._word_count <= 10 and self._foreign_long_count == 0:
364
+ return 0.0
365
+
366
+ return self._bad_character_count / self._character_count
367
+
368
+
369
+ class CjkUncommonPlugin(MessDetectorPlugin):
370
+ """
371
+ Detect messy CJK text that probably means nothing.
372
+ """
373
+
374
+ def __init__(self) -> None:
375
+ self._character_count: int = 0
376
+ self._uncommon_count: int = 0
377
+
378
+ def eligible(self, character: str) -> bool:
379
+ return is_cjk(character)
380
+
381
+ def feed(self, character: str) -> None:
382
+ self._character_count += 1
383
+
384
+ if is_cjk_uncommon(character):
385
+ self._uncommon_count += 1
386
+ return
387
+
388
+ def reset(self) -> None: # Abstract
389
+ self._character_count = 0
390
+ self._uncommon_count = 0
391
+
392
+ @property
393
+ def ratio(self) -> float:
394
+ if self._character_count < 8:
395
+ return 0.0
396
+
397
+ uncommon_form_usage: float = self._uncommon_count / self._character_count
398
+
399
+ # we can be pretty sure it's garbage when uncommon characters are widely
400
+ # used. otherwise it could just be traditional chinese for example.
401
+ return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
402
+
403
+
404
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
405
+ def __init__(self) -> None:
406
+ self._buf: bool = False
407
+
408
+ self._character_count_since_last_sep: int = 0
409
+
410
+ self._successive_upper_lower_count: int = 0
411
+ self._successive_upper_lower_count_final: int = 0
412
+
413
+ self._character_count: int = 0
414
+
415
+ self._last_alpha_seen: str | None = None
416
+ self._current_ascii_only: bool = True
417
+
418
+ def eligible(self, character: str) -> bool:
419
+ return True
420
+
421
+ def feed(self, character: str) -> None:
422
+ is_concerned = character.isalpha() and is_case_variable(character)
423
+ chunk_sep = is_concerned is False
424
+
425
+ if chunk_sep and self._character_count_since_last_sep > 0:
426
+ if (
427
+ self._character_count_since_last_sep <= 64
428
+ and character.isdigit() is False
429
+ and self._current_ascii_only is False
430
+ ):
431
+ self._successive_upper_lower_count_final += (
432
+ self._successive_upper_lower_count
433
+ )
434
+
435
+ self._successive_upper_lower_count = 0
436
+ self._character_count_since_last_sep = 0
437
+ self._last_alpha_seen = None
438
+ self._buf = False
439
+ self._character_count += 1
440
+ self._current_ascii_only = True
441
+
442
+ return
443
+
444
+ if self._current_ascii_only is True and character.isascii() is False:
445
+ self._current_ascii_only = False
446
+
447
+ if self._last_alpha_seen is not None:
448
+ if (character.isupper() and self._last_alpha_seen.islower()) or (
449
+ character.islower() and self._last_alpha_seen.isupper()
450
+ ):
451
+ if self._buf is True:
452
+ self._successive_upper_lower_count += 2
453
+ self._buf = False
454
+ else:
455
+ self._buf = True
456
+ else:
457
+ self._buf = False
458
+
459
+ self._character_count += 1
460
+ self._character_count_since_last_sep += 1
461
+ self._last_alpha_seen = character
462
+
463
+ def reset(self) -> None: # Abstract
464
+ self._character_count = 0
465
+ self._character_count_since_last_sep = 0
466
+ self._successive_upper_lower_count = 0
467
+ self._successive_upper_lower_count_final = 0
468
+ self._last_alpha_seen = None
469
+ self._buf = False
470
+ self._current_ascii_only = True
471
+
472
+ @property
473
+ def ratio(self) -> float:
474
+ if self._character_count == 0:
475
+ return 0.0
476
+
477
+ return self._successive_upper_lower_count_final / self._character_count
478
+
479
+
480
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
481
+ def __init__(self) -> None:
482
+ self._character_count: int = 0
483
+ self._isolated_form_count: int = 0
484
+
485
+ def reset(self) -> None: # Abstract
486
+ self._character_count = 0
487
+ self._isolated_form_count = 0
488
+
489
+ def eligible(self, character: str) -> bool:
490
+ return is_arabic(character)
491
+
492
+ def feed(self, character: str) -> None:
493
+ self._character_count += 1
494
+
495
+ if is_arabic_isolated_form(character):
496
+ self._isolated_form_count += 1
497
+
498
+ @property
499
+ def ratio(self) -> float:
500
+ if self._character_count < 8:
501
+ return 0.0
502
+
503
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
504
+
505
+ return isolated_form_usage
506
+
507
+
508
+ @lru_cache(maxsize=1024)
509
+ def is_suspiciously_successive_range(
510
+ unicode_range_a: str | None, unicode_range_b: str | None
511
+ ) -> bool:
512
+ """
513
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
514
+ """
515
+ if unicode_range_a is None or unicode_range_b is None:
516
+ return True
517
+
518
+ if unicode_range_a == unicode_range_b:
519
+ return False
520
+
521
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
522
+ return False
523
+
524
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
525
+ return False
526
+
527
+ # Latin characters can be accompanied with a combining diacritical mark
528
+ # eg. Vietnamese.
529
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
530
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
531
+ ):
532
+ return False
533
+
534
+ keywords_range_a, keywords_range_b = (
535
+ unicode_range_a.split(" "),
536
+ unicode_range_b.split(" "),
537
+ )
538
+
539
+ for el in keywords_range_a:
540
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
541
+ continue
542
+ if el in keywords_range_b:
543
+ return False
544
+
545
+ # Japanese Exception
546
+ range_a_jp_chars, range_b_jp_chars = (
547
+ unicode_range_a
548
+ in (
549
+ "Hiragana",
550
+ "Katakana",
551
+ ),
552
+ unicode_range_b in ("Hiragana", "Katakana"),
553
+ )
554
+ if (range_a_jp_chars or range_b_jp_chars) and (
555
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
556
+ ):
557
+ return False
558
+ if range_a_jp_chars and range_b_jp_chars:
559
+ return False
560
+
561
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
562
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
563
+ return False
564
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
565
+ return False
566
+
567
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
568
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
569
+ unicode_range_a in ["Katakana", "Hiragana"]
570
+ and unicode_range_b in ["Katakana", "Hiragana"]
571
+ ):
572
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
573
+ return False
574
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
575
+ return False
576
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
577
+ return False
578
+
579
+ return True
580
+
581
+
582
+ @lru_cache(maxsize=2048)
583
+ def mess_ratio(
584
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
585
+ ) -> float:
586
+ """
587
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
588
+ """
589
+
590
+ detectors: list[MessDetectorPlugin] = [
591
+ md_class() for md_class in MessDetectorPlugin.__subclasses__()
592
+ ]
593
+
594
+ length: int = len(decoded_sequence) + 1
595
+
596
+ mean_mess_ratio: float = 0.0
597
+
598
+ if length < 512:
599
+ intermediary_mean_mess_ratio_calc: int = 32
600
+ elif length <= 1024:
601
+ intermediary_mean_mess_ratio_calc = 64
602
+ else:
603
+ intermediary_mean_mess_ratio_calc = 128
604
+
605
+ for character, index in zip(decoded_sequence + "\n", range(length)):
606
+ for detector in detectors:
607
+ if detector.eligible(character):
608
+ detector.feed(character)
609
+
610
+ if (
611
+ index > 0 and index % intermediary_mean_mess_ratio_calc == 0
612
+ ) or index == length - 1:
613
+ mean_mess_ratio = sum(dt.ratio for dt in detectors)
614
+
615
+ if mean_mess_ratio >= maximum_threshold:
616
+ break
617
+
618
+ if debug:
619
+ logger = getLogger("charset_normalizer")
620
+
621
+ logger.log(
622
+ TRACE,
623
+ "Mess-detector extended-analysis start. "
624
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
625
+ f"maximum_threshold={maximum_threshold}",
626
+ )
627
+
628
+ if len(decoded_sequence) > 16:
629
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
630
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
631
+
632
+ for dt in detectors:
633
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
634
+
635
+ return round(mean_mess_ratio, 3)
phivenv/Lib/site-packages/charset_normalizer/models.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from encodings.aliases import aliases
4
+ from hashlib import sha256
5
+ from json import dumps
6
+ from re import sub
7
+ from typing import Any, Iterator, List, Tuple
8
+
9
+ from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
10
+ from .utils import iana_name, is_multi_byte_encoding, unicode_range
11
+
12
+
13
+ class CharsetMatch:
14
+ def __init__(
15
+ self,
16
+ payload: bytes,
17
+ guessed_encoding: str,
18
+ mean_mess_ratio: float,
19
+ has_sig_or_bom: bool,
20
+ languages: CoherenceMatches,
21
+ decoded_payload: str | None = None,
22
+ preemptive_declaration: str | None = None,
23
+ ):
24
+ self._payload: bytes = payload
25
+
26
+ self._encoding: str = guessed_encoding
27
+ self._mean_mess_ratio: float = mean_mess_ratio
28
+ self._languages: CoherenceMatches = languages
29
+ self._has_sig_or_bom: bool = has_sig_or_bom
30
+ self._unicode_ranges: list[str] | None = None
31
+
32
+ self._leaves: list[CharsetMatch] = []
33
+ self._mean_coherence_ratio: float = 0.0
34
+
35
+ self._output_payload: bytes | None = None
36
+ self._output_encoding: str | None = None
37
+
38
+ self._string: str | None = decoded_payload
39
+
40
+ self._preemptive_declaration: str | None = preemptive_declaration
41
+
42
+ def __eq__(self, other: object) -> bool:
43
+ if not isinstance(other, CharsetMatch):
44
+ if isinstance(other, str):
45
+ return iana_name(other) == self.encoding
46
+ return False
47
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
48
+
49
+ def __lt__(self, other: object) -> bool:
50
+ """
51
+ Implemented to make sorted available upon CharsetMatches items.
52
+ """
53
+ if not isinstance(other, CharsetMatch):
54
+ raise ValueError
55
+
56
+ chaos_difference: float = abs(self.chaos - other.chaos)
57
+ coherence_difference: float = abs(self.coherence - other.coherence)
58
+
59
+ # Below 1% difference --> Use Coherence
60
+ if chaos_difference < 0.01 and coherence_difference > 0.02:
61
+ return self.coherence > other.coherence
62
+ elif chaos_difference < 0.01 and coherence_difference <= 0.02:
63
+ # When having a difficult decision, use the result that decoded as many multi-byte as possible.
64
+ # preserve RAM usage!
65
+ if len(self._payload) >= TOO_BIG_SEQUENCE:
66
+ return self.chaos < other.chaos
67
+ return self.multi_byte_usage > other.multi_byte_usage
68
+
69
+ return self.chaos < other.chaos
70
+
71
+ @property
72
+ def multi_byte_usage(self) -> float:
73
+ return 1.0 - (len(str(self)) / len(self.raw))
74
+
75
+ def __str__(self) -> str:
76
+ # Lazy Str Loading
77
+ if self._string is None:
78
+ self._string = str(self._payload, self._encoding, "strict")
79
+ return self._string
80
+
81
+ def __repr__(self) -> str:
82
+ return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
83
+
84
+ def add_submatch(self, other: CharsetMatch) -> None:
85
+ if not isinstance(other, CharsetMatch) or other == self:
86
+ raise ValueError(
87
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
88
+ other.__class__
89
+ )
90
+ )
91
+
92
+ other._string = None # Unload RAM usage; dirty trick.
93
+ self._leaves.append(other)
94
+
95
+ @property
96
+ def encoding(self) -> str:
97
+ return self._encoding
98
+
99
+ @property
100
+ def encoding_aliases(self) -> list[str]:
101
+ """
102
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
103
+ """
104
+ also_known_as: list[str] = []
105
+ for u, p in aliases.items():
106
+ if self.encoding == u:
107
+ also_known_as.append(p)
108
+ elif self.encoding == p:
109
+ also_known_as.append(u)
110
+ return also_known_as
111
+
112
+ @property
113
+ def bom(self) -> bool:
114
+ return self._has_sig_or_bom
115
+
116
+ @property
117
+ def byte_order_mark(self) -> bool:
118
+ return self._has_sig_or_bom
119
+
120
+ @property
121
+ def languages(self) -> list[str]:
122
+ """
123
+ Return the complete list of possible languages found in decoded sequence.
124
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
125
+ """
126
+ return [e[0] for e in self._languages]
127
+
128
+ @property
129
+ def language(self) -> str:
130
+ """
131
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
132
+ "Unknown".
133
+ """
134
+ if not self._languages:
135
+ # Trying to infer the language based on the given encoding
136
+ # Its either English or we should not pronounce ourselves in certain cases.
137
+ if "ascii" in self.could_be_from_charset:
138
+ return "English"
139
+
140
+ # doing it there to avoid circular import
141
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
142
+
143
+ languages = (
144
+ mb_encoding_languages(self.encoding)
145
+ if is_multi_byte_encoding(self.encoding)
146
+ else encoding_languages(self.encoding)
147
+ )
148
+
149
+ if len(languages) == 0 or "Latin Based" in languages:
150
+ return "Unknown"
151
+
152
+ return languages[0]
153
+
154
+ return self._languages[0][0]
155
+
156
+ @property
157
+ def chaos(self) -> float:
158
+ return self._mean_mess_ratio
159
+
160
+ @property
161
+ def coherence(self) -> float:
162
+ if not self._languages:
163
+ return 0.0
164
+ return self._languages[0][1]
165
+
166
+ @property
167
+ def percent_chaos(self) -> float:
168
+ return round(self.chaos * 100, ndigits=3)
169
+
170
+ @property
171
+ def percent_coherence(self) -> float:
172
+ return round(self.coherence * 100, ndigits=3)
173
+
174
+ @property
175
+ def raw(self) -> bytes:
176
+ """
177
+ Original untouched bytes.
178
+ """
179
+ return self._payload
180
+
181
+ @property
182
+ def submatch(self) -> list[CharsetMatch]:
183
+ return self._leaves
184
+
185
+ @property
186
+ def has_submatch(self) -> bool:
187
+ return len(self._leaves) > 0
188
+
189
+ @property
190
+ def alphabets(self) -> list[str]:
191
+ if self._unicode_ranges is not None:
192
+ return self._unicode_ranges
193
+ # list detected ranges
194
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
195
+ # filter and sort
196
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197
+ return self._unicode_ranges
198
+
199
+ @property
200
+ def could_be_from_charset(self) -> list[str]:
201
+ """
202
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
203
+ encoding.
204
+ This list does include the encoding available in property 'encoding'.
205
+ """
206
+ return [self._encoding] + [m.encoding for m in self._leaves]
207
+
208
+ def output(self, encoding: str = "utf_8") -> bytes:
209
+ """
210
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211
+ Any errors will be simply ignored by the encoder NOT replaced.
212
+ """
213
+ if self._output_encoding is None or self._output_encoding != encoding:
214
+ self._output_encoding = encoding
215
+ decoded_string = str(self)
216
+ if (
217
+ self._preemptive_declaration is not None
218
+ and self._preemptive_declaration.lower()
219
+ not in ["utf-8", "utf8", "utf_8"]
220
+ ):
221
+ patched_header = sub(
222
+ RE_POSSIBLE_ENCODING_INDICATION,
223
+ lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224
+ m.groups()[0],
225
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
226
+ ),
227
+ decoded_string[:8192],
228
+ count=1,
229
+ )
230
+
231
+ decoded_string = patched_header + decoded_string[8192:]
232
+
233
+ self._output_payload = decoded_string.encode(encoding, "replace")
234
+
235
+ return self._output_payload # type: ignore
236
+
237
+ @property
238
+ def fingerprint(self) -> str:
239
+ """
240
+ Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
241
+ """
242
+ return sha256(self.output()).hexdigest()
243
+
244
+
245
+ class CharsetMatches:
246
+ """
247
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
248
+ Act like a list(iterable) but does not implements all related methods.
249
+ """
250
+
251
+ def __init__(self, results: list[CharsetMatch] | None = None):
252
+ self._results: list[CharsetMatch] = sorted(results) if results else []
253
+
254
+ def __iter__(self) -> Iterator[CharsetMatch]:
255
+ yield from self._results
256
+
257
+ def __getitem__(self, item: int | str) -> CharsetMatch:
258
+ """
259
+ Retrieve a single item either by its position or encoding name (alias may be used here).
260
+ Raise KeyError upon invalid index or encoding not present in results.
261
+ """
262
+ if isinstance(item, int):
263
+ return self._results[item]
264
+ if isinstance(item, str):
265
+ item = iana_name(item, False)
266
+ for result in self._results:
267
+ if item in result.could_be_from_charset:
268
+ return result
269
+ raise KeyError
270
+
271
+ def __len__(self) -> int:
272
+ return len(self._results)
273
+
274
+ def __bool__(self) -> bool:
275
+ return len(self._results) > 0
276
+
277
+ def append(self, item: CharsetMatch) -> None:
278
+ """
279
+ Insert a single match. Will be inserted accordingly to preserve sort.
280
+ Can be inserted as a submatch.
281
+ """
282
+ if not isinstance(item, CharsetMatch):
283
+ raise ValueError(
284
+ "Cannot append instance '{}' to CharsetMatches".format(
285
+ str(item.__class__)
286
+ )
287
+ )
288
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
289
+ if len(item.raw) < TOO_BIG_SEQUENCE:
290
+ for match in self._results:
291
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
292
+ match.add_submatch(item)
293
+ return
294
+ self._results.append(item)
295
+ self._results = sorted(self._results)
296
+
297
+ def best(self) -> CharsetMatch | None:
298
+ """
299
+ Simply return the first match. Strict equivalent to matches[0].
300
+ """
301
+ if not self._results:
302
+ return None
303
+ return self._results[0]
304
+
305
+ def first(self) -> CharsetMatch | None:
306
+ """
307
+ Redundant method, call the method best(). Kept for BC reasons.
308
+ """
309
+ return self.best()
310
+
311
+
312
+ CoherenceMatch = Tuple[str, float]
313
+ CoherenceMatches = List[CoherenceMatch]
314
+
315
+
316
+ class CliDetectionResult:
317
+ def __init__(
318
+ self,
319
+ path: str,
320
+ encoding: str | None,
321
+ encoding_aliases: list[str],
322
+ alternative_encodings: list[str],
323
+ language: str,
324
+ alphabets: list[str],
325
+ has_sig_or_bom: bool,
326
+ chaos: float,
327
+ coherence: float,
328
+ unicode_path: str | None,
329
+ is_preferred: bool,
330
+ ):
331
+ self.path: str = path
332
+ self.unicode_path: str | None = unicode_path
333
+ self.encoding: str | None = encoding
334
+ self.encoding_aliases: list[str] = encoding_aliases
335
+ self.alternative_encodings: list[str] = alternative_encodings
336
+ self.language: str = language
337
+ self.alphabets: list[str] = alphabets
338
+ self.has_sig_or_bom: bool = has_sig_or_bom
339
+ self.chaos: float = chaos
340
+ self.coherence: float = coherence
341
+ self.is_preferred: bool = is_preferred
342
+
343
+ @property
344
+ def __dict__(self) -> dict[str, Any]: # type: ignore
345
+ return {
346
+ "path": self.path,
347
+ "encoding": self.encoding,
348
+ "encoding_aliases": self.encoding_aliases,
349
+ "alternative_encodings": self.alternative_encodings,
350
+ "language": self.language,
351
+ "alphabets": self.alphabets,
352
+ "has_sig_or_bom": self.has_sig_or_bom,
353
+ "chaos": self.chaos,
354
+ "coherence": self.coherence,
355
+ "unicode_path": self.unicode_path,
356
+ "is_preferred": self.is_preferred,
357
+ }
358
+
359
+ def to_json(self) -> str:
360
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)
phivenv/Lib/site-packages/charset_normalizer/py.typed ADDED
File without changes
phivenv/Lib/site-packages/charset_normalizer/utils.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import unicodedata
6
+ from codecs import IncrementalDecoder
7
+ from encodings.aliases import aliases
8
+ from functools import lru_cache
9
+ from re import findall
10
+ from typing import Generator
11
+
12
+ from _multibytecodec import ( # type: ignore[import-not-found,import]
13
+ MultibyteIncrementalDecoder,
14
+ )
15
+
16
+ from .constant import (
17
+ ENCODING_MARKS,
18
+ IANA_SUPPORTED_SIMILAR,
19
+ RE_POSSIBLE_ENCODING_INDICATION,
20
+ UNICODE_RANGES_COMBINED,
21
+ UNICODE_SECONDARY_RANGE_KEYWORD,
22
+ UTF8_MAXIMAL_ALLOCATION,
23
+ COMMON_CJK_CHARACTERS,
24
+ )
25
+
26
+
27
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
28
+ def is_accentuated(character: str) -> bool:
29
+ try:
30
+ description: str = unicodedata.name(character)
31
+ except ValueError: # Defensive: unicode database outdated?
32
+ return False
33
+ return (
34
+ "WITH GRAVE" in description
35
+ or "WITH ACUTE" in description
36
+ or "WITH CEDILLA" in description
37
+ or "WITH DIAERESIS" in description
38
+ or "WITH CIRCUMFLEX" in description
39
+ or "WITH TILDE" in description
40
+ or "WITH MACRON" in description
41
+ or "WITH RING ABOVE" in description
42
+ )
43
+
44
+
45
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
46
+ def remove_accent(character: str) -> str:
47
+ decomposed: str = unicodedata.decomposition(character)
48
+ if not decomposed:
49
+ return character
50
+
51
+ codes: list[str] = decomposed.split(" ")
52
+
53
+ return chr(int(codes[0], 16))
54
+
55
+
56
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
57
+ def unicode_range(character: str) -> str | None:
58
+ """
59
+ Retrieve the Unicode range official name from a single character.
60
+ """
61
+ character_ord: int = ord(character)
62
+
63
+ for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
64
+ if character_ord in ord_range:
65
+ return range_name
66
+
67
+ return None
68
+
69
+
70
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
71
+ def is_latin(character: str) -> bool:
72
+ try:
73
+ description: str = unicodedata.name(character)
74
+ except ValueError: # Defensive: unicode database outdated?
75
+ return False
76
+ return "LATIN" in description
77
+
78
+
79
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
80
+ def is_punctuation(character: str) -> bool:
81
+ character_category: str = unicodedata.category(character)
82
+
83
+ if "P" in character_category:
84
+ return True
85
+
86
+ character_range: str | None = unicode_range(character)
87
+
88
+ if character_range is None:
89
+ return False
90
+
91
+ return "Punctuation" in character_range
92
+
93
+
94
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
95
+ def is_symbol(character: str) -> bool:
96
+ character_category: str = unicodedata.category(character)
97
+
98
+ if "S" in character_category or "N" in character_category:
99
+ return True
100
+
101
+ character_range: str | None = unicode_range(character)
102
+
103
+ if character_range is None:
104
+ return False
105
+
106
+ return "Forms" in character_range and character_category != "Lo"
107
+
108
+
109
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
110
+ def is_emoticon(character: str) -> bool:
111
+ character_range: str | None = unicode_range(character)
112
+
113
+ if character_range is None:
114
+ return False
115
+
116
+ return "Emoticons" in character_range or "Pictographs" in character_range
117
+
118
+
119
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
120
+ def is_separator(character: str) -> bool:
121
+ if character.isspace() or character in {"|", "+", "<", ">"}:
122
+ return True
123
+
124
+ character_category: str = unicodedata.category(character)
125
+
126
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
127
+
128
+
129
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
130
+ def is_case_variable(character: str) -> bool:
131
+ return character.islower() != character.isupper()
132
+
133
+
134
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
135
+ def is_cjk(character: str) -> bool:
136
+ try:
137
+ character_name = unicodedata.name(character)
138
+ except ValueError: # Defensive: unicode database outdated?
139
+ return False
140
+
141
+ return "CJK" in character_name
142
+
143
+
144
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
145
+ def is_hiragana(character: str) -> bool:
146
+ try:
147
+ character_name = unicodedata.name(character)
148
+ except ValueError: # Defensive: unicode database outdated?
149
+ return False
150
+
151
+ return "HIRAGANA" in character_name
152
+
153
+
154
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
155
+ def is_katakana(character: str) -> bool:
156
+ try:
157
+ character_name = unicodedata.name(character)
158
+ except ValueError: # Defensive: unicode database outdated?
159
+ return False
160
+
161
+ return "KATAKANA" in character_name
162
+
163
+
164
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
165
+ def is_hangul(character: str) -> bool:
166
+ try:
167
+ character_name = unicodedata.name(character)
168
+ except ValueError: # Defensive: unicode database outdated?
169
+ return False
170
+
171
+ return "HANGUL" in character_name
172
+
173
+
174
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
175
+ def is_thai(character: str) -> bool:
176
+ try:
177
+ character_name = unicodedata.name(character)
178
+ except ValueError: # Defensive: unicode database outdated?
179
+ return False
180
+
181
+ return "THAI" in character_name
182
+
183
+
184
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
185
+ def is_arabic(character: str) -> bool:
186
+ try:
187
+ character_name = unicodedata.name(character)
188
+ except ValueError: # Defensive: unicode database outdated?
189
+ return False
190
+
191
+ return "ARABIC" in character_name
192
+
193
+
194
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
195
+ def is_arabic_isolated_form(character: str) -> bool:
196
+ try:
197
+ character_name = unicodedata.name(character)
198
+ except ValueError: # Defensive: unicode database outdated?
199
+ return False
200
+
201
+ return "ARABIC" in character_name and "ISOLATED FORM" in character_name
202
+
203
+
204
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
205
+ def is_cjk_uncommon(character: str) -> bool:
206
+ return character not in COMMON_CJK_CHARACTERS
207
+
208
+
209
+ @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
210
+ def is_unicode_range_secondary(range_name: str) -> bool:
211
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
212
+
213
+
214
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
215
+ def is_unprintable(character: str) -> bool:
216
+ return (
217
+ character.isspace() is False # includes \n \t \r \v
218
+ and character.isprintable() is False
219
+ and character != "\x1a" # Why? Its the ASCII substitute character.
220
+ and character != "\ufeff" # bug discovered in Python,
221
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
222
+ )
223
+
224
+
225
+ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
226
+ """
227
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
228
+ """
229
+ if not isinstance(sequence, bytes):
230
+ raise TypeError
231
+
232
+ seq_len: int = len(sequence)
233
+
234
+ results: list[str] = findall(
235
+ RE_POSSIBLE_ENCODING_INDICATION,
236
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
237
+ )
238
+
239
+ if len(results) == 0:
240
+ return None
241
+
242
+ for specified_encoding in results:
243
+ specified_encoding = specified_encoding.lower().replace("-", "_")
244
+
245
+ encoding_alias: str
246
+ encoding_iana: str
247
+
248
+ for encoding_alias, encoding_iana in aliases.items():
249
+ if encoding_alias == specified_encoding:
250
+ return encoding_iana
251
+ if encoding_iana == specified_encoding:
252
+ return encoding_iana
253
+
254
+ return None
255
+
256
+
257
+ @lru_cache(maxsize=128)
258
+ def is_multi_byte_encoding(name: str) -> bool:
259
+ """
260
+ Verify is a specific encoding is a multi byte one based on it IANA name
261
+ """
262
+ return name in {
263
+ "utf_8",
264
+ "utf_8_sig",
265
+ "utf_16",
266
+ "utf_16_be",
267
+ "utf_16_le",
268
+ "utf_32",
269
+ "utf_32_le",
270
+ "utf_32_be",
271
+ "utf_7",
272
+ } or issubclass(
273
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
274
+ MultibyteIncrementalDecoder,
275
+ )
276
+
277
+
278
+ def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
279
+ """
280
+ Identify and extract SIG/BOM in given sequence.
281
+ """
282
+
283
+ for iana_encoding in ENCODING_MARKS:
284
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
285
+
286
+ if isinstance(marks, bytes):
287
+ marks = [marks]
288
+
289
+ for mark in marks:
290
+ if sequence.startswith(mark):
291
+ return iana_encoding, mark
292
+
293
+ return None, b""
294
+
295
+
296
+ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
297
+ return iana_encoding not in {"utf_16", "utf_32"}
298
+
299
+
300
+ def iana_name(cp_name: str, strict: bool = True) -> str:
301
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
302
+ cp_name = cp_name.lower().replace("-", "_")
303
+
304
+ encoding_alias: str
305
+ encoding_iana: str
306
+
307
+ for encoding_alias, encoding_iana in aliases.items():
308
+ if cp_name in [encoding_alias, encoding_iana]:
309
+ return encoding_iana
310
+
311
+ if strict:
312
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
313
+
314
+ return cp_name
315
+
316
+
317
+ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
318
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
319
+ return 0.0
320
+
321
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
322
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
323
+
324
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
325
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
326
+
327
+ character_match_count: int = 0
328
+
329
+ for i in range(255):
330
+ to_be_decoded: bytes = bytes([i])
331
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
332
+ character_match_count += 1
333
+
334
+ return character_match_count / 254
335
+
336
+
337
+ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
338
+ """
339
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
340
+ the function cp_similarity.
341
+ """
342
+ return (
343
+ iana_name_a in IANA_SUPPORTED_SIMILAR
344
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
345
+ )
346
+
347
+
348
+ def set_logging_handler(
349
+ name: str = "charset_normalizer",
350
+ level: int = logging.INFO,
351
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
352
+ ) -> None:
353
+ logger = logging.getLogger(name)
354
+ logger.setLevel(level)
355
+
356
+ handler = logging.StreamHandler()
357
+ handler.setFormatter(logging.Formatter(format_string))
358
+ logger.addHandler(handler)
359
+
360
+
361
+ def cut_sequence_chunks(
362
+ sequences: bytes,
363
+ encoding_iana: str,
364
+ offsets: range,
365
+ chunk_size: int,
366
+ bom_or_sig_available: bool,
367
+ strip_sig_or_bom: bool,
368
+ sig_payload: bytes,
369
+ is_multi_byte_decoder: bool,
370
+ decoded_payload: str | None = None,
371
+ ) -> Generator[str, None, None]:
372
+ if decoded_payload and is_multi_byte_decoder is False:
373
+ for i in offsets:
374
+ chunk = decoded_payload[i : i + chunk_size]
375
+ if not chunk:
376
+ break
377
+ yield chunk
378
+ else:
379
+ for i in offsets:
380
+ chunk_end = i + chunk_size
381
+ if chunk_end > len(sequences) + 8:
382
+ continue
383
+
384
+ cut_sequence = sequences[i : i + chunk_size]
385
+
386
+ if bom_or_sig_available and strip_sig_or_bom is False:
387
+ cut_sequence = sig_payload + cut_sequence
388
+
389
+ chunk = cut_sequence.decode(
390
+ encoding_iana,
391
+ errors="ignore" if is_multi_byte_decoder else "strict",
392
+ )
393
+
394
+ # multi-byte bad cutting detector and adjustment
395
+ # not the cleanest way to perform that fix but clever enough for now.
396
+ if is_multi_byte_decoder and i > 0:
397
+ chunk_partial_size_chk: int = min(chunk_size, 16)
398
+
399
+ if (
400
+ decoded_payload
401
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
402
+ ):
403
+ for j in range(i, i - 4, -1):
404
+ cut_sequence = sequences[j:chunk_end]
405
+
406
+ if bom_or_sig_available and strip_sig_or_bom is False:
407
+ cut_sequence = sig_payload + cut_sequence
408
+
409
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
410
+
411
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
412
+ break
413
+
414
+ yield chunk
phivenv/Lib/site-packages/charset_normalizer/version.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expose version
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __version__ = "3.4.3"
8
+ VERSION = __version__.split(".")
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/METADATA ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: colorama
3
+ Version: 0.4.6
4
+ Summary: Cross-platform colored terminal text.
5
+ Project-URL: Homepage, https://github.com/tartley/colorama
6
+ Author-email: Jonathan Hartley <tartley@tartley.com>
7
+ License-File: LICENSE.txt
8
+ Keywords: ansi,color,colour,crossplatform,terminal,text,windows,xplatform
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: BSD License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 2
16
+ Classifier: Programming Language :: Python :: 2.7
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
24
+ Classifier: Topic :: Terminals
25
+ Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7
26
+ Description-Content-Type: text/x-rst
27
+
28
+ .. image:: https://img.shields.io/pypi/v/colorama.svg
29
+ :target: https://pypi.org/project/colorama/
30
+ :alt: Latest Version
31
+
32
+ .. image:: https://img.shields.io/pypi/pyversions/colorama.svg
33
+ :target: https://pypi.org/project/colorama/
34
+ :alt: Supported Python versions
35
+
36
+ .. image:: https://github.com/tartley/colorama/actions/workflows/test.yml/badge.svg
37
+ :target: https://github.com/tartley/colorama/actions/workflows/test.yml
38
+ :alt: Build Status
39
+
40
+ Colorama
41
+ ========
42
+
43
+ Makes ANSI escape character sequences (for producing colored terminal text and
44
+ cursor positioning) work under MS Windows.
45
+
46
+ .. |donate| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif
47
+ :target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=2MZ9D2GMLYCUJ&item_name=Colorama&currency_code=USD
48
+ :alt: Donate with Paypal
49
+
50
+ `PyPI for releases <https://pypi.org/project/colorama/>`_ |
51
+ `Github for source <https://github.com/tartley/colorama>`_ |
52
+ `Colorama for enterprise on Tidelift <https://github.com/tartley/colorama/blob/master/ENTERPRISE.md>`_
53
+
54
+ If you find Colorama useful, please |donate| to the authors. Thank you!
55
+
56
+ Installation
57
+ ------------
58
+
59
+ Tested on CPython 2.7, 3.7, 3.8, 3.9 and 3.10 and Pypy 2.7 and 3.8.
60
+
61
+ No requirements other than the standard library.
62
+
63
+ .. code-block:: bash
64
+
65
+ pip install colorama
66
+ # or
67
+ conda install -c anaconda colorama
68
+
69
+ Description
70
+ -----------
71
+
72
+ ANSI escape character sequences have long been used to produce colored terminal
73
+ text and cursor positioning on Unix and Macs. Colorama makes this work on
74
+ Windows, too, by wrapping ``stdout``, stripping ANSI sequences it finds (which
75
+ would appear as gobbledygook in the output), and converting them into the
76
+ appropriate win32 calls to modify the state of the terminal. On other platforms,
77
+ Colorama does nothing.
78
+
79
+ This has the upshot of providing a simple cross-platform API for printing
80
+ colored terminal text from Python, and has the happy side-effect that existing
81
+ applications or libraries which use ANSI sequences to produce colored output on
82
+ Linux or Macs can now also work on Windows, simply by calling
83
+ ``colorama.just_fix_windows_console()`` (since v0.4.6) or ``colorama.init()``
84
+ (all versions, but may have other side-effects – see below).
85
+
86
+ An alternative approach is to install ``ansi.sys`` on Windows machines, which
87
+ provides the same behaviour for all applications running in terminals. Colorama
88
+ is intended for situations where that isn't easy (e.g., maybe your app doesn't
89
+ have an installer.)
90
+
91
+ Demo scripts in the source code repository print some colored text using
92
+ ANSI sequences. Compare their output under Gnome-terminal's built in ANSI
93
+ handling, versus on Windows Command-Prompt using Colorama:
94
+
95
+ .. image:: https://github.com/tartley/colorama/raw/master/screenshots/ubuntu-demo.png
96
+ :width: 661
97
+ :height: 357
98
+ :alt: ANSI sequences on Ubuntu under gnome-terminal.
99
+
100
+ .. image:: https://github.com/tartley/colorama/raw/master/screenshots/windows-demo.png
101
+ :width: 668
102
+ :height: 325
103
+ :alt: Same ANSI sequences on Windows, using Colorama.
104
+
105
+ These screenshots show that, on Windows, Colorama does not support ANSI 'dim
106
+ text'; it looks the same as 'normal text'.
107
+
108
+ Usage
109
+ -----
110
+
111
+ Initialisation
112
+ ..............
113
+
114
+ If the only thing you want from Colorama is to get ANSI escapes to work on
115
+ Windows, then run:
116
+
117
+ .. code-block:: python
118
+
119
+ from colorama import just_fix_windows_console
120
+ just_fix_windows_console()
121
+
122
+ If you're on a recent version of Windows 10 or better, and your stdout/stderr
123
+ are pointing to a Windows console, then this will flip the magic configuration
124
+ switch to enable Windows' built-in ANSI support.
125
+
126
+ If you're on an older version of Windows, and your stdout/stderr are pointing to
127
+ a Windows console, then this will wrap ``sys.stdout`` and/or ``sys.stderr`` in a
128
+ magic file object that intercepts ANSI escape sequences and issues the
129
+ appropriate Win32 calls to emulate them.
130
+
131
+ In all other circumstances, it does nothing whatsoever. Basically the idea is
132
+ that this makes Windows act like Unix with respect to ANSI escape handling.
133
+
134
+ It's safe to call this function multiple times. It's safe to call this function
135
+ on non-Windows platforms, but it won't do anything. It's safe to call this
136
+ function when one or both of your stdout/stderr are redirected to a file – it
137
+ won't do anything to those streams.
138
+
139
+ Alternatively, you can use the older interface with more features (but also more
140
+ potential footguns):
141
+
142
+ .. code-block:: python
143
+
144
+ from colorama import init
145
+ init()
146
+
147
+ This does the same thing as ``just_fix_windows_console``, except for the
148
+ following differences:
149
+
150
+ - It's not safe to call ``init`` multiple times; you can end up with multiple
151
+ layers of wrapping and broken ANSI support.
152
+
153
+ - Colorama will apply a heuristic to guess whether stdout/stderr support ANSI,
154
+ and if it thinks they don't, then it will wrap ``sys.stdout`` and
155
+ ``sys.stderr`` in a magic file object that strips out ANSI escape sequences
156
+ before printing them. This happens on all platforms, and can be convenient if
157
+ you want to write your code to emit ANSI escape sequences unconditionally, and
158
+ let Colorama decide whether they should actually be output. But note that
159
+ Colorama's heuristic is not particularly clever.
160
+
161
+ - ``init`` also accepts explicit keyword args to enable/disable various
162
+ functionality – see below.
163
+
164
+ To stop using Colorama before your program exits, simply call ``deinit()``.
165
+ This will restore ``stdout`` and ``stderr`` to their original values, so that
166
+ Colorama is disabled. To resume using Colorama again, call ``reinit()``; it is
167
+ cheaper than calling ``init()`` again (but does the same thing).
168
+
169
+ Most users should depend on ``colorama >= 0.4.6``, and use
170
+ ``just_fix_windows_console``. The old ``init`` interface will be supported
171
+ indefinitely for backwards compatibility, but we don't plan to fix any issues
172
+ with it, also for backwards compatibility.
173
+
174
+ Colored Output
175
+ ..............
176
+
177
+ Cross-platform printing of colored text can then be done using Colorama's
178
+ constant shorthand for ANSI escape sequences. These are deliberately
179
+ rudimentary, see below.
180
+
181
+ .. code-block:: python
182
+
183
+ from colorama import Fore, Back, Style
184
+ print(Fore.RED + 'some red text')
185
+ print(Back.GREEN + 'and with a green background')
186
+ print(Style.DIM + 'and in dim text')
187
+ print(Style.RESET_ALL)
188
+ print('back to normal now')
189
+
190
+ ...or simply by manually printing ANSI sequences from your own code:
191
+
192
+ .. code-block:: python
193
+
194
+ print('\033[31m' + 'some red text')
195
+ print('\033[39m') # and reset to default color
196
+
197
+ ...or, Colorama can be used in conjunction with existing ANSI libraries
198
+ such as the venerable `Termcolor <https://pypi.org/project/termcolor/>`_
199
+ the fabulous `Blessings <https://pypi.org/project/blessings/>`_,
200
+ or the incredible `_Rich <https://pypi.org/project/rich/>`_.
201
+
202
+ If you wish Colorama's Fore, Back and Style constants were more capable,
203
+ then consider using one of the above highly capable libraries to generate
204
+ colors, etc, and use Colorama just for its primary purpose: to convert
205
+ those ANSI sequences to also work on Windows:
206
+
207
+ SIMILARLY, do not send PRs adding the generation of new ANSI types to Colorama.
208
+ We are only interested in converting ANSI codes to win32 API calls, not
209
+ shortcuts like the above to generate ANSI characters.
210
+
211
+ .. code-block:: python
212
+
213
+ from colorama import just_fix_windows_console
214
+ from termcolor import colored
215
+
216
+ # use Colorama to make Termcolor work on Windows too
217
+ just_fix_windows_console()
218
+
219
+ # then use Termcolor for all colored text output
220
+ print(colored('Hello, World!', 'green', 'on_red'))
221
+
222
+ Available formatting constants are::
223
+
224
+ Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
225
+ Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
226
+ Style: DIM, NORMAL, BRIGHT, RESET_ALL
227
+
228
+ ``Style.RESET_ALL`` resets foreground, background, and brightness. Colorama will
229
+ perform this reset automatically on program exit.
230
+
231
+ These are fairly well supported, but not part of the standard::
232
+
233
+ Fore: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
234
+ Back: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
235
+
236
+ Cursor Positioning
237
+ ..................
238
+
239
+ ANSI codes to reposition the cursor are supported. See ``demos/demo06.py`` for
240
+ an example of how to generate them.
241
+
242
+ Init Keyword Args
243
+ .................
244
+
245
+ ``init()`` accepts some ``**kwargs`` to override default behaviour.
246
+
247
+ init(autoreset=False):
248
+ If you find yourself repeatedly sending reset sequences to turn off color
249
+ changes at the end of every print, then ``init(autoreset=True)`` will
250
+ automate that:
251
+
252
+ .. code-block:: python
253
+
254
+ from colorama import init
255
+ init(autoreset=True)
256
+ print(Fore.RED + 'some red text')
257
+ print('automatically back to default color again')
258
+
259
+ init(strip=None):
260
+ Pass ``True`` or ``False`` to override whether ANSI codes should be
261
+ stripped from the output. The default behaviour is to strip if on Windows
262
+ or if output is redirected (not a tty).
263
+
264
+ init(convert=None):
265
+ Pass ``True`` or ``False`` to override whether to convert ANSI codes in the
266
+ output into win32 calls. The default behaviour is to convert if on Windows
267
+ and output is to a tty (terminal).
268
+
269
+ init(wrap=True):
270
+ On Windows, Colorama works by replacing ``sys.stdout`` and ``sys.stderr``
271
+ with proxy objects, which override the ``.write()`` method to do their work.
272
+ If this wrapping causes you problems, then this can be disabled by passing
273
+ ``init(wrap=False)``. The default behaviour is to wrap if ``autoreset`` or
274
+ ``strip`` or ``convert`` are True.
275
+
276
+ When wrapping is disabled, colored printing on non-Windows platforms will
277
+ continue to work as normal. To do cross-platform colored output, you can
278
+ use Colorama's ``AnsiToWin32`` proxy directly:
279
+
280
+ .. code-block:: python
281
+
282
+ import sys
283
+ from colorama import init, AnsiToWin32
284
+ init(wrap=False)
285
+ stream = AnsiToWin32(sys.stderr).stream
286
+
287
+ # Python 2
288
+ print >>stream, Fore.BLUE + 'blue text on stderr'
289
+
290
+ # Python 3
291
+ print(Fore.BLUE + 'blue text on stderr', file=stream)
292
+
293
+ Recognised ANSI Sequences
294
+ .........................
295
+
296
+ ANSI sequences generally take the form::
297
+
298
+ ESC [ <param> ; <param> ... <command>
299
+
300
+ Where ``<param>`` is an integer, and ``<command>`` is a single letter. Zero or
301
+ more params are passed to a ``<command>``. If no params are passed, it is
302
+ generally synonymous with passing a single zero. No spaces exist in the
303
+ sequence; they have been inserted here simply to read more easily.
304
+
305
+ The only ANSI sequences that Colorama converts into win32 calls are::
306
+
307
+ ESC [ 0 m # reset all (colors and brightness)
308
+ ESC [ 1 m # bright
309
+ ESC [ 2 m # dim (looks same as normal brightness)
310
+ ESC [ 22 m # normal brightness
311
+
312
+ # FOREGROUND:
313
+ ESC [ 30 m # black
314
+ ESC [ 31 m # red
315
+ ESC [ 32 m # green
316
+ ESC [ 33 m # yellow
317
+ ESC [ 34 m # blue
318
+ ESC [ 35 m # magenta
319
+ ESC [ 36 m # cyan
320
+ ESC [ 37 m # white
321
+ ESC [ 39 m # reset
322
+
323
+ # BACKGROUND
324
+ ESC [ 40 m # black
325
+ ESC [ 41 m # red
326
+ ESC [ 42 m # green
327
+ ESC [ 43 m # yellow
328
+ ESC [ 44 m # blue
329
+ ESC [ 45 m # magenta
330
+ ESC [ 46 m # cyan
331
+ ESC [ 47 m # white
332
+ ESC [ 49 m # reset
333
+
334
+ # cursor positioning
335
+ ESC [ y;x H # position cursor at x across, y down
336
+ ESC [ y;x f # position cursor at x across, y down
337
+ ESC [ n A # move cursor n lines up
338
+ ESC [ n B # move cursor n lines down
339
+ ESC [ n C # move cursor n characters forward
340
+ ESC [ n D # move cursor n characters backward
341
+
342
+ # clear the screen
343
+ ESC [ mode J # clear the screen
344
+
345
+ # clear the line
346
+ ESC [ mode K # clear the line
347
+
348
+ Multiple numeric params to the ``'m'`` command can be combined into a single
349
+ sequence::
350
+
351
+ ESC [ 36 ; 45 ; 1 m # bright cyan text on magenta background
352
+
353
+ All other ANSI sequences of the form ``ESC [ <param> ; <param> ... <command>``
354
+ are silently stripped from the output on Windows.
355
+
356
+ Any other form of ANSI sequence, such as single-character codes or alternative
357
+ initial characters, are not recognised or stripped. It would be cool to add
358
+ them though. Let me know if it would be useful for you, via the Issues on
359
+ GitHub.
360
+
361
+ Status & Known Problems
362
+ -----------------------
363
+
364
+ I've personally only tested it on Windows XP (CMD, Console2), Ubuntu
365
+ (gnome-terminal, xterm), and OS X.
366
+
367
+ Some valid ANSI sequences aren't recognised.
368
+
369
+ If you're hacking on the code, see `README-hacking.md`_. ESPECIALLY, see the
370
+ explanation there of why we do not want PRs that allow Colorama to generate new
371
+ types of ANSI codes.
372
+
373
+ See outstanding issues and wish-list:
374
+ https://github.com/tartley/colorama/issues
375
+
376
+ If anything doesn't work for you, or doesn't do what you expected or hoped for,
377
+ I'd love to hear about it on that issues list, would be delighted by patches,
378
+ and would be happy to grant commit access to anyone who submits a working patch
379
+ or two.
380
+
381
+ .. _README-hacking.md: README-hacking.md
382
+
383
+ License
384
+ -------
385
+
386
+ Copyright Jonathan Hartley & Arnon Yaari, 2013-2020. BSD 3-Clause license; see
387
+ LICENSE file.
388
+
389
+ Professional support
390
+ --------------------
391
+
392
+ .. |tideliftlogo| image:: https://cdn2.hubspot.net/hubfs/4008838/website/logos/logos_for_download/Tidelift_primary-shorthand-logo.png
393
+ :alt: Tidelift
394
+ :target: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
395
+
396
+ .. list-table::
397
+ :widths: 10 100
398
+
399
+ * - |tideliftlogo|
400
+ - Professional support for colorama is available as part of the
401
+ `Tidelift Subscription`_.
402
+ Tidelift gives software development teams a single source for purchasing
403
+ and maintaining their software, with professional grade assurances from
404
+ the experts who know it best, while seamlessly integrating with existing
405
+ tools.
406
+
407
+ .. _Tidelift Subscription: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
408
+
409
+ Thanks
410
+ ------
411
+
412
+ See the CHANGELOG for more thanks!
413
+
414
+ * Marc Schlaich (schlamar) for a ``setup.py`` fix for Python2.5.
415
+ * Marc Abramowitz, reported & fixed a crash on exit with closed ``stdout``,
416
+ providing a solution to issue #7's setuptools/distutils debate,
417
+ and other fixes.
418
+ * User 'eryksun', for guidance on correctly instantiating ``ctypes.windll``.
419
+ * Matthew McCormick for politely pointing out a longstanding crash on non-Win.
420
+ * Ben Hoyt, for a magnificent fix under 64-bit Windows.
421
+ * Jesse at Empty Square for submitting a fix for examples in the README.
422
+ * User 'jamessp', an observant documentation fix for cursor positioning.
423
+ * User 'vaal1239', Dave Mckee & Lackner Kristof for a tiny but much-needed Win7
424
+ fix.
425
+ * Julien Stuyck, for wisely suggesting Python3 compatible updates to README.
426
+ * Daniel Griffith for multiple fabulous patches.
427
+ * Oscar Lesta for a valuable fix to stop ANSI chars being sent to non-tty
428
+ output.
429
+ * Roger Binns, for many suggestions, valuable feedback, & bug reports.
430
+ * Tim Golden for thought and much appreciated feedback on the initial idea.
431
+ * User 'Zearin' for updates to the README file.
432
+ * John Szakmeister for adding support for light colors
433
+ * Charles Merriam for adding documentation to demos
434
+ * Jurko for a fix on 64-bit Windows CPython2.5 w/o ctypes
435
+ * Florian Bruhin for a fix when stdout or stderr are None
436
+ * Thomas Weininger for fixing ValueError on Windows
437
+ * Remi Rampin for better Github integration and fixes to the README file
438
+ * Simeon Visser for closing a file handle using 'with' and updating classifiers
439
+ to include Python 3.3 and 3.4
440
+ * Andy Neff for fixing RESET of LIGHT_EX colors.
441
+ * Jonathan Hartley for the initial idea and implementation.
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/RECORD ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ colorama-0.4.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ colorama-0.4.6.dist-info/METADATA,sha256=e67SnrUMOym9sz_4TjF3vxvAV4T3aF7NyqRHHH3YEMw,17158
3
+ colorama-0.4.6.dist-info/RECORD,,
4
+ colorama-0.4.6.dist-info/WHEEL,sha256=cdcF4Fbd0FPtw2EMIOwH-3rSOTUdTCeOSXRMD1iLUb8,105
5
+ colorama-0.4.6.dist-info/licenses/LICENSE.txt,sha256=ysNcAmhuXQSlpxQL-zs25zrtSWZW6JEQLkKIhteTAxg,1491
6
+ colorama/__init__.py,sha256=wePQA4U20tKgYARySLEC047ucNX-g8pRLpYBuiHlLb8,266
7
+ colorama/__pycache__/__init__.cpython-39.pyc,,
8
+ colorama/__pycache__/ansi.cpython-39.pyc,,
9
+ colorama/__pycache__/ansitowin32.cpython-39.pyc,,
10
+ colorama/__pycache__/initialise.cpython-39.pyc,,
11
+ colorama/__pycache__/win32.cpython-39.pyc,,
12
+ colorama/__pycache__/winterm.cpython-39.pyc,,
13
+ colorama/ansi.py,sha256=Top4EeEuaQdBWdteKMEcGOTeKeF19Q-Wo_6_Cj5kOzQ,2522
14
+ colorama/ansitowin32.py,sha256=vPNYa3OZbxjbuFyaVo0Tmhmy1FZ1lKMWCnT7odXpItk,11128
15
+ colorama/initialise.py,sha256=-hIny86ClXo39ixh5iSCfUIa2f_h_bgKRDW7gqs-KLU,3325
16
+ colorama/tests/__init__.py,sha256=MkgPAEzGQd-Rq0w0PZXSX2LadRWhUECcisJY8lSrm4Q,75
17
+ colorama/tests/__pycache__/__init__.cpython-39.pyc,,
18
+ colorama/tests/__pycache__/ansi_test.cpython-39.pyc,,
19
+ colorama/tests/__pycache__/ansitowin32_test.cpython-39.pyc,,
20
+ colorama/tests/__pycache__/initialise_test.cpython-39.pyc,,
21
+ colorama/tests/__pycache__/isatty_test.cpython-39.pyc,,
22
+ colorama/tests/__pycache__/utils.cpython-39.pyc,,
23
+ colorama/tests/__pycache__/winterm_test.cpython-39.pyc,,
24
+ colorama/tests/ansi_test.py,sha256=FeViDrUINIZcr505PAxvU4AjXz1asEiALs9GXMhwRaE,2839
25
+ colorama/tests/ansitowin32_test.py,sha256=RN7AIhMJ5EqDsYaCjVo-o4u8JzDD4ukJbmevWKS70rY,10678
26
+ colorama/tests/initialise_test.py,sha256=BbPy-XfyHwJ6zKozuQOvNvQZzsx9vdb_0bYXn7hsBTc,6741
27
+ colorama/tests/isatty_test.py,sha256=Pg26LRpv0yQDB5Ac-sxgVXG7hsA1NYvapFgApZfYzZg,1866
28
+ colorama/tests/utils.py,sha256=1IIRylG39z5-dzq09R_ngufxyPZxgldNbrxKxUGwGKE,1079
29
+ colorama/tests/winterm_test.py,sha256=qoWFPEjym5gm2RuMwpf3pOis3a5r_PJZFCzK254JL8A,3709
30
+ colorama/win32.py,sha256=YQOKwMTwtGBbsY4dL5HYTvwTeP9wIQra5MvPNddpxZs,6181
31
+ colorama/winterm.py,sha256=XCQFDHjPi6AHYNdZwy0tA02H-Jh48Jp-HvCjeLeLp3U,7134
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.11.1
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
phivenv/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2010 Jonathan Hartley
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holders, nor those of its contributors
15
+ may be used to endorse or promote products derived from this software without
16
+ specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
phivenv/Lib/site-packages/colorama/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
+ from .initialise import init, deinit, reinit, colorama_text, just_fix_windows_console
3
+ from .ansi import Fore, Back, Style, Cursor
4
+ from .ansitowin32 import AnsiToWin32
5
+
6
+ __version__ = '0.4.6'
7
+
phivenv/Lib/site-packages/colorama/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (438 Bytes). View file
 
phivenv/Lib/site-packages/colorama/__pycache__/ansi.cpython-39.pyc ADDED
Binary file (3.19 kB). View file
 
phivenv/Lib/site-packages/colorama/__pycache__/ansitowin32.cpython-39.pyc ADDED
Binary file (8.27 kB). View file
 
phivenv/Lib/site-packages/colorama/__pycache__/initialise.cpython-39.pyc ADDED
Binary file (2.24 kB). View file
 
phivenv/Lib/site-packages/colorama/__pycache__/win32.cpython-39.pyc ADDED
Binary file (4.42 kB). View file
 
phivenv/Lib/site-packages/colorama/__pycache__/winterm.cpython-39.pyc ADDED
Binary file (5.22 kB). View file
 
phivenv/Lib/site-packages/colorama/ansi.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
+ '''
3
+ This module generates ANSI character codes to printing colors to terminals.
4
+ See: http://en.wikipedia.org/wiki/ANSI_escape_code
5
+ '''
6
+
7
+ CSI = '\033['
8
+ OSC = '\033]'
9
+ BEL = '\a'
10
+
11
+
12
+ def code_to_chars(code):
13
+ return CSI + str(code) + 'm'
14
+
15
+ def set_title(title):
16
+ return OSC + '2;' + title + BEL
17
+
18
+ def clear_screen(mode=2):
19
+ return CSI + str(mode) + 'J'
20
+
21
+ def clear_line(mode=2):
22
+ return CSI + str(mode) + 'K'
23
+
24
+
25
+ class AnsiCodes(object):
26
+ def __init__(self):
27
+ # the subclasses declare class attributes which are numbers.
28
+ # Upon instantiation we define instance attributes, which are the same
29
+ # as the class attributes but wrapped with the ANSI escape sequence
30
+ for name in dir(self):
31
+ if not name.startswith('_'):
32
+ value = getattr(self, name)
33
+ setattr(self, name, code_to_chars(value))
34
+
35
+
36
+ class AnsiCursor(object):
37
+ def UP(self, n=1):
38
+ return CSI + str(n) + 'A'
39
+ def DOWN(self, n=1):
40
+ return CSI + str(n) + 'B'
41
+ def FORWARD(self, n=1):
42
+ return CSI + str(n) + 'C'
43
+ def BACK(self, n=1):
44
+ return CSI + str(n) + 'D'
45
+ def POS(self, x=1, y=1):
46
+ return CSI + str(y) + ';' + str(x) + 'H'
47
+
48
+
49
+ class AnsiFore(AnsiCodes):
50
+ BLACK = 30
51
+ RED = 31
52
+ GREEN = 32
53
+ YELLOW = 33
54
+ BLUE = 34
55
+ MAGENTA = 35
56
+ CYAN = 36
57
+ WHITE = 37
58
+ RESET = 39
59
+
60
+ # These are fairly well supported, but not part of the standard.
61
+ LIGHTBLACK_EX = 90
62
+ LIGHTRED_EX = 91
63
+ LIGHTGREEN_EX = 92
64
+ LIGHTYELLOW_EX = 93
65
+ LIGHTBLUE_EX = 94
66
+ LIGHTMAGENTA_EX = 95
67
+ LIGHTCYAN_EX = 96
68
+ LIGHTWHITE_EX = 97
69
+
70
+
71
+ class AnsiBack(AnsiCodes):
72
+ BLACK = 40
73
+ RED = 41
74
+ GREEN = 42
75
+ YELLOW = 43
76
+ BLUE = 44
77
+ MAGENTA = 45
78
+ CYAN = 46
79
+ WHITE = 47
80
+ RESET = 49
81
+
82
+ # These are fairly well supported, but not part of the standard.
83
+ LIGHTBLACK_EX = 100
84
+ LIGHTRED_EX = 101
85
+ LIGHTGREEN_EX = 102
86
+ LIGHTYELLOW_EX = 103
87
+ LIGHTBLUE_EX = 104
88
+ LIGHTMAGENTA_EX = 105
89
+ LIGHTCYAN_EX = 106
90
+ LIGHTWHITE_EX = 107
91
+
92
+
93
+ class AnsiStyle(AnsiCodes):
94
+ BRIGHT = 1
95
+ DIM = 2
96
+ NORMAL = 22
97
+ RESET_ALL = 0
98
+
99
+ Fore = AnsiFore()
100
+ Back = AnsiBack()
101
+ Style = AnsiStyle()
102
+ Cursor = AnsiCursor()
phivenv/Lib/site-packages/colorama/ansitowin32.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
+ import re
3
+ import sys
4
+ import os
5
+
6
+ from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
7
+ from .winterm import enable_vt_processing, WinTerm, WinColor, WinStyle
8
+ from .win32 import windll, winapi_test
9
+
10
+
11
+ winterm = None
12
+ if windll is not None:
13
+ winterm = WinTerm()
14
+
15
+
16
+ class StreamWrapper(object):
17
+ '''
18
+ Wraps a stream (such as stdout), acting as a transparent proxy for all
19
+ attribute access apart from method 'write()', which is delegated to our
20
+ Converter instance.
21
+ '''
22
+ def __init__(self, wrapped, converter):
23
+ # double-underscore everything to prevent clashes with names of
24
+ # attributes on the wrapped stream object.
25
+ self.__wrapped = wrapped
26
+ self.__convertor = converter
27
+
28
+ def __getattr__(self, name):
29
+ return getattr(self.__wrapped, name)
30
+
31
+ def __enter__(self, *args, **kwargs):
32
+ # special method lookup bypasses __getattr__/__getattribute__, see
33
+ # https://stackoverflow.com/questions/12632894/why-doesnt-getattr-work-with-exit
34
+ # thus, contextlib magic methods are not proxied via __getattr__
35
+ return self.__wrapped.__enter__(*args, **kwargs)
36
+
37
+ def __exit__(self, *args, **kwargs):
38
+ return self.__wrapped.__exit__(*args, **kwargs)
39
+
40
+ def __setstate__(self, state):
41
+ self.__dict__ = state
42
+
43
+ def __getstate__(self):
44
+ return self.__dict__
45
+
46
+ def write(self, text):
47
+ self.__convertor.write(text)
48
+
49
+ def isatty(self):
50
+ stream = self.__wrapped
51
+ if 'PYCHARM_HOSTED' in os.environ:
52
+ if stream is not None and (stream is sys.__stdout__ or stream is sys.__stderr__):
53
+ return True
54
+ try:
55
+ stream_isatty = stream.isatty
56
+ except AttributeError:
57
+ return False
58
+ else:
59
+ return stream_isatty()
60
+
61
+ @property
62
+ def closed(self):
63
+ stream = self.__wrapped
64
+ try:
65
+ return stream.closed
66
+ # AttributeError in the case that the stream doesn't support being closed
67
+ # ValueError for the case that the stream has already been detached when atexit runs
68
+ except (AttributeError, ValueError):
69
+ return True
70
+
71
+
72
+ class AnsiToWin32(object):
73
+ '''
74
+ Implements a 'write()' method which, on Windows, will strip ANSI character
75
+ sequences from the text, and if outputting to a tty, will convert them into
76
+ win32 function calls.
77
+ '''
78
+ ANSI_CSI_RE = re.compile('\001?\033\\[((?:\\d|;)*)([a-zA-Z])\002?') # Control Sequence Introducer
79
+ ANSI_OSC_RE = re.compile('\001?\033\\]([^\a]*)(\a)\002?') # Operating System Command
80
+
81
+ def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
82
+ # The wrapped stream (normally sys.stdout or sys.stderr)
83
+ self.wrapped = wrapped
84
+
85
+ # should we reset colors to defaults after every .write()
86
+ self.autoreset = autoreset
87
+
88
+ # create the proxy wrapping our output stream
89
+ self.stream = StreamWrapper(wrapped, self)
90
+
91
+ on_windows = os.name == 'nt'
92
+ # We test if the WinAPI works, because even if we are on Windows
93
+ # we may be using a terminal that doesn't support the WinAPI
94
+ # (e.g. Cygwin Terminal). In this case it's up to the terminal
95
+ # to support the ANSI codes.
96
+ conversion_supported = on_windows and winapi_test()
97
+ try:
98
+ fd = wrapped.fileno()
99
+ except Exception:
100
+ fd = -1
101
+ system_has_native_ansi = not on_windows or enable_vt_processing(fd)
102
+ have_tty = not self.stream.closed and self.stream.isatty()
103
+ need_conversion = conversion_supported and not system_has_native_ansi
104
+
105
+ # should we strip ANSI sequences from our output?
106
+ if strip is None:
107
+ strip = need_conversion or not have_tty
108
+ self.strip = strip
109
+
110
+ # should we should convert ANSI sequences into win32 calls?
111
+ if convert is None:
112
+ convert = need_conversion and have_tty
113
+ self.convert = convert
114
+
115
+ # dict of ansi codes to win32 functions and parameters
116
+ self.win32_calls = self.get_win32_calls()
117
+
118
+ # are we wrapping stderr?
119
+ self.on_stderr = self.wrapped is sys.stderr
120
+
121
+ def should_wrap(self):
122
+ '''
123
+ True if this class is actually needed. If false, then the output
124
+ stream will not be affected, nor will win32 calls be issued, so
125
+ wrapping stdout is not actually required. This will generally be
126
+ False on non-Windows platforms, unless optional functionality like
127
+ autoreset has been requested using kwargs to init()
128
+ '''
129
+ return self.convert or self.strip or self.autoreset
130
+
131
+ def get_win32_calls(self):
132
+ if self.convert and winterm:
133
+ return {
134
+ AnsiStyle.RESET_ALL: (winterm.reset_all, ),
135
+ AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
136
+ AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
137
+ AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
138
+ AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
139
+ AnsiFore.RED: (winterm.fore, WinColor.RED),
140
+ AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
141
+ AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
142
+ AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
143
+ AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
144
+ AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
145
+ AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
146
+ AnsiFore.RESET: (winterm.fore, ),
147
+ AnsiFore.LIGHTBLACK_EX: (winterm.fore, WinColor.BLACK, True),
148
+ AnsiFore.LIGHTRED_EX: (winterm.fore, WinColor.RED, True),
149
+ AnsiFore.LIGHTGREEN_EX: (winterm.fore, WinColor.GREEN, True),
150
+ AnsiFore.LIGHTYELLOW_EX: (winterm.fore, WinColor.YELLOW, True),
151
+ AnsiFore.LIGHTBLUE_EX: (winterm.fore, WinColor.BLUE, True),
152
+ AnsiFore.LIGHTMAGENTA_EX: (winterm.fore, WinColor.MAGENTA, True),
153
+ AnsiFore.LIGHTCYAN_EX: (winterm.fore, WinColor.CYAN, True),
154
+ AnsiFore.LIGHTWHITE_EX: (winterm.fore, WinColor.GREY, True),
155
+ AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
156
+ AnsiBack.RED: (winterm.back, WinColor.RED),
157
+ AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
158
+ AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
159
+ AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
160
+ AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
161
+ AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
162
+ AnsiBack.WHITE: (winterm.back, WinColor.GREY),
163
+ AnsiBack.RESET: (winterm.back, ),
164
+ AnsiBack.LIGHTBLACK_EX: (winterm.back, WinColor.BLACK, True),
165
+ AnsiBack.LIGHTRED_EX: (winterm.back, WinColor.RED, True),
166
+ AnsiBack.LIGHTGREEN_EX: (winterm.back, WinColor.GREEN, True),
167
+ AnsiBack.LIGHTYELLOW_EX: (winterm.back, WinColor.YELLOW, True),
168
+ AnsiBack.LIGHTBLUE_EX: (winterm.back, WinColor.BLUE, True),
169
+ AnsiBack.LIGHTMAGENTA_EX: (winterm.back, WinColor.MAGENTA, True),
170
+ AnsiBack.LIGHTCYAN_EX: (winterm.back, WinColor.CYAN, True),
171
+ AnsiBack.LIGHTWHITE_EX: (winterm.back, WinColor.GREY, True),
172
+ }
173
+ return dict()
174
+
175
+ def write(self, text):
176
+ if self.strip or self.convert:
177
+ self.write_and_convert(text)
178
+ else:
179
+ self.wrapped.write(text)
180
+ self.wrapped.flush()
181
+ if self.autoreset:
182
+ self.reset_all()
183
+
184
+
185
+ def reset_all(self):
186
+ if self.convert:
187
+ self.call_win32('m', (0,))
188
+ elif not self.strip and not self.stream.closed:
189
+ self.wrapped.write(Style.RESET_ALL)
190
+
191
+
192
+ def write_and_convert(self, text):
193
+ '''
194
+ Write the given text to our wrapped stream, stripping any ANSI
195
+ sequences from the text, and optionally converting them into win32
196
+ calls.
197
+ '''
198
+ cursor = 0
199
+ text = self.convert_osc(text)
200
+ for match in self.ANSI_CSI_RE.finditer(text):
201
+ start, end = match.span()
202
+ self.write_plain_text(text, cursor, start)
203
+ self.convert_ansi(*match.groups())
204
+ cursor = end
205
+ self.write_plain_text(text, cursor, len(text))
206
+
207
+
208
+ def write_plain_text(self, text, start, end):
209
+ if start < end:
210
+ self.wrapped.write(text[start:end])
211
+ self.wrapped.flush()
212
+
213
+
214
+ def convert_ansi(self, paramstring, command):
215
+ if self.convert:
216
+ params = self.extract_params(command, paramstring)
217
+ self.call_win32(command, params)
218
+
219
+
220
+ def extract_params(self, command, paramstring):
221
+ if command in 'Hf':
222
+ params = tuple(int(p) if len(p) != 0 else 1 for p in paramstring.split(';'))
223
+ while len(params) < 2:
224
+ # defaults:
225
+ params = params + (1,)
226
+ else:
227
+ params = tuple(int(p) for p in paramstring.split(';') if len(p) != 0)
228
+ if len(params) == 0:
229
+ # defaults:
230
+ if command in 'JKm':
231
+ params = (0,)
232
+ elif command in 'ABCD':
233
+ params = (1,)
234
+
235
+ return params
236
+
237
+
238
+ def call_win32(self, command, params):
239
+ if command == 'm':
240
+ for param in params:
241
+ if param in self.win32_calls:
242
+ func_args = self.win32_calls[param]
243
+ func = func_args[0]
244
+ args = func_args[1:]
245
+ kwargs = dict(on_stderr=self.on_stderr)
246
+ func(*args, **kwargs)
247
+ elif command in 'J':
248
+ winterm.erase_screen(params[0], on_stderr=self.on_stderr)
249
+ elif command in 'K':
250
+ winterm.erase_line(params[0], on_stderr=self.on_stderr)
251
+ elif command in 'Hf': # cursor position - absolute
252
+ winterm.set_cursor_position(params, on_stderr=self.on_stderr)
253
+ elif command in 'ABCD': # cursor position - relative
254
+ n = params[0]
255
+ # A - up, B - down, C - forward, D - back
256
+ x, y = {'A': (0, -n), 'B': (0, n), 'C': (n, 0), 'D': (-n, 0)}[command]
257
+ winterm.cursor_adjust(x, y, on_stderr=self.on_stderr)
258
+
259
+
260
+ def convert_osc(self, text):
261
+ for match in self.ANSI_OSC_RE.finditer(text):
262
+ start, end = match.span()
263
+ text = text[:start] + text[end:]
264
+ paramstring, command = match.groups()
265
+ if command == BEL:
266
+ if paramstring.count(";") == 1:
267
+ params = paramstring.split(";")
268
+ # 0 - change title and icon (we will only change title)
269
+ # 1 - change icon (we don't support this)
270
+ # 2 - change title
271
+ if params[0] in '02':
272
+ winterm.set_title(params[1])
273
+ return text
274
+
275
+
276
+ def flush(self):
277
+ self.wrapped.flush()
phivenv/Lib/site-packages/colorama/initialise.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
+ import atexit
3
+ import contextlib
4
+ import sys
5
+
6
+ from .ansitowin32 import AnsiToWin32
7
+
8
+
9
+ def _wipe_internal_state_for_tests():
10
+ global orig_stdout, orig_stderr
11
+ orig_stdout = None
12
+ orig_stderr = None
13
+
14
+ global wrapped_stdout, wrapped_stderr
15
+ wrapped_stdout = None
16
+ wrapped_stderr = None
17
+
18
+ global atexit_done
19
+ atexit_done = False
20
+
21
+ global fixed_windows_console
22
+ fixed_windows_console = False
23
+
24
+ try:
25
+ # no-op if it wasn't registered
26
+ atexit.unregister(reset_all)
27
+ except AttributeError:
28
+ # python 2: no atexit.unregister. Oh well, we did our best.
29
+ pass
30
+
31
+
32
+ def reset_all():
33
+ if AnsiToWin32 is not None: # Issue #74: objects might become None at exit
34
+ AnsiToWin32(orig_stdout).reset_all()
35
+
36
+
37
+ def init(autoreset=False, convert=None, strip=None, wrap=True):
38
+
39
+ if not wrap and any([autoreset, convert, strip]):
40
+ raise ValueError('wrap=False conflicts with any other arg=True')
41
+
42
+ global wrapped_stdout, wrapped_stderr
43
+ global orig_stdout, orig_stderr
44
+
45
+ orig_stdout = sys.stdout
46
+ orig_stderr = sys.stderr
47
+
48
+ if sys.stdout is None:
49
+ wrapped_stdout = None
50
+ else:
51
+ sys.stdout = wrapped_stdout = \
52
+ wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
53
+ if sys.stderr is None:
54
+ wrapped_stderr = None
55
+ else:
56
+ sys.stderr = wrapped_stderr = \
57
+ wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
58
+
59
+ global atexit_done
60
+ if not atexit_done:
61
+ atexit.register(reset_all)
62
+ atexit_done = True
63
+
64
+
65
+ def deinit():
66
+ if orig_stdout is not None:
67
+ sys.stdout = orig_stdout
68
+ if orig_stderr is not None:
69
+ sys.stderr = orig_stderr
70
+
71
+
72
+ def just_fix_windows_console():
73
+ global fixed_windows_console
74
+
75
+ if sys.platform != "win32":
76
+ return
77
+ if fixed_windows_console:
78
+ return
79
+ if wrapped_stdout is not None or wrapped_stderr is not None:
80
+ # Someone already ran init() and it did stuff, so we won't second-guess them
81
+ return
82
+
83
+ # On newer versions of Windows, AnsiToWin32.__init__ will implicitly enable the
84
+ # native ANSI support in the console as a side-effect. We only need to actually
85
+ # replace sys.stdout/stderr if we're in the old-style conversion mode.
86
+ new_stdout = AnsiToWin32(sys.stdout, convert=None, strip=None, autoreset=False)
87
+ if new_stdout.convert:
88
+ sys.stdout = new_stdout
89
+ new_stderr = AnsiToWin32(sys.stderr, convert=None, strip=None, autoreset=False)
90
+ if new_stderr.convert:
91
+ sys.stderr = new_stderr
92
+
93
+ fixed_windows_console = True
94
+
95
+ @contextlib.contextmanager
96
+ def colorama_text(*args, **kwargs):
97
+ init(*args, **kwargs)
98
+ try:
99
+ yield
100
+ finally:
101
+ deinit()
102
+
103
+
104
+ def reinit():
105
+ if wrapped_stdout is not None:
106
+ sys.stdout = wrapped_stdout
107
+ if wrapped_stderr is not None:
108
+ sys.stderr = wrapped_stderr
109
+
110
+
111
+ def wrap_stream(stream, convert, strip, autoreset, wrap):
112
+ if wrap:
113
+ wrapper = AnsiToWin32(stream,
114
+ convert=convert, strip=strip, autoreset=autoreset)
115
+ if wrapper.should_wrap():
116
+ stream = wrapper.stream
117
+ return stream
118
+
119
+
120
+ # Use this for initial setup as well, to reduce code duplication
121
+ _wipe_internal_state_for_tests()
phivenv/Lib/site-packages/colorama/tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
phivenv/Lib/site-packages/colorama/tests/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (158 Bytes). View file
 
phivenv/Lib/site-packages/colorama/tests/__pycache__/ansi_test.cpython-39.pyc ADDED
Binary file (2.52 kB). View file