Spaces:
Sleeping
Sleeping
Upload 6572 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +140 -0
- venv/.gitignore +2 -0
- venv/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +1 -0
- venv/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE +20 -0
- venv/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA +46 -0
- venv/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD +43 -0
- venv/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL +5 -0
- venv/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +2 -0
- venv/Lib/site-packages/__pycache__/_virtualenv.cpython-313.pyc +0 -0
- venv/Lib/site-packages/__pycache__/typing_extensions.cpython-313.pyc +3 -0
- venv/Lib/site-packages/_virtualenv.pth +3 -0
- venv/Lib/site-packages/_virtualenv.py +103 -0
- venv/Lib/site-packages/_yaml/__init__.py +33 -0
- venv/Lib/site-packages/_yaml/__pycache__/__init__.cpython-313.pyc +0 -0
- venv/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER +1 -0
- venv/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE +20 -0
- venv/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA +77 -0
- venv/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD +14 -0
- venv/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL +5 -0
- venv/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt +1 -0
- venv/Lib/site-packages/certifi/__init__.py +4 -0
- venv/Lib/site-packages/certifi/__main__.py +12 -0
- venv/Lib/site-packages/certifi/__pycache__/__init__.cpython-313.pyc +0 -0
- venv/Lib/site-packages/certifi/__pycache__/__main__.cpython-313.pyc +0 -0
- venv/Lib/site-packages/certifi/__pycache__/core.cpython-313.pyc +0 -0
- venv/Lib/site-packages/certifi/cacert.pem +0 -0
- venv/Lib/site-packages/certifi/core.py +114 -0
- venv/Lib/site-packages/certifi/py.typed +0 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +1 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +21 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +721 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +35 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +5 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +2 -0
- venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +1 -0
- venv/Lib/site-packages/charset_normalizer/__init__.py +48 -0
- venv/Lib/site-packages/charset_normalizer/__main__.py +6 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-313.pyc +0 -0
- venv/Lib/site-packages/charset_normalizer/api.py +668 -0
- venv/Lib/site-packages/charset_normalizer/cd.py +395 -0
- venv/Lib/site-packages/charset_normalizer/cli/__init__.py +8 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,143 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
venv/Lib/site-packages/__pycache__/typing_extensions.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
venv/Lib/site-packages/charset_normalizer/md__mypyc.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
venv/Lib/site-packages/huggingface_hub/__pycache__/hf_api.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
venv/Lib/site-packages/huggingface_hub/inference/__pycache__/_client.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
venv/Lib/site-packages/huggingface_hub/inference/_generated/__pycache__/_async_client.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
venv/Lib/site-packages/idna/__pycache__/uts46data.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
venv/Lib/site-packages/numpy.libs/libscipy_openblas64_-43e11ff0749b8cbe0a615c9cf6737e0e.dll filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
venv/Lib/site-packages/numpy.libs/msvcp140-263139962577ecda4cd9469ca360a746.dll filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
venv/Lib/site-packages/numpy/_core/__pycache__/_add_newdocs.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
venv/Lib/site-packages/numpy/_core/__pycache__/fromnumeric.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
venv/Lib/site-packages/numpy/_core/_multiarray_umath.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
venv/Lib/site-packages/numpy/_core/_simd.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
venv/Lib/site-packages/numpy/_core/lib/npymath.lib filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_datetime.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_dtype.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_multiarray.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_nditer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_numeric.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_regression.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_ufunc.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_umath.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
venv/Lib/site-packages/numpy/f2py/__pycache__/crackfortran.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
venv/Lib/site-packages/numpy/fft/_pocketfft_umath.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
venv/Lib/site-packages/numpy/lib/__pycache__/_function_base_impl.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
venv/Lib/site-packages/numpy/lib/tests/__pycache__/test_function_base.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
venv/Lib/site-packages/numpy/lib/tests/__pycache__/test_io.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
venv/Lib/site-packages/numpy/linalg/__pycache__/_linalg.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
venv/Lib/site-packages/numpy/linalg/_umath_linalg.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
venv/Lib/site-packages/numpy/linalg/tests/__pycache__/test_linalg.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
venv/Lib/site-packages/numpy/ma/__pycache__/core.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
venv/Lib/site-packages/numpy/ma/tests/__pycache__/test_core.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
venv/Lib/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
venv/Lib/site-packages/numpy/random/_bounded_integers.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
venv/Lib/site-packages/numpy/random/_common.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
venv/Lib/site-packages/numpy/random/_generator.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
venv/Lib/site-packages/numpy/random/bit_generator.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
venv/Lib/site-packages/numpy/random/lib/npyrandom.lib filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
venv/Lib/site-packages/numpy/random/mtrand.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
venv/Lib/site-packages/numpy/random/tests/__pycache__/test_generator_mt19937.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
venv/Lib/site-packages/numpy/random/tests/__pycache__/test_random.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
venv/Lib/site-packages/numpy/random/tests/__pycache__/test_randomstate.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
venv/Lib/site-packages/numpy/testing/_private/__pycache__/utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
venv/Lib/site-packages/numpy/testing/tests/__pycache__/test_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
venv/Lib/site-packages/pip/_vendor/__pycache__/typing_extensions.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
venv/Lib/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
venv/Lib/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
venv/Lib/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
venv/Lib/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
venv/Lib/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
venv/Lib/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
venv/Lib/site-packages/pip/_vendor/rich/__pycache__/console.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
venv/Lib/site-packages/regex/__pycache__/_regex_core.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
venv/Lib/site-packages/regex/__pycache__/test_regex.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
venv/Lib/site-packages/regex/_regex.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
venv/Lib/site-packages/safetensors/_safetensors_rust.pyd filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
venv/Lib/site-packages/tokenizers/tokenizers.pyd filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
venv/Lib/site-packages/transformers/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
venv/Lib/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
venv/Lib/site-packages/transformers/__pycache__/modeling_outputs.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
venv/Lib/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
venv/Lib/site-packages/transformers/__pycache__/modeling_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
venv/Lib/site-packages/transformers/__pycache__/testing_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
venv/Lib/site-packages/transformers/__pycache__/tokenization_utils_base.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
venv/Lib/site-packages/transformers/__pycache__/trainer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
venv/Lib/site-packages/transformers/__pycache__/training_args.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
venv/Lib/site-packages/transformers/generation/__pycache__/logits_process.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
venv/Lib/site-packages/transformers/generation/__pycache__/tf_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
venv/Lib/site-packages/transformers/generation/__pycache__/utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
venv/Lib/site-packages/transformers/integrations/__pycache__/integration_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
venv/Lib/site-packages/transformers/models/autoformer/__pycache__/modeling_autoformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
venv/Lib/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
venv/Lib/site-packages/transformers/models/big_bird/__pycache__/modeling_big_bird.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
venv/Lib/site-packages/transformers/models/big_bird/__pycache__/modeling_flax_big_bird.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
venv/Lib/site-packages/transformers/models/bigbird_pegasus/__pycache__/modeling_bigbird_pegasus.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
venv/Lib/site-packages/transformers/models/blip_2/__pycache__/modeling_blip_2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
venv/Lib/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
venv/Lib/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
venv/Lib/site-packages/transformers/models/conditional_detr/__pycache__/modeling_conditional_detr.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
venv/Lib/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
venv/Lib/site-packages/transformers/models/deprecated/deta/__pycache__/modeling_deta.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
venv/Lib/site-packages/transformers/models/deprecated/jukebox/__pycache__/modeling_jukebox.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
venv/Lib/site-packages/transformers/models/deprecated/mega/__pycache__/modeling_mega.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
venv/Lib/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/modeling_xlm_prophetnet.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
venv/Lib/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
venv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esmfold.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
venv/Lib/site-packages/transformers/models/flava/__pycache__/modeling_flava.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
venv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modeling_grounding_dino.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
venv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_tf_groupvit.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
venv/Lib/site-packages/transformers/models/informer/__pycache__/modeling_informer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
venv/Lib/site-packages/transformers/models/kosmos2/__pycache__/modeling_kosmos2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
venv/Lib/site-packages/transformers/models/led/__pycache__/modeling_led.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 127 |
+
venv/Lib/site-packages/transformers/models/led/__pycache__/modeling_tf_led.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
venv/Lib/site-packages/transformers/models/longformer/__pycache__/modeling_longformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
venv/Lib/site-packages/transformers/models/longformer/__pycache__/modeling_tf_longformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
venv/Lib/site-packages/transformers/models/longt5/__pycache__/modeling_flax_longt5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 131 |
+
venv/Lib/site-packages/transformers/models/longt5/__pycache__/modeling_longt5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 132 |
+
venv/Lib/site-packages/transformers/models/luke/__pycache__/modeling_luke.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 133 |
+
venv/Lib/site-packages/transformers/models/mask2former/__pycache__/modeling_mask2former.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 134 |
+
venv/Lib/site-packages/transformers/models/mllama/__pycache__/modeling_mllama.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 135 |
+
venv/Lib/site-packages/transformers/models/moshi/__pycache__/modeling_moshi.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 136 |
+
venv/Lib/site-packages/transformers/models/mt5/__pycache__/modeling_mt5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 137 |
+
venv/Lib/site-packages/transformers/models/musicgen_melody/__pycache__/modeling_musicgen_melody.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 138 |
+
venv/Lib/site-packages/transformers/models/musicgen/__pycache__/modeling_musicgen.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 139 |
+
venv/Lib/site-packages/transformers/models/oneformer/__pycache__/modeling_oneformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 140 |
+
venv/Lib/site-packages/transformers/models/perceiver/__pycache__/modeling_perceiver.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 141 |
+
venv/Lib/site-packages/transformers/models/phi4_multimodal/__pycache__/modeling_phi4_multimodal.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 142 |
+
venv/Lib/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 143 |
+
venv/Lib/site-packages/transformers/models/qwen2_5_vl/__pycache__/modeling_qwen2_5_vl.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 144 |
+
venv/Lib/site-packages/transformers/models/qwen2_vl/__pycache__/modeling_qwen2_vl.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 145 |
+
venv/Lib/site-packages/transformers/models/reformer/__pycache__/modeling_reformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 146 |
+
venv/Lib/site-packages/transformers/models/rt_detr_v2/__pycache__/modeling_rt_detr_v2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 147 |
+
venv/Lib/site-packages/transformers/models/rt_detr/__pycache__/modeling_rt_detr.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 148 |
+
venv/Lib/site-packages/transformers/models/seamless_m4t_v2/__pycache__/modeling_seamless_m4t_v2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 149 |
+
venv/Lib/site-packages/transformers/models/seamless_m4t/__pycache__/modeling_seamless_m4t.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 150 |
+
venv/Lib/site-packages/transformers/models/speecht5/__pycache__/modeling_speecht5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 151 |
+
venv/Lib/site-packages/transformers/models/t5/__pycache__/modeling_t5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 152 |
+
venv/Lib/site-packages/transformers/models/tapas/__pycache__/modeling_tapas.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 153 |
+
venv/Lib/site-packages/transformers/models/tapas/__pycache__/modeling_tf_tapas.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 154 |
+
venv/Lib/site-packages/transformers/models/tapas/__pycache__/tokenization_tapas.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
venv/Lib/site-packages/transformers/models/udop/__pycache__/modeling_udop.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 156 |
+
venv/Lib/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
venv/Lib/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 158 |
+
venv/Lib/site-packages/transformers/models/wav2vec2/__pycache__/modeling_wav2vec2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 159 |
+
venv/Lib/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 160 |
+
venv/Lib/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 161 |
+
venv/Lib/site-packages/transformers/utils/__pycache__/dummy_pt_objects.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 162 |
+
venv/Lib/site-packages/transformers/utils/__pycache__/dummy_tf_objects.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
|
| 163 |
+
venv/Lib/site-packages/yaml/_yaml.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
| 164 |
+
venv/Scripts/f2py.exe filter=lfs diff=lfs merge=lfs -text
|
| 165 |
+
venv/Scripts/huggingface-cli.exe filter=lfs diff=lfs merge=lfs -text
|
| 166 |
+
venv/Scripts/normalizer.exe filter=lfs diff=lfs merge=lfs -text
|
| 167 |
+
venv/Scripts/numpy-config.exe filter=lfs diff=lfs merge=lfs -text
|
| 168 |
+
venv/Scripts/pip-3.13.exe filter=lfs diff=lfs merge=lfs -text
|
| 169 |
+
venv/Scripts/pip.exe filter=lfs diff=lfs merge=lfs -text
|
| 170 |
+
venv/Scripts/pip3.13.exe filter=lfs diff=lfs merge=lfs -text
|
| 171 |
+
venv/Scripts/pip3.exe filter=lfs diff=lfs merge=lfs -text
|
| 172 |
+
venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
|
| 173 |
+
venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
|
| 174 |
+
venv/Scripts/tqdm.exe filter=lfs diff=lfs merge=lfs -text
|
| 175 |
+
venv/Scripts/transformers-cli.exe filter=lfs diff=lfs merge=lfs -text
|
venv/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# created by virtualenv automatically
|
| 2 |
+
*
|
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright (c) 2017-2021 Ingy döt Net
|
| 2 |
+
Copyright (c) 2006-2016 Kirill Simonov
|
| 3 |
+
|
| 4 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
| 5 |
+
this software and associated documentation files (the "Software"), to deal in
|
| 6 |
+
the Software without restriction, including without limitation the rights to
|
| 7 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
| 8 |
+
of the Software, and to permit persons to whom the Software is furnished to do
|
| 9 |
+
so, subject to the following conditions:
|
| 10 |
+
|
| 11 |
+
The above copyright notice and this permission notice shall be included in all
|
| 12 |
+
copies or substantial portions of the Software.
|
| 13 |
+
|
| 14 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 15 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 16 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 17 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 18 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 19 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 20 |
+
SOFTWARE.
|
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: PyYAML
|
| 3 |
+
Version: 6.0.2
|
| 4 |
+
Summary: YAML parser and emitter for Python
|
| 5 |
+
Home-page: https://pyyaml.org/
|
| 6 |
+
Download-URL: https://pypi.org/project/PyYAML/
|
| 7 |
+
Author: Kirill Simonov
|
| 8 |
+
Author-email: xi@resolvent.net
|
| 9 |
+
License: MIT
|
| 10 |
+
Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
|
| 11 |
+
Project-URL: CI, https://github.com/yaml/pyyaml/actions
|
| 12 |
+
Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
|
| 13 |
+
Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
|
| 14 |
+
Project-URL: Source Code, https://github.com/yaml/pyyaml
|
| 15 |
+
Platform: Any
|
| 16 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 17 |
+
Classifier: Intended Audience :: Developers
|
| 18 |
+
Classifier: License :: OSI Approved :: MIT License
|
| 19 |
+
Classifier: Operating System :: OS Independent
|
| 20 |
+
Classifier: Programming Language :: Cython
|
| 21 |
+
Classifier: Programming Language :: Python
|
| 22 |
+
Classifier: Programming Language :: Python :: 3
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 25 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 26 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 27 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 28 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 29 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 30 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 31 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
| 32 |
+
Classifier: Topic :: Text Processing :: Markup
|
| 33 |
+
Requires-Python: >=3.8
|
| 34 |
+
License-File: LICENSE
|
| 35 |
+
|
| 36 |
+
YAML is a data serialization format designed for human readability
|
| 37 |
+
and interaction with scripting languages. PyYAML is a YAML parser
|
| 38 |
+
and emitter for Python.
|
| 39 |
+
|
| 40 |
+
PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
|
| 41 |
+
support, capable extension API, and sensible error messages. PyYAML
|
| 42 |
+
supports standard YAML tags and provides Python-specific tags that
|
| 43 |
+
allow to represent an arbitrary Python object.
|
| 44 |
+
|
| 45 |
+
PyYAML is applicable for a broad range of tasks from complex
|
| 46 |
+
configuration files to object serialization and persistence.
|
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
|
| 3 |
+
PyYAML-6.0.2.dist-info/METADATA,sha256=9lwXqTOrXPts-jI2Lo5UwuaAYo0hiRA0BZqjch0WjAk,2106
|
| 4 |
+
PyYAML-6.0.2.dist-info/RECORD,,
|
| 5 |
+
PyYAML-6.0.2.dist-info/WHEEL,sha256=ugue6NJCr9gUOQmWni1lhHLbY_ilTPbmSokNVdK9MnY,102
|
| 6 |
+
PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
|
| 7 |
+
_yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
|
| 8 |
+
_yaml/__pycache__/__init__.cpython-313.pyc,,
|
| 9 |
+
yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
|
| 10 |
+
yaml/__pycache__/__init__.cpython-313.pyc,,
|
| 11 |
+
yaml/__pycache__/composer.cpython-313.pyc,,
|
| 12 |
+
yaml/__pycache__/constructor.cpython-313.pyc,,
|
| 13 |
+
yaml/__pycache__/cyaml.cpython-313.pyc,,
|
| 14 |
+
yaml/__pycache__/dumper.cpython-313.pyc,,
|
| 15 |
+
yaml/__pycache__/emitter.cpython-313.pyc,,
|
| 16 |
+
yaml/__pycache__/error.cpython-313.pyc,,
|
| 17 |
+
yaml/__pycache__/events.cpython-313.pyc,,
|
| 18 |
+
yaml/__pycache__/loader.cpython-313.pyc,,
|
| 19 |
+
yaml/__pycache__/nodes.cpython-313.pyc,,
|
| 20 |
+
yaml/__pycache__/parser.cpython-313.pyc,,
|
| 21 |
+
yaml/__pycache__/reader.cpython-313.pyc,,
|
| 22 |
+
yaml/__pycache__/representer.cpython-313.pyc,,
|
| 23 |
+
yaml/__pycache__/resolver.cpython-313.pyc,,
|
| 24 |
+
yaml/__pycache__/scanner.cpython-313.pyc,,
|
| 25 |
+
yaml/__pycache__/serializer.cpython-313.pyc,,
|
| 26 |
+
yaml/__pycache__/tokens.cpython-313.pyc,,
|
| 27 |
+
yaml/_yaml.cp313-win_amd64.pyd,sha256=_iGlW4L7exHQxfjFi8uRZ1FD2cvMLGZnUB97b3sPn2g,263680
|
| 28 |
+
yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
|
| 29 |
+
yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
|
| 30 |
+
yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
|
| 31 |
+
yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
|
| 32 |
+
yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
|
| 33 |
+
yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
|
| 34 |
+
yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
|
| 35 |
+
yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
|
| 36 |
+
yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
|
| 37 |
+
yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
|
| 38 |
+
yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
|
| 39 |
+
yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
|
| 40 |
+
yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
|
| 41 |
+
yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
|
| 42 |
+
yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
|
| 43 |
+
yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
|
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: bdist_wheel (0.44.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp313-cp313-win_amd64
|
| 5 |
+
|
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_yaml
|
| 2 |
+
yaml
|
venv/Lib/site-packages/__pycache__/_virtualenv.cpython-313.pyc
ADDED
|
Binary file (4.24 kB). View file
|
|
|
venv/Lib/site-packages/__pycache__/typing_extensions.cpython-313.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12c04370c14aa791e9c7507e203eb1047185bf2492c88001270566b6abce838c
|
| 3 |
+
size 177778
|
venv/Lib/site-packages/_virtualenv.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69ac3d8f27e679c81b94ab30b3b56e9cd138219b1ba94a1fa3606d5a76a1433d
|
| 3 |
+
size 18
|
venv/Lib/site-packages/_virtualenv.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Patches that are applied at runtime to the virtual environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
VIRTUALENV_PATCH_FILE = os.path.join(__file__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def patch_dist(dist):
|
| 12 |
+
"""
|
| 13 |
+
Distutils allows user to configure some arguments via a configuration file:
|
| 14 |
+
https://docs.python.org/3/install/index.html#distutils-configuration-files.
|
| 15 |
+
|
| 16 |
+
Some of this arguments though don't make sense in context of the virtual environment files, let's fix them up.
|
| 17 |
+
""" # noqa: D205
|
| 18 |
+
# we cannot allow some install config as that would get packages installed outside of the virtual environment
|
| 19 |
+
old_parse_config_files = dist.Distribution.parse_config_files
|
| 20 |
+
|
| 21 |
+
def parse_config_files(self, *args, **kwargs):
|
| 22 |
+
result = old_parse_config_files(self, *args, **kwargs)
|
| 23 |
+
install = self.get_option_dict("install")
|
| 24 |
+
|
| 25 |
+
if "prefix" in install: # the prefix governs where to install the libraries
|
| 26 |
+
install["prefix"] = VIRTUALENV_PATCH_FILE, os.path.abspath(sys.prefix)
|
| 27 |
+
for base in ("purelib", "platlib", "headers", "scripts", "data"):
|
| 28 |
+
key = f"install_{base}"
|
| 29 |
+
if key in install: # do not allow global configs to hijack venv paths
|
| 30 |
+
install.pop(key, None)
|
| 31 |
+
return result
|
| 32 |
+
|
| 33 |
+
dist.Distribution.parse_config_files = parse_config_files
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Import hook that patches some modules to ignore configuration values that break package installation in case
|
| 37 |
+
# of virtual environments.
|
| 38 |
+
_DISTUTILS_PATCH = "distutils.dist", "setuptools.dist"
|
| 39 |
+
# https://docs.python.org/3/library/importlib.html#setting-up-an-importer
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class _Finder:
|
| 43 |
+
"""A meta path finder that allows patching the imported distutils modules."""
|
| 44 |
+
|
| 45 |
+
fullname = None
|
| 46 |
+
|
| 47 |
+
# lock[0] is threading.Lock(), but initialized lazily to avoid importing threading very early at startup,
|
| 48 |
+
# because there are gevent-based applications that need to be first to import threading by themselves.
|
| 49 |
+
# See https://github.com/pypa/virtualenv/issues/1895 for details.
|
| 50 |
+
lock = [] # noqa: RUF012
|
| 51 |
+
|
| 52 |
+
def find_spec(self, fullname, path, target=None): # noqa: ARG002
|
| 53 |
+
if fullname in _DISTUTILS_PATCH and self.fullname is None: # noqa: PLR1702
|
| 54 |
+
# initialize lock[0] lazily
|
| 55 |
+
if len(self.lock) == 0:
|
| 56 |
+
import threading # noqa: PLC0415
|
| 57 |
+
|
| 58 |
+
lock = threading.Lock()
|
| 59 |
+
# there is possibility that two threads T1 and T2 are simultaneously running into find_spec,
|
| 60 |
+
# observing .lock as empty, and further going into hereby initialization. However due to the GIL,
|
| 61 |
+
# list.append() operation is atomic and this way only one of the threads will "win" to put the lock
|
| 62 |
+
# - that every thread will use - into .lock[0].
|
| 63 |
+
# https://docs.python.org/3/faq/library.html#what-kinds-of-global-value-mutation-are-thread-safe
|
| 64 |
+
self.lock.append(lock)
|
| 65 |
+
|
| 66 |
+
from functools import partial # noqa: PLC0415
|
| 67 |
+
from importlib.util import find_spec # noqa: PLC0415
|
| 68 |
+
|
| 69 |
+
with self.lock[0]:
|
| 70 |
+
self.fullname = fullname
|
| 71 |
+
try:
|
| 72 |
+
spec = find_spec(fullname, path)
|
| 73 |
+
if spec is not None:
|
| 74 |
+
# https://www.python.org/dev/peps/pep-0451/#how-loading-will-work
|
| 75 |
+
is_new_api = hasattr(spec.loader, "exec_module")
|
| 76 |
+
func_name = "exec_module" if is_new_api else "load_module"
|
| 77 |
+
old = getattr(spec.loader, func_name)
|
| 78 |
+
func = self.exec_module if is_new_api else self.load_module
|
| 79 |
+
if old is not func:
|
| 80 |
+
try: # noqa: SIM105
|
| 81 |
+
setattr(spec.loader, func_name, partial(func, old))
|
| 82 |
+
except AttributeError:
|
| 83 |
+
pass # C-Extension loaders are r/o such as zipimporter with <3.7
|
| 84 |
+
return spec
|
| 85 |
+
finally:
|
| 86 |
+
self.fullname = None
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
@staticmethod
|
| 90 |
+
def exec_module(old, module):
|
| 91 |
+
old(module)
|
| 92 |
+
if module.__name__ in _DISTUTILS_PATCH:
|
| 93 |
+
patch_dist(module)
|
| 94 |
+
|
| 95 |
+
@staticmethod
|
| 96 |
+
def load_module(old, name):
|
| 97 |
+
module = old(name)
|
| 98 |
+
if module.__name__ in _DISTUTILS_PATCH:
|
| 99 |
+
patch_dist(module)
|
| 100 |
+
return module
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
sys.meta_path.insert(0, _Finder())
|
venv/Lib/site-packages/_yaml/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This is a stub package designed to roughly emulate the _yaml
|
| 2 |
+
# extension module, which previously existed as a standalone module
|
| 3 |
+
# and has been moved into the `yaml` package namespace.
|
| 4 |
+
# It does not perfectly mimic its old counterpart, but should get
|
| 5 |
+
# close enough for anyone who's relying on it even when they shouldn't.
|
| 6 |
+
import yaml
|
| 7 |
+
|
| 8 |
+
# in some circumstances, the yaml module we imoprted may be from a different version, so we need
|
| 9 |
+
# to tread carefully when poking at it here (it may not have the attributes we expect)
|
| 10 |
+
if not getattr(yaml, '__with_libyaml__', False):
|
| 11 |
+
from sys import version_info
|
| 12 |
+
|
| 13 |
+
exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
|
| 14 |
+
raise exc("No module named '_yaml'")
|
| 15 |
+
else:
|
| 16 |
+
from yaml._yaml import *
|
| 17 |
+
import warnings
|
| 18 |
+
warnings.warn(
|
| 19 |
+
'The _yaml extension module is now located at yaml._yaml'
|
| 20 |
+
' and its location is subject to change. To use the'
|
| 21 |
+
' LibYAML-based parser and emitter, import from `yaml`:'
|
| 22 |
+
' `from yaml import CLoader as Loader, CDumper as Dumper`.',
|
| 23 |
+
DeprecationWarning
|
| 24 |
+
)
|
| 25 |
+
del warnings
|
| 26 |
+
# Don't `del yaml` here because yaml is actually an existing
|
| 27 |
+
# namespace member of _yaml.
|
| 28 |
+
|
| 29 |
+
__name__ = '_yaml'
|
| 30 |
+
# If the module is top-level (i.e. not a part of any specific package)
|
| 31 |
+
# then the attribute should be set to ''.
|
| 32 |
+
# https://docs.python.org/3.8/library/types.html
|
| 33 |
+
__package__ = ''
|
venv/Lib/site-packages/_yaml/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (872 Bytes). View file
|
|
|
venv/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This package contains a modified version of ca-bundle.crt:
|
| 2 |
+
|
| 3 |
+
ca-bundle.crt -- Bundle of CA Root Certificates
|
| 4 |
+
|
| 5 |
+
This is a bundle of X.509 certificates of public Certificate Authorities
|
| 6 |
+
(CA). These were automatically extracted from Mozilla's root certificates
|
| 7 |
+
file (certdata.txt). This file can be found in the mozilla source tree:
|
| 8 |
+
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
| 9 |
+
It contains the certificates in PEM format and therefore
|
| 10 |
+
can be directly used with curl / libcurl / php_curl, or with
|
| 11 |
+
an Apache+mod_ssl webserver for SSL client authentication.
|
| 12 |
+
Just configure this file as the SSLCACertificateFile.#
|
| 13 |
+
|
| 14 |
+
***** BEGIN LICENSE BLOCK *****
|
| 15 |
+
This Source Code Form is subject to the terms of the Mozilla Public License,
|
| 16 |
+
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
| 17 |
+
one at http://mozilla.org/MPL/2.0/.
|
| 18 |
+
|
| 19 |
+
***** END LICENSE BLOCK *****
|
| 20 |
+
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
venv/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.2
|
| 2 |
+
Name: certifi
|
| 3 |
+
Version: 2025.1.31
|
| 4 |
+
Summary: Python package for providing Mozilla's CA Bundle.
|
| 5 |
+
Home-page: https://github.com/certifi/python-certifi
|
| 6 |
+
Author: Kenneth Reitz
|
| 7 |
+
Author-email: me@kennethreitz.com
|
| 8 |
+
License: MPL-2.0
|
| 9 |
+
Project-URL: Source, https://github.com/certifi/python-certifi
|
| 10 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
| 13 |
+
Classifier: Natural Language :: English
|
| 14 |
+
Classifier: Programming Language :: Python
|
| 15 |
+
Classifier: Programming Language :: Python :: 3
|
| 16 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.6
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 25 |
+
Requires-Python: >=3.6
|
| 26 |
+
License-File: LICENSE
|
| 27 |
+
Dynamic: author
|
| 28 |
+
Dynamic: author-email
|
| 29 |
+
Dynamic: classifier
|
| 30 |
+
Dynamic: description
|
| 31 |
+
Dynamic: home-page
|
| 32 |
+
Dynamic: license
|
| 33 |
+
Dynamic: project-url
|
| 34 |
+
Dynamic: requires-python
|
| 35 |
+
Dynamic: summary
|
| 36 |
+
|
| 37 |
+
Certifi: Python SSL Certificates
|
| 38 |
+
================================
|
| 39 |
+
|
| 40 |
+
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
| 41 |
+
validating the trustworthiness of SSL certificates while verifying the identity
|
| 42 |
+
of TLS hosts. It has been extracted from the `Requests`_ project.
|
| 43 |
+
|
| 44 |
+
Installation
|
| 45 |
+
------------
|
| 46 |
+
|
| 47 |
+
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
| 48 |
+
|
| 49 |
+
$ pip install certifi
|
| 50 |
+
|
| 51 |
+
Usage
|
| 52 |
+
-----
|
| 53 |
+
|
| 54 |
+
To reference the installed certificate authority (CA) bundle, you can use the
|
| 55 |
+
built-in function::
|
| 56 |
+
|
| 57 |
+
>>> import certifi
|
| 58 |
+
|
| 59 |
+
>>> certifi.where()
|
| 60 |
+
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
| 61 |
+
|
| 62 |
+
Or from the command line::
|
| 63 |
+
|
| 64 |
+
$ python -m certifi
|
| 65 |
+
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
| 66 |
+
|
| 67 |
+
Enjoy!
|
| 68 |
+
|
| 69 |
+
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
| 70 |
+
|
| 71 |
+
Addition/Removal of Certificates
|
| 72 |
+
--------------------------------
|
| 73 |
+
|
| 74 |
+
Certifi does not support any addition/removal or other modification of the
|
| 75 |
+
CA trust store content. This project is intended to provide a reliable and
|
| 76 |
+
highly portable root of trust to python deployments. Look to upstream projects
|
| 77 |
+
for methods to use alternate trust.
|
venv/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
| 3 |
+
certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
|
| 4 |
+
certifi-2025.1.31.dist-info/RECORD,,
|
| 5 |
+
certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
| 6 |
+
certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
| 7 |
+
certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
|
| 8 |
+
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
| 9 |
+
certifi/__pycache__/__init__.cpython-313.pyc,,
|
| 10 |
+
certifi/__pycache__/__main__.cpython-313.pyc,,
|
| 11 |
+
certifi/__pycache__/core.cpython-313.pyc,,
|
| 12 |
+
certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
|
| 13 |
+
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
|
| 14 |
+
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
venv/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (75.8.0)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
| 5 |
+
|
venv/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
certifi
|
venv/Lib/site-packages/certifi/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .core import contents, where
|
| 2 |
+
|
| 3 |
+
__all__ = ["contents", "where"]
|
| 4 |
+
__version__ = "2025.01.31"
|
venv/Lib/site-packages/certifi/__main__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
from certifi import contents, where
|
| 4 |
+
|
| 5 |
+
parser = argparse.ArgumentParser()
|
| 6 |
+
parser.add_argument("-c", "--contents", action="store_true")
|
| 7 |
+
args = parser.parse_args()
|
| 8 |
+
|
| 9 |
+
if args.contents:
|
| 10 |
+
print(contents())
|
| 11 |
+
else:
|
| 12 |
+
print(where())
|
venv/Lib/site-packages/certifi/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (306 Bytes). View file
|
|
|
venv/Lib/site-packages/certifi/__pycache__/__main__.cpython-313.pyc
ADDED
|
Binary file (623 Bytes). View file
|
|
|
venv/Lib/site-packages/certifi/__pycache__/core.cpython-313.pyc
ADDED
|
Binary file (3.2 kB). View file
|
|
|
venv/Lib/site-packages/certifi/cacert.pem
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
venv/Lib/site-packages/certifi/core.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
certifi.py
|
| 3 |
+
~~~~~~~~~~
|
| 4 |
+
|
| 5 |
+
This module returns the installation location of cacert.pem or its contents.
|
| 6 |
+
"""
|
| 7 |
+
import sys
|
| 8 |
+
import atexit
|
| 9 |
+
|
| 10 |
+
def exit_cacert_ctx() -> None:
|
| 11 |
+
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
if sys.version_info >= (3, 11):
|
| 15 |
+
|
| 16 |
+
from importlib.resources import as_file, files
|
| 17 |
+
|
| 18 |
+
_CACERT_CTX = None
|
| 19 |
+
_CACERT_PATH = None
|
| 20 |
+
|
| 21 |
+
def where() -> str:
|
| 22 |
+
# This is slightly terrible, but we want to delay extracting the file
|
| 23 |
+
# in cases where we're inside of a zipimport situation until someone
|
| 24 |
+
# actually calls where(), but we don't want to re-extract the file
|
| 25 |
+
# on every call of where(), so we'll do it once then store it in a
|
| 26 |
+
# global variable.
|
| 27 |
+
global _CACERT_CTX
|
| 28 |
+
global _CACERT_PATH
|
| 29 |
+
if _CACERT_PATH is None:
|
| 30 |
+
# This is slightly janky, the importlib.resources API wants you to
|
| 31 |
+
# manage the cleanup of this file, so it doesn't actually return a
|
| 32 |
+
# path, it returns a context manager that will give you the path
|
| 33 |
+
# when you enter it and will do any cleanup when you leave it. In
|
| 34 |
+
# the common case of not needing a temporary file, it will just
|
| 35 |
+
# return the file system location and the __exit__() is a no-op.
|
| 36 |
+
#
|
| 37 |
+
# We also have to hold onto the actual context manager, because
|
| 38 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
| 39 |
+
# we will also store that at the global level as well.
|
| 40 |
+
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
| 41 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
| 42 |
+
atexit.register(exit_cacert_ctx)
|
| 43 |
+
|
| 44 |
+
return _CACERT_PATH
|
| 45 |
+
|
| 46 |
+
def contents() -> str:
|
| 47 |
+
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
| 48 |
+
|
| 49 |
+
elif sys.version_info >= (3, 7):
|
| 50 |
+
|
| 51 |
+
from importlib.resources import path as get_path, read_text
|
| 52 |
+
|
| 53 |
+
_CACERT_CTX = None
|
| 54 |
+
_CACERT_PATH = None
|
| 55 |
+
|
| 56 |
+
def where() -> str:
|
| 57 |
+
# This is slightly terrible, but we want to delay extracting the
|
| 58 |
+
# file in cases where we're inside of a zipimport situation until
|
| 59 |
+
# someone actually calls where(), but we don't want to re-extract
|
| 60 |
+
# the file on every call of where(), so we'll do it once then store
|
| 61 |
+
# it in a global variable.
|
| 62 |
+
global _CACERT_CTX
|
| 63 |
+
global _CACERT_PATH
|
| 64 |
+
if _CACERT_PATH is None:
|
| 65 |
+
# This is slightly janky, the importlib.resources API wants you
|
| 66 |
+
# to manage the cleanup of this file, so it doesn't actually
|
| 67 |
+
# return a path, it returns a context manager that will give
|
| 68 |
+
# you the path when you enter it and will do any cleanup when
|
| 69 |
+
# you leave it. In the common case of not needing a temporary
|
| 70 |
+
# file, it will just return the file system location and the
|
| 71 |
+
# __exit__() is a no-op.
|
| 72 |
+
#
|
| 73 |
+
# We also have to hold onto the actual context manager, because
|
| 74 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
| 75 |
+
# we will also store that at the global level as well.
|
| 76 |
+
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
| 77 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
| 78 |
+
atexit.register(exit_cacert_ctx)
|
| 79 |
+
|
| 80 |
+
return _CACERT_PATH
|
| 81 |
+
|
| 82 |
+
def contents() -> str:
|
| 83 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
| 84 |
+
|
| 85 |
+
else:
|
| 86 |
+
import os
|
| 87 |
+
import types
|
| 88 |
+
from typing import Union
|
| 89 |
+
|
| 90 |
+
Package = Union[types.ModuleType, str]
|
| 91 |
+
Resource = Union[str, "os.PathLike"]
|
| 92 |
+
|
| 93 |
+
# This fallback will work for Python versions prior to 3.7 that lack the
|
| 94 |
+
# importlib.resources module but relies on the existing `where` function
|
| 95 |
+
# so won't address issues with environments like PyOxidizer that don't set
|
| 96 |
+
# __file__ on modules.
|
| 97 |
+
def read_text(
|
| 98 |
+
package: Package,
|
| 99 |
+
resource: Resource,
|
| 100 |
+
encoding: str = 'utf-8',
|
| 101 |
+
errors: str = 'strict'
|
| 102 |
+
) -> str:
|
| 103 |
+
with open(where(), encoding=encoding) as data:
|
| 104 |
+
return data.read()
|
| 105 |
+
|
| 106 |
+
# If we don't have importlib.resources, then we will just do the old logic
|
| 107 |
+
# of assuming we're on the filesystem and munge the path directly.
|
| 108 |
+
def where() -> str:
|
| 109 |
+
f = os.path.dirname(__file__)
|
| 110 |
+
|
| 111 |
+
return os.path.join(f, "cacert.pem")
|
| 112 |
+
|
| 113 |
+
def contents() -> str:
|
| 114 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
venv/Lib/site-packages/certifi/py.typed
ADDED
|
File without changes
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 TAHRI Ahmed R.
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA
ADDED
|
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: charset-normalizer
|
| 3 |
+
Version: 3.4.1
|
| 4 |
+
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
| 5 |
+
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
| 6 |
+
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
| 7 |
+
License: MIT
|
| 8 |
+
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
| 9 |
+
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
| 10 |
+
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
| 11 |
+
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
| 12 |
+
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
| 13 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 14 |
+
Classifier: Intended Audience :: Developers
|
| 15 |
+
Classifier: License :: OSI Approved :: MIT License
|
| 16 |
+
Classifier: Operating System :: OS Independent
|
| 17 |
+
Classifier: Programming Language :: Python
|
| 18 |
+
Classifier: Programming Language :: Python :: 3
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 25 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 26 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 27 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 28 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 29 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
| 30 |
+
Classifier: Topic :: Utilities
|
| 31 |
+
Classifier: Typing :: Typed
|
| 32 |
+
Requires-Python: >=3.7
|
| 33 |
+
Description-Content-Type: text/markdown
|
| 34 |
+
License-File: LICENSE
|
| 35 |
+
Provides-Extra: unicode-backport
|
| 36 |
+
|
| 37 |
+
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
| 38 |
+
|
| 39 |
+
<p align="center">
|
| 40 |
+
<sup>The Real First Universal Charset Detector</sup><br>
|
| 41 |
+
<a href="https://pypi.org/project/charset-normalizer">
|
| 42 |
+
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
| 43 |
+
</a>
|
| 44 |
+
<a href="https://pepy.tech/project/charset-normalizer/">
|
| 45 |
+
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
| 46 |
+
</a>
|
| 47 |
+
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
| 48 |
+
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
| 49 |
+
</a>
|
| 50 |
+
</p>
|
| 51 |
+
<p align="center">
|
| 52 |
+
<sup><i>Featured Packages</i></sup><br>
|
| 53 |
+
<a href="https://github.com/jawah/niquests">
|
| 54 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
|
| 55 |
+
</a>
|
| 56 |
+
<a href="https://github.com/jawah/wassima">
|
| 57 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
| 58 |
+
</a>
|
| 59 |
+
</p>
|
| 60 |
+
<p align="center">
|
| 61 |
+
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
| 62 |
+
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
| 63 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
| 64 |
+
</a>
|
| 65 |
+
</p>
|
| 66 |
+
|
| 67 |
+
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
| 68 |
+
> I'm trying to resolve the issue by taking a new approach.
|
| 69 |
+
> All IANA character set names for which the Python core library provides codecs are supported.
|
| 70 |
+
|
| 71 |
+
<p align="center">
|
| 72 |
+
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
| 73 |
+
</p>
|
| 74 |
+
|
| 75 |
+
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
| 76 |
+
|
| 77 |
+
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
| 78 |
+
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
| 79 |
+
| `Fast` | ❌ | ✅ | ✅ |
|
| 80 |
+
| `Universal**` | ❌ | ✅ | ❌ |
|
| 81 |
+
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
| 82 |
+
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
| 83 |
+
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
| 84 |
+
| `Native Python` | ✅ | ✅ | ❌ |
|
| 85 |
+
| `Detect spoken language` | ❌ | ✅ | N/A |
|
| 86 |
+
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
| 87 |
+
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
| 88 |
+
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
| 89 |
+
|
| 90 |
+
<p align="center">
|
| 91 |
+
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
| 92 |
+
</p>
|
| 93 |
+
|
| 94 |
+
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
| 95 |
+
|
| 96 |
+
## ⚡ Performance
|
| 97 |
+
|
| 98 |
+
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
| 99 |
+
|
| 100 |
+
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
| 101 |
+
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
| 102 |
+
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
| 103 |
+
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
| 104 |
+
|
| 105 |
+
| Package | 99th percentile | 95th percentile | 50th percentile |
|
| 106 |
+
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
| 107 |
+
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
| 108 |
+
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
| 109 |
+
|
| 110 |
+
_updated as of december 2024 using CPython 3.12_
|
| 111 |
+
|
| 112 |
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
| 113 |
+
|
| 114 |
+
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
| 115 |
+
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
| 116 |
+
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
| 117 |
+
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
| 118 |
+
> (e.g. Supported Encoding) Challenge-them if you want.
|
| 119 |
+
|
| 120 |
+
## ✨ Installation
|
| 121 |
+
|
| 122 |
+
Using pip:
|
| 123 |
+
|
| 124 |
+
```sh
|
| 125 |
+
pip install charset-normalizer -U
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
## 🚀 Basic Usage
|
| 129 |
+
|
| 130 |
+
### CLI
|
| 131 |
+
This package comes with a CLI.
|
| 132 |
+
|
| 133 |
+
```
|
| 134 |
+
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
| 135 |
+
file [file ...]
|
| 136 |
+
|
| 137 |
+
The Real First Universal Charset Detector. Discover originating encoding used
|
| 138 |
+
on text file. Normalize text to unicode.
|
| 139 |
+
|
| 140 |
+
positional arguments:
|
| 141 |
+
files File(s) to be analysed
|
| 142 |
+
|
| 143 |
+
optional arguments:
|
| 144 |
+
-h, --help show this help message and exit
|
| 145 |
+
-v, --verbose Display complementary information about file if any.
|
| 146 |
+
Stdout will contain logs about the detection process.
|
| 147 |
+
-a, --with-alternative
|
| 148 |
+
Output complementary possibilities if any. Top-level
|
| 149 |
+
JSON WILL be a list.
|
| 150 |
+
-n, --normalize Permit to normalize input file. If not set, program
|
| 151 |
+
does not write anything.
|
| 152 |
+
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
| 153 |
+
JSON output.
|
| 154 |
+
-r, --replace Replace file when trying to normalize it instead of
|
| 155 |
+
creating a new one.
|
| 156 |
+
-f, --force Replace file without asking if you are sure, use this
|
| 157 |
+
flag with caution.
|
| 158 |
+
-t THRESHOLD, --threshold THRESHOLD
|
| 159 |
+
Define a custom maximum amount of chaos allowed in
|
| 160 |
+
decoded content. 0. <= chaos <= 1.
|
| 161 |
+
--version Show version information and exit.
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
```bash
|
| 165 |
+
normalizer ./data/sample.1.fr.srt
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
or
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
python -m charset_normalizer ./data/sample.1.fr.srt
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
| 175 |
+
|
| 176 |
+
```json
|
| 177 |
+
{
|
| 178 |
+
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
| 179 |
+
"encoding": "cp1252",
|
| 180 |
+
"encoding_aliases": [
|
| 181 |
+
"1252",
|
| 182 |
+
"windows_1252"
|
| 183 |
+
],
|
| 184 |
+
"alternative_encodings": [
|
| 185 |
+
"cp1254",
|
| 186 |
+
"cp1256",
|
| 187 |
+
"cp1258",
|
| 188 |
+
"iso8859_14",
|
| 189 |
+
"iso8859_15",
|
| 190 |
+
"iso8859_16",
|
| 191 |
+
"iso8859_3",
|
| 192 |
+
"iso8859_9",
|
| 193 |
+
"latin_1",
|
| 194 |
+
"mbcs"
|
| 195 |
+
],
|
| 196 |
+
"language": "French",
|
| 197 |
+
"alphabets": [
|
| 198 |
+
"Basic Latin",
|
| 199 |
+
"Latin-1 Supplement"
|
| 200 |
+
],
|
| 201 |
+
"has_sig_or_bom": false,
|
| 202 |
+
"chaos": 0.149,
|
| 203 |
+
"coherence": 97.152,
|
| 204 |
+
"unicode_path": null,
|
| 205 |
+
"is_preferred": true
|
| 206 |
+
}
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### Python
|
| 210 |
+
*Just print out normalized text*
|
| 211 |
+
```python
|
| 212 |
+
from charset_normalizer import from_path
|
| 213 |
+
|
| 214 |
+
results = from_path('./my_subtitle.srt')
|
| 215 |
+
|
| 216 |
+
print(str(results.best()))
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
*Upgrade your code without effort*
|
| 220 |
+
```python
|
| 221 |
+
from charset_normalizer import detect
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
| 225 |
+
|
| 226 |
+
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
| 227 |
+
|
| 228 |
+
## 😇 Why
|
| 229 |
+
|
| 230 |
+
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
| 231 |
+
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
| 232 |
+
|
| 233 |
+
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
| 234 |
+
produce **two identical rendered string.**
|
| 235 |
+
What I want is to get readable text, the best I can.
|
| 236 |
+
|
| 237 |
+
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
| 238 |
+
|
| 239 |
+
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
| 240 |
+
|
| 241 |
+
## 🍰 How
|
| 242 |
+
|
| 243 |
+
- Discard all charset encoding table that could not fit the binary content.
|
| 244 |
+
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
| 245 |
+
- Extract matches with the lowest mess detected.
|
| 246 |
+
- Additionally, we measure coherence / probe for a language.
|
| 247 |
+
|
| 248 |
+
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
| 249 |
+
|
| 250 |
+
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
| 251 |
+
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
| 252 |
+
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
| 253 |
+
improve or rewrite it.
|
| 254 |
+
|
| 255 |
+
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
| 256 |
+
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
| 257 |
+
|
| 258 |
+
## ⚡ Known limitations
|
| 259 |
+
|
| 260 |
+
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
| 261 |
+
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
| 262 |
+
|
| 263 |
+
## ⚠️ About Python EOLs
|
| 264 |
+
|
| 265 |
+
**If you are running:**
|
| 266 |
+
|
| 267 |
+
- Python >=2.7,<3.5: Unsupported
|
| 268 |
+
- Python 3.5: charset-normalizer < 2.1
|
| 269 |
+
- Python 3.6: charset-normalizer < 3.1
|
| 270 |
+
- Python 3.7: charset-normalizer < 4.0
|
| 271 |
+
|
| 272 |
+
Upgrade your Python interpreter as soon as possible.
|
| 273 |
+
|
| 274 |
+
## 👤 Contributing
|
| 275 |
+
|
| 276 |
+
Contributions, issues and feature requests are very much welcome.<br />
|
| 277 |
+
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
| 278 |
+
|
| 279 |
+
## 📝 License
|
| 280 |
+
|
| 281 |
+
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
| 282 |
+
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
| 283 |
+
|
| 284 |
+
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
| 285 |
+
|
| 286 |
+
## 💼 For Enterprise
|
| 287 |
+
|
| 288 |
+
Professional support for charset-normalizer is available as part of the [Tidelift
|
| 289 |
+
Subscription][1]. Tidelift gives software development teams a single source for
|
| 290 |
+
purchasing and maintaining their software, with professional grade assurances
|
| 291 |
+
from the experts who know it best, while seamlessly integrating with existing
|
| 292 |
+
tools.
|
| 293 |
+
|
| 294 |
+
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
| 295 |
+
|
| 296 |
+
[](https://www.bestpractices.dev/projects/7297)
|
| 297 |
+
|
| 298 |
+
# Changelog
|
| 299 |
+
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
| 300 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
| 301 |
+
|
| 302 |
+
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
| 303 |
+
|
| 304 |
+
### Changed
|
| 305 |
+
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
| 306 |
+
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
| 307 |
+
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
| 308 |
+
|
| 309 |
+
### Added
|
| 310 |
+
- pre-commit configuration.
|
| 311 |
+
- noxfile.
|
| 312 |
+
|
| 313 |
+
### Removed
|
| 314 |
+
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
| 315 |
+
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
| 316 |
+
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
| 317 |
+
- Unused `utils.range_scan` function.
|
| 318 |
+
|
| 319 |
+
### Fixed
|
| 320 |
+
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
| 321 |
+
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
| 322 |
+
|
| 323 |
+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
| 324 |
+
|
| 325 |
+
### Added
|
| 326 |
+
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
| 327 |
+
- Support for Python 3.13 (#512)
|
| 328 |
+
|
| 329 |
+
### Fixed
|
| 330 |
+
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
| 331 |
+
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
| 332 |
+
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
| 333 |
+
|
| 334 |
+
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
| 335 |
+
|
| 336 |
+
### Fixed
|
| 337 |
+
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
| 338 |
+
- Regression on some detection case showcased in the documentation (#371)
|
| 339 |
+
|
| 340 |
+
### Added
|
| 341 |
+
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
| 342 |
+
|
| 343 |
+
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
| 344 |
+
|
| 345 |
+
### Changed
|
| 346 |
+
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
| 347 |
+
- Improved the general detection reliability based on reports from the community
|
| 348 |
+
|
| 349 |
+
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
| 350 |
+
|
| 351 |
+
### Added
|
| 352 |
+
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
| 353 |
+
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
| 354 |
+
|
| 355 |
+
### Removed
|
| 356 |
+
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
| 357 |
+
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
| 358 |
+
|
| 359 |
+
### Changed
|
| 360 |
+
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
| 361 |
+
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
| 362 |
+
|
| 363 |
+
### Fixed
|
| 364 |
+
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
| 365 |
+
|
| 366 |
+
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
| 367 |
+
|
| 368 |
+
### Changed
|
| 369 |
+
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
| 370 |
+
- Minor improvement over the global detection reliability
|
| 371 |
+
|
| 372 |
+
### Added
|
| 373 |
+
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
| 374 |
+
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
| 375 |
+
- Explicit support for Python 3.12
|
| 376 |
+
|
| 377 |
+
### Fixed
|
| 378 |
+
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
| 379 |
+
|
| 380 |
+
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
| 381 |
+
|
| 382 |
+
### Added
|
| 383 |
+
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
| 384 |
+
|
| 385 |
+
### Removed
|
| 386 |
+
- Support for Python 3.6 (PR #260)
|
| 387 |
+
|
| 388 |
+
### Changed
|
| 389 |
+
- Optional speedup provided by mypy/c 1.0.1
|
| 390 |
+
|
| 391 |
+
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
| 392 |
+
|
| 393 |
+
### Fixed
|
| 394 |
+
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
| 395 |
+
|
| 396 |
+
### Changed
|
| 397 |
+
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
| 398 |
+
|
| 399 |
+
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
| 400 |
+
|
| 401 |
+
### Added
|
| 402 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
| 403 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
| 404 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
| 405 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
| 406 |
+
|
| 407 |
+
### Changed
|
| 408 |
+
- Build with static metadata using 'build' frontend
|
| 409 |
+
- Make the language detection stricter
|
| 410 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
| 411 |
+
|
| 412 |
+
### Fixed
|
| 413 |
+
- CLI with opt --normalize fail when using full path for files
|
| 414 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
| 415 |
+
- Sphinx warnings when generating the documentation
|
| 416 |
+
|
| 417 |
+
### Removed
|
| 418 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
| 419 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
| 420 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
| 421 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
| 422 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
| 423 |
+
- Breaking: Top-level function `normalize`
|
| 424 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
| 425 |
+
- Support for the backport `unicodedata2`
|
| 426 |
+
|
| 427 |
+
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
| 428 |
+
|
| 429 |
+
### Added
|
| 430 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
| 431 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
| 432 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
| 433 |
+
|
| 434 |
+
### Changed
|
| 435 |
+
- Build with static metadata using 'build' frontend
|
| 436 |
+
- Make the language detection stricter
|
| 437 |
+
|
| 438 |
+
### Fixed
|
| 439 |
+
- CLI with opt --normalize fail when using full path for files
|
| 440 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
| 441 |
+
|
| 442 |
+
### Removed
|
| 443 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
| 444 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
| 445 |
+
|
| 446 |
+
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
| 447 |
+
|
| 448 |
+
### Added
|
| 449 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
| 450 |
+
|
| 451 |
+
### Removed
|
| 452 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
| 453 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
| 454 |
+
|
| 455 |
+
### Fixed
|
| 456 |
+
- Sphinx warnings when generating the documentation
|
| 457 |
+
|
| 458 |
+
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
| 459 |
+
|
| 460 |
+
### Changed
|
| 461 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
| 462 |
+
|
| 463 |
+
### Removed
|
| 464 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
| 465 |
+
- Breaking: Top-level function `normalize`
|
| 466 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
| 467 |
+
- Support for the backport `unicodedata2`
|
| 468 |
+
|
| 469 |
+
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
| 470 |
+
|
| 471 |
+
### Deprecated
|
| 472 |
+
- Function `normalize` scheduled for removal in 3.0
|
| 473 |
+
|
| 474 |
+
### Changed
|
| 475 |
+
- Removed useless call to decode in fn is_unprintable (#206)
|
| 476 |
+
|
| 477 |
+
### Fixed
|
| 478 |
+
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
| 479 |
+
|
| 480 |
+
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
| 481 |
+
|
| 482 |
+
### Added
|
| 483 |
+
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
| 484 |
+
|
| 485 |
+
### Changed
|
| 486 |
+
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
| 487 |
+
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
| 488 |
+
|
| 489 |
+
### Fixed
|
| 490 |
+
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
| 491 |
+
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
| 492 |
+
|
| 493 |
+
### Removed
|
| 494 |
+
- Support for Python 3.5 (PR #192)
|
| 495 |
+
|
| 496 |
+
### Deprecated
|
| 497 |
+
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
| 498 |
+
|
| 499 |
+
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
| 500 |
+
|
| 501 |
+
### Fixed
|
| 502 |
+
- ASCII miss-detection on rare cases (PR #170)
|
| 503 |
+
|
| 504 |
+
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
| 505 |
+
|
| 506 |
+
### Added
|
| 507 |
+
- Explicit support for Python 3.11 (PR #164)
|
| 508 |
+
|
| 509 |
+
### Changed
|
| 510 |
+
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
| 511 |
+
|
| 512 |
+
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
| 513 |
+
|
| 514 |
+
### Fixed
|
| 515 |
+
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
| 516 |
+
|
| 517 |
+
### Changed
|
| 518 |
+
- Skipping the language-detection (CD) on ASCII (PR #155)
|
| 519 |
+
|
| 520 |
+
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
| 521 |
+
|
| 522 |
+
### Changed
|
| 523 |
+
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
| 524 |
+
|
| 525 |
+
### Fixed
|
| 526 |
+
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
| 527 |
+
|
| 528 |
+
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
| 529 |
+
### Changed
|
| 530 |
+
- Improvement over Vietnamese detection (PR #126)
|
| 531 |
+
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
| 532 |
+
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
| 533 |
+
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
| 534 |
+
- Code style as refactored by Sourcery-AI (PR #131)
|
| 535 |
+
- Minor adjustment on the MD around european words (PR #133)
|
| 536 |
+
- Remove and replace SRTs from assets / tests (PR #139)
|
| 537 |
+
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
| 538 |
+
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
| 539 |
+
|
| 540 |
+
### Fixed
|
| 541 |
+
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
| 542 |
+
- Avoid using too insignificant chunk (PR #137)
|
| 543 |
+
|
| 544 |
+
### Added
|
| 545 |
+
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
| 546 |
+
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
| 547 |
+
|
| 548 |
+
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
| 549 |
+
### Added
|
| 550 |
+
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
| 551 |
+
|
| 552 |
+
### Changed
|
| 553 |
+
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
| 554 |
+
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
| 555 |
+
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
| 556 |
+
- Various detection improvement (MD+CD) (PR #117)
|
| 557 |
+
|
| 558 |
+
### Removed
|
| 559 |
+
- Remove redundant logging entry about detected language(s) (PR #115)
|
| 560 |
+
|
| 561 |
+
### Fixed
|
| 562 |
+
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
| 563 |
+
|
| 564 |
+
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
| 565 |
+
### Fixed
|
| 566 |
+
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
| 567 |
+
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
| 568 |
+
|
| 569 |
+
### Changed
|
| 570 |
+
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
| 571 |
+
|
| 572 |
+
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
| 573 |
+
### Changed
|
| 574 |
+
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
| 575 |
+
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
| 576 |
+
- The Unicode detection is slightly improved (PR #93)
|
| 577 |
+
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
| 578 |
+
|
| 579 |
+
### Removed
|
| 580 |
+
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
| 581 |
+
|
| 582 |
+
### Fixed
|
| 583 |
+
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
| 584 |
+
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
| 585 |
+
- The MANIFEST.in was not exhaustive (PR #78)
|
| 586 |
+
|
| 587 |
+
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
| 588 |
+
### Fixed
|
| 589 |
+
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
| 590 |
+
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
| 591 |
+
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
| 592 |
+
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
| 593 |
+
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
| 594 |
+
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
| 595 |
+
|
| 596 |
+
### Changed
|
| 597 |
+
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
| 598 |
+
- Allow fallback on specified encoding if any (PR #71)
|
| 599 |
+
|
| 600 |
+
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
| 601 |
+
### Changed
|
| 602 |
+
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
| 603 |
+
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
| 604 |
+
|
| 605 |
+
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
| 606 |
+
### Fixed
|
| 607 |
+
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
| 608 |
+
|
| 609 |
+
### Changed
|
| 610 |
+
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
| 611 |
+
|
| 612 |
+
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
| 613 |
+
### Fixed
|
| 614 |
+
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
| 615 |
+
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
| 616 |
+
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
| 617 |
+
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
| 618 |
+
|
| 619 |
+
### Changed
|
| 620 |
+
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
| 621 |
+
|
| 622 |
+
### Added
|
| 623 |
+
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
| 624 |
+
|
| 625 |
+
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
| 626 |
+
### Changed
|
| 627 |
+
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
| 628 |
+
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
| 629 |
+
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
| 630 |
+
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
| 631 |
+
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
| 632 |
+
- utf_7 detection has been reinstated.
|
| 633 |
+
|
| 634 |
+
### Removed
|
| 635 |
+
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
| 636 |
+
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
| 637 |
+
- The exception hook on UnicodeDecodeError has been removed.
|
| 638 |
+
|
| 639 |
+
### Deprecated
|
| 640 |
+
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
| 641 |
+
|
| 642 |
+
### Fixed
|
| 643 |
+
- The CLI output used the relative path of the file(s). Should be absolute.
|
| 644 |
+
|
| 645 |
+
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
| 646 |
+
### Fixed
|
| 647 |
+
- Logger configuration/usage no longer conflict with others (PR #44)
|
| 648 |
+
|
| 649 |
+
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
| 650 |
+
### Removed
|
| 651 |
+
- Using standard logging instead of using the package loguru.
|
| 652 |
+
- Dropping nose test framework in favor of the maintained pytest.
|
| 653 |
+
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
| 654 |
+
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
| 655 |
+
- Stop support for UTF-7 that does not contain a SIG.
|
| 656 |
+
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
| 657 |
+
|
| 658 |
+
### Fixed
|
| 659 |
+
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
| 660 |
+
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
| 661 |
+
|
| 662 |
+
### Changed
|
| 663 |
+
- Improving the package final size by compressing frequencies.json.
|
| 664 |
+
- Huge improvement over the larges payload.
|
| 665 |
+
|
| 666 |
+
### Added
|
| 667 |
+
- CLI now produces JSON consumable output.
|
| 668 |
+
- Return ASCII if given sequences fit. Given reasonable confidence.
|
| 669 |
+
|
| 670 |
+
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
| 671 |
+
|
| 672 |
+
### Fixed
|
| 673 |
+
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
| 674 |
+
|
| 675 |
+
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
| 676 |
+
|
| 677 |
+
### Fixed
|
| 678 |
+
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
| 679 |
+
|
| 680 |
+
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
| 681 |
+
|
| 682 |
+
### Fixed
|
| 683 |
+
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
| 684 |
+
|
| 685 |
+
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
| 686 |
+
|
| 687 |
+
### Changed
|
| 688 |
+
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
| 689 |
+
|
| 690 |
+
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
| 691 |
+
|
| 692 |
+
### Fixed
|
| 693 |
+
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
| 694 |
+
|
| 695 |
+
### Changed
|
| 696 |
+
- Dependencies refactoring, constraints revised.
|
| 697 |
+
|
| 698 |
+
### Added
|
| 699 |
+
- Add python 3.9 and 3.10 to the supported interpreters
|
| 700 |
+
|
| 701 |
+
MIT License
|
| 702 |
+
|
| 703 |
+
Copyright (c) 2025 TAHRI Ahmed R.
|
| 704 |
+
|
| 705 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 706 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 707 |
+
in the Software without restriction, including without limitation the rights
|
| 708 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 709 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 710 |
+
furnished to do so, subject to the following conditions:
|
| 711 |
+
|
| 712 |
+
The above copyright notice and this permission notice shall be included in all
|
| 713 |
+
copies or substantial portions of the Software.
|
| 714 |
+
|
| 715 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 716 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 717 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 718 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 719 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 720 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 721 |
+
SOFTWARE.
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
../../Scripts/normalizer.exe,sha256=4_Drg6MZgKEgGq4qpOAk6mQHRVD7X8yi-_wkphtVVJY,108425
|
| 2 |
+
charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 3 |
+
charset_normalizer-3.4.1.dist-info/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
|
| 4 |
+
charset_normalizer-3.4.1.dist-info/METADATA,sha256=0_fAC3DknimRZusm6kkP4ylPD0JVzBq5mKHWLNBJM6w,36034
|
| 5 |
+
charset_normalizer-3.4.1.dist-info/RECORD,,
|
| 6 |
+
charset_normalizer-3.4.1.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
|
| 7 |
+
charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
|
| 8 |
+
charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
| 9 |
+
charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
|
| 10 |
+
charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
|
| 11 |
+
charset_normalizer/__pycache__/__init__.cpython-313.pyc,,
|
| 12 |
+
charset_normalizer/__pycache__/__main__.cpython-313.pyc,,
|
| 13 |
+
charset_normalizer/__pycache__/api.cpython-313.pyc,,
|
| 14 |
+
charset_normalizer/__pycache__/cd.cpython-313.pyc,,
|
| 15 |
+
charset_normalizer/__pycache__/constant.cpython-313.pyc,,
|
| 16 |
+
charset_normalizer/__pycache__/legacy.cpython-313.pyc,,
|
| 17 |
+
charset_normalizer/__pycache__/md.cpython-313.pyc,,
|
| 18 |
+
charset_normalizer/__pycache__/models.cpython-313.pyc,,
|
| 19 |
+
charset_normalizer/__pycache__/utils.cpython-313.pyc,,
|
| 20 |
+
charset_normalizer/__pycache__/version.cpython-313.pyc,,
|
| 21 |
+
charset_normalizer/api.py,sha256=2a0p2Gnhbdo9O6C04CNxTSN23fIbgOF20nxb0pWPNFM,23285
|
| 22 |
+
charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
|
| 23 |
+
charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
|
| 24 |
+
charset_normalizer/cli/__main__.py,sha256=lZ89qRWun7FRxX0qm1GhK-m0DH0i048yiMAX1mVIuRg,10731
|
| 25 |
+
charset_normalizer/cli/__pycache__/__init__.cpython-313.pyc,,
|
| 26 |
+
charset_normalizer/cli/__pycache__/__main__.cpython-313.pyc,,
|
| 27 |
+
charset_normalizer/constant.py,sha256=7OKYi28cJjZxIcX3lQCwfK9ijoOgaVEbERww7SqqNSY,42475
|
| 28 |
+
charset_normalizer/legacy.py,sha256=v8An1aAQHUu036UWOhyIaDGkirZ0t4hfNVlyje5KInU,2394
|
| 29 |
+
charset_normalizer/md.cp313-win_amd64.pyd,sha256=H4pRc9i_5sVp6Bxzi4MIADB-1FhtKumsXME6RoxuGJI,10752
|
| 30 |
+
charset_normalizer/md.py,sha256=e452fhwIAguEUr3FJzG7QZvFgXI-dVLOh_M1ZUiFI6U,20666
|
| 31 |
+
charset_normalizer/md__mypyc.cp313-win_amd64.pyd,sha256=Q3zrdee8fHLJCQVYOX7zWYsLx7xJlDSvWCcCgIPTAMo,125440
|
| 32 |
+
charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
|
| 33 |
+
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 34 |
+
charset_normalizer/utils.py,sha256=oH9Q3WcAMwmsSB7uM8uDozz9DXnkYecbkTNbdnMbgzI,12410
|
| 35 |
+
charset_normalizer/version.py,sha256=7_thI7FzRQxEsbtUYwrJs3FCFWF666mw74H8mggPRR0,123
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (75.6.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp313-cp313-win_amd64
|
| 5 |
+
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
normalizer = charset_normalizer:cli.cli_detect
|
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
charset_normalizer
|
venv/Lib/site-packages/charset_normalizer/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Charset-Normalizer
|
| 3 |
+
~~~~~~~~~~~~~~
|
| 4 |
+
The Real First Universal Charset Detector.
|
| 5 |
+
A library that helps you read text from an unknown charset encoding.
|
| 6 |
+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
| 7 |
+
All IANA character set names for which the Python core library provides codecs are supported.
|
| 8 |
+
|
| 9 |
+
Basic usage:
|
| 10 |
+
>>> from charset_normalizer import from_bytes
|
| 11 |
+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
| 12 |
+
>>> best_guess = results.best()
|
| 13 |
+
>>> str(best_guess)
|
| 14 |
+
'Bсеки човек има право на образование. Oбразованието!'
|
| 15 |
+
|
| 16 |
+
Others methods and usages are available - see the full documentation
|
| 17 |
+
at <https://github.com/Ousret/charset_normalizer>.
|
| 18 |
+
:copyright: (c) 2021 by Ahmed TAHRI
|
| 19 |
+
:license: MIT, see LICENSE for more details.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import logging
|
| 25 |
+
|
| 26 |
+
from .api import from_bytes, from_fp, from_path, is_binary
|
| 27 |
+
from .legacy import detect
|
| 28 |
+
from .models import CharsetMatch, CharsetMatches
|
| 29 |
+
from .utils import set_logging_handler
|
| 30 |
+
from .version import VERSION, __version__
|
| 31 |
+
|
| 32 |
+
__all__ = (
|
| 33 |
+
"from_fp",
|
| 34 |
+
"from_path",
|
| 35 |
+
"from_bytes",
|
| 36 |
+
"is_binary",
|
| 37 |
+
"detect",
|
| 38 |
+
"CharsetMatch",
|
| 39 |
+
"CharsetMatches",
|
| 40 |
+
"__version__",
|
| 41 |
+
"VERSION",
|
| 42 |
+
"set_logging_handler",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Attach a NullHandler to the top level logger by default
|
| 46 |
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
| 47 |
+
|
| 48 |
+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
venv/Lib/site-packages/charset_normalizer/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .cli import cli_detect
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
cli_detect()
|
venv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (1.77 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-313.pyc
ADDED
|
Binary file (349 Bytes). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-313.pyc
ADDED
|
Binary file (18.7 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-313.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-313.pyc
ADDED
|
Binary file (38.7 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-313.pyc
ADDED
|
Binary file (2.91 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-313.pyc
ADDED
|
Binary file (25.5 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (17.3 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-313.pyc
ADDED
|
Binary file (13.8 kB). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-313.pyc
ADDED
|
Binary file (374 Bytes). View file
|
|
|
venv/Lib/site-packages/charset_normalizer/api.py
ADDED
|
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from os import PathLike
|
| 5 |
+
from typing import BinaryIO
|
| 6 |
+
|
| 7 |
+
from .cd import (
|
| 8 |
+
coherence_ratio,
|
| 9 |
+
encoding_languages,
|
| 10 |
+
mb_encoding_languages,
|
| 11 |
+
merge_coherence_ratios,
|
| 12 |
+
)
|
| 13 |
+
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
| 14 |
+
from .md import mess_ratio
|
| 15 |
+
from .models import CharsetMatch, CharsetMatches
|
| 16 |
+
from .utils import (
|
| 17 |
+
any_specified_encoding,
|
| 18 |
+
cut_sequence_chunks,
|
| 19 |
+
iana_name,
|
| 20 |
+
identify_sig_or_bom,
|
| 21 |
+
is_cp_similar,
|
| 22 |
+
is_multi_byte_encoding,
|
| 23 |
+
should_strip_sig_or_bom,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger("charset_normalizer")
|
| 27 |
+
explain_handler = logging.StreamHandler()
|
| 28 |
+
explain_handler.setFormatter(
|
| 29 |
+
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def from_bytes(
|
| 34 |
+
sequences: bytes | bytearray,
|
| 35 |
+
steps: int = 5,
|
| 36 |
+
chunk_size: int = 512,
|
| 37 |
+
threshold: float = 0.2,
|
| 38 |
+
cp_isolation: list[str] | None = None,
|
| 39 |
+
cp_exclusion: list[str] | None = None,
|
| 40 |
+
preemptive_behaviour: bool = True,
|
| 41 |
+
explain: bool = False,
|
| 42 |
+
language_threshold: float = 0.1,
|
| 43 |
+
enable_fallback: bool = True,
|
| 44 |
+
) -> CharsetMatches:
|
| 45 |
+
"""
|
| 46 |
+
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
| 47 |
+
If there is no results, it is a strong indicator that the source is binary/not text.
|
| 48 |
+
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
| 49 |
+
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
| 50 |
+
|
| 51 |
+
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
| 52 |
+
but never take it for granted. Can improve the performance.
|
| 53 |
+
|
| 54 |
+
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
| 55 |
+
purpose.
|
| 56 |
+
|
| 57 |
+
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
| 58 |
+
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
| 59 |
+
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
| 60 |
+
Custom logging format and handler can be set manually.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
if not isinstance(sequences, (bytearray, bytes)):
|
| 64 |
+
raise TypeError(
|
| 65 |
+
"Expected object of type bytes or bytearray, got: {}".format(
|
| 66 |
+
type(sequences)
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if explain:
|
| 71 |
+
previous_logger_level: int = logger.level
|
| 72 |
+
logger.addHandler(explain_handler)
|
| 73 |
+
logger.setLevel(TRACE)
|
| 74 |
+
|
| 75 |
+
length: int = len(sequences)
|
| 76 |
+
|
| 77 |
+
if length == 0:
|
| 78 |
+
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
| 79 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 80 |
+
logger.removeHandler(explain_handler)
|
| 81 |
+
logger.setLevel(previous_logger_level or logging.WARNING)
|
| 82 |
+
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
| 83 |
+
|
| 84 |
+
if cp_isolation is not None:
|
| 85 |
+
logger.log(
|
| 86 |
+
TRACE,
|
| 87 |
+
"cp_isolation is set. use this flag for debugging purpose. "
|
| 88 |
+
"limited list of encoding allowed : %s.",
|
| 89 |
+
", ".join(cp_isolation),
|
| 90 |
+
)
|
| 91 |
+
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
| 92 |
+
else:
|
| 93 |
+
cp_isolation = []
|
| 94 |
+
|
| 95 |
+
if cp_exclusion is not None:
|
| 96 |
+
logger.log(
|
| 97 |
+
TRACE,
|
| 98 |
+
"cp_exclusion is set. use this flag for debugging purpose. "
|
| 99 |
+
"limited list of encoding excluded : %s.",
|
| 100 |
+
", ".join(cp_exclusion),
|
| 101 |
+
)
|
| 102 |
+
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
| 103 |
+
else:
|
| 104 |
+
cp_exclusion = []
|
| 105 |
+
|
| 106 |
+
if length <= (chunk_size * steps):
|
| 107 |
+
logger.log(
|
| 108 |
+
TRACE,
|
| 109 |
+
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
| 110 |
+
steps,
|
| 111 |
+
chunk_size,
|
| 112 |
+
length,
|
| 113 |
+
)
|
| 114 |
+
steps = 1
|
| 115 |
+
chunk_size = length
|
| 116 |
+
|
| 117 |
+
if steps > 1 and length / steps < chunk_size:
|
| 118 |
+
chunk_size = int(length / steps)
|
| 119 |
+
|
| 120 |
+
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
| 121 |
+
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
| 122 |
+
|
| 123 |
+
if is_too_small_sequence:
|
| 124 |
+
logger.log(
|
| 125 |
+
TRACE,
|
| 126 |
+
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
| 127 |
+
length
|
| 128 |
+
),
|
| 129 |
+
)
|
| 130 |
+
elif is_too_large_sequence:
|
| 131 |
+
logger.log(
|
| 132 |
+
TRACE,
|
| 133 |
+
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
| 134 |
+
length
|
| 135 |
+
),
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
prioritized_encodings: list[str] = []
|
| 139 |
+
|
| 140 |
+
specified_encoding: str | None = (
|
| 141 |
+
any_specified_encoding(sequences) if preemptive_behaviour else None
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if specified_encoding is not None:
|
| 145 |
+
prioritized_encodings.append(specified_encoding)
|
| 146 |
+
logger.log(
|
| 147 |
+
TRACE,
|
| 148 |
+
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
| 149 |
+
specified_encoding,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
tested: set[str] = set()
|
| 153 |
+
tested_but_hard_failure: list[str] = []
|
| 154 |
+
tested_but_soft_failure: list[str] = []
|
| 155 |
+
|
| 156 |
+
fallback_ascii: CharsetMatch | None = None
|
| 157 |
+
fallback_u8: CharsetMatch | None = None
|
| 158 |
+
fallback_specified: CharsetMatch | None = None
|
| 159 |
+
|
| 160 |
+
results: CharsetMatches = CharsetMatches()
|
| 161 |
+
|
| 162 |
+
early_stop_results: CharsetMatches = CharsetMatches()
|
| 163 |
+
|
| 164 |
+
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
| 165 |
+
|
| 166 |
+
if sig_encoding is not None:
|
| 167 |
+
prioritized_encodings.append(sig_encoding)
|
| 168 |
+
logger.log(
|
| 169 |
+
TRACE,
|
| 170 |
+
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
| 171 |
+
len(sig_payload),
|
| 172 |
+
sig_encoding,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
prioritized_encodings.append("ascii")
|
| 176 |
+
|
| 177 |
+
if "utf_8" not in prioritized_encodings:
|
| 178 |
+
prioritized_encodings.append("utf_8")
|
| 179 |
+
|
| 180 |
+
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
| 181 |
+
if cp_isolation and encoding_iana not in cp_isolation:
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
if cp_exclusion and encoding_iana in cp_exclusion:
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
+
if encoding_iana in tested:
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
tested.add(encoding_iana)
|
| 191 |
+
|
| 192 |
+
decoded_payload: str | None = None
|
| 193 |
+
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
| 194 |
+
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
| 195 |
+
encoding_iana
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
| 199 |
+
logger.log(
|
| 200 |
+
TRACE,
|
| 201 |
+
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
| 202 |
+
encoding_iana,
|
| 203 |
+
)
|
| 204 |
+
continue
|
| 205 |
+
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
| 206 |
+
logger.log(
|
| 207 |
+
TRACE,
|
| 208 |
+
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
| 209 |
+
encoding_iana,
|
| 210 |
+
)
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
| 215 |
+
except (ModuleNotFoundError, ImportError):
|
| 216 |
+
logger.log(
|
| 217 |
+
TRACE,
|
| 218 |
+
"Encoding %s does not provide an IncrementalDecoder",
|
| 219 |
+
encoding_iana,
|
| 220 |
+
)
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
try:
|
| 224 |
+
if is_too_large_sequence and is_multi_byte_decoder is False:
|
| 225 |
+
str(
|
| 226 |
+
(
|
| 227 |
+
sequences[: int(50e4)]
|
| 228 |
+
if strip_sig_or_bom is False
|
| 229 |
+
else sequences[len(sig_payload) : int(50e4)]
|
| 230 |
+
),
|
| 231 |
+
encoding=encoding_iana,
|
| 232 |
+
)
|
| 233 |
+
else:
|
| 234 |
+
decoded_payload = str(
|
| 235 |
+
(
|
| 236 |
+
sequences
|
| 237 |
+
if strip_sig_or_bom is False
|
| 238 |
+
else sequences[len(sig_payload) :]
|
| 239 |
+
),
|
| 240 |
+
encoding=encoding_iana,
|
| 241 |
+
)
|
| 242 |
+
except (UnicodeDecodeError, LookupError) as e:
|
| 243 |
+
if not isinstance(e, LookupError):
|
| 244 |
+
logger.log(
|
| 245 |
+
TRACE,
|
| 246 |
+
"Code page %s does not fit given bytes sequence at ALL. %s",
|
| 247 |
+
encoding_iana,
|
| 248 |
+
str(e),
|
| 249 |
+
)
|
| 250 |
+
tested_but_hard_failure.append(encoding_iana)
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
similar_soft_failure_test: bool = False
|
| 254 |
+
|
| 255 |
+
for encoding_soft_failed in tested_but_soft_failure:
|
| 256 |
+
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
| 257 |
+
similar_soft_failure_test = True
|
| 258 |
+
break
|
| 259 |
+
|
| 260 |
+
if similar_soft_failure_test:
|
| 261 |
+
logger.log(
|
| 262 |
+
TRACE,
|
| 263 |
+
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
| 264 |
+
encoding_iana,
|
| 265 |
+
encoding_soft_failed,
|
| 266 |
+
)
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
r_ = range(
|
| 270 |
+
0 if not bom_or_sig_available else len(sig_payload),
|
| 271 |
+
length,
|
| 272 |
+
int(length / steps),
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
multi_byte_bonus: bool = (
|
| 276 |
+
is_multi_byte_decoder
|
| 277 |
+
and decoded_payload is not None
|
| 278 |
+
and len(decoded_payload) < length
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
if multi_byte_bonus:
|
| 282 |
+
logger.log(
|
| 283 |
+
TRACE,
|
| 284 |
+
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
| 285 |
+
"was encoded using n-bytes.",
|
| 286 |
+
encoding_iana,
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
max_chunk_gave_up: int = int(len(r_) / 4)
|
| 290 |
+
|
| 291 |
+
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
| 292 |
+
early_stop_count: int = 0
|
| 293 |
+
lazy_str_hard_failure = False
|
| 294 |
+
|
| 295 |
+
md_chunks: list[str] = []
|
| 296 |
+
md_ratios = []
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
for chunk in cut_sequence_chunks(
|
| 300 |
+
sequences,
|
| 301 |
+
encoding_iana,
|
| 302 |
+
r_,
|
| 303 |
+
chunk_size,
|
| 304 |
+
bom_or_sig_available,
|
| 305 |
+
strip_sig_or_bom,
|
| 306 |
+
sig_payload,
|
| 307 |
+
is_multi_byte_decoder,
|
| 308 |
+
decoded_payload,
|
| 309 |
+
):
|
| 310 |
+
md_chunks.append(chunk)
|
| 311 |
+
|
| 312 |
+
md_ratios.append(
|
| 313 |
+
mess_ratio(
|
| 314 |
+
chunk,
|
| 315 |
+
threshold,
|
| 316 |
+
explain is True and 1 <= len(cp_isolation) <= 2,
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
if md_ratios[-1] >= threshold:
|
| 321 |
+
early_stop_count += 1
|
| 322 |
+
|
| 323 |
+
if (early_stop_count >= max_chunk_gave_up) or (
|
| 324 |
+
bom_or_sig_available and strip_sig_or_bom is False
|
| 325 |
+
):
|
| 326 |
+
break
|
| 327 |
+
except (
|
| 328 |
+
UnicodeDecodeError
|
| 329 |
+
) as e: # Lazy str loading may have missed something there
|
| 330 |
+
logger.log(
|
| 331 |
+
TRACE,
|
| 332 |
+
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
| 333 |
+
encoding_iana,
|
| 334 |
+
str(e),
|
| 335 |
+
)
|
| 336 |
+
early_stop_count = max_chunk_gave_up
|
| 337 |
+
lazy_str_hard_failure = True
|
| 338 |
+
|
| 339 |
+
# We might want to check the sequence again with the whole content
|
| 340 |
+
# Only if initial MD tests passes
|
| 341 |
+
if (
|
| 342 |
+
not lazy_str_hard_failure
|
| 343 |
+
and is_too_large_sequence
|
| 344 |
+
and not is_multi_byte_decoder
|
| 345 |
+
):
|
| 346 |
+
try:
|
| 347 |
+
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
| 348 |
+
except UnicodeDecodeError as e:
|
| 349 |
+
logger.log(
|
| 350 |
+
TRACE,
|
| 351 |
+
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
| 352 |
+
encoding_iana,
|
| 353 |
+
str(e),
|
| 354 |
+
)
|
| 355 |
+
tested_but_hard_failure.append(encoding_iana)
|
| 356 |
+
continue
|
| 357 |
+
|
| 358 |
+
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
| 359 |
+
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
| 360 |
+
tested_but_soft_failure.append(encoding_iana)
|
| 361 |
+
logger.log(
|
| 362 |
+
TRACE,
|
| 363 |
+
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
| 364 |
+
"Computed mean chaos is %f %%.",
|
| 365 |
+
encoding_iana,
|
| 366 |
+
early_stop_count,
|
| 367 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
| 368 |
+
)
|
| 369 |
+
# Preparing those fallbacks in case we got nothing.
|
| 370 |
+
if (
|
| 371 |
+
enable_fallback
|
| 372 |
+
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
| 373 |
+
and not lazy_str_hard_failure
|
| 374 |
+
):
|
| 375 |
+
fallback_entry = CharsetMatch(
|
| 376 |
+
sequences,
|
| 377 |
+
encoding_iana,
|
| 378 |
+
threshold,
|
| 379 |
+
False,
|
| 380 |
+
[],
|
| 381 |
+
decoded_payload,
|
| 382 |
+
preemptive_declaration=specified_encoding,
|
| 383 |
+
)
|
| 384 |
+
if encoding_iana == specified_encoding:
|
| 385 |
+
fallback_specified = fallback_entry
|
| 386 |
+
elif encoding_iana == "ascii":
|
| 387 |
+
fallback_ascii = fallback_entry
|
| 388 |
+
else:
|
| 389 |
+
fallback_u8 = fallback_entry
|
| 390 |
+
continue
|
| 391 |
+
|
| 392 |
+
logger.log(
|
| 393 |
+
TRACE,
|
| 394 |
+
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
| 395 |
+
encoding_iana,
|
| 396 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
if not is_multi_byte_decoder:
|
| 400 |
+
target_languages: list[str] = encoding_languages(encoding_iana)
|
| 401 |
+
else:
|
| 402 |
+
target_languages = mb_encoding_languages(encoding_iana)
|
| 403 |
+
|
| 404 |
+
if target_languages:
|
| 405 |
+
logger.log(
|
| 406 |
+
TRACE,
|
| 407 |
+
"{} should target any language(s) of {}".format(
|
| 408 |
+
encoding_iana, str(target_languages)
|
| 409 |
+
),
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
cd_ratios = []
|
| 413 |
+
|
| 414 |
+
# We shall skip the CD when its about ASCII
|
| 415 |
+
# Most of the time its not relevant to run "language-detection" on it.
|
| 416 |
+
if encoding_iana != "ascii":
|
| 417 |
+
for chunk in md_chunks:
|
| 418 |
+
chunk_languages = coherence_ratio(
|
| 419 |
+
chunk,
|
| 420 |
+
language_threshold,
|
| 421 |
+
",".join(target_languages) if target_languages else None,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
cd_ratios.append(chunk_languages)
|
| 425 |
+
|
| 426 |
+
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
| 427 |
+
|
| 428 |
+
if cd_ratios_merged:
|
| 429 |
+
logger.log(
|
| 430 |
+
TRACE,
|
| 431 |
+
"We detected language {} using {}".format(
|
| 432 |
+
cd_ratios_merged, encoding_iana
|
| 433 |
+
),
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
current_match = CharsetMatch(
|
| 437 |
+
sequences,
|
| 438 |
+
encoding_iana,
|
| 439 |
+
mean_mess_ratio,
|
| 440 |
+
bom_or_sig_available,
|
| 441 |
+
cd_ratios_merged,
|
| 442 |
+
(
|
| 443 |
+
decoded_payload
|
| 444 |
+
if (
|
| 445 |
+
is_too_large_sequence is False
|
| 446 |
+
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
| 447 |
+
)
|
| 448 |
+
else None
|
| 449 |
+
),
|
| 450 |
+
preemptive_declaration=specified_encoding,
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
results.append(current_match)
|
| 454 |
+
|
| 455 |
+
if (
|
| 456 |
+
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
| 457 |
+
and mean_mess_ratio < 0.1
|
| 458 |
+
):
|
| 459 |
+
# If md says nothing to worry about, then... stop immediately!
|
| 460 |
+
if mean_mess_ratio == 0.0:
|
| 461 |
+
logger.debug(
|
| 462 |
+
"Encoding detection: %s is most likely the one.",
|
| 463 |
+
current_match.encoding,
|
| 464 |
+
)
|
| 465 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 466 |
+
logger.removeHandler(explain_handler)
|
| 467 |
+
logger.setLevel(previous_logger_level)
|
| 468 |
+
return CharsetMatches([current_match])
|
| 469 |
+
|
| 470 |
+
early_stop_results.append(current_match)
|
| 471 |
+
|
| 472 |
+
if (
|
| 473 |
+
len(early_stop_results)
|
| 474 |
+
and (specified_encoding is None or specified_encoding in tested)
|
| 475 |
+
and "ascii" in tested
|
| 476 |
+
and "utf_8" in tested
|
| 477 |
+
):
|
| 478 |
+
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
| 479 |
+
logger.debug(
|
| 480 |
+
"Encoding detection: %s is most likely the one.",
|
| 481 |
+
probable_result.encoding,
|
| 482 |
+
)
|
| 483 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 484 |
+
logger.removeHandler(explain_handler)
|
| 485 |
+
logger.setLevel(previous_logger_level)
|
| 486 |
+
|
| 487 |
+
return CharsetMatches([probable_result])
|
| 488 |
+
|
| 489 |
+
if encoding_iana == sig_encoding:
|
| 490 |
+
logger.debug(
|
| 491 |
+
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
| 492 |
+
"the beginning of the sequence.",
|
| 493 |
+
encoding_iana,
|
| 494 |
+
)
|
| 495 |
+
if explain: # Defensive: ensure exit path clean handler
|
| 496 |
+
logger.removeHandler(explain_handler)
|
| 497 |
+
logger.setLevel(previous_logger_level)
|
| 498 |
+
return CharsetMatches([results[encoding_iana]])
|
| 499 |
+
|
| 500 |
+
if len(results) == 0:
|
| 501 |
+
if fallback_u8 or fallback_ascii or fallback_specified:
|
| 502 |
+
logger.log(
|
| 503 |
+
TRACE,
|
| 504 |
+
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
if fallback_specified:
|
| 508 |
+
logger.debug(
|
| 509 |
+
"Encoding detection: %s will be used as a fallback match",
|
| 510 |
+
fallback_specified.encoding,
|
| 511 |
+
)
|
| 512 |
+
results.append(fallback_specified)
|
| 513 |
+
elif (
|
| 514 |
+
(fallback_u8 and fallback_ascii is None)
|
| 515 |
+
or (
|
| 516 |
+
fallback_u8
|
| 517 |
+
and fallback_ascii
|
| 518 |
+
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
| 519 |
+
)
|
| 520 |
+
or (fallback_u8 is not None)
|
| 521 |
+
):
|
| 522 |
+
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
| 523 |
+
results.append(fallback_u8)
|
| 524 |
+
elif fallback_ascii:
|
| 525 |
+
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
| 526 |
+
results.append(fallback_ascii)
|
| 527 |
+
|
| 528 |
+
if results:
|
| 529 |
+
logger.debug(
|
| 530 |
+
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
| 531 |
+
results.best().encoding, # type: ignore
|
| 532 |
+
len(results) - 1,
|
| 533 |
+
)
|
| 534 |
+
else:
|
| 535 |
+
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
| 536 |
+
|
| 537 |
+
if explain:
|
| 538 |
+
logger.removeHandler(explain_handler)
|
| 539 |
+
logger.setLevel(previous_logger_level)
|
| 540 |
+
|
| 541 |
+
return results
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
def from_fp(
|
| 545 |
+
fp: BinaryIO,
|
| 546 |
+
steps: int = 5,
|
| 547 |
+
chunk_size: int = 512,
|
| 548 |
+
threshold: float = 0.20,
|
| 549 |
+
cp_isolation: list[str] | None = None,
|
| 550 |
+
cp_exclusion: list[str] | None = None,
|
| 551 |
+
preemptive_behaviour: bool = True,
|
| 552 |
+
explain: bool = False,
|
| 553 |
+
language_threshold: float = 0.1,
|
| 554 |
+
enable_fallback: bool = True,
|
| 555 |
+
) -> CharsetMatches:
|
| 556 |
+
"""
|
| 557 |
+
Same thing than the function from_bytes but using a file pointer that is already ready.
|
| 558 |
+
Will not close the file pointer.
|
| 559 |
+
"""
|
| 560 |
+
return from_bytes(
|
| 561 |
+
fp.read(),
|
| 562 |
+
steps,
|
| 563 |
+
chunk_size,
|
| 564 |
+
threshold,
|
| 565 |
+
cp_isolation,
|
| 566 |
+
cp_exclusion,
|
| 567 |
+
preemptive_behaviour,
|
| 568 |
+
explain,
|
| 569 |
+
language_threshold,
|
| 570 |
+
enable_fallback,
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
def from_path(
|
| 575 |
+
path: str | bytes | PathLike, # type: ignore[type-arg]
|
| 576 |
+
steps: int = 5,
|
| 577 |
+
chunk_size: int = 512,
|
| 578 |
+
threshold: float = 0.20,
|
| 579 |
+
cp_isolation: list[str] | None = None,
|
| 580 |
+
cp_exclusion: list[str] | None = None,
|
| 581 |
+
preemptive_behaviour: bool = True,
|
| 582 |
+
explain: bool = False,
|
| 583 |
+
language_threshold: float = 0.1,
|
| 584 |
+
enable_fallback: bool = True,
|
| 585 |
+
) -> CharsetMatches:
|
| 586 |
+
"""
|
| 587 |
+
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
| 588 |
+
Can raise IOError.
|
| 589 |
+
"""
|
| 590 |
+
with open(path, "rb") as fp:
|
| 591 |
+
return from_fp(
|
| 592 |
+
fp,
|
| 593 |
+
steps,
|
| 594 |
+
chunk_size,
|
| 595 |
+
threshold,
|
| 596 |
+
cp_isolation,
|
| 597 |
+
cp_exclusion,
|
| 598 |
+
preemptive_behaviour,
|
| 599 |
+
explain,
|
| 600 |
+
language_threshold,
|
| 601 |
+
enable_fallback,
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
def is_binary(
|
| 606 |
+
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
| 607 |
+
steps: int = 5,
|
| 608 |
+
chunk_size: int = 512,
|
| 609 |
+
threshold: float = 0.20,
|
| 610 |
+
cp_isolation: list[str] | None = None,
|
| 611 |
+
cp_exclusion: list[str] | None = None,
|
| 612 |
+
preemptive_behaviour: bool = True,
|
| 613 |
+
explain: bool = False,
|
| 614 |
+
language_threshold: float = 0.1,
|
| 615 |
+
enable_fallback: bool = False,
|
| 616 |
+
) -> bool:
|
| 617 |
+
"""
|
| 618 |
+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
| 619 |
+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
| 620 |
+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
| 621 |
+
"""
|
| 622 |
+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
| 623 |
+
guesses = from_path(
|
| 624 |
+
fp_or_path_or_payload,
|
| 625 |
+
steps=steps,
|
| 626 |
+
chunk_size=chunk_size,
|
| 627 |
+
threshold=threshold,
|
| 628 |
+
cp_isolation=cp_isolation,
|
| 629 |
+
cp_exclusion=cp_exclusion,
|
| 630 |
+
preemptive_behaviour=preemptive_behaviour,
|
| 631 |
+
explain=explain,
|
| 632 |
+
language_threshold=language_threshold,
|
| 633 |
+
enable_fallback=enable_fallback,
|
| 634 |
+
)
|
| 635 |
+
elif isinstance(
|
| 636 |
+
fp_or_path_or_payload,
|
| 637 |
+
(
|
| 638 |
+
bytes,
|
| 639 |
+
bytearray,
|
| 640 |
+
),
|
| 641 |
+
):
|
| 642 |
+
guesses = from_bytes(
|
| 643 |
+
fp_or_path_or_payload,
|
| 644 |
+
steps=steps,
|
| 645 |
+
chunk_size=chunk_size,
|
| 646 |
+
threshold=threshold,
|
| 647 |
+
cp_isolation=cp_isolation,
|
| 648 |
+
cp_exclusion=cp_exclusion,
|
| 649 |
+
preemptive_behaviour=preemptive_behaviour,
|
| 650 |
+
explain=explain,
|
| 651 |
+
language_threshold=language_threshold,
|
| 652 |
+
enable_fallback=enable_fallback,
|
| 653 |
+
)
|
| 654 |
+
else:
|
| 655 |
+
guesses = from_fp(
|
| 656 |
+
fp_or_path_or_payload,
|
| 657 |
+
steps=steps,
|
| 658 |
+
chunk_size=chunk_size,
|
| 659 |
+
threshold=threshold,
|
| 660 |
+
cp_isolation=cp_isolation,
|
| 661 |
+
cp_exclusion=cp_exclusion,
|
| 662 |
+
preemptive_behaviour=preemptive_behaviour,
|
| 663 |
+
explain=explain,
|
| 664 |
+
language_threshold=language_threshold,
|
| 665 |
+
enable_fallback=enable_fallback,
|
| 666 |
+
)
|
| 667 |
+
|
| 668 |
+
return not guesses
|
venv/Lib/site-packages/charset_normalizer/cd.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
from codecs import IncrementalDecoder
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Counter as TypeCounter
|
| 8 |
+
|
| 9 |
+
from .constant import (
|
| 10 |
+
FREQUENCIES,
|
| 11 |
+
KO_NAMES,
|
| 12 |
+
LANGUAGE_SUPPORTED_COUNT,
|
| 13 |
+
TOO_SMALL_SEQUENCE,
|
| 14 |
+
ZH_NAMES,
|
| 15 |
+
)
|
| 16 |
+
from .md import is_suspiciously_successive_range
|
| 17 |
+
from .models import CoherenceMatches
|
| 18 |
+
from .utils import (
|
| 19 |
+
is_accentuated,
|
| 20 |
+
is_latin,
|
| 21 |
+
is_multi_byte_encoding,
|
| 22 |
+
is_unicode_range_secondary,
|
| 23 |
+
unicode_range,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def encoding_unicode_range(iana_name: str) -> list[str]:
|
| 28 |
+
"""
|
| 29 |
+
Return associated unicode ranges in a single byte code page.
|
| 30 |
+
"""
|
| 31 |
+
if is_multi_byte_encoding(iana_name):
|
| 32 |
+
raise OSError("Function not supported on multi-byte code page")
|
| 33 |
+
|
| 34 |
+
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
| 35 |
+
|
| 36 |
+
p: IncrementalDecoder = decoder(errors="ignore")
|
| 37 |
+
seen_ranges: dict[str, int] = {}
|
| 38 |
+
character_count: int = 0
|
| 39 |
+
|
| 40 |
+
for i in range(0x40, 0xFF):
|
| 41 |
+
chunk: str = p.decode(bytes([i]))
|
| 42 |
+
|
| 43 |
+
if chunk:
|
| 44 |
+
character_range: str | None = unicode_range(chunk)
|
| 45 |
+
|
| 46 |
+
if character_range is None:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
if is_unicode_range_secondary(character_range) is False:
|
| 50 |
+
if character_range not in seen_ranges:
|
| 51 |
+
seen_ranges[character_range] = 0
|
| 52 |
+
seen_ranges[character_range] += 1
|
| 53 |
+
character_count += 1
|
| 54 |
+
|
| 55 |
+
return sorted(
|
| 56 |
+
[
|
| 57 |
+
character_range
|
| 58 |
+
for character_range in seen_ranges
|
| 59 |
+
if seen_ranges[character_range] / character_count >= 0.15
|
| 60 |
+
]
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def unicode_range_languages(primary_range: str) -> list[str]:
|
| 65 |
+
"""
|
| 66 |
+
Return inferred languages used with a unicode range.
|
| 67 |
+
"""
|
| 68 |
+
languages: list[str] = []
|
| 69 |
+
|
| 70 |
+
for language, characters in FREQUENCIES.items():
|
| 71 |
+
for character in characters:
|
| 72 |
+
if unicode_range(character) == primary_range:
|
| 73 |
+
languages.append(language)
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
return languages
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@lru_cache()
|
| 80 |
+
def encoding_languages(iana_name: str) -> list[str]:
|
| 81 |
+
"""
|
| 82 |
+
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
| 83 |
+
This function does the correspondence.
|
| 84 |
+
"""
|
| 85 |
+
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
| 86 |
+
primary_range: str | None = None
|
| 87 |
+
|
| 88 |
+
for specified_range in unicode_ranges:
|
| 89 |
+
if "Latin" not in specified_range:
|
| 90 |
+
primary_range = specified_range
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
if primary_range is None:
|
| 94 |
+
return ["Latin Based"]
|
| 95 |
+
|
| 96 |
+
return unicode_range_languages(primary_range)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@lru_cache()
|
| 100 |
+
def mb_encoding_languages(iana_name: str) -> list[str]:
|
| 101 |
+
"""
|
| 102 |
+
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
| 103 |
+
This function does the correspondence.
|
| 104 |
+
"""
|
| 105 |
+
if (
|
| 106 |
+
iana_name.startswith("shift_")
|
| 107 |
+
or iana_name.startswith("iso2022_jp")
|
| 108 |
+
or iana_name.startswith("euc_j")
|
| 109 |
+
or iana_name == "cp932"
|
| 110 |
+
):
|
| 111 |
+
return ["Japanese"]
|
| 112 |
+
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
| 113 |
+
return ["Chinese"]
|
| 114 |
+
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
| 115 |
+
return ["Korean"]
|
| 116 |
+
|
| 117 |
+
return []
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
| 121 |
+
def get_target_features(language: str) -> tuple[bool, bool]:
|
| 122 |
+
"""
|
| 123 |
+
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
| 124 |
+
"""
|
| 125 |
+
target_have_accents: bool = False
|
| 126 |
+
target_pure_latin: bool = True
|
| 127 |
+
|
| 128 |
+
for character in FREQUENCIES[language]:
|
| 129 |
+
if not target_have_accents and is_accentuated(character):
|
| 130 |
+
target_have_accents = True
|
| 131 |
+
if target_pure_latin and is_latin(character) is False:
|
| 132 |
+
target_pure_latin = False
|
| 133 |
+
|
| 134 |
+
return target_have_accents, target_pure_latin
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def alphabet_languages(
|
| 138 |
+
characters: list[str], ignore_non_latin: bool = False
|
| 139 |
+
) -> list[str]:
|
| 140 |
+
"""
|
| 141 |
+
Return associated languages associated to given characters.
|
| 142 |
+
"""
|
| 143 |
+
languages: list[tuple[str, float]] = []
|
| 144 |
+
|
| 145 |
+
source_have_accents = any(is_accentuated(character) for character in characters)
|
| 146 |
+
|
| 147 |
+
for language, language_characters in FREQUENCIES.items():
|
| 148 |
+
target_have_accents, target_pure_latin = get_target_features(language)
|
| 149 |
+
|
| 150 |
+
if ignore_non_latin and target_pure_latin is False:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
if target_have_accents is False and source_have_accents:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
character_count: int = len(language_characters)
|
| 157 |
+
|
| 158 |
+
character_match_count: int = len(
|
| 159 |
+
[c for c in language_characters if c in characters]
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
ratio: float = character_match_count / character_count
|
| 163 |
+
|
| 164 |
+
if ratio >= 0.2:
|
| 165 |
+
languages.append((language, ratio))
|
| 166 |
+
|
| 167 |
+
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
| 168 |
+
|
| 169 |
+
return [compatible_language[0] for compatible_language in languages]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def characters_popularity_compare(
|
| 173 |
+
language: str, ordered_characters: list[str]
|
| 174 |
+
) -> float:
|
| 175 |
+
"""
|
| 176 |
+
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
| 177 |
+
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
| 178 |
+
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
| 179 |
+
"""
|
| 180 |
+
if language not in FREQUENCIES:
|
| 181 |
+
raise ValueError(f"{language} not available")
|
| 182 |
+
|
| 183 |
+
character_approved_count: int = 0
|
| 184 |
+
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
| 185 |
+
|
| 186 |
+
ordered_characters_count: int = len(ordered_characters)
|
| 187 |
+
target_language_characters_count: int = len(FREQUENCIES[language])
|
| 188 |
+
|
| 189 |
+
large_alphabet: bool = target_language_characters_count > 26
|
| 190 |
+
|
| 191 |
+
for character, character_rank in zip(
|
| 192 |
+
ordered_characters, range(0, ordered_characters_count)
|
| 193 |
+
):
|
| 194 |
+
if character not in FREQUENCIES_language_set:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
| 198 |
+
expected_projection_ratio: float = (
|
| 199 |
+
target_language_characters_count / ordered_characters_count
|
| 200 |
+
)
|
| 201 |
+
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
| 202 |
+
|
| 203 |
+
if (
|
| 204 |
+
large_alphabet is False
|
| 205 |
+
and abs(character_rank_projection - character_rank_in_language) > 4
|
| 206 |
+
):
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
if (
|
| 210 |
+
large_alphabet is True
|
| 211 |
+
and abs(character_rank_projection - character_rank_in_language)
|
| 212 |
+
< target_language_characters_count / 3
|
| 213 |
+
):
|
| 214 |
+
character_approved_count += 1
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
characters_before_source: list[str] = FREQUENCIES[language][
|
| 218 |
+
0:character_rank_in_language
|
| 219 |
+
]
|
| 220 |
+
characters_after_source: list[str] = FREQUENCIES[language][
|
| 221 |
+
character_rank_in_language:
|
| 222 |
+
]
|
| 223 |
+
characters_before: list[str] = ordered_characters[0:character_rank]
|
| 224 |
+
characters_after: list[str] = ordered_characters[character_rank:]
|
| 225 |
+
|
| 226 |
+
before_match_count: int = len(
|
| 227 |
+
set(characters_before) & set(characters_before_source)
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
after_match_count: int = len(
|
| 231 |
+
set(characters_after) & set(characters_after_source)
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if len(characters_before_source) == 0 and before_match_count <= 4:
|
| 235 |
+
character_approved_count += 1
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
if len(characters_after_source) == 0 and after_match_count <= 4:
|
| 239 |
+
character_approved_count += 1
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
if (
|
| 243 |
+
before_match_count / len(characters_before_source) >= 0.4
|
| 244 |
+
or after_match_count / len(characters_after_source) >= 0.4
|
| 245 |
+
):
|
| 246 |
+
character_approved_count += 1
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
return character_approved_count / len(ordered_characters)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
| 253 |
+
"""
|
| 254 |
+
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
| 255 |
+
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
| 256 |
+
One containing the latin letters and the other hebrew.
|
| 257 |
+
"""
|
| 258 |
+
layers: dict[str, str] = {}
|
| 259 |
+
|
| 260 |
+
for character in decoded_sequence:
|
| 261 |
+
if character.isalpha() is False:
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
character_range: str | None = unicode_range(character)
|
| 265 |
+
|
| 266 |
+
if character_range is None:
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
layer_target_range: str | None = None
|
| 270 |
+
|
| 271 |
+
for discovered_range in layers:
|
| 272 |
+
if (
|
| 273 |
+
is_suspiciously_successive_range(discovered_range, character_range)
|
| 274 |
+
is False
|
| 275 |
+
):
|
| 276 |
+
layer_target_range = discovered_range
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
if layer_target_range is None:
|
| 280 |
+
layer_target_range = character_range
|
| 281 |
+
|
| 282 |
+
if layer_target_range not in layers:
|
| 283 |
+
layers[layer_target_range] = character.lower()
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
layers[layer_target_range] += character.lower()
|
| 287 |
+
|
| 288 |
+
return list(layers.values())
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
| 292 |
+
"""
|
| 293 |
+
This function merge results previously given by the function coherence_ratio.
|
| 294 |
+
The return type is the same as coherence_ratio.
|
| 295 |
+
"""
|
| 296 |
+
per_language_ratios: dict[str, list[float]] = {}
|
| 297 |
+
for result in results:
|
| 298 |
+
for sub_result in result:
|
| 299 |
+
language, ratio = sub_result
|
| 300 |
+
if language not in per_language_ratios:
|
| 301 |
+
per_language_ratios[language] = [ratio]
|
| 302 |
+
continue
|
| 303 |
+
per_language_ratios[language].append(ratio)
|
| 304 |
+
|
| 305 |
+
merge = [
|
| 306 |
+
(
|
| 307 |
+
language,
|
| 308 |
+
round(
|
| 309 |
+
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
| 310 |
+
4,
|
| 311 |
+
),
|
| 312 |
+
)
|
| 313 |
+
for language in per_language_ratios
|
| 314 |
+
]
|
| 315 |
+
|
| 316 |
+
return sorted(merge, key=lambda x: x[1], reverse=True)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
| 320 |
+
"""
|
| 321 |
+
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
| 322 |
+
of "English". This function only keeps the best match and remove the em-dash in it.
|
| 323 |
+
"""
|
| 324 |
+
index_results: dict[str, list[float]] = dict()
|
| 325 |
+
|
| 326 |
+
for result in results:
|
| 327 |
+
language, ratio = result
|
| 328 |
+
no_em_name: str = language.replace("—", "")
|
| 329 |
+
|
| 330 |
+
if no_em_name not in index_results:
|
| 331 |
+
index_results[no_em_name] = []
|
| 332 |
+
|
| 333 |
+
index_results[no_em_name].append(ratio)
|
| 334 |
+
|
| 335 |
+
if any(len(index_results[e]) > 1 for e in index_results):
|
| 336 |
+
filtered_results: CoherenceMatches = []
|
| 337 |
+
|
| 338 |
+
for language in index_results:
|
| 339 |
+
filtered_results.append((language, max(index_results[language])))
|
| 340 |
+
|
| 341 |
+
return filtered_results
|
| 342 |
+
|
| 343 |
+
return results
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
@lru_cache(maxsize=2048)
|
| 347 |
+
def coherence_ratio(
|
| 348 |
+
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
| 349 |
+
) -> CoherenceMatches:
|
| 350 |
+
"""
|
| 351 |
+
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
| 352 |
+
A layer = Character extraction by alphabets/ranges.
|
| 353 |
+
"""
|
| 354 |
+
|
| 355 |
+
results: list[tuple[str, float]] = []
|
| 356 |
+
ignore_non_latin: bool = False
|
| 357 |
+
|
| 358 |
+
sufficient_match_count: int = 0
|
| 359 |
+
|
| 360 |
+
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
| 361 |
+
if "Latin Based" in lg_inclusion_list:
|
| 362 |
+
ignore_non_latin = True
|
| 363 |
+
lg_inclusion_list.remove("Latin Based")
|
| 364 |
+
|
| 365 |
+
for layer in alpha_unicode_split(decoded_sequence):
|
| 366 |
+
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
| 367 |
+
most_common = sequence_frequencies.most_common()
|
| 368 |
+
|
| 369 |
+
character_count: int = sum(o for c, o in most_common)
|
| 370 |
+
|
| 371 |
+
if character_count <= TOO_SMALL_SEQUENCE:
|
| 372 |
+
continue
|
| 373 |
+
|
| 374 |
+
popular_character_ordered: list[str] = [c for c, o in most_common]
|
| 375 |
+
|
| 376 |
+
for language in lg_inclusion_list or alphabet_languages(
|
| 377 |
+
popular_character_ordered, ignore_non_latin
|
| 378 |
+
):
|
| 379 |
+
ratio: float = characters_popularity_compare(
|
| 380 |
+
language, popular_character_ordered
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
if ratio < threshold:
|
| 384 |
+
continue
|
| 385 |
+
elif ratio >= 0.8:
|
| 386 |
+
sufficient_match_count += 1
|
| 387 |
+
|
| 388 |
+
results.append((language, round(ratio, 4)))
|
| 389 |
+
|
| 390 |
+
if sufficient_match_count >= 3:
|
| 391 |
+
break
|
| 392 |
+
|
| 393 |
+
return sorted(
|
| 394 |
+
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
| 395 |
+
)
|
venv/Lib/site-packages/charset_normalizer/cli/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .__main__ import cli_detect, query_yes_no
|
| 4 |
+
|
| 5 |
+
__all__ = (
|
| 6 |
+
"cli_detect",
|
| 7 |
+
"query_yes_no",
|
| 8 |
+
)
|