ricitos2001 commited on
Commit
a7b70a9
·
verified ·
1 Parent(s): 424798f

Upload 6572 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +140 -0
  2. venv/.gitignore +2 -0
  3. venv/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +1 -0
  4. venv/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE +20 -0
  5. venv/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA +46 -0
  6. venv/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD +43 -0
  7. venv/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL +5 -0
  8. venv/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +2 -0
  9. venv/Lib/site-packages/__pycache__/_virtualenv.cpython-313.pyc +0 -0
  10. venv/Lib/site-packages/__pycache__/typing_extensions.cpython-313.pyc +3 -0
  11. venv/Lib/site-packages/_virtualenv.pth +3 -0
  12. venv/Lib/site-packages/_virtualenv.py +103 -0
  13. venv/Lib/site-packages/_yaml/__init__.py +33 -0
  14. venv/Lib/site-packages/_yaml/__pycache__/__init__.cpython-313.pyc +0 -0
  15. venv/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER +1 -0
  16. venv/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE +20 -0
  17. venv/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA +77 -0
  18. venv/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD +14 -0
  19. venv/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL +5 -0
  20. venv/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt +1 -0
  21. venv/Lib/site-packages/certifi/__init__.py +4 -0
  22. venv/Lib/site-packages/certifi/__main__.py +12 -0
  23. venv/Lib/site-packages/certifi/__pycache__/__init__.cpython-313.pyc +0 -0
  24. venv/Lib/site-packages/certifi/__pycache__/__main__.cpython-313.pyc +0 -0
  25. venv/Lib/site-packages/certifi/__pycache__/core.cpython-313.pyc +0 -0
  26. venv/Lib/site-packages/certifi/cacert.pem +0 -0
  27. venv/Lib/site-packages/certifi/core.py +114 -0
  28. venv/Lib/site-packages/certifi/py.typed +0 -0
  29. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +1 -0
  30. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +21 -0
  31. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +721 -0
  32. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +35 -0
  33. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +5 -0
  34. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +2 -0
  35. venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +1 -0
  36. venv/Lib/site-packages/charset_normalizer/__init__.py +48 -0
  37. venv/Lib/site-packages/charset_normalizer/__main__.py +6 -0
  38. venv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-313.pyc +0 -0
  39. venv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-313.pyc +0 -0
  40. venv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-313.pyc +0 -0
  41. venv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-313.pyc +0 -0
  42. venv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-313.pyc +0 -0
  43. venv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-313.pyc +0 -0
  44. venv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-313.pyc +0 -0
  45. venv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-313.pyc +0 -0
  46. venv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-313.pyc +0 -0
  47. venv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-313.pyc +0 -0
  48. venv/Lib/site-packages/charset_normalizer/api.py +668 -0
  49. venv/Lib/site-packages/charset_normalizer/cd.py +395 -0
  50. venv/Lib/site-packages/charset_normalizer/cli/__init__.py +8 -0
.gitattributes CHANGED
@@ -33,3 +33,143 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ venv/Lib/site-packages/__pycache__/typing_extensions.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
37
+ venv/Lib/site-packages/charset_normalizer/md__mypyc.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
38
+ venv/Lib/site-packages/huggingface_hub/__pycache__/hf_api.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
39
+ venv/Lib/site-packages/huggingface_hub/inference/__pycache__/_client.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
40
+ venv/Lib/site-packages/huggingface_hub/inference/_generated/__pycache__/_async_client.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
41
+ venv/Lib/site-packages/idna/__pycache__/uts46data.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
42
+ venv/Lib/site-packages/numpy.libs/libscipy_openblas64_-43e11ff0749b8cbe0a615c9cf6737e0e.dll filter=lfs diff=lfs merge=lfs -text
43
+ venv/Lib/site-packages/numpy.libs/msvcp140-263139962577ecda4cd9469ca360a746.dll filter=lfs diff=lfs merge=lfs -text
44
+ venv/Lib/site-packages/numpy/_core/__pycache__/_add_newdocs.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
45
+ venv/Lib/site-packages/numpy/_core/__pycache__/fromnumeric.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
46
+ venv/Lib/site-packages/numpy/_core/_multiarray_umath.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
47
+ venv/Lib/site-packages/numpy/_core/_simd.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
48
+ venv/Lib/site-packages/numpy/_core/lib/npymath.lib filter=lfs diff=lfs merge=lfs -text
49
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_datetime.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
50
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_dtype.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
51
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_multiarray.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
52
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_nditer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
53
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_numeric.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
54
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_regression.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
55
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_ufunc.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
56
+ venv/Lib/site-packages/numpy/_core/tests/__pycache__/test_umath.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
57
+ venv/Lib/site-packages/numpy/f2py/__pycache__/crackfortran.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
58
+ venv/Lib/site-packages/numpy/fft/_pocketfft_umath.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
59
+ venv/Lib/site-packages/numpy/lib/__pycache__/_function_base_impl.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
60
+ venv/Lib/site-packages/numpy/lib/tests/__pycache__/test_function_base.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
61
+ venv/Lib/site-packages/numpy/lib/tests/__pycache__/test_io.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
62
+ venv/Lib/site-packages/numpy/linalg/__pycache__/_linalg.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
63
+ venv/Lib/site-packages/numpy/linalg/_umath_linalg.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
64
+ venv/Lib/site-packages/numpy/linalg/tests/__pycache__/test_linalg.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
65
+ venv/Lib/site-packages/numpy/ma/__pycache__/core.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
66
+ venv/Lib/site-packages/numpy/ma/tests/__pycache__/test_core.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
67
+ venv/Lib/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
68
+ venv/Lib/site-packages/numpy/random/_bounded_integers.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
69
+ venv/Lib/site-packages/numpy/random/_common.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
70
+ venv/Lib/site-packages/numpy/random/_generator.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
71
+ venv/Lib/site-packages/numpy/random/bit_generator.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
72
+ venv/Lib/site-packages/numpy/random/lib/npyrandom.lib filter=lfs diff=lfs merge=lfs -text
73
+ venv/Lib/site-packages/numpy/random/mtrand.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
74
+ venv/Lib/site-packages/numpy/random/tests/__pycache__/test_generator_mt19937.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
75
+ venv/Lib/site-packages/numpy/random/tests/__pycache__/test_random.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
76
+ venv/Lib/site-packages/numpy/random/tests/__pycache__/test_randomstate.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
77
+ venv/Lib/site-packages/numpy/testing/_private/__pycache__/utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
78
+ venv/Lib/site-packages/numpy/testing/tests/__pycache__/test_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
79
+ venv/Lib/site-packages/pip/_vendor/__pycache__/typing_extensions.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
80
+ venv/Lib/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
81
+ venv/Lib/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
82
+ venv/Lib/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
83
+ venv/Lib/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
84
+ venv/Lib/site-packages/pip/_vendor/pkg_resources/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
85
+ venv/Lib/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
86
+ venv/Lib/site-packages/pip/_vendor/rich/__pycache__/console.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
87
+ venv/Lib/site-packages/regex/__pycache__/_regex_core.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
88
+ venv/Lib/site-packages/regex/__pycache__/test_regex.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
89
+ venv/Lib/site-packages/regex/_regex.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
90
+ venv/Lib/site-packages/safetensors/_safetensors_rust.pyd filter=lfs diff=lfs merge=lfs -text
91
+ venv/Lib/site-packages/tokenizers/tokenizers.pyd filter=lfs diff=lfs merge=lfs -text
92
+ venv/Lib/site-packages/transformers/__pycache__/__init__.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
93
+ venv/Lib/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
94
+ venv/Lib/site-packages/transformers/__pycache__/modeling_outputs.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
95
+ venv/Lib/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
96
+ venv/Lib/site-packages/transformers/__pycache__/modeling_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
97
+ venv/Lib/site-packages/transformers/__pycache__/testing_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
98
+ venv/Lib/site-packages/transformers/__pycache__/tokenization_utils_base.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
99
+ venv/Lib/site-packages/transformers/__pycache__/trainer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
100
+ venv/Lib/site-packages/transformers/__pycache__/training_args.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
101
+ venv/Lib/site-packages/transformers/generation/__pycache__/logits_process.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
102
+ venv/Lib/site-packages/transformers/generation/__pycache__/tf_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
103
+ venv/Lib/site-packages/transformers/generation/__pycache__/utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
104
+ venv/Lib/site-packages/transformers/integrations/__pycache__/integration_utils.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
105
+ venv/Lib/site-packages/transformers/models/autoformer/__pycache__/modeling_autoformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
106
+ venv/Lib/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
107
+ venv/Lib/site-packages/transformers/models/big_bird/__pycache__/modeling_big_bird.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
108
+ venv/Lib/site-packages/transformers/models/big_bird/__pycache__/modeling_flax_big_bird.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
109
+ venv/Lib/site-packages/transformers/models/bigbird_pegasus/__pycache__/modeling_bigbird_pegasus.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
110
+ venv/Lib/site-packages/transformers/models/blip_2/__pycache__/modeling_blip_2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
111
+ venv/Lib/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
112
+ venv/Lib/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
113
+ venv/Lib/site-packages/transformers/models/conditional_detr/__pycache__/modeling_conditional_detr.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
114
+ venv/Lib/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
115
+ venv/Lib/site-packages/transformers/models/deprecated/deta/__pycache__/modeling_deta.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
116
+ venv/Lib/site-packages/transformers/models/deprecated/jukebox/__pycache__/modeling_jukebox.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
117
+ venv/Lib/site-packages/transformers/models/deprecated/mega/__pycache__/modeling_mega.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
118
+ venv/Lib/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/modeling_xlm_prophetnet.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
119
+ venv/Lib/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
120
+ venv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esmfold.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
121
+ venv/Lib/site-packages/transformers/models/flava/__pycache__/modeling_flava.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
122
+ venv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modeling_grounding_dino.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
123
+ venv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_tf_groupvit.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
124
+ venv/Lib/site-packages/transformers/models/informer/__pycache__/modeling_informer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
125
+ venv/Lib/site-packages/transformers/models/kosmos2/__pycache__/modeling_kosmos2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
126
+ venv/Lib/site-packages/transformers/models/led/__pycache__/modeling_led.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
127
+ venv/Lib/site-packages/transformers/models/led/__pycache__/modeling_tf_led.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
128
+ venv/Lib/site-packages/transformers/models/longformer/__pycache__/modeling_longformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
129
+ venv/Lib/site-packages/transformers/models/longformer/__pycache__/modeling_tf_longformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
130
+ venv/Lib/site-packages/transformers/models/longt5/__pycache__/modeling_flax_longt5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
131
+ venv/Lib/site-packages/transformers/models/longt5/__pycache__/modeling_longt5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
132
+ venv/Lib/site-packages/transformers/models/luke/__pycache__/modeling_luke.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
133
+ venv/Lib/site-packages/transformers/models/mask2former/__pycache__/modeling_mask2former.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
134
+ venv/Lib/site-packages/transformers/models/mllama/__pycache__/modeling_mllama.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
135
+ venv/Lib/site-packages/transformers/models/moshi/__pycache__/modeling_moshi.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
136
+ venv/Lib/site-packages/transformers/models/mt5/__pycache__/modeling_mt5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
137
+ venv/Lib/site-packages/transformers/models/musicgen_melody/__pycache__/modeling_musicgen_melody.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
138
+ venv/Lib/site-packages/transformers/models/musicgen/__pycache__/modeling_musicgen.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
139
+ venv/Lib/site-packages/transformers/models/oneformer/__pycache__/modeling_oneformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
140
+ venv/Lib/site-packages/transformers/models/perceiver/__pycache__/modeling_perceiver.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
141
+ venv/Lib/site-packages/transformers/models/phi4_multimodal/__pycache__/modeling_phi4_multimodal.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
142
+ venv/Lib/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
143
+ venv/Lib/site-packages/transformers/models/qwen2_5_vl/__pycache__/modeling_qwen2_5_vl.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
144
+ venv/Lib/site-packages/transformers/models/qwen2_vl/__pycache__/modeling_qwen2_vl.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
145
+ venv/Lib/site-packages/transformers/models/reformer/__pycache__/modeling_reformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
146
+ venv/Lib/site-packages/transformers/models/rt_detr_v2/__pycache__/modeling_rt_detr_v2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
147
+ venv/Lib/site-packages/transformers/models/rt_detr/__pycache__/modeling_rt_detr.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
148
+ venv/Lib/site-packages/transformers/models/seamless_m4t_v2/__pycache__/modeling_seamless_m4t_v2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
149
+ venv/Lib/site-packages/transformers/models/seamless_m4t/__pycache__/modeling_seamless_m4t.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
150
+ venv/Lib/site-packages/transformers/models/speecht5/__pycache__/modeling_speecht5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
151
+ venv/Lib/site-packages/transformers/models/t5/__pycache__/modeling_t5.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
152
+ venv/Lib/site-packages/transformers/models/tapas/__pycache__/modeling_tapas.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
153
+ venv/Lib/site-packages/transformers/models/tapas/__pycache__/modeling_tf_tapas.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
154
+ venv/Lib/site-packages/transformers/models/tapas/__pycache__/tokenization_tapas.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
155
+ venv/Lib/site-packages/transformers/models/udop/__pycache__/modeling_udop.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
156
+ venv/Lib/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
157
+ venv/Lib/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
158
+ venv/Lib/site-packages/transformers/models/wav2vec2/__pycache__/modeling_wav2vec2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
159
+ venv/Lib/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
160
+ venv/Lib/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
161
+ venv/Lib/site-packages/transformers/utils/__pycache__/dummy_pt_objects.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
162
+ venv/Lib/site-packages/transformers/utils/__pycache__/dummy_tf_objects.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
163
+ venv/Lib/site-packages/yaml/_yaml.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
164
+ venv/Scripts/f2py.exe filter=lfs diff=lfs merge=lfs -text
165
+ venv/Scripts/huggingface-cli.exe filter=lfs diff=lfs merge=lfs -text
166
+ venv/Scripts/normalizer.exe filter=lfs diff=lfs merge=lfs -text
167
+ venv/Scripts/numpy-config.exe filter=lfs diff=lfs merge=lfs -text
168
+ venv/Scripts/pip-3.13.exe filter=lfs diff=lfs merge=lfs -text
169
+ venv/Scripts/pip.exe filter=lfs diff=lfs merge=lfs -text
170
+ venv/Scripts/pip3.13.exe filter=lfs diff=lfs merge=lfs -text
171
+ venv/Scripts/pip3.exe filter=lfs diff=lfs merge=lfs -text
172
+ venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
173
+ venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
174
+ venv/Scripts/tqdm.exe filter=lfs diff=lfs merge=lfs -text
175
+ venv/Scripts/transformers-cli.exe filter=lfs diff=lfs merge=lfs -text
venv/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # created by virtualenv automatically
2
+ *
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017-2021 Ingy döt Net
2
+ Copyright (c) 2006-2016 Kirill Simonov
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ this software and associated documentation files (the "Software"), to deal in
6
+ the Software without restriction, including without limitation the rights to
7
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ of the Software, and to permit persons to whom the Software is furnished to do
9
+ so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all
12
+ copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ SOFTWARE.
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: PyYAML
3
+ Version: 6.0.2
4
+ Summary: YAML parser and emitter for Python
5
+ Home-page: https://pyyaml.org/
6
+ Download-URL: https://pypi.org/project/PyYAML/
7
+ Author: Kirill Simonov
8
+ Author-email: xi@resolvent.net
9
+ License: MIT
10
+ Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
11
+ Project-URL: CI, https://github.com/yaml/pyyaml/actions
12
+ Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
13
+ Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
14
+ Project-URL: Source Code, https://github.com/yaml/pyyaml
15
+ Platform: Any
16
+ Classifier: Development Status :: 5 - Production/Stable
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Cython
21
+ Classifier: Programming Language :: Python
22
+ Classifier: Programming Language :: Python :: 3
23
+ Classifier: Programming Language :: Python :: 3.8
24
+ Classifier: Programming Language :: Python :: 3.9
25
+ Classifier: Programming Language :: Python :: 3.10
26
+ Classifier: Programming Language :: Python :: 3.11
27
+ Classifier: Programming Language :: Python :: 3.12
28
+ Classifier: Programming Language :: Python :: 3.13
29
+ Classifier: Programming Language :: Python :: Implementation :: CPython
30
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
31
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
32
+ Classifier: Topic :: Text Processing :: Markup
33
+ Requires-Python: >=3.8
34
+ License-File: LICENSE
35
+
36
+ YAML is a data serialization format designed for human readability
37
+ and interaction with scripting languages. PyYAML is a YAML parser
38
+ and emitter for Python.
39
+
40
+ PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
41
+ support, capable extension API, and sensible error messages. PyYAML
42
+ supports standard YAML tags and provides Python-specific tags that
43
+ allow to represent an arbitrary Python object.
44
+
45
+ PyYAML is applicable for a broad range of tasks from complex
46
+ configuration files to object serialization and persistence.
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
3
+ PyYAML-6.0.2.dist-info/METADATA,sha256=9lwXqTOrXPts-jI2Lo5UwuaAYo0hiRA0BZqjch0WjAk,2106
4
+ PyYAML-6.0.2.dist-info/RECORD,,
5
+ PyYAML-6.0.2.dist-info/WHEEL,sha256=ugue6NJCr9gUOQmWni1lhHLbY_ilTPbmSokNVdK9MnY,102
6
+ PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
7
+ _yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
8
+ _yaml/__pycache__/__init__.cpython-313.pyc,,
9
+ yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
10
+ yaml/__pycache__/__init__.cpython-313.pyc,,
11
+ yaml/__pycache__/composer.cpython-313.pyc,,
12
+ yaml/__pycache__/constructor.cpython-313.pyc,,
13
+ yaml/__pycache__/cyaml.cpython-313.pyc,,
14
+ yaml/__pycache__/dumper.cpython-313.pyc,,
15
+ yaml/__pycache__/emitter.cpython-313.pyc,,
16
+ yaml/__pycache__/error.cpython-313.pyc,,
17
+ yaml/__pycache__/events.cpython-313.pyc,,
18
+ yaml/__pycache__/loader.cpython-313.pyc,,
19
+ yaml/__pycache__/nodes.cpython-313.pyc,,
20
+ yaml/__pycache__/parser.cpython-313.pyc,,
21
+ yaml/__pycache__/reader.cpython-313.pyc,,
22
+ yaml/__pycache__/representer.cpython-313.pyc,,
23
+ yaml/__pycache__/resolver.cpython-313.pyc,,
24
+ yaml/__pycache__/scanner.cpython-313.pyc,,
25
+ yaml/__pycache__/serializer.cpython-313.pyc,,
26
+ yaml/__pycache__/tokens.cpython-313.pyc,,
27
+ yaml/_yaml.cp313-win_amd64.pyd,sha256=_iGlW4L7exHQxfjFi8uRZ1FD2cvMLGZnUB97b3sPn2g,263680
28
+ yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
29
+ yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
30
+ yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
31
+ yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
32
+ yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
33
+ yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
34
+ yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
35
+ yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
36
+ yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
37
+ yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
38
+ yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
39
+ yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
40
+ yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
41
+ yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
42
+ yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
43
+ yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.44.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp313-cp313-win_amd64
5
+
venv/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _yaml
2
+ yaml
venv/Lib/site-packages/__pycache__/_virtualenv.cpython-313.pyc ADDED
Binary file (4.24 kB). View file
 
venv/Lib/site-packages/__pycache__/typing_extensions.cpython-313.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c04370c14aa791e9c7507e203eb1047185bf2492c88001270566b6abce838c
3
+ size 177778
venv/Lib/site-packages/_virtualenv.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ac3d8f27e679c81b94ab30b3b56e9cd138219b1ba94a1fa3606d5a76a1433d
3
+ size 18
venv/Lib/site-packages/_virtualenv.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Patches that are applied at runtime to the virtual environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+
8
+ VIRTUALENV_PATCH_FILE = os.path.join(__file__)
9
+
10
+
11
+ def patch_dist(dist):
12
+ """
13
+ Distutils allows user to configure some arguments via a configuration file:
14
+ https://docs.python.org/3/install/index.html#distutils-configuration-files.
15
+
16
+ Some of this arguments though don't make sense in context of the virtual environment files, let's fix them up.
17
+ """ # noqa: D205
18
+ # we cannot allow some install config as that would get packages installed outside of the virtual environment
19
+ old_parse_config_files = dist.Distribution.parse_config_files
20
+
21
+ def parse_config_files(self, *args, **kwargs):
22
+ result = old_parse_config_files(self, *args, **kwargs)
23
+ install = self.get_option_dict("install")
24
+
25
+ if "prefix" in install: # the prefix governs where to install the libraries
26
+ install["prefix"] = VIRTUALENV_PATCH_FILE, os.path.abspath(sys.prefix)
27
+ for base in ("purelib", "platlib", "headers", "scripts", "data"):
28
+ key = f"install_{base}"
29
+ if key in install: # do not allow global configs to hijack venv paths
30
+ install.pop(key, None)
31
+ return result
32
+
33
+ dist.Distribution.parse_config_files = parse_config_files
34
+
35
+
36
+ # Import hook that patches some modules to ignore configuration values that break package installation in case
37
+ # of virtual environments.
38
+ _DISTUTILS_PATCH = "distutils.dist", "setuptools.dist"
39
+ # https://docs.python.org/3/library/importlib.html#setting-up-an-importer
40
+
41
+
42
+ class _Finder:
43
+ """A meta path finder that allows patching the imported distutils modules."""
44
+
45
+ fullname = None
46
+
47
+ # lock[0] is threading.Lock(), but initialized lazily to avoid importing threading very early at startup,
48
+ # because there are gevent-based applications that need to be first to import threading by themselves.
49
+ # See https://github.com/pypa/virtualenv/issues/1895 for details.
50
+ lock = [] # noqa: RUF012
51
+
52
+ def find_spec(self, fullname, path, target=None): # noqa: ARG002
53
+ if fullname in _DISTUTILS_PATCH and self.fullname is None: # noqa: PLR1702
54
+ # initialize lock[0] lazily
55
+ if len(self.lock) == 0:
56
+ import threading # noqa: PLC0415
57
+
58
+ lock = threading.Lock()
59
+ # there is possibility that two threads T1 and T2 are simultaneously running into find_spec,
60
+ # observing .lock as empty, and further going into hereby initialization. However due to the GIL,
61
+ # list.append() operation is atomic and this way only one of the threads will "win" to put the lock
62
+ # - that every thread will use - into .lock[0].
63
+ # https://docs.python.org/3/faq/library.html#what-kinds-of-global-value-mutation-are-thread-safe
64
+ self.lock.append(lock)
65
+
66
+ from functools import partial # noqa: PLC0415
67
+ from importlib.util import find_spec # noqa: PLC0415
68
+
69
+ with self.lock[0]:
70
+ self.fullname = fullname
71
+ try:
72
+ spec = find_spec(fullname, path)
73
+ if spec is not None:
74
+ # https://www.python.org/dev/peps/pep-0451/#how-loading-will-work
75
+ is_new_api = hasattr(spec.loader, "exec_module")
76
+ func_name = "exec_module" if is_new_api else "load_module"
77
+ old = getattr(spec.loader, func_name)
78
+ func = self.exec_module if is_new_api else self.load_module
79
+ if old is not func:
80
+ try: # noqa: SIM105
81
+ setattr(spec.loader, func_name, partial(func, old))
82
+ except AttributeError:
83
+ pass # C-Extension loaders are r/o such as zipimporter with <3.7
84
+ return spec
85
+ finally:
86
+ self.fullname = None
87
+ return None
88
+
89
+ @staticmethod
90
+ def exec_module(old, module):
91
+ old(module)
92
+ if module.__name__ in _DISTUTILS_PATCH:
93
+ patch_dist(module)
94
+
95
+ @staticmethod
96
+ def load_module(old, name):
97
+ module = old(name)
98
+ if module.__name__ in _DISTUTILS_PATCH:
99
+ patch_dist(module)
100
+ return module
101
+
102
+
103
+ sys.meta_path.insert(0, _Finder())
venv/Lib/site-packages/_yaml/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a stub package designed to roughly emulate the _yaml
2
+ # extension module, which previously existed as a standalone module
3
+ # and has been moved into the `yaml` package namespace.
4
+ # It does not perfectly mimic its old counterpart, but should get
5
+ # close enough for anyone who's relying on it even when they shouldn't.
6
+ import yaml
7
+
8
+ # in some circumstances, the yaml module we imoprted may be from a different version, so we need
9
+ # to tread carefully when poking at it here (it may not have the attributes we expect)
10
+ if not getattr(yaml, '__with_libyaml__', False):
11
+ from sys import version_info
12
+
13
+ exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
14
+ raise exc("No module named '_yaml'")
15
+ else:
16
+ from yaml._yaml import *
17
+ import warnings
18
+ warnings.warn(
19
+ 'The _yaml extension module is now located at yaml._yaml'
20
+ ' and its location is subject to change. To use the'
21
+ ' LibYAML-based parser and emitter, import from `yaml`:'
22
+ ' `from yaml import CLoader as Loader, CDumper as Dumper`.',
23
+ DeprecationWarning
24
+ )
25
+ del warnings
26
+ # Don't `del yaml` here because yaml is actually an existing
27
+ # namespace member of _yaml.
28
+
29
+ __name__ = '_yaml'
30
+ # If the module is top-level (i.e. not a part of any specific package)
31
+ # then the attribute should be set to ''.
32
+ # https://docs.python.org/3.8/library/types.html
33
+ __package__ = ''
venv/Lib/site-packages/_yaml/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (872 Bytes). View file
 
venv/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This package contains a modified version of ca-bundle.crt:
2
+
3
+ ca-bundle.crt -- Bundle of CA Root Certificates
4
+
5
+ This is a bundle of X.509 certificates of public Certificate Authorities
6
+ (CA). These were automatically extracted from Mozilla's root certificates
7
+ file (certdata.txt). This file can be found in the mozilla source tree:
8
+ https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
9
+ It contains the certificates in PEM format and therefore
10
+ can be directly used with curl / libcurl / php_curl, or with
11
+ an Apache+mod_ssl webserver for SSL client authentication.
12
+ Just configure this file as the SSLCACertificateFile.#
13
+
14
+ ***** BEGIN LICENSE BLOCK *****
15
+ This Source Code Form is subject to the terms of the Mozilla Public License,
16
+ v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
17
+ one at http://mozilla.org/MPL/2.0/.
18
+
19
+ ***** END LICENSE BLOCK *****
20
+ @(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
venv/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: certifi
3
+ Version: 2025.1.31
4
+ Summary: Python package for providing Mozilla's CA Bundle.
5
+ Home-page: https://github.com/certifi/python-certifi
6
+ Author: Kenneth Reitz
7
+ Author-email: me@kennethreitz.com
8
+ License: MPL-2.0
9
+ Project-URL: Source, https://github.com/certifi/python-certifi
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
13
+ Classifier: Natural Language :: English
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Requires-Python: >=3.6
26
+ License-File: LICENSE
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: home-page
32
+ Dynamic: license
33
+ Dynamic: project-url
34
+ Dynamic: requires-python
35
+ Dynamic: summary
36
+
37
+ Certifi: Python SSL Certificates
38
+ ================================
39
+
40
+ Certifi provides Mozilla's carefully curated collection of Root Certificates for
41
+ validating the trustworthiness of SSL certificates while verifying the identity
42
+ of TLS hosts. It has been extracted from the `Requests`_ project.
43
+
44
+ Installation
45
+ ------------
46
+
47
+ ``certifi`` is available on PyPI. Simply install it with ``pip``::
48
+
49
+ $ pip install certifi
50
+
51
+ Usage
52
+ -----
53
+
54
+ To reference the installed certificate authority (CA) bundle, you can use the
55
+ built-in function::
56
+
57
+ >>> import certifi
58
+
59
+ >>> certifi.where()
60
+ '/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
61
+
62
+ Or from the command line::
63
+
64
+ $ python -m certifi
65
+ /usr/local/lib/python3.7/site-packages/certifi/cacert.pem
66
+
67
+ Enjoy!
68
+
69
+ .. _`Requests`: https://requests.readthedocs.io/en/master/
70
+
71
+ Addition/Removal of Certificates
72
+ --------------------------------
73
+
74
+ Certifi does not support any addition/removal or other modification of the
75
+ CA trust store content. This project is intended to provide a reliable and
76
+ highly portable root of trust to python deployments. Look to upstream projects
77
+ for methods to use alternate trust.
venv/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
3
+ certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
4
+ certifi-2025.1.31.dist-info/RECORD,,
5
+ certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
+ certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
7
+ certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
8
+ certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
9
+ certifi/__pycache__/__init__.cpython-313.pyc,,
10
+ certifi/__pycache__/__main__.cpython-313.pyc,,
11
+ certifi/__pycache__/core.cpython-313.pyc,,
12
+ certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
13
+ certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
14
+ certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
venv/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
venv/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ certifi
venv/Lib/site-packages/certifi/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .core import contents, where
2
+
3
+ __all__ = ["contents", "where"]
4
+ __version__ = "2025.01.31"
venv/Lib/site-packages/certifi/__main__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from certifi import contents, where
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("-c", "--contents", action="store_true")
7
+ args = parser.parse_args()
8
+
9
+ if args.contents:
10
+ print(contents())
11
+ else:
12
+ print(where())
venv/Lib/site-packages/certifi/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (306 Bytes). View file
 
venv/Lib/site-packages/certifi/__pycache__/__main__.cpython-313.pyc ADDED
Binary file (623 Bytes). View file
 
venv/Lib/site-packages/certifi/__pycache__/core.cpython-313.pyc ADDED
Binary file (3.2 kB). View file
 
venv/Lib/site-packages/certifi/cacert.pem ADDED
The diff for this file is too large to render. See raw diff
 
venv/Lib/site-packages/certifi/core.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ certifi.py
3
+ ~~~~~~~~~~
4
+
5
+ This module returns the installation location of cacert.pem or its contents.
6
+ """
7
+ import sys
8
+ import atexit
9
+
10
+ def exit_cacert_ctx() -> None:
11
+ _CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
12
+
13
+
14
+ if sys.version_info >= (3, 11):
15
+
16
+ from importlib.resources import as_file, files
17
+
18
+ _CACERT_CTX = None
19
+ _CACERT_PATH = None
20
+
21
+ def where() -> str:
22
+ # This is slightly terrible, but we want to delay extracting the file
23
+ # in cases where we're inside of a zipimport situation until someone
24
+ # actually calls where(), but we don't want to re-extract the file
25
+ # on every call of where(), so we'll do it once then store it in a
26
+ # global variable.
27
+ global _CACERT_CTX
28
+ global _CACERT_PATH
29
+ if _CACERT_PATH is None:
30
+ # This is slightly janky, the importlib.resources API wants you to
31
+ # manage the cleanup of this file, so it doesn't actually return a
32
+ # path, it returns a context manager that will give you the path
33
+ # when you enter it and will do any cleanup when you leave it. In
34
+ # the common case of not needing a temporary file, it will just
35
+ # return the file system location and the __exit__() is a no-op.
36
+ #
37
+ # We also have to hold onto the actual context manager, because
38
+ # it will do the cleanup whenever it gets garbage collected, so
39
+ # we will also store that at the global level as well.
40
+ _CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
41
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
42
+ atexit.register(exit_cacert_ctx)
43
+
44
+ return _CACERT_PATH
45
+
46
+ def contents() -> str:
47
+ return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
48
+
49
+ elif sys.version_info >= (3, 7):
50
+
51
+ from importlib.resources import path as get_path, read_text
52
+
53
+ _CACERT_CTX = None
54
+ _CACERT_PATH = None
55
+
56
+ def where() -> str:
57
+ # This is slightly terrible, but we want to delay extracting the
58
+ # file in cases where we're inside of a zipimport situation until
59
+ # someone actually calls where(), but we don't want to re-extract
60
+ # the file on every call of where(), so we'll do it once then store
61
+ # it in a global variable.
62
+ global _CACERT_CTX
63
+ global _CACERT_PATH
64
+ if _CACERT_PATH is None:
65
+ # This is slightly janky, the importlib.resources API wants you
66
+ # to manage the cleanup of this file, so it doesn't actually
67
+ # return a path, it returns a context manager that will give
68
+ # you the path when you enter it and will do any cleanup when
69
+ # you leave it. In the common case of not needing a temporary
70
+ # file, it will just return the file system location and the
71
+ # __exit__() is a no-op.
72
+ #
73
+ # We also have to hold onto the actual context manager, because
74
+ # it will do the cleanup whenever it gets garbage collected, so
75
+ # we will also store that at the global level as well.
76
+ _CACERT_CTX = get_path("certifi", "cacert.pem")
77
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
78
+ atexit.register(exit_cacert_ctx)
79
+
80
+ return _CACERT_PATH
81
+
82
+ def contents() -> str:
83
+ return read_text("certifi", "cacert.pem", encoding="ascii")
84
+
85
+ else:
86
+ import os
87
+ import types
88
+ from typing import Union
89
+
90
+ Package = Union[types.ModuleType, str]
91
+ Resource = Union[str, "os.PathLike"]
92
+
93
+ # This fallback will work for Python versions prior to 3.7 that lack the
94
+ # importlib.resources module but relies on the existing `where` function
95
+ # so won't address issues with environments like PyOxidizer that don't set
96
+ # __file__ on modules.
97
+ def read_text(
98
+ package: Package,
99
+ resource: Resource,
100
+ encoding: str = 'utf-8',
101
+ errors: str = 'strict'
102
+ ) -> str:
103
+ with open(where(), encoding=encoding) as data:
104
+ return data.read()
105
+
106
+ # If we don't have importlib.resources, then we will just do the old logic
107
+ # of assuming we're on the filesystem and munge the path directly.
108
+ def where() -> str:
109
+ f = os.path.dirname(__file__)
110
+
111
+ return os.path.join(f, "cacert.pem")
112
+
113
+ def contents() -> str:
114
+ return read_text("certifi", "cacert.pem", encoding="ascii")
venv/Lib/site-packages/certifi/py.typed ADDED
File without changes
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: charset-normalizer
3
+ Version: 3.4.1
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
6
+ Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
7
+ License: MIT
8
+ Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
+ Project-URL: Code, https://github.com/jawah/charset_normalizer
11
+ Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Programming Language :: Python :: 3 :: Only
27
+ Classifier: Programming Language :: Python :: Implementation :: CPython
28
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
29
+ Classifier: Topic :: Text Processing :: Linguistic
30
+ Classifier: Topic :: Utilities
31
+ Classifier: Typing :: Typed
32
+ Requires-Python: >=3.7
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Provides-Extra: unicode-backport
36
+
37
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
38
+
39
+ <p align="center">
40
+ <sup>The Real First Universal Charset Detector</sup><br>
41
+ <a href="https://pypi.org/project/charset-normalizer">
42
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
43
+ </a>
44
+ <a href="https://pepy.tech/project/charset-normalizer/">
45
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
46
+ </a>
47
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
48
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
49
+ </a>
50
+ </p>
51
+ <p align="center">
52
+ <sup><i>Featured Packages</i></sup><br>
53
+ <a href="https://github.com/jawah/niquests">
54
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
55
+ </a>
56
+ <a href="https://github.com/jawah/wassima">
57
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
58
+ </a>
59
+ </p>
60
+ <p align="center">
61
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
62
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
63
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
64
+ </a>
65
+ </p>
66
+
67
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
68
+ > I'm trying to resolve the issue by taking a new approach.
69
+ > All IANA character set names for which the Python core library provides codecs are supported.
70
+
71
+ <p align="center">
72
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
73
+ </p>
74
+
75
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
76
+
77
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
78
+ |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
79
+ | `Fast` | ❌ | ✅ | ✅ |
80
+ | `Universal**` | ❌ | ✅ | ❌ |
81
+ | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
82
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
83
+ | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
84
+ | `Native Python` | ✅ | ✅ | ❌ |
85
+ | `Detect spoken language` | ❌ | ✅ | N/A |
86
+ | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
87
+ | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
88
+ | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
89
+
90
+ <p align="center">
91
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
92
+ </p>
93
+
94
+ *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
95
+
96
+ ## ⚡ Performance
97
+
98
+ This package offer better performance than its counterpart Chardet. Here are some numbers.
99
+
100
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
101
+ |-----------------------------------------------|:--------:|:------------------:|:------------------:|
102
+ | [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
103
+ | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
104
+
105
+ | Package | 99th percentile | 95th percentile | 50th percentile |
106
+ |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
107
+ | [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
108
+ | charset-normalizer | 100 ms | 50 ms | 5 ms |
109
+
110
+ _updated as of december 2024 using CPython 3.12_
111
+
112
+ Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
113
+
114
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
115
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
116
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
117
+ > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
118
+ > (e.g. Supported Encoding) Challenge-them if you want.
119
+
120
+ ## ✨ Installation
121
+
122
+ Using pip:
123
+
124
+ ```sh
125
+ pip install charset-normalizer -U
126
+ ```
127
+
128
+ ## 🚀 Basic Usage
129
+
130
+ ### CLI
131
+ This package comes with a CLI.
132
+
133
+ ```
134
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
135
+ file [file ...]
136
+
137
+ The Real First Universal Charset Detector. Discover originating encoding used
138
+ on text file. Normalize text to unicode.
139
+
140
+ positional arguments:
141
+ files File(s) to be analysed
142
+
143
+ optional arguments:
144
+ -h, --help show this help message and exit
145
+ -v, --verbose Display complementary information about file if any.
146
+ Stdout will contain logs about the detection process.
147
+ -a, --with-alternative
148
+ Output complementary possibilities if any. Top-level
149
+ JSON WILL be a list.
150
+ -n, --normalize Permit to normalize input file. If not set, program
151
+ does not write anything.
152
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
153
+ JSON output.
154
+ -r, --replace Replace file when trying to normalize it instead of
155
+ creating a new one.
156
+ -f, --force Replace file without asking if you are sure, use this
157
+ flag with caution.
158
+ -t THRESHOLD, --threshold THRESHOLD
159
+ Define a custom maximum amount of chaos allowed in
160
+ decoded content. 0. <= chaos <= 1.
161
+ --version Show version information and exit.
162
+ ```
163
+
164
+ ```bash
165
+ normalizer ./data/sample.1.fr.srt
166
+ ```
167
+
168
+ or
169
+
170
+ ```bash
171
+ python -m charset_normalizer ./data/sample.1.fr.srt
172
+ ```
173
+
174
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
175
+
176
+ ```json
177
+ {
178
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
179
+ "encoding": "cp1252",
180
+ "encoding_aliases": [
181
+ "1252",
182
+ "windows_1252"
183
+ ],
184
+ "alternative_encodings": [
185
+ "cp1254",
186
+ "cp1256",
187
+ "cp1258",
188
+ "iso8859_14",
189
+ "iso8859_15",
190
+ "iso8859_16",
191
+ "iso8859_3",
192
+ "iso8859_9",
193
+ "latin_1",
194
+ "mbcs"
195
+ ],
196
+ "language": "French",
197
+ "alphabets": [
198
+ "Basic Latin",
199
+ "Latin-1 Supplement"
200
+ ],
201
+ "has_sig_or_bom": false,
202
+ "chaos": 0.149,
203
+ "coherence": 97.152,
204
+ "unicode_path": null,
205
+ "is_preferred": true
206
+ }
207
+ ```
208
+
209
+ ### Python
210
+ *Just print out normalized text*
211
+ ```python
212
+ from charset_normalizer import from_path
213
+
214
+ results = from_path('./my_subtitle.srt')
215
+
216
+ print(str(results.best()))
217
+ ```
218
+
219
+ *Upgrade your code without effort*
220
+ ```python
221
+ from charset_normalizer import detect
222
+ ```
223
+
224
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
225
+
226
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
227
+
228
+ ## 😇 Why
229
+
230
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
231
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
232
+
233
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
234
+ produce **two identical rendered string.**
235
+ What I want is to get readable text, the best I can.
236
+
237
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
238
+
239
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
240
+
241
+ ## 🍰 How
242
+
243
+ - Discard all charset encoding table that could not fit the binary content.
244
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
245
+ - Extract matches with the lowest mess detected.
246
+ - Additionally, we measure coherence / probe for a language.
247
+
248
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
249
+
250
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
251
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
252
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
253
+ improve or rewrite it.
254
+
255
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
256
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
257
+
258
+ ## ⚡ Known limitations
259
+
260
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
261
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
262
+
263
+ ## ⚠️ About Python EOLs
264
+
265
+ **If you are running:**
266
+
267
+ - Python >=2.7,<3.5: Unsupported
268
+ - Python 3.5: charset-normalizer < 2.1
269
+ - Python 3.6: charset-normalizer < 3.1
270
+ - Python 3.7: charset-normalizer < 4.0
271
+
272
+ Upgrade your Python interpreter as soon as possible.
273
+
274
+ ## 👤 Contributing
275
+
276
+ Contributions, issues and feature requests are very much welcome.<br />
277
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
278
+
279
+ ## 📝 License
280
+
281
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
282
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
283
+
284
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
285
+
286
+ ## 💼 For Enterprise
287
+
288
+ Professional support for charset-normalizer is available as part of the [Tidelift
289
+ Subscription][1]. Tidelift gives software development teams a single source for
290
+ purchasing and maintaining their software, with professional grade assurances
291
+ from the experts who know it best, while seamlessly integrating with existing
292
+ tools.
293
+
294
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
295
+
296
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
297
+
298
+ # Changelog
299
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
300
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
301
+
302
+ ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
303
+
304
+ ### Changed
305
+ - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
306
+ - Enforce annotation delayed loading for a simpler and consistent types in the project.
307
+ - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
308
+
309
+ ### Added
310
+ - pre-commit configuration.
311
+ - noxfile.
312
+
313
+ ### Removed
314
+ - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
315
+ - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
316
+ - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
317
+ - Unused `utils.range_scan` function.
318
+
319
+ ### Fixed
320
+ - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
321
+ - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
322
+
323
+ ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
324
+
325
+ ### Added
326
+ - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
327
+ - Support for Python 3.13 (#512)
328
+
329
+ ### Fixed
330
+ - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
331
+ - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
332
+ - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
333
+
334
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
335
+
336
+ ### Fixed
337
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
338
+ - Regression on some detection case showcased in the documentation (#371)
339
+
340
+ ### Added
341
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
342
+
343
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
344
+
345
+ ### Changed
346
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
347
+ - Improved the general detection reliability based on reports from the community
348
+
349
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
350
+
351
+ ### Added
352
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
353
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
354
+
355
+ ### Removed
356
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
357
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
358
+
359
+ ### Changed
360
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
361
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
362
+
363
+ ### Fixed
364
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
365
+
366
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
367
+
368
+ ### Changed
369
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
370
+ - Minor improvement over the global detection reliability
371
+
372
+ ### Added
373
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
374
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
375
+ - Explicit support for Python 3.12
376
+
377
+ ### Fixed
378
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
379
+
380
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
381
+
382
+ ### Added
383
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
384
+
385
+ ### Removed
386
+ - Support for Python 3.6 (PR #260)
387
+
388
+ ### Changed
389
+ - Optional speedup provided by mypy/c 1.0.1
390
+
391
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
392
+
393
+ ### Fixed
394
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
395
+
396
+ ### Changed
397
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
398
+
399
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
400
+
401
+ ### Added
402
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
403
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
404
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
405
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
406
+
407
+ ### Changed
408
+ - Build with static metadata using 'build' frontend
409
+ - Make the language detection stricter
410
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
411
+
412
+ ### Fixed
413
+ - CLI with opt --normalize fail when using full path for files
414
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
415
+ - Sphinx warnings when generating the documentation
416
+
417
+ ### Removed
418
+ - Coherence detector no longer return 'Simple English' instead return 'English'
419
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
420
+ - Breaking: Method `first()` and `best()` from CharsetMatch
421
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
422
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
423
+ - Breaking: Top-level function `normalize`
424
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
425
+ - Support for the backport `unicodedata2`
426
+
427
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
428
+
429
+ ### Added
430
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
431
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
432
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
433
+
434
+ ### Changed
435
+ - Build with static metadata using 'build' frontend
436
+ - Make the language detection stricter
437
+
438
+ ### Fixed
439
+ - CLI with opt --normalize fail when using full path for files
440
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
441
+
442
+ ### Removed
443
+ - Coherence detector no longer return 'Simple English' instead return 'English'
444
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
445
+
446
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
447
+
448
+ ### Added
449
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
450
+
451
+ ### Removed
452
+ - Breaking: Method `first()` and `best()` from CharsetMatch
453
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
454
+
455
+ ### Fixed
456
+ - Sphinx warnings when generating the documentation
457
+
458
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
459
+
460
+ ### Changed
461
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
462
+
463
+ ### Removed
464
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
465
+ - Breaking: Top-level function `normalize`
466
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
467
+ - Support for the backport `unicodedata2`
468
+
469
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
470
+
471
+ ### Deprecated
472
+ - Function `normalize` scheduled for removal in 3.0
473
+
474
+ ### Changed
475
+ - Removed useless call to decode in fn is_unprintable (#206)
476
+
477
+ ### Fixed
478
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
479
+
480
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
481
+
482
+ ### Added
483
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
484
+
485
+ ### Changed
486
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
487
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
488
+
489
+ ### Fixed
490
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
491
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
492
+
493
+ ### Removed
494
+ - Support for Python 3.5 (PR #192)
495
+
496
+ ### Deprecated
497
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
498
+
499
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
500
+
501
+ ### Fixed
502
+ - ASCII miss-detection on rare cases (PR #170)
503
+
504
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
505
+
506
+ ### Added
507
+ - Explicit support for Python 3.11 (PR #164)
508
+
509
+ ### Changed
510
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
511
+
512
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
513
+
514
+ ### Fixed
515
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
516
+
517
+ ### Changed
518
+ - Skipping the language-detection (CD) on ASCII (PR #155)
519
+
520
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
521
+
522
+ ### Changed
523
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
524
+
525
+ ### Fixed
526
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
527
+
528
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
529
+ ### Changed
530
+ - Improvement over Vietnamese detection (PR #126)
531
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
532
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
533
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
534
+ - Code style as refactored by Sourcery-AI (PR #131)
535
+ - Minor adjustment on the MD around european words (PR #133)
536
+ - Remove and replace SRTs from assets / tests (PR #139)
537
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
538
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
539
+
540
+ ### Fixed
541
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
542
+ - Avoid using too insignificant chunk (PR #137)
543
+
544
+ ### Added
545
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
546
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
547
+
548
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
549
+ ### Added
550
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
551
+
552
+ ### Changed
553
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
554
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
555
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
556
+ - Various detection improvement (MD+CD) (PR #117)
557
+
558
+ ### Removed
559
+ - Remove redundant logging entry about detected language(s) (PR #115)
560
+
561
+ ### Fixed
562
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
563
+
564
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
565
+ ### Fixed
566
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
567
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
568
+
569
+ ### Changed
570
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
571
+
572
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
573
+ ### Changed
574
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
575
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
576
+ - The Unicode detection is slightly improved (PR #93)
577
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
578
+
579
+ ### Removed
580
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
581
+
582
+ ### Fixed
583
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
584
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
585
+ - The MANIFEST.in was not exhaustive (PR #78)
586
+
587
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
588
+ ### Fixed
589
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
590
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
591
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
592
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
593
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
594
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
595
+
596
+ ### Changed
597
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
598
+ - Allow fallback on specified encoding if any (PR #71)
599
+
600
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
601
+ ### Changed
602
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
603
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
604
+
605
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
606
+ ### Fixed
607
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
608
+
609
+ ### Changed
610
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
611
+
612
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
613
+ ### Fixed
614
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
615
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
616
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
617
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
618
+
619
+ ### Changed
620
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
621
+
622
+ ### Added
623
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
624
+
625
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
626
+ ### Changed
627
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
628
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
629
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
630
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
631
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
632
+ - utf_7 detection has been reinstated.
633
+
634
+ ### Removed
635
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
636
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
637
+ - The exception hook on UnicodeDecodeError has been removed.
638
+
639
+ ### Deprecated
640
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
641
+
642
+ ### Fixed
643
+ - The CLI output used the relative path of the file(s). Should be absolute.
644
+
645
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
646
+ ### Fixed
647
+ - Logger configuration/usage no longer conflict with others (PR #44)
648
+
649
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
650
+ ### Removed
651
+ - Using standard logging instead of using the package loguru.
652
+ - Dropping nose test framework in favor of the maintained pytest.
653
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
654
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
655
+ - Stop support for UTF-7 that does not contain a SIG.
656
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
657
+
658
+ ### Fixed
659
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
660
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
661
+
662
+ ### Changed
663
+ - Improving the package final size by compressing frequencies.json.
664
+ - Huge improvement over the larges payload.
665
+
666
+ ### Added
667
+ - CLI now produces JSON consumable output.
668
+ - Return ASCII if given sequences fit. Given reasonable confidence.
669
+
670
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
671
+
672
+ ### Fixed
673
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
674
+
675
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
676
+
677
+ ### Fixed
678
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
679
+
680
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
681
+
682
+ ### Fixed
683
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
684
+
685
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
686
+
687
+ ### Changed
688
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
689
+
690
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
691
+
692
+ ### Fixed
693
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
694
+
695
+ ### Changed
696
+ - Dependencies refactoring, constraints revised.
697
+
698
+ ### Added
699
+ - Add python 3.9 and 3.10 to the supported interpreters
700
+
701
+ MIT License
702
+
703
+ Copyright (c) 2025 TAHRI Ahmed R.
704
+
705
+ Permission is hereby granted, free of charge, to any person obtaining a copy
706
+ of this software and associated documentation files (the "Software"), to deal
707
+ in the Software without restriction, including without limitation the rights
708
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
709
+ copies of the Software, and to permit persons to whom the Software is
710
+ furnished to do so, subject to the following conditions:
711
+
712
+ The above copyright notice and this permission notice shall be included in all
713
+ copies or substantial portions of the Software.
714
+
715
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
716
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
717
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
718
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
719
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
720
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
721
+ SOFTWARE.
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../Scripts/normalizer.exe,sha256=4_Drg6MZgKEgGq4qpOAk6mQHRVD7X8yi-_wkphtVVJY,108425
2
+ charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
+ charset_normalizer-3.4.1.dist-info/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
4
+ charset_normalizer-3.4.1.dist-info/METADATA,sha256=0_fAC3DknimRZusm6kkP4ylPD0JVzBq5mKHWLNBJM6w,36034
5
+ charset_normalizer-3.4.1.dist-info/RECORD,,
6
+ charset_normalizer-3.4.1.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
7
+ charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
8
+ charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
+ charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
10
+ charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
11
+ charset_normalizer/__pycache__/__init__.cpython-313.pyc,,
12
+ charset_normalizer/__pycache__/__main__.cpython-313.pyc,,
13
+ charset_normalizer/__pycache__/api.cpython-313.pyc,,
14
+ charset_normalizer/__pycache__/cd.cpython-313.pyc,,
15
+ charset_normalizer/__pycache__/constant.cpython-313.pyc,,
16
+ charset_normalizer/__pycache__/legacy.cpython-313.pyc,,
17
+ charset_normalizer/__pycache__/md.cpython-313.pyc,,
18
+ charset_normalizer/__pycache__/models.cpython-313.pyc,,
19
+ charset_normalizer/__pycache__/utils.cpython-313.pyc,,
20
+ charset_normalizer/__pycache__/version.cpython-313.pyc,,
21
+ charset_normalizer/api.py,sha256=2a0p2Gnhbdo9O6C04CNxTSN23fIbgOF20nxb0pWPNFM,23285
22
+ charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
23
+ charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
24
+ charset_normalizer/cli/__main__.py,sha256=lZ89qRWun7FRxX0qm1GhK-m0DH0i048yiMAX1mVIuRg,10731
25
+ charset_normalizer/cli/__pycache__/__init__.cpython-313.pyc,,
26
+ charset_normalizer/cli/__pycache__/__main__.cpython-313.pyc,,
27
+ charset_normalizer/constant.py,sha256=7OKYi28cJjZxIcX3lQCwfK9ijoOgaVEbERww7SqqNSY,42475
28
+ charset_normalizer/legacy.py,sha256=v8An1aAQHUu036UWOhyIaDGkirZ0t4hfNVlyje5KInU,2394
29
+ charset_normalizer/md.cp313-win_amd64.pyd,sha256=H4pRc9i_5sVp6Bxzi4MIADB-1FhtKumsXME6RoxuGJI,10752
30
+ charset_normalizer/md.py,sha256=e452fhwIAguEUr3FJzG7QZvFgXI-dVLOh_M1ZUiFI6U,20666
31
+ charset_normalizer/md__mypyc.cp313-win_amd64.pyd,sha256=Q3zrdee8fHLJCQVYOX7zWYsLx7xJlDSvWCcCgIPTAMo,125440
32
+ charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
33
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ charset_normalizer/utils.py,sha256=oH9Q3WcAMwmsSB7uM8uDozz9DXnkYecbkTNbdnMbgzI,12410
35
+ charset_normalizer/version.py,sha256=7_thI7FzRQxEsbtUYwrJs3FCFWF666mw74H8mggPRR0,123
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.6.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp313-cp313-win_amd64
5
+
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ normalizer = charset_normalizer:cli.cli_detect
venv/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ charset_normalizer
venv/Lib/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
venv/Lib/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import cli_detect
4
+
5
+ if __name__ == "__main__":
6
+ cli_detect()
venv/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.77 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-313.pyc ADDED
Binary file (349 Bytes). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-313.pyc ADDED
Binary file (18.7 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-313.pyc ADDED
Binary file (13.4 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-313.pyc ADDED
Binary file (38.7 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-313.pyc ADDED
Binary file (2.91 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-313.pyc ADDED
Binary file (25.5 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-313.pyc ADDED
Binary file (17.3 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-313.pyc ADDED
Binary file (13.8 kB). View file
 
venv/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-313.pyc ADDED
Binary file (374 Bytes). View file
 
venv/Lib/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import PathLike
5
+ from typing import BinaryIO
6
+
7
+ from .cd import (
8
+ coherence_ratio,
9
+ encoding_languages,
10
+ mb_encoding_languages,
11
+ merge_coherence_ratios,
12
+ )
13
+ from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
14
+ from .md import mess_ratio
15
+ from .models import CharsetMatch, CharsetMatches
16
+ from .utils import (
17
+ any_specified_encoding,
18
+ cut_sequence_chunks,
19
+ iana_name,
20
+ identify_sig_or_bom,
21
+ is_cp_similar,
22
+ is_multi_byte_encoding,
23
+ should_strip_sig_or_bom,
24
+ )
25
+
26
+ logger = logging.getLogger("charset_normalizer")
27
+ explain_handler = logging.StreamHandler()
28
+ explain_handler.setFormatter(
29
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
+ )
31
+
32
+
33
+ def from_bytes(
34
+ sequences: bytes | bytearray,
35
+ steps: int = 5,
36
+ chunk_size: int = 512,
37
+ threshold: float = 0.2,
38
+ cp_isolation: list[str] | None = None,
39
+ cp_exclusion: list[str] | None = None,
40
+ preemptive_behaviour: bool = True,
41
+ explain: bool = False,
42
+ language_threshold: float = 0.1,
43
+ enable_fallback: bool = True,
44
+ ) -> CharsetMatches:
45
+ """
46
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
+ If there is no results, it is a strong indicator that the source is binary/not text.
48
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
+
51
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
+ but never take it for granted. Can improve the performance.
53
+
54
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
+ purpose.
56
+
57
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
+ Custom logging format and handler can be set manually.
61
+ """
62
+
63
+ if not isinstance(sequences, (bytearray, bytes)):
64
+ raise TypeError(
65
+ "Expected object of type bytes or bytearray, got: {}".format(
66
+ type(sequences)
67
+ )
68
+ )
69
+
70
+ if explain:
71
+ previous_logger_level: int = logger.level
72
+ logger.addHandler(explain_handler)
73
+ logger.setLevel(TRACE)
74
+
75
+ length: int = len(sequences)
76
+
77
+ if length == 0:
78
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
+ if explain: # Defensive: ensure exit path clean handler
80
+ logger.removeHandler(explain_handler)
81
+ logger.setLevel(previous_logger_level or logging.WARNING)
82
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
+
84
+ if cp_isolation is not None:
85
+ logger.log(
86
+ TRACE,
87
+ "cp_isolation is set. use this flag for debugging purpose. "
88
+ "limited list of encoding allowed : %s.",
89
+ ", ".join(cp_isolation),
90
+ )
91
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
+ else:
93
+ cp_isolation = []
94
+
95
+ if cp_exclusion is not None:
96
+ logger.log(
97
+ TRACE,
98
+ "cp_exclusion is set. use this flag for debugging purpose. "
99
+ "limited list of encoding excluded : %s.",
100
+ ", ".join(cp_exclusion),
101
+ )
102
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
+ else:
104
+ cp_exclusion = []
105
+
106
+ if length <= (chunk_size * steps):
107
+ logger.log(
108
+ TRACE,
109
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
+ steps,
111
+ chunk_size,
112
+ length,
113
+ )
114
+ steps = 1
115
+ chunk_size = length
116
+
117
+ if steps > 1 and length / steps < chunk_size:
118
+ chunk_size = int(length / steps)
119
+
120
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
+
123
+ if is_too_small_sequence:
124
+ logger.log(
125
+ TRACE,
126
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
+ length
128
+ ),
129
+ )
130
+ elif is_too_large_sequence:
131
+ logger.log(
132
+ TRACE,
133
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
+ length
135
+ ),
136
+ )
137
+
138
+ prioritized_encodings: list[str] = []
139
+
140
+ specified_encoding: str | None = (
141
+ any_specified_encoding(sequences) if preemptive_behaviour else None
142
+ )
143
+
144
+ if specified_encoding is not None:
145
+ prioritized_encodings.append(specified_encoding)
146
+ logger.log(
147
+ TRACE,
148
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
149
+ specified_encoding,
150
+ )
151
+
152
+ tested: set[str] = set()
153
+ tested_but_hard_failure: list[str] = []
154
+ tested_but_soft_failure: list[str] = []
155
+
156
+ fallback_ascii: CharsetMatch | None = None
157
+ fallback_u8: CharsetMatch | None = None
158
+ fallback_specified: CharsetMatch | None = None
159
+
160
+ results: CharsetMatches = CharsetMatches()
161
+
162
+ early_stop_results: CharsetMatches = CharsetMatches()
163
+
164
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
165
+
166
+ if sig_encoding is not None:
167
+ prioritized_encodings.append(sig_encoding)
168
+ logger.log(
169
+ TRACE,
170
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
171
+ len(sig_payload),
172
+ sig_encoding,
173
+ )
174
+
175
+ prioritized_encodings.append("ascii")
176
+
177
+ if "utf_8" not in prioritized_encodings:
178
+ prioritized_encodings.append("utf_8")
179
+
180
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
181
+ if cp_isolation and encoding_iana not in cp_isolation:
182
+ continue
183
+
184
+ if cp_exclusion and encoding_iana in cp_exclusion:
185
+ continue
186
+
187
+ if encoding_iana in tested:
188
+ continue
189
+
190
+ tested.add(encoding_iana)
191
+
192
+ decoded_payload: str | None = None
193
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
194
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
195
+ encoding_iana
196
+ )
197
+
198
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
199
+ logger.log(
200
+ TRACE,
201
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
202
+ encoding_iana,
203
+ )
204
+ continue
205
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
206
+ logger.log(
207
+ TRACE,
208
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
209
+ encoding_iana,
210
+ )
211
+ continue
212
+
213
+ try:
214
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215
+ except (ModuleNotFoundError, ImportError):
216
+ logger.log(
217
+ TRACE,
218
+ "Encoding %s does not provide an IncrementalDecoder",
219
+ encoding_iana,
220
+ )
221
+ continue
222
+
223
+ try:
224
+ if is_too_large_sequence and is_multi_byte_decoder is False:
225
+ str(
226
+ (
227
+ sequences[: int(50e4)]
228
+ if strip_sig_or_bom is False
229
+ else sequences[len(sig_payload) : int(50e4)]
230
+ ),
231
+ encoding=encoding_iana,
232
+ )
233
+ else:
234
+ decoded_payload = str(
235
+ (
236
+ sequences
237
+ if strip_sig_or_bom is False
238
+ else sequences[len(sig_payload) :]
239
+ ),
240
+ encoding=encoding_iana,
241
+ )
242
+ except (UnicodeDecodeError, LookupError) as e:
243
+ if not isinstance(e, LookupError):
244
+ logger.log(
245
+ TRACE,
246
+ "Code page %s does not fit given bytes sequence at ALL. %s",
247
+ encoding_iana,
248
+ str(e),
249
+ )
250
+ tested_but_hard_failure.append(encoding_iana)
251
+ continue
252
+
253
+ similar_soft_failure_test: bool = False
254
+
255
+ for encoding_soft_failed in tested_but_soft_failure:
256
+ if is_cp_similar(encoding_iana, encoding_soft_failed):
257
+ similar_soft_failure_test = True
258
+ break
259
+
260
+ if similar_soft_failure_test:
261
+ logger.log(
262
+ TRACE,
263
+ "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264
+ encoding_iana,
265
+ encoding_soft_failed,
266
+ )
267
+ continue
268
+
269
+ r_ = range(
270
+ 0 if not bom_or_sig_available else len(sig_payload),
271
+ length,
272
+ int(length / steps),
273
+ )
274
+
275
+ multi_byte_bonus: bool = (
276
+ is_multi_byte_decoder
277
+ and decoded_payload is not None
278
+ and len(decoded_payload) < length
279
+ )
280
+
281
+ if multi_byte_bonus:
282
+ logger.log(
283
+ TRACE,
284
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
285
+ "was encoded using n-bytes.",
286
+ encoding_iana,
287
+ )
288
+
289
+ max_chunk_gave_up: int = int(len(r_) / 4)
290
+
291
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
292
+ early_stop_count: int = 0
293
+ lazy_str_hard_failure = False
294
+
295
+ md_chunks: list[str] = []
296
+ md_ratios = []
297
+
298
+ try:
299
+ for chunk in cut_sequence_chunks(
300
+ sequences,
301
+ encoding_iana,
302
+ r_,
303
+ chunk_size,
304
+ bom_or_sig_available,
305
+ strip_sig_or_bom,
306
+ sig_payload,
307
+ is_multi_byte_decoder,
308
+ decoded_payload,
309
+ ):
310
+ md_chunks.append(chunk)
311
+
312
+ md_ratios.append(
313
+ mess_ratio(
314
+ chunk,
315
+ threshold,
316
+ explain is True and 1 <= len(cp_isolation) <= 2,
317
+ )
318
+ )
319
+
320
+ if md_ratios[-1] >= threshold:
321
+ early_stop_count += 1
322
+
323
+ if (early_stop_count >= max_chunk_gave_up) or (
324
+ bom_or_sig_available and strip_sig_or_bom is False
325
+ ):
326
+ break
327
+ except (
328
+ UnicodeDecodeError
329
+ ) as e: # Lazy str loading may have missed something there
330
+ logger.log(
331
+ TRACE,
332
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
333
+ encoding_iana,
334
+ str(e),
335
+ )
336
+ early_stop_count = max_chunk_gave_up
337
+ lazy_str_hard_failure = True
338
+
339
+ # We might want to check the sequence again with the whole content
340
+ # Only if initial MD tests passes
341
+ if (
342
+ not lazy_str_hard_failure
343
+ and is_too_large_sequence
344
+ and not is_multi_byte_decoder
345
+ ):
346
+ try:
347
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
348
+ except UnicodeDecodeError as e:
349
+ logger.log(
350
+ TRACE,
351
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
352
+ encoding_iana,
353
+ str(e),
354
+ )
355
+ tested_but_hard_failure.append(encoding_iana)
356
+ continue
357
+
358
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360
+ tested_but_soft_failure.append(encoding_iana)
361
+ logger.log(
362
+ TRACE,
363
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
364
+ "Computed mean chaos is %f %%.",
365
+ encoding_iana,
366
+ early_stop_count,
367
+ round(mean_mess_ratio * 100, ndigits=3),
368
+ )
369
+ # Preparing those fallbacks in case we got nothing.
370
+ if (
371
+ enable_fallback
372
+ and encoding_iana in ["ascii", "utf_8", specified_encoding]
373
+ and not lazy_str_hard_failure
374
+ ):
375
+ fallback_entry = CharsetMatch(
376
+ sequences,
377
+ encoding_iana,
378
+ threshold,
379
+ False,
380
+ [],
381
+ decoded_payload,
382
+ preemptive_declaration=specified_encoding,
383
+ )
384
+ if encoding_iana == specified_encoding:
385
+ fallback_specified = fallback_entry
386
+ elif encoding_iana == "ascii":
387
+ fallback_ascii = fallback_entry
388
+ else:
389
+ fallback_u8 = fallback_entry
390
+ continue
391
+
392
+ logger.log(
393
+ TRACE,
394
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
395
+ encoding_iana,
396
+ round(mean_mess_ratio * 100, ndigits=3),
397
+ )
398
+
399
+ if not is_multi_byte_decoder:
400
+ target_languages: list[str] = encoding_languages(encoding_iana)
401
+ else:
402
+ target_languages = mb_encoding_languages(encoding_iana)
403
+
404
+ if target_languages:
405
+ logger.log(
406
+ TRACE,
407
+ "{} should target any language(s) of {}".format(
408
+ encoding_iana, str(target_languages)
409
+ ),
410
+ )
411
+
412
+ cd_ratios = []
413
+
414
+ # We shall skip the CD when its about ASCII
415
+ # Most of the time its not relevant to run "language-detection" on it.
416
+ if encoding_iana != "ascii":
417
+ for chunk in md_chunks:
418
+ chunk_languages = coherence_ratio(
419
+ chunk,
420
+ language_threshold,
421
+ ",".join(target_languages) if target_languages else None,
422
+ )
423
+
424
+ cd_ratios.append(chunk_languages)
425
+
426
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
427
+
428
+ if cd_ratios_merged:
429
+ logger.log(
430
+ TRACE,
431
+ "We detected language {} using {}".format(
432
+ cd_ratios_merged, encoding_iana
433
+ ),
434
+ )
435
+
436
+ current_match = CharsetMatch(
437
+ sequences,
438
+ encoding_iana,
439
+ mean_mess_ratio,
440
+ bom_or_sig_available,
441
+ cd_ratios_merged,
442
+ (
443
+ decoded_payload
444
+ if (
445
+ is_too_large_sequence is False
446
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
447
+ )
448
+ else None
449
+ ),
450
+ preemptive_declaration=specified_encoding,
451
+ )
452
+
453
+ results.append(current_match)
454
+
455
+ if (
456
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
457
+ and mean_mess_ratio < 0.1
458
+ ):
459
+ # If md says nothing to worry about, then... stop immediately!
460
+ if mean_mess_ratio == 0.0:
461
+ logger.debug(
462
+ "Encoding detection: %s is most likely the one.",
463
+ current_match.encoding,
464
+ )
465
+ if explain: # Defensive: ensure exit path clean handler
466
+ logger.removeHandler(explain_handler)
467
+ logger.setLevel(previous_logger_level)
468
+ return CharsetMatches([current_match])
469
+
470
+ early_stop_results.append(current_match)
471
+
472
+ if (
473
+ len(early_stop_results)
474
+ and (specified_encoding is None or specified_encoding in tested)
475
+ and "ascii" in tested
476
+ and "utf_8" in tested
477
+ ):
478
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
479
+ logger.debug(
480
+ "Encoding detection: %s is most likely the one.",
481
+ probable_result.encoding,
482
+ )
483
+ if explain: # Defensive: ensure exit path clean handler
484
+ logger.removeHandler(explain_handler)
485
+ logger.setLevel(previous_logger_level)
486
+
487
+ return CharsetMatches([probable_result])
488
+
489
+ if encoding_iana == sig_encoding:
490
+ logger.debug(
491
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
492
+ "the beginning of the sequence.",
493
+ encoding_iana,
494
+ )
495
+ if explain: # Defensive: ensure exit path clean handler
496
+ logger.removeHandler(explain_handler)
497
+ logger.setLevel(previous_logger_level)
498
+ return CharsetMatches([results[encoding_iana]])
499
+
500
+ if len(results) == 0:
501
+ if fallback_u8 or fallback_ascii or fallback_specified:
502
+ logger.log(
503
+ TRACE,
504
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
505
+ )
506
+
507
+ if fallback_specified:
508
+ logger.debug(
509
+ "Encoding detection: %s will be used as a fallback match",
510
+ fallback_specified.encoding,
511
+ )
512
+ results.append(fallback_specified)
513
+ elif (
514
+ (fallback_u8 and fallback_ascii is None)
515
+ or (
516
+ fallback_u8
517
+ and fallback_ascii
518
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
519
+ )
520
+ or (fallback_u8 is not None)
521
+ ):
522
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
523
+ results.append(fallback_u8)
524
+ elif fallback_ascii:
525
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
526
+ results.append(fallback_ascii)
527
+
528
+ if results:
529
+ logger.debug(
530
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
531
+ results.best().encoding, # type: ignore
532
+ len(results) - 1,
533
+ )
534
+ else:
535
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
536
+
537
+ if explain:
538
+ logger.removeHandler(explain_handler)
539
+ logger.setLevel(previous_logger_level)
540
+
541
+ return results
542
+
543
+
544
+ def from_fp(
545
+ fp: BinaryIO,
546
+ steps: int = 5,
547
+ chunk_size: int = 512,
548
+ threshold: float = 0.20,
549
+ cp_isolation: list[str] | None = None,
550
+ cp_exclusion: list[str] | None = None,
551
+ preemptive_behaviour: bool = True,
552
+ explain: bool = False,
553
+ language_threshold: float = 0.1,
554
+ enable_fallback: bool = True,
555
+ ) -> CharsetMatches:
556
+ """
557
+ Same thing than the function from_bytes but using a file pointer that is already ready.
558
+ Will not close the file pointer.
559
+ """
560
+ return from_bytes(
561
+ fp.read(),
562
+ steps,
563
+ chunk_size,
564
+ threshold,
565
+ cp_isolation,
566
+ cp_exclusion,
567
+ preemptive_behaviour,
568
+ explain,
569
+ language_threshold,
570
+ enable_fallback,
571
+ )
572
+
573
+
574
+ def from_path(
575
+ path: str | bytes | PathLike, # type: ignore[type-arg]
576
+ steps: int = 5,
577
+ chunk_size: int = 512,
578
+ threshold: float = 0.20,
579
+ cp_isolation: list[str] | None = None,
580
+ cp_exclusion: list[str] | None = None,
581
+ preemptive_behaviour: bool = True,
582
+ explain: bool = False,
583
+ language_threshold: float = 0.1,
584
+ enable_fallback: bool = True,
585
+ ) -> CharsetMatches:
586
+ """
587
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
588
+ Can raise IOError.
589
+ """
590
+ with open(path, "rb") as fp:
591
+ return from_fp(
592
+ fp,
593
+ steps,
594
+ chunk_size,
595
+ threshold,
596
+ cp_isolation,
597
+ cp_exclusion,
598
+ preemptive_behaviour,
599
+ explain,
600
+ language_threshold,
601
+ enable_fallback,
602
+ )
603
+
604
+
605
+ def is_binary(
606
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
607
+ steps: int = 5,
608
+ chunk_size: int = 512,
609
+ threshold: float = 0.20,
610
+ cp_isolation: list[str] | None = None,
611
+ cp_exclusion: list[str] | None = None,
612
+ preemptive_behaviour: bool = True,
613
+ explain: bool = False,
614
+ language_threshold: float = 0.1,
615
+ enable_fallback: bool = False,
616
+ ) -> bool:
617
+ """
618
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621
+ """
622
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
623
+ guesses = from_path(
624
+ fp_or_path_or_payload,
625
+ steps=steps,
626
+ chunk_size=chunk_size,
627
+ threshold=threshold,
628
+ cp_isolation=cp_isolation,
629
+ cp_exclusion=cp_exclusion,
630
+ preemptive_behaviour=preemptive_behaviour,
631
+ explain=explain,
632
+ language_threshold=language_threshold,
633
+ enable_fallback=enable_fallback,
634
+ )
635
+ elif isinstance(
636
+ fp_or_path_or_payload,
637
+ (
638
+ bytes,
639
+ bytearray,
640
+ ),
641
+ ):
642
+ guesses = from_bytes(
643
+ fp_or_path_or_payload,
644
+ steps=steps,
645
+ chunk_size=chunk_size,
646
+ threshold=threshold,
647
+ cp_isolation=cp_isolation,
648
+ cp_exclusion=cp_exclusion,
649
+ preemptive_behaviour=preemptive_behaviour,
650
+ explain=explain,
651
+ language_threshold=language_threshold,
652
+ enable_fallback=enable_fallback,
653
+ )
654
+ else:
655
+ guesses = from_fp(
656
+ fp_or_path_or_payload,
657
+ steps=steps,
658
+ chunk_size=chunk_size,
659
+ threshold=threshold,
660
+ cp_isolation=cp_isolation,
661
+ cp_exclusion=cp_exclusion,
662
+ preemptive_behaviour=preemptive_behaviour,
663
+ explain=explain,
664
+ language_threshold=language_threshold,
665
+ enable_fallback=enable_fallback,
666
+ )
667
+
668
+ return not guesses
venv/Lib/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from codecs import IncrementalDecoder
5
+ from collections import Counter
6
+ from functools import lru_cache
7
+ from typing import Counter as TypeCounter
8
+
9
+ from .constant import (
10
+ FREQUENCIES,
11
+ KO_NAMES,
12
+ LANGUAGE_SUPPORTED_COUNT,
13
+ TOO_SMALL_SEQUENCE,
14
+ ZH_NAMES,
15
+ )
16
+ from .md import is_suspiciously_successive_range
17
+ from .models import CoherenceMatches
18
+ from .utils import (
19
+ is_accentuated,
20
+ is_latin,
21
+ is_multi_byte_encoding,
22
+ is_unicode_range_secondary,
23
+ unicode_range,
24
+ )
25
+
26
+
27
+ def encoding_unicode_range(iana_name: str) -> list[str]:
28
+ """
29
+ Return associated unicode ranges in a single byte code page.
30
+ """
31
+ if is_multi_byte_encoding(iana_name):
32
+ raise OSError("Function not supported on multi-byte code page")
33
+
34
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
35
+
36
+ p: IncrementalDecoder = decoder(errors="ignore")
37
+ seen_ranges: dict[str, int] = {}
38
+ character_count: int = 0
39
+
40
+ for i in range(0x40, 0xFF):
41
+ chunk: str = p.decode(bytes([i]))
42
+
43
+ if chunk:
44
+ character_range: str | None = unicode_range(chunk)
45
+
46
+ if character_range is None:
47
+ continue
48
+
49
+ if is_unicode_range_secondary(character_range) is False:
50
+ if character_range not in seen_ranges:
51
+ seen_ranges[character_range] = 0
52
+ seen_ranges[character_range] += 1
53
+ character_count += 1
54
+
55
+ return sorted(
56
+ [
57
+ character_range
58
+ for character_range in seen_ranges
59
+ if seen_ranges[character_range] / character_count >= 0.15
60
+ ]
61
+ )
62
+
63
+
64
+ def unicode_range_languages(primary_range: str) -> list[str]:
65
+ """
66
+ Return inferred languages used with a unicode range.
67
+ """
68
+ languages: list[str] = []
69
+
70
+ for language, characters in FREQUENCIES.items():
71
+ for character in characters:
72
+ if unicode_range(character) == primary_range:
73
+ languages.append(language)
74
+ break
75
+
76
+ return languages
77
+
78
+
79
+ @lru_cache()
80
+ def encoding_languages(iana_name: str) -> list[str]:
81
+ """
82
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
+ This function does the correspondence.
84
+ """
85
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
86
+ primary_range: str | None = None
87
+
88
+ for specified_range in unicode_ranges:
89
+ if "Latin" not in specified_range:
90
+ primary_range = specified_range
91
+ break
92
+
93
+ if primary_range is None:
94
+ return ["Latin Based"]
95
+
96
+ return unicode_range_languages(primary_range)
97
+
98
+
99
+ @lru_cache()
100
+ def mb_encoding_languages(iana_name: str) -> list[str]:
101
+ """
102
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
+ This function does the correspondence.
104
+ """
105
+ if (
106
+ iana_name.startswith("shift_")
107
+ or iana_name.startswith("iso2022_jp")
108
+ or iana_name.startswith("euc_j")
109
+ or iana_name == "cp932"
110
+ ):
111
+ return ["Japanese"]
112
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
+ return ["Chinese"]
114
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
+ return ["Korean"]
116
+
117
+ return []
118
+
119
+
120
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
+ def get_target_features(language: str) -> tuple[bool, bool]:
122
+ """
123
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
+ """
125
+ target_have_accents: bool = False
126
+ target_pure_latin: bool = True
127
+
128
+ for character in FREQUENCIES[language]:
129
+ if not target_have_accents and is_accentuated(character):
130
+ target_have_accents = True
131
+ if target_pure_latin and is_latin(character) is False:
132
+ target_pure_latin = False
133
+
134
+ return target_have_accents, target_pure_latin
135
+
136
+
137
+ def alphabet_languages(
138
+ characters: list[str], ignore_non_latin: bool = False
139
+ ) -> list[str]:
140
+ """
141
+ Return associated languages associated to given characters.
142
+ """
143
+ languages: list[tuple[str, float]] = []
144
+
145
+ source_have_accents = any(is_accentuated(character) for character in characters)
146
+
147
+ for language, language_characters in FREQUENCIES.items():
148
+ target_have_accents, target_pure_latin = get_target_features(language)
149
+
150
+ if ignore_non_latin and target_pure_latin is False:
151
+ continue
152
+
153
+ if target_have_accents is False and source_have_accents:
154
+ continue
155
+
156
+ character_count: int = len(language_characters)
157
+
158
+ character_match_count: int = len(
159
+ [c for c in language_characters if c in characters]
160
+ )
161
+
162
+ ratio: float = character_match_count / character_count
163
+
164
+ if ratio >= 0.2:
165
+ languages.append((language, ratio))
166
+
167
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
+
169
+ return [compatible_language[0] for compatible_language in languages]
170
+
171
+
172
+ def characters_popularity_compare(
173
+ language: str, ordered_characters: list[str]
174
+ ) -> float:
175
+ """
176
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
+ """
180
+ if language not in FREQUENCIES:
181
+ raise ValueError(f"{language} not available")
182
+
183
+ character_approved_count: int = 0
184
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
185
+
186
+ ordered_characters_count: int = len(ordered_characters)
187
+ target_language_characters_count: int = len(FREQUENCIES[language])
188
+
189
+ large_alphabet: bool = target_language_characters_count > 26
190
+
191
+ for character, character_rank in zip(
192
+ ordered_characters, range(0, ordered_characters_count)
193
+ ):
194
+ if character not in FREQUENCIES_language_set:
195
+ continue
196
+
197
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
198
+ expected_projection_ratio: float = (
199
+ target_language_characters_count / ordered_characters_count
200
+ )
201
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
+
203
+ if (
204
+ large_alphabet is False
205
+ and abs(character_rank_projection - character_rank_in_language) > 4
206
+ ):
207
+ continue
208
+
209
+ if (
210
+ large_alphabet is True
211
+ and abs(character_rank_projection - character_rank_in_language)
212
+ < target_language_characters_count / 3
213
+ ):
214
+ character_approved_count += 1
215
+ continue
216
+
217
+ characters_before_source: list[str] = FREQUENCIES[language][
218
+ 0:character_rank_in_language
219
+ ]
220
+ characters_after_source: list[str] = FREQUENCIES[language][
221
+ character_rank_in_language:
222
+ ]
223
+ characters_before: list[str] = ordered_characters[0:character_rank]
224
+ characters_after: list[str] = ordered_characters[character_rank:]
225
+
226
+ before_match_count: int = len(
227
+ set(characters_before) & set(characters_before_source)
228
+ )
229
+
230
+ after_match_count: int = len(
231
+ set(characters_after) & set(characters_after_source)
232
+ )
233
+
234
+ if len(characters_before_source) == 0 and before_match_count <= 4:
235
+ character_approved_count += 1
236
+ continue
237
+
238
+ if len(characters_after_source) == 0 and after_match_count <= 4:
239
+ character_approved_count += 1
240
+ continue
241
+
242
+ if (
243
+ before_match_count / len(characters_before_source) >= 0.4
244
+ or after_match_count / len(characters_after_source) >= 0.4
245
+ ):
246
+ character_approved_count += 1
247
+ continue
248
+
249
+ return character_approved_count / len(ordered_characters)
250
+
251
+
252
+ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
253
+ """
254
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
+ One containing the latin letters and the other hebrew.
257
+ """
258
+ layers: dict[str, str] = {}
259
+
260
+ for character in decoded_sequence:
261
+ if character.isalpha() is False:
262
+ continue
263
+
264
+ character_range: str | None = unicode_range(character)
265
+
266
+ if character_range is None:
267
+ continue
268
+
269
+ layer_target_range: str | None = None
270
+
271
+ for discovered_range in layers:
272
+ if (
273
+ is_suspiciously_successive_range(discovered_range, character_range)
274
+ is False
275
+ ):
276
+ layer_target_range = discovered_range
277
+ break
278
+
279
+ if layer_target_range is None:
280
+ layer_target_range = character_range
281
+
282
+ if layer_target_range not in layers:
283
+ layers[layer_target_range] = character.lower()
284
+ continue
285
+
286
+ layers[layer_target_range] += character.lower()
287
+
288
+ return list(layers.values())
289
+
290
+
291
+ def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
292
+ """
293
+ This function merge results previously given by the function coherence_ratio.
294
+ The return type is the same as coherence_ratio.
295
+ """
296
+ per_language_ratios: dict[str, list[float]] = {}
297
+ for result in results:
298
+ for sub_result in result:
299
+ language, ratio = sub_result
300
+ if language not in per_language_ratios:
301
+ per_language_ratios[language] = [ratio]
302
+ continue
303
+ per_language_ratios[language].append(ratio)
304
+
305
+ merge = [
306
+ (
307
+ language,
308
+ round(
309
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
+ 4,
311
+ ),
312
+ )
313
+ for language in per_language_ratios
314
+ ]
315
+
316
+ return sorted(merge, key=lambda x: x[1], reverse=True)
317
+
318
+
319
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
+ """
321
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
+ of "English". This function only keeps the best match and remove the em-dash in it.
323
+ """
324
+ index_results: dict[str, list[float]] = dict()
325
+
326
+ for result in results:
327
+ language, ratio = result
328
+ no_em_name: str = language.replace("—", "")
329
+
330
+ if no_em_name not in index_results:
331
+ index_results[no_em_name] = []
332
+
333
+ index_results[no_em_name].append(ratio)
334
+
335
+ if any(len(index_results[e]) > 1 for e in index_results):
336
+ filtered_results: CoherenceMatches = []
337
+
338
+ for language in index_results:
339
+ filtered_results.append((language, max(index_results[language])))
340
+
341
+ return filtered_results
342
+
343
+ return results
344
+
345
+
346
+ @lru_cache(maxsize=2048)
347
+ def coherence_ratio(
348
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
349
+ ) -> CoherenceMatches:
350
+ """
351
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
+ A layer = Character extraction by alphabets/ranges.
353
+ """
354
+
355
+ results: list[tuple[str, float]] = []
356
+ ignore_non_latin: bool = False
357
+
358
+ sufficient_match_count: int = 0
359
+
360
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
+ if "Latin Based" in lg_inclusion_list:
362
+ ignore_non_latin = True
363
+ lg_inclusion_list.remove("Latin Based")
364
+
365
+ for layer in alpha_unicode_split(decoded_sequence):
366
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
367
+ most_common = sequence_frequencies.most_common()
368
+
369
+ character_count: int = sum(o for c, o in most_common)
370
+
371
+ if character_count <= TOO_SMALL_SEQUENCE:
372
+ continue
373
+
374
+ popular_character_ordered: list[str] = [c for c, o in most_common]
375
+
376
+ for language in lg_inclusion_list or alphabet_languages(
377
+ popular_character_ordered, ignore_non_latin
378
+ ):
379
+ ratio: float = characters_popularity_compare(
380
+ language, popular_character_ordered
381
+ )
382
+
383
+ if ratio < threshold:
384
+ continue
385
+ elif ratio >= 0.8:
386
+ sufficient_match_count += 1
387
+
388
+ results.append((language, round(ratio, 4)))
389
+
390
+ if sufficient_match_count >= 3:
391
+ break
392
+
393
+ return sorted(
394
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
+ )
venv/Lib/site-packages/charset_normalizer/cli/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .__main__ import cli_detect, query_yes_no
4
+
5
+ __all__ = (
6
+ "cli_detect",
7
+ "query_yes_no",
8
+ )