|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import sys |
|
|
import typing |
|
|
from json import dumps |
|
|
from os.path import abspath, basename, dirname, join, realpath |
|
|
from platform import python_version |
|
|
from unicodedata import unidata_version |
|
|
|
|
|
import charset_normalizer.md as md_module |
|
|
from charset_normalizer import from_fp |
|
|
from charset_normalizer.models import CliDetectionResult |
|
|
from charset_normalizer.version import __version__ |
|
|
|
|
|
|
|
|
def query_yes_no(question: str, default: str = "yes") -> bool: |
|
|
"""Ask a yes/no question via input() and return their answer. |
|
|
|
|
|
"question" is a string that is presented to the user. |
|
|
"default" is the presumed answer if the user just hits <Enter>. |
|
|
It must be "yes" (the default), "no" or None (meaning |
|
|
an answer is required of the user). |
|
|
|
|
|
The "answer" return value is True for "yes" or False for "no". |
|
|
|
|
|
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input |
|
|
""" |
|
|
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} |
|
|
if default is None: |
|
|
prompt = " [y/n] " |
|
|
elif default == "yes": |
|
|
prompt = " [Y/n] " |
|
|
elif default == "no": |
|
|
prompt = " [y/N] " |
|
|
else: |
|
|
raise ValueError("invalid default answer: '%s'" % default) |
|
|
|
|
|
while True: |
|
|
sys.stdout.write(question + prompt) |
|
|
choice = input().lower() |
|
|
if default is not None and choice == "": |
|
|
return valid[default] |
|
|
elif choice in valid: |
|
|
return valid[choice] |
|
|
else: |
|
|
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") |
|
|
|
|
|
|
|
|
class FileType: |
|
|
"""Factory for creating file object types |
|
|
|
|
|
Instances of FileType are typically passed as type= arguments to the |
|
|
ArgumentParser add_argument() method. |
|
|
|
|
|
Keyword Arguments: |
|
|
- mode -- A string indicating how the file is to be opened. Accepts the |
|
|
same values as the builtin open() function. |
|
|
- bufsize -- The file's desired buffer size. Accepts the same values as |
|
|
the builtin open() function. |
|
|
- encoding -- The file's encoding. Accepts the same values as the |
|
|
builtin open() function. |
|
|
- errors -- A string indicating how encoding and decoding errors are to |
|
|
be handled. Accepts the same value as the builtin open() function. |
|
|
|
|
|
Backported from CPython 3.12 |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
mode: str = "r", |
|
|
bufsize: int = -1, |
|
|
encoding: str | None = None, |
|
|
errors: str | None = None, |
|
|
): |
|
|
self._mode = mode |
|
|
self._bufsize = bufsize |
|
|
self._encoding = encoding |
|
|
self._errors = errors |
|
|
|
|
|
def __call__(self, string: str) -> typing.IO: |
|
|
|
|
|
if string == "-": |
|
|
if "r" in self._mode: |
|
|
return sys.stdin.buffer if "b" in self._mode else sys.stdin |
|
|
elif any(c in self._mode for c in "wax"): |
|
|
return sys.stdout.buffer if "b" in self._mode else sys.stdout |
|
|
else: |
|
|
msg = f'argument "-" with mode {self._mode}' |
|
|
raise ValueError(msg) |
|
|
|
|
|
|
|
|
try: |
|
|
return open(string, self._mode, self._bufsize, self._encoding, self._errors) |
|
|
except OSError as e: |
|
|
message = f"can't open '{string}': {e}" |
|
|
raise argparse.ArgumentTypeError(message) |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
args = self._mode, self._bufsize |
|
|
kwargs = [("encoding", self._encoding), ("errors", self._errors)] |
|
|
args_str = ", ".join( |
|
|
[repr(arg) for arg in args if arg != -1] |
|
|
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None] |
|
|
) |
|
|
return f"{type(self).__name__}({args_str})" |
|
|
|
|
|
|
|
|
def cli_detect(argv: list[str] | None = None) -> int: |
|
|
""" |
|
|
CLI assistant using ARGV and ArgumentParser |
|
|
:param argv: |
|
|
:return: 0 if everything is fine, anything else equal trouble |
|
|
""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description="The Real First Universal Charset Detector. " |
|
|
"Discover originating encoding used on text file. " |
|
|
"Normalize text to unicode." |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-v", |
|
|
"--verbose", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="verbose", |
|
|
help="Display complementary information about file if any. " |
|
|
"Stdout will contain logs about the detection process.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-a", |
|
|
"--with-alternative", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="alternatives", |
|
|
help="Output complementary possibilities if any. Top-level JSON WILL be a list.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-n", |
|
|
"--normalize", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="normalize", |
|
|
help="Permit to normalize input file. If not set, program does not write anything.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-m", |
|
|
"--minimal", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="minimal", |
|
|
help="Only output the charset detected to STDOUT. Disabling JSON output.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-r", |
|
|
"--replace", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="replace", |
|
|
help="Replace file when trying to normalize it instead of creating a new one.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-f", |
|
|
"--force", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="force", |
|
|
help="Replace file without asking if you are sure, use this flag with caution.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-i", |
|
|
"--no-preemptive", |
|
|
action="store_true", |
|
|
default=False, |
|
|
dest="no_preemptive", |
|
|
help="Disable looking at a charset declaration to hint the detector.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-t", |
|
|
"--threshold", |
|
|
action="store", |
|
|
default=0.2, |
|
|
type=float, |
|
|
dest="threshold", |
|
|
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--version", |
|
|
action="version", |
|
|
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( |
|
|
__version__, |
|
|
python_version(), |
|
|
unidata_version, |
|
|
"OFF" if md_module.__file__.lower().endswith(".py") else "ON", |
|
|
), |
|
|
help="Show version information and exit.", |
|
|
) |
|
|
|
|
|
args = parser.parse_args(argv) |
|
|
|
|
|
if args.replace is True and args.normalize is False: |
|
|
if args.files: |
|
|
for my_file in args.files: |
|
|
my_file.close() |
|
|
print("Use --replace in addition of --normalize only.", file=sys.stderr) |
|
|
return 1 |
|
|
|
|
|
if args.force is True and args.replace is False: |
|
|
if args.files: |
|
|
for my_file in args.files: |
|
|
my_file.close() |
|
|
print("Use --force in addition of --replace only.", file=sys.stderr) |
|
|
return 1 |
|
|
|
|
|
if args.threshold < 0.0 or args.threshold > 1.0: |
|
|
if args.files: |
|
|
for my_file in args.files: |
|
|
my_file.close() |
|
|
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) |
|
|
return 1 |
|
|
|
|
|
x_ = [] |
|
|
|
|
|
for my_file in args.files: |
|
|
matches = from_fp( |
|
|
my_file, |
|
|
threshold=args.threshold, |
|
|
explain=args.verbose, |
|
|
preemptive_behaviour=args.no_preemptive is False, |
|
|
) |
|
|
|
|
|
best_guess = matches.best() |
|
|
|
|
|
if best_guess is None: |
|
|
print( |
|
|
'Unable to identify originating encoding for "{}". {}'.format( |
|
|
my_file.name, |
|
|
( |
|
|
"Maybe try increasing maximum amount of chaos." |
|
|
if args.threshold < 1.0 |
|
|
else "" |
|
|
), |
|
|
), |
|
|
file=sys.stderr, |
|
|
) |
|
|
x_.append( |
|
|
CliDetectionResult( |
|
|
abspath(my_file.name), |
|
|
None, |
|
|
[], |
|
|
[], |
|
|
"Unknown", |
|
|
[], |
|
|
False, |
|
|
1.0, |
|
|
0.0, |
|
|
None, |
|
|
True, |
|
|
) |
|
|
) |
|
|
else: |
|
|
x_.append( |
|
|
CliDetectionResult( |
|
|
abspath(my_file.name), |
|
|
best_guess.encoding, |
|
|
best_guess.encoding_aliases, |
|
|
[ |
|
|
cp |
|
|
for cp in best_guess.could_be_from_charset |
|
|
if cp != best_guess.encoding |
|
|
], |
|
|
best_guess.language, |
|
|
best_guess.alphabets, |
|
|
best_guess.bom, |
|
|
best_guess.percent_chaos, |
|
|
best_guess.percent_coherence, |
|
|
None, |
|
|
True, |
|
|
) |
|
|
) |
|
|
|
|
|
if len(matches) > 1 and args.alternatives: |
|
|
for el in matches: |
|
|
if el != best_guess: |
|
|
x_.append( |
|
|
CliDetectionResult( |
|
|
abspath(my_file.name), |
|
|
el.encoding, |
|
|
el.encoding_aliases, |
|
|
[ |
|
|
cp |
|
|
for cp in el.could_be_from_charset |
|
|
if cp != el.encoding |
|
|
], |
|
|
el.language, |
|
|
el.alphabets, |
|
|
el.bom, |
|
|
el.percent_chaos, |
|
|
el.percent_coherence, |
|
|
None, |
|
|
False, |
|
|
) |
|
|
) |
|
|
|
|
|
if args.normalize is True: |
|
|
if best_guess.encoding.startswith("utf") is True: |
|
|
print( |
|
|
'"{}" file does not need to be normalized, as it already came from unicode.'.format( |
|
|
my_file.name |
|
|
), |
|
|
file=sys.stderr, |
|
|
) |
|
|
if my_file.closed is False: |
|
|
my_file.close() |
|
|
continue |
|
|
|
|
|
dir_path = dirname(realpath(my_file.name)) |
|
|
file_name = basename(realpath(my_file.name)) |
|
|
|
|
|
o_: list[str] = file_name.split(".") |
|
|
|
|
|
if args.replace is False: |
|
|
o_.insert(-1, best_guess.encoding) |
|
|
if my_file.closed is False: |
|
|
my_file.close() |
|
|
elif ( |
|
|
args.force is False |
|
|
and query_yes_no( |
|
|
'Are you sure to normalize "{}" by replacing it ?'.format( |
|
|
my_file.name |
|
|
), |
|
|
"no", |
|
|
) |
|
|
is False |
|
|
): |
|
|
if my_file.closed is False: |
|
|
my_file.close() |
|
|
continue |
|
|
|
|
|
try: |
|
|
x_[0].unicode_path = join(dir_path, ".".join(o_)) |
|
|
|
|
|
with open(x_[0].unicode_path, "wb") as fp: |
|
|
fp.write(best_guess.output()) |
|
|
except OSError as e: |
|
|
print(str(e), file=sys.stderr) |
|
|
if my_file.closed is False: |
|
|
my_file.close() |
|
|
return 2 |
|
|
|
|
|
if my_file.closed is False: |
|
|
my_file.close() |
|
|
|
|
|
if args.minimal is False: |
|
|
print( |
|
|
dumps( |
|
|
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, |
|
|
ensure_ascii=True, |
|
|
indent=4, |
|
|
) |
|
|
) |
|
|
else: |
|
|
for my_file in args.files: |
|
|
print( |
|
|
", ".join( |
|
|
[ |
|
|
el.encoding or "undefined" |
|
|
for el in x_ |
|
|
if el.path == abspath(my_file.name) |
|
|
] |
|
|
) |
|
|
) |
|
|
|
|
|
return 0 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
cli_detect() |
|
|
|