| | from __future__ import annotations |
| |
|
| | import argparse |
| | import sys |
| | import typing |
| | from json import dumps |
| | from os.path import abspath, basename, dirname, join, realpath |
| | from platform import python_version |
| | from unicodedata import unidata_version |
| |
|
| | import charset_normalizer.md as md_module |
| | from charset_normalizer import from_fp |
| | from charset_normalizer.models import CliDetectionResult |
| | from charset_normalizer.version import __version__ |
| |
|
| |
|
| | def query_yes_no(question: str, default: str = "yes") -> bool: |
| | """Ask a yes/no question via input() and return their answer. |
| | |
| | "question" is a string that is presented to the user. |
| | "default" is the presumed answer if the user just hits <Enter>. |
| | It must be "yes" (the default), "no" or None (meaning |
| | an answer is required of the user). |
| | |
| | The "answer" return value is True for "yes" or False for "no". |
| | |
| | Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input |
| | """ |
| | valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} |
| | if default is None: |
| | prompt = " [y/n] " |
| | elif default == "yes": |
| | prompt = " [Y/n] " |
| | elif default == "no": |
| | prompt = " [y/N] " |
| | else: |
| | raise ValueError("invalid default answer: '%s'" % default) |
| |
|
| | while True: |
| | sys.stdout.write(question + prompt) |
| | choice = input().lower() |
| | if default is not None and choice == "": |
| | return valid[default] |
| | elif choice in valid: |
| | return valid[choice] |
| | else: |
| | sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") |
| |
|
| |
|
| | class FileType: |
| | """Factory for creating file object types |
| | |
| | Instances of FileType are typically passed as type= arguments to the |
| | ArgumentParser add_argument() method. |
| | |
| | Keyword Arguments: |
| | - mode -- A string indicating how the file is to be opened. Accepts the |
| | same values as the builtin open() function. |
| | - bufsize -- The file's desired buffer size. Accepts the same values as |
| | the builtin open() function. |
| | - encoding -- The file's encoding. Accepts the same values as the |
| | builtin open() function. |
| | - errors -- A string indicating how encoding and decoding errors are to |
| | be handled. Accepts the same value as the builtin open() function. |
| | |
| | Backported from CPython 3.12 |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | mode: str = "r", |
| | bufsize: int = -1, |
| | encoding: str | None = None, |
| | errors: str | None = None, |
| | ): |
| | self._mode = mode |
| | self._bufsize = bufsize |
| | self._encoding = encoding |
| | self._errors = errors |
| |
|
| | def __call__(self, string: str) -> typing.IO: |
| | |
| | if string == "-": |
| | if "r" in self._mode: |
| | return sys.stdin.buffer if "b" in self._mode else sys.stdin |
| | elif any(c in self._mode for c in "wax"): |
| | return sys.stdout.buffer if "b" in self._mode else sys.stdout |
| | else: |
| | msg = f'argument "-" with mode {self._mode}' |
| | raise ValueError(msg) |
| |
|
| | |
| | try: |
| | return open(string, self._mode, self._bufsize, self._encoding, self._errors) |
| | except OSError as e: |
| | message = f"can't open '{string}': {e}" |
| | raise argparse.ArgumentTypeError(message) |
| |
|
| | def __repr__(self) -> str: |
| | args = self._mode, self._bufsize |
| | kwargs = [("encoding", self._encoding), ("errors", self._errors)] |
| | args_str = ", ".join( |
| | [repr(arg) for arg in args if arg != -1] |
| | + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None] |
| | ) |
| | return f"{type(self).__name__}({args_str})" |
| |
|
| |
|
| | def cli_detect(argv: list[str] | None = None) -> int: |
| | """ |
| | CLI assistant using ARGV and ArgumentParser |
| | :param argv: |
| | :return: 0 if everything is fine, anything else equal trouble |
| | """ |
| | parser = argparse.ArgumentParser( |
| | description="The Real First Universal Charset Detector. " |
| | "Discover originating encoding used on text file. " |
| | "Normalize text to unicode." |
| | ) |
| |
|
| | parser.add_argument( |
| | "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed" |
| | ) |
| | parser.add_argument( |
| | "-v", |
| | "--verbose", |
| | action="store_true", |
| | default=False, |
| | dest="verbose", |
| | help="Display complementary information about file if any. " |
| | "Stdout will contain logs about the detection process.", |
| | ) |
| | parser.add_argument( |
| | "-a", |
| | "--with-alternative", |
| | action="store_true", |
| | default=False, |
| | dest="alternatives", |
| | help="Output complementary possibilities if any. Top-level JSON WILL be a list.", |
| | ) |
| | parser.add_argument( |
| | "-n", |
| | "--normalize", |
| | action="store_true", |
| | default=False, |
| | dest="normalize", |
| | help="Permit to normalize input file. If not set, program does not write anything.", |
| | ) |
| | parser.add_argument( |
| | "-m", |
| | "--minimal", |
| | action="store_true", |
| | default=False, |
| | dest="minimal", |
| | help="Only output the charset detected to STDOUT. Disabling JSON output.", |
| | ) |
| | parser.add_argument( |
| | "-r", |
| | "--replace", |
| | action="store_true", |
| | default=False, |
| | dest="replace", |
| | help="Replace file when trying to normalize it instead of creating a new one.", |
| | ) |
| | parser.add_argument( |
| | "-f", |
| | "--force", |
| | action="store_true", |
| | default=False, |
| | dest="force", |
| | help="Replace file without asking if you are sure, use this flag with caution.", |
| | ) |
| | parser.add_argument( |
| | "-i", |
| | "--no-preemptive", |
| | action="store_true", |
| | default=False, |
| | dest="no_preemptive", |
| | help="Disable looking at a charset declaration to hint the detector.", |
| | ) |
| | parser.add_argument( |
| | "-t", |
| | "--threshold", |
| | action="store", |
| | default=0.2, |
| | type=float, |
| | dest="threshold", |
| | help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.", |
| | ) |
| | parser.add_argument( |
| | "--version", |
| | action="version", |
| | version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( |
| | __version__, |
| | python_version(), |
| | unidata_version, |
| | "OFF" if md_module.__file__.lower().endswith(".py") else "ON", |
| | ), |
| | help="Show version information and exit.", |
| | ) |
| |
|
| | args = parser.parse_args(argv) |
| |
|
| | if args.replace is True and args.normalize is False: |
| | if args.files: |
| | for my_file in args.files: |
| | my_file.close() |
| | print("Use --replace in addition of --normalize only.", file=sys.stderr) |
| | return 1 |
| |
|
| | if args.force is True and args.replace is False: |
| | if args.files: |
| | for my_file in args.files: |
| | my_file.close() |
| | print("Use --force in addition of --replace only.", file=sys.stderr) |
| | return 1 |
| |
|
| | if args.threshold < 0.0 or args.threshold > 1.0: |
| | if args.files: |
| | for my_file in args.files: |
| | my_file.close() |
| | print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) |
| | return 1 |
| |
|
| | x_ = [] |
| |
|
| | for my_file in args.files: |
| | matches = from_fp( |
| | my_file, |
| | threshold=args.threshold, |
| | explain=args.verbose, |
| | preemptive_behaviour=args.no_preemptive is False, |
| | ) |
| |
|
| | best_guess = matches.best() |
| |
|
| | if best_guess is None: |
| | print( |
| | 'Unable to identify originating encoding for "{}". {}'.format( |
| | my_file.name, |
| | ( |
| | "Maybe try increasing maximum amount of chaos." |
| | if args.threshold < 1.0 |
| | else "" |
| | ), |
| | ), |
| | file=sys.stderr, |
| | ) |
| | x_.append( |
| | CliDetectionResult( |
| | abspath(my_file.name), |
| | None, |
| | [], |
| | [], |
| | "Unknown", |
| | [], |
| | False, |
| | 1.0, |
| | 0.0, |
| | None, |
| | True, |
| | ) |
| | ) |
| | else: |
| | x_.append( |
| | CliDetectionResult( |
| | abspath(my_file.name), |
| | best_guess.encoding, |
| | best_guess.encoding_aliases, |
| | [ |
| | cp |
| | for cp in best_guess.could_be_from_charset |
| | if cp != best_guess.encoding |
| | ], |
| | best_guess.language, |
| | best_guess.alphabets, |
| | best_guess.bom, |
| | best_guess.percent_chaos, |
| | best_guess.percent_coherence, |
| | None, |
| | True, |
| | ) |
| | ) |
| |
|
| | if len(matches) > 1 and args.alternatives: |
| | for el in matches: |
| | if el != best_guess: |
| | x_.append( |
| | CliDetectionResult( |
| | abspath(my_file.name), |
| | el.encoding, |
| | el.encoding_aliases, |
| | [ |
| | cp |
| | for cp in el.could_be_from_charset |
| | if cp != el.encoding |
| | ], |
| | el.language, |
| | el.alphabets, |
| | el.bom, |
| | el.percent_chaos, |
| | el.percent_coherence, |
| | None, |
| | False, |
| | ) |
| | ) |
| |
|
| | if args.normalize is True: |
| | if best_guess.encoding.startswith("utf") is True: |
| | print( |
| | '"{}" file does not need to be normalized, as it already came from unicode.'.format( |
| | my_file.name |
| | ), |
| | file=sys.stderr, |
| | ) |
| | if my_file.closed is False: |
| | my_file.close() |
| | continue |
| |
|
| | dir_path = dirname(realpath(my_file.name)) |
| | file_name = basename(realpath(my_file.name)) |
| |
|
| | o_: list[str] = file_name.split(".") |
| |
|
| | if args.replace is False: |
| | o_.insert(-1, best_guess.encoding) |
| | if my_file.closed is False: |
| | my_file.close() |
| | elif ( |
| | args.force is False |
| | and query_yes_no( |
| | 'Are you sure to normalize "{}" by replacing it ?'.format( |
| | my_file.name |
| | ), |
| | "no", |
| | ) |
| | is False |
| | ): |
| | if my_file.closed is False: |
| | my_file.close() |
| | continue |
| |
|
| | try: |
| | x_[0].unicode_path = join(dir_path, ".".join(o_)) |
| |
|
| | with open(x_[0].unicode_path, "wb") as fp: |
| | fp.write(best_guess.output()) |
| | except OSError as e: |
| | print(str(e), file=sys.stderr) |
| | if my_file.closed is False: |
| | my_file.close() |
| | return 2 |
| |
|
| | if my_file.closed is False: |
| | my_file.close() |
| |
|
| | if args.minimal is False: |
| | print( |
| | dumps( |
| | [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, |
| | ensure_ascii=True, |
| | indent=4, |
| | ) |
| | ) |
| | else: |
| | for my_file in args.files: |
| | print( |
| | ", ".join( |
| | [ |
| | el.encoding or "undefined" |
| | for el in x_ |
| | if el.path == abspath(my_file.name) |
| | ] |
| | ) |
| | ) |
| |
|
| | return 0 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | cli_detect() |
| |
|