Add files using upload-large-folder tool

51c0549 verified 3 months ago

11.9 kB

	from __future__ import annotations

	import argparse
	import sys
	import typing
	from json import dumps
	from os.path import abspath, basename, dirname, join, realpath
	from platform import python_version
	from unicodedata import unidata_version

	import charset_normalizer.md as md_module
	from charset_normalizer import from_fp
	from charset_normalizer.models import CliDetectionResult
	from charset_normalizer.version import __version__


	def query_yes_no(question: str, default: str = "yes") -> bool: # Defensive:
	"""Ask a yes/no question via input() and return the answer as a bool."""
	prompt = " [Y/n] " if default == "yes" else " [y/N] "

	while True:
	choice = input(question + prompt).strip().lower()
	if not choice:
	return default == "yes"
	if choice in ("y", "yes"):
	return True
	if choice in ("n", "no"):
	return False
	print("Please respond with 'y' or 'n'.")


	class FileType:
	"""Factory for creating file object types

	Instances of FileType are typically passed as type= arguments to the
	ArgumentParser add_argument() method.

	Keyword Arguments:
	- mode -- A string indicating how the file is to be opened. Accepts the
	same values as the builtin open() function.
	- bufsize -- The file's desired buffer size. Accepts the same values as
	the builtin open() function.
	- encoding -- The file's encoding. Accepts the same values as the
	builtin open() function.
	- errors -- A string indicating how encoding and decoding errors are to
	be handled. Accepts the same value as the builtin open() function.

	Backported from CPython 3.12
	"""

	def __init__(
	self,
	mode: str = "r",
	bufsize: int = -1,
	encoding: str \| None = None,
	errors: str \| None = None,
	):
	self._mode = mode
	self._bufsize = bufsize
	self._encoding = encoding
	self._errors = errors

	def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
	# the special argument "-" means sys.std{in,out}
	if string == "-":
	if "r" in self._mode:
	return sys.stdin.buffer if "b" in self._mode else sys.stdin
	elif any(c in self._mode for c in "wax"):
	return sys.stdout.buffer if "b" in self._mode else sys.stdout
	else:
	msg = f'argument "-" with mode {self._mode}'
	raise ValueError(msg)

	# all other arguments are used as file names
	try:
	return open(string, self._mode, self._bufsize, self._encoding, self._errors)
	except OSError as e:
	message = f"can't open '{string}': {e}"
	raise argparse.ArgumentTypeError(message)

	def __repr__(self) -> str:
	args = self._mode, self._bufsize
	kwargs = [("encoding", self._encoding), ("errors", self._errors)]
	args_str = ", ".join(
	[repr(arg) for arg in args if arg != -1]
	+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
	)
	return f"{type(self).__name__}({args_str})"


	def cli_detect(argv: list[str] \| None = None) -> int:
	"""
	CLI assistant using ARGV and ArgumentParser
	:param argv:
	:return: 0 if everything is fine, anything else equal trouble
	"""
	parser = argparse.ArgumentParser(
	description="The Real First Universal Charset Detector. "
	"Discover originating encoding used on text file. "
	"Normalize text to unicode."
	)

	parser.add_argument(
	"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
	)
	parser.add_argument(
	"-v",
	"--verbose",
	action="store_true",
	default=False,
	dest="verbose",
	help="Display complementary information about file if any. "
	"Stdout will contain logs about the detection process.",
	)
	parser.add_argument(
	"-a",
	"--with-alternative",
	action="store_true",
	default=False,
	dest="alternatives",
	help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
	)
	parser.add_argument(
	"-n",
	"--normalize",
	action="store_true",
	default=False,
	dest="normalize",
	help="Permit to normalize input file. If not set, program does not write anything.",
	)
	parser.add_argument(
	"-m",
	"--minimal",
	action="store_true",
	default=False,
	dest="minimal",
	help="Only output the charset detected to STDOUT. Disabling JSON output.",
	)
	parser.add_argument(
	"-r",
	"--replace",
	action="store_true",
	default=False,
	dest="replace",
	help="Replace file when trying to normalize it instead of creating a new one.",
	)
	parser.add_argument(
	"-f",
	"--force",
	action="store_true",
	default=False,
	dest="force",
	help="Replace file without asking if you are sure, use this flag with caution.",
	)
	parser.add_argument(
	"-i",
	"--no-preemptive",
	action="store_true",
	default=False,
	dest="no_preemptive",
	help="Disable looking at a charset declaration to hint the detector.",
	)
	parser.add_argument(
	"-t",
	"--threshold",
	action="store",
	default=0.2,
	type=float,
	dest="threshold",
	help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
	)
	parser.add_argument(
	"--version",
	action="version",
	version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
	__version__,
	python_version(),
	unidata_version,
	"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
	),
	help="Show version information and exit.",
	)

	args = parser.parse_args(argv)

	if args.replace is True and args.normalize is False:
	if args.files:
	for my_file in args.files:
	my_file.close()
	print("Use --replace in addition of --normalize only.", file=sys.stderr)
	return 1

	if args.force is True and args.replace is False:
	if args.files:
	for my_file in args.files:
	my_file.close()
	print("Use --force in addition of --replace only.", file=sys.stderr)
	return 1

	if args.threshold < 0.0 or args.threshold > 1.0:
	if args.files:
	for my_file in args.files:
	my_file.close()
	print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
	return 1

	x_ = []

	for my_file in args.files:
	matches = from_fp(
	my_file,
	threshold=args.threshold,
	explain=args.verbose,
	preemptive_behaviour=args.no_preemptive is False,
	)

	best_guess = matches.best()

	if best_guess is None:
	print(
	'Unable to identify originating encoding for "{}". {}'.format(
	my_file.name,
	(
	"Maybe try increasing maximum amount of chaos."
	if args.threshold < 1.0
	else ""
	),
	),
	file=sys.stderr,
	)
	x_.append(
	CliDetectionResult(
	abspath(my_file.name),
	None,
	[],
	[],
	"Unknown",
	[],
	False,
	1.0,
	0.0,
	None,
	True,
	)
	)
	else:
	cli_result = CliDetectionResult(
	abspath(my_file.name),
	best_guess.encoding,
	best_guess.encoding_aliases,
	[
	cp
	for cp in best_guess.could_be_from_charset
	if cp != best_guess.encoding
	],
	best_guess.language,
	best_guess.alphabets,
	best_guess.bom,
	best_guess.percent_chaos,
	best_guess.percent_coherence,
	None,
	True,
	)
	x_.append(cli_result)

	if len(matches) > 1 and args.alternatives:
	for el in matches:
	if el != best_guess:
	x_.append(
	CliDetectionResult(
	abspath(my_file.name),
	el.encoding,
	el.encoding_aliases,
	[
	cp
	for cp in el.could_be_from_charset
	if cp != el.encoding
	],
	el.language,
	el.alphabets,
	el.bom,
	el.percent_chaos,
	el.percent_coherence,
	None,
	False,
	)
	)

	if args.normalize is True:
	if best_guess.encoding.startswith("utf") is True:
	print(
	'"{}" file does not need to be normalized, as it already came from unicode.'.format(
	my_file.name
	),
	file=sys.stderr,
	)
	if my_file.closed is False:
	my_file.close()
	continue

	dir_path = dirname(realpath(my_file.name))
	file_name = basename(realpath(my_file.name))

	o_: list[str] = file_name.split(".")

	if args.replace is False:
	o_.insert(-1, best_guess.encoding)
	if my_file.closed is False:
	my_file.close()
	elif (
	args.force is False
	and query_yes_no(
	'Are you sure to normalize "{}" by replacing it ?'.format(
	my_file.name
	),
	"no",
	)
	is False
	):
	if my_file.closed is False:
	my_file.close()
	continue

	try:
	cli_result.unicode_path = join(dir_path, ".".join(o_))

	with open(cli_result.unicode_path, "wb") as fp:
	fp.write(best_guess.output())
	except OSError as e: # Defensive:
	print(str(e), file=sys.stderr)
	if my_file.closed is False:
	my_file.close()
	return 2

	if my_file.closed is False:
	my_file.close()

	if args.minimal is False:
	print(
	dumps(
	[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
	ensure_ascii=True,
	indent=4,
	)
	)
	else:
	for my_file in args.files:
	print(
	", ".join(
	[
	el.encoding or "undefined"
	for el in x_
	if el.path == abspath(my_file.name)
	]
	)
	)

	return 0


	if __name__ == "__main__": # Defensive:
	cli_detect()