chinese-translator-paste / chinese_file_translator.py

Upload 8 files

b03b79f verified about 1 month ago

47.7 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	ChineseFileTranslator v1.0.0
	================================
	Author : algorembrant
	License : MIT
	Version : 1.0.0

	Translate Chinese text inside .txt or .md files to English.
	Preserves Markdown structure (headings, bold, italics, code blocks, tables, links).
	Supports batch/vectorized processing, multiple translation backends,
	auto-detection of Chinese script, and history logging.

	USAGE COMMANDS
	--------------
	Translate a single file (default: Google backend):
	python chinese_file_translator.py input.txt

	Translate and save to a specific output file:
	python chinese_file_translator.py input.md -o translated.md

	Translate using the offline Helsinki-NLP MarianMT model:
	python chinese_file_translator.py input.txt --backend offline

	Translate using Microsoft Translator (requires API key in config):
	python chinese_file_translator.py input.txt --backend microsoft

	Force Simplified Chinese OCR/detection:
	python chinese_file_translator.py input.txt --lang simplified

	Force Traditional Chinese:
	python chinese_file_translator.py input.txt --lang traditional

	Auto-detect Chinese script (default):
	python chinese_file_translator.py input.txt --lang auto

	Enable GPU (CUDA) for offline model:
	python chinese_file_translator.py input.txt --backend offline --gpu

	Set OCR confidence threshold (0.0 - 1.0, default 0.3):
	python chinese_file_translator.py input.txt --confidence 0.4

	Batch translate all .txt and .md files in a directory:
	python chinese_file_translator.py --batch ./my_folder/

	Batch translate with output directory:
	python chinese_file_translator.py --batch ./input/ --batch-out ./output/

	Set chunk size for large files (default 4000 chars):
	python chinese_file_translator.py input.txt --chunk-size 2000

	Append both Chinese source and English translation side-by-side:
	python chinese_file_translator.py input.txt --bilingual

	Only extract and print detected Chinese text (no translation):
	python chinese_file_translator.py input.txt --extract-only

	Print translated output to stdout instead of file:
	python chinese_file_translator.py input.txt --stdout

	Export translation history to JSON on exit:
	python chinese_file_translator.py input.txt --export-history out.json

	Enable verbose/debug logging:
	python chinese_file_translator.py input.txt --verbose

	Show version and exit:
	python chinese_file_translator.py --version

	Show full help:
	python chinese_file_translator.py --help

	SUPPORTED FILE TYPES
	--------------------
	- Plain text (.txt) : All Chinese detected and translated in-place
	- Markdown (.md) : Chinese content translated; Markdown syntax preserved
	Preserved: headings (#), bold (*), italic (), inline code (`),
	fenced code blocks (```), blockquotes (>), tables (\|),
	links ([text](url)), images (![alt](url)), horizontal rules

	SUPPORTED CHINESE VARIANTS
	---------------------------
	- Simplified Chinese (Mandarin, simplified/simp)
	- Traditional Chinese (Mandarin / Hong Kong / Taiwan)
	- Cantonese / Yue (detected via Unicode CJK ranges)
	- Classical Chinese (Literary Chinese, treated as Traditional)
	- Mixed Chinese-English (Chinglish / code-switching)

	TRANSLATION BACKENDS
	--------------------
	1. Google Translate (online, fast, default, no API key needed)
	2. Microsoft Translate (online, fallback, requires Azure API key)
	3. Helsinki-NLP MarianMT (offline, opus-mt-zh-en, ~300 MB download on first use)

	CONFIGURATION
	-------------
	Config is stored at: ~/.chinese_file_translator/config.json
	History is stored at: ~/.chinese_file_translator/history.json
	Logs are stored at: ~/.chinese_file_translator/app.log

	EXTERNAL SETUP REQUIRED
	-----------------------
	PyTorch (required only for offline backend):
	CPU-only:
	pip install torch --index-url https://download.pytorch.org/whl/cpu
	CUDA 11.8:
	pip install torch --index-url https://download.pytorch.org/whl/cu118
	CUDA 12.1:
	pip install torch --index-url https://download.pytorch.org/whl/cu121

	Helsinki-NLP model is downloaded automatically on first offline run (~300 MB):
	Model: Helsinki-NLP/opus-mt-zh-en
	Cache: ~/.chinese_file_translator/models/

	Microsoft Translator (optional):
	Get a free API key from Azure Cognitive Services and add to config.json:
	{ "microsoft_api_key": "YOUR_KEY_HERE", "microsoft_region": "eastus" }
	"""

	# ── Standard Library ──────────────────────────────────────────────────────────
	import os
	import re
	import sys
	import json
	import time
	import logging
	import argparse
	import textwrap
	import threading
	import unicodedata
	from copy import deepcopy
	from pathlib import Path
	from datetime import datetime
	from typing import (
	Any, Dict, Generator, List, Optional, Sequence, Tuple
	)

	# ── Online Translation ────────────────────────────────────────────────────────
	try:
	from deep_translator import GoogleTranslator, MicrosoftTranslator
	DEEP_TRANSLATOR_AVAILABLE = True
	except ImportError:
	DEEP_TRANSLATOR_AVAILABLE = False

	# ── Offline Translation ───────────────────────────────────────────────────────
	OFFLINE_AVAILABLE = False
	try:
	from transformers import MarianMTModel, MarianTokenizer
	import torch
	OFFLINE_AVAILABLE = True
	except ImportError:
	pass

	# ── Progress bar (optional) ───────────────────────────────────────────────────
	try:
	from tqdm import tqdm
	TQDM_AVAILABLE = True
	except ImportError:
	TQDM_AVAILABLE = False

	# ── Clipboard (optional) ─────────────────────────────────────────────────────
	try:
	import pyperclip
	CLIPBOARD_AVAILABLE = True
	except ImportError:
	CLIPBOARD_AVAILABLE = False

	# ── Constants ─────────────────────────────────────────────────────────────────
	APP_NAME = "ChineseFileTranslator"
	APP_VERSION = "1.0.0"
	APP_AUTHOR = "algorembrant"
	_HOME = Path.home() / ".chinese_file_translator"
	CONFIG_FILE = _HOME / "config.json"
	HISTORY_FILE = _HOME / "history.json"
	LOG_FILE = _HOME / "app.log"
	OFFLINE_MODEL = "Helsinki-NLP/opus-mt-zh-en"
	OFFLINE_MODEL_T = "Helsinki-NLP/opus-mt-zht-en"

	# CJK Unicode blocks used for Chinese detection
	_CJK_RANGES: Tuple[Tuple[int, int], ...] = (
	(0x4E00, 0x9FFF), # CJK Unified Ideographs
	(0x3400, 0x4DBF), # CJK Extension A
	(0x20000, 0x2A6DF), # CJK Extension B
	(0x2A700, 0x2B73F), # CJK Extension C
	(0x2B740, 0x2B81F), # CJK Extension D
	(0xF900, 0xFAFF), # CJK Compatibility Ideographs
	(0x2F800, 0x2FA1F), # CJK Compatibility Supplement
	(0x3000, 0x303F), # CJK Symbols and Punctuation
	(0xFF00, 0xFFEF), # Fullwidth / Halfwidth Forms
	(0xFE30, 0xFE4F), # CJK Compatibility Forms
	)

	# Markdown patterns that must NOT be translated
	_MD_CODE_FENCE = re.compile(r"```[\s\S]*?```")
	_MD_INLINE_CODE = re.compile(r"`[^`\n]*?`")
	_MD_LINK = re.compile(r"(!?\[[^\]]?\])\(([^)]?)\)")
	_MD_HTML_TAG = re.compile(r"<[a-zA-Z/][^>]*?>")
	_MD_FRONTMATTER = re.compile(r"^---[\s\S]*?^---", re.MULTILINE)


	# ════════════════════════════════════════════════════════════════════════════
	# LOGGING
	# ════════════════════════════════════════════════════════════════════════════
	def setup_logging(verbose: bool = False) -> logging.Logger:
	_HOME.mkdir(parents=True, exist_ok=True)
	level = logging.DEBUG if verbose else logging.INFO
	fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
	handlers: List[logging.Handler] = [
	logging.FileHandler(LOG_FILE, encoding="utf-8"),
	logging.StreamHandler(sys.stdout),
	]
	logging.basicConfig(level=level, format=fmt, handlers=handlers)
	return logging.getLogger(APP_NAME)


	logger = logging.getLogger(APP_NAME)


	# ════════════════════════════════════════════════════════════════════════════
	# CONFIG
	# ════════════════════════════════════════════════════════════════════════════
	class Config:
	"""Persistent JSON configuration. CLI args override stored values."""

	DEFAULTS: Dict[str, Any] = {
	"backend" : "google",
	"lang" : "auto",
	"use_gpu" : False,
	"confidence_threshold" : 0.30,
	"chunk_size" : 4000,
	"batch_size" : 10,
	"bilingual" : False,
	"preserve_whitespace" : True,
	"microsoft_api_key" : "",
	"microsoft_region" : "eastus",
	"offline_model_dir" : str(_HOME / "models"),
	"max_history" : 1000,
	"output_suffix" : "_translated",
	"retry_attempts" : 3,
	"retry_delay_seconds" : 1.5,
	}

	def __init__(self) -> None:
	self._data: Dict[str, Any] = dict(self.DEFAULTS)
	_HOME.mkdir(parents=True, exist_ok=True)
	self._load()

	def _load(self) -> None:
	if CONFIG_FILE.exists():
	try:
	with open(CONFIG_FILE, "r", encoding="utf-8") as f:
	self._data.update(json.load(f))
	except Exception as exc:
	logger.warning(f"Config load failed ({exc}). Using defaults.")

	def save(self) -> None:
	try:
	with open(CONFIG_FILE, "w", encoding="utf-8") as f:
	json.dump(self._data, f, indent=2, ensure_ascii=False)
	except Exception as exc:
	logger.error(f"Config save failed: {exc}")

	def get(self, key: str, default: Any = None) -> Any:
	return self._data.get(key, self.DEFAULTS.get(key, default))

	def set(self, key: str, value: Any) -> None:
	self._data[key] = value
	self.save()

	def apply_args(self, args: argparse.Namespace) -> None:
	if getattr(args, "backend", None):
	self._data["backend"] = args.backend
	if getattr(args, "lang", None):
	self._data["lang"] = args.lang
	if getattr(args, "gpu", False):
	self._data["use_gpu"] = True
	if getattr(args, "confidence", None) is not None:
	self._data["confidence_threshold"] = args.confidence
	if getattr(args, "chunk_size", None) is not None:
	self._data["chunk_size"] = args.chunk_size
	if getattr(args, "bilingual", False):
	self._data["bilingual"] = True
	if getattr(args, "offline", False):
	self._data["backend"] = "offline"


	# ════════════════════════════════════════════════════════════════════════════
	# CHINESE DETECTION UTILITIES
	# ════════════════════════════════════════════════════════════════════════════
	def _is_cjk(char: str) -> bool:
	"""Return True if the character falls within any CJK Unicode range."""
	cp = ord(char)
	return any(lo <= cp <= hi for lo, hi in _CJK_RANGES)


	def contains_chinese(text: str, min_ratio: float = 0.0) -> bool:
	"""
	Return True when Chinese characters are present in `text`.
	If `min_ratio` is > 0, requires that fraction of non-whitespace characters.
	"""
	if not text or not text.strip():
	return False
	non_ws = [c for c in text if not c.isspace()]
	if not non_ws:
	return False
	cjk_count = sum(1 for c in non_ws if _is_cjk(c))
	if min_ratio <= 0:
	return cjk_count > 0
	return (cjk_count / len(non_ws)) >= min_ratio


	def chinese_ratio(text: str) -> float:
	"""Return the fraction of non-whitespace chars that are CJK."""
	non_ws = [c for c in text if not c.isspace()]
	if not non_ws:
	return 0.0
	return sum(1 for c in non_ws if _is_cjk(c)) / len(non_ws)


	def detect_script(text: str) -> str:
	"""
	Heuristic: Traditional Chinese uses specific code points absent from
	Simplified. Returns 'traditional', 'simplified', or 'mixed'.
	"""
	# Characters common in Traditional but rarely in Simplified
	_TRAD_MARKERS = set(
	"繁體國語臺灣學習問題開發電腦時間工作歷史語言文化"
	"經濟機會關係發展環境教育政府社會應該雖然雖然認為"
	)
	_SIMP_MARKERS = set(
	"简体国语台湾学习问题开发电脑时间工作历史语言文化"
	"经济机会关系发展环境教育政府社会应该虽然认为"
	)
	trad = sum(1 for c in text if c in _TRAD_MARKERS)
	simp = sum(1 for c in text if c in _SIMP_MARKERS)
	if trad > simp:
	return "traditional"
	if simp > trad:
	return "simplified"
	return "simplified" # default fallback


	# ════════════════════════════════════════════════════════════════════════════
	# TRANSLATION ENGINE
	# ════════════════════════════════════════════════════════════════════════════
	class TranslationEngine:
	"""
	Multi-backend Chinese-to-English translation.

	Vectorized batch mode is used for the offline (MarianMT) backend.
	Online backends (Google, Microsoft) chunk by character limit with
	sentence-boundary awareness and automatic retry on transient errors.
	"""

	_GOOGLE_LIMIT = 4500 # chars per Google request
	_MS_LIMIT = 10000 # chars per Microsoft request
	_OFFLINE_LIMIT = 512 # tokens; use 400-char char proxy

	def __init__(self, config: Config) -> None:
	self.cfg = config
	self._offline_model: Any = None
	self._offline_tok: Any = None
	self._lock = threading.Lock()

	# ── Public API ────────────────────────────────────────────────────────

	def translate(
	self, text: str, source_lang: str = "auto"
	) -> Tuple[str, str]:
	"""
	Translate `text` to English.
	Returns (translated_text, backend_name).
	"""
	if not text or not text.strip():
	return text, "passthrough"

	backend = self.cfg.get("backend", "google")
	attempt_order: List[str] = _dedupe_list([backend, "google", "offline"])

	last_exc: Optional[Exception] = None
	for b in attempt_order:
	try:
	result = self._call_backend(b, text, source_lang)
	return result, b
	except Exception as exc:
	logger.warning(f"Backend '{b}' failed for [{text}]: {exc}")
	last_exc = exc

	# NEVER CRASH: return original if all failed
	logger.error(f"All translation backends failed for [{text}]. Returning original.")
	return text, "failed"

	def translate_batch(
	self,
	texts: List[str],
	source_lang: str = "auto",
	) -> List[Tuple[str, str]]:
	"""
	Translate a list of strings.
	Uses vectorized batching for the offline backend; serial calls for
	online backends (rate-limit friendly).
	"""
	backend = self.cfg.get("backend", "google")
	if backend == "offline" and OFFLINE_AVAILABLE:
	return self._translate_batch_offline(texts)
	# Serial with progress
	results: List[Tuple[str, str]] = []
	iterable = (
	tqdm(texts, desc="Translating", unit="chunk")
	if TQDM_AVAILABLE else texts
	)
	for text in iterable:
	results.append(self.translate(text, source_lang))
	# Small delay for online backends to avoid rate limits
	if backend in ("google", "microsoft"):
	time.sleep(0.3)
	return results

	# ── Backend dispatch ──────────────────────────────────────────────────

	def _call_backend(
	self, backend: str, text: str, source_lang: str
	) -> str:
	retries = int(self.cfg.get("retry_attempts", 3))
	delay = float(self.cfg.get("retry_delay_seconds", 1.5))
	last_exc2: Optional[Exception] = None
	for attempt in range(retries):
	try:
	if backend == "google":
	return self._google(text, source_lang)
	elif backend == "microsoft":
	return self._microsoft(text, source_lang)
	elif backend == "offline":
	translated, _ = self._offline_single(text)
	return translated
	else:
	raise ValueError(f"Unknown backend: {backend}")
	except Exception as exc:
	last_exc2 = exc
	if attempt < retries - 1:
	time.sleep(delay * (attempt + 1))
	raise RuntimeError(
	f"Backend '{backend}' failed after {retries} attempts: {last_exc2}"
	)

	# ── Google ────────────────────────────────────────────────────────────

	def _google(self, text: str, source_lang: str) -> str:
	if not DEEP_TRANSLATOR_AVAILABLE:
	raise RuntimeError("deep-translator not installed.")

	lang_map = {"simplified": "zh-CN", "traditional": "zh-TW", "auto": "auto"}
	src = lang_map.get(source_lang, "auto")
	chunks = list(_split_text(text, self._GOOGLE_LIMIT))
	parts: List[str] = []

	for chunk in chunks:
	try:
	translated = GoogleTranslator(source=src, target="en").translate(chunk)
	# If it's None or returned original Chinese, it failed
	if not translated or (translated.strip() == chunk.strip() and contains_chinese(chunk)):
	raise RuntimeError("Google returned original or None")
	parts.append(translated)
	except Exception as e:
	raise RuntimeError(f"Google translate error: {e}")

	return " ".join(parts)

	# ── Microsoft ─────────────────────────────────────────────────────────

	def _microsoft(self, text: str, source_lang: str) -> str:
	if not DEEP_TRANSLATOR_AVAILABLE:
	raise RuntimeError(
	"deep-translator not installed. Run: pip install deep-translator"
	)
	api_key = str(self.cfg.get("microsoft_api_key", ""))
	region = str(self.cfg.get("microsoft_region", "eastus"))
	if not api_key:
	raise ValueError(
	"Microsoft API key not configured. "
	"Add 'microsoft_api_key' to ~/.chinese_file_translator/config.json"
	)
	lang_map = {"simplified": "zh-Hans", "traditional": "zh-Hant", "auto": "auto"}
	src = lang_map.get(source_lang, "auto")
	chunks = list(_split_text(text, self._MS_LIMIT))
	parts = []
	for chunk in chunks:
	tr = MicrosoftTranslator(
	api_key=api_key, region=region, source=src, target="en"
	).translate(chunk)
	parts.append(tr or chunk)
	return " ".join(parts)

	# ── Offline (MarianMT) ────────────────────────────────────────────────

	def _load_offline(self) -> None:
	if not OFFLINE_AVAILABLE:
	raise RuntimeError("Offline model dependencies not installed.")
	model_dir = str(self.cfg.get("offline_model_dir", str(_HOME / "models")))
	Path(model_dir).mkdir(parents=True, exist_ok=True)
	# ...
	self._offline_tok = MarianTokenizer.from_pretrained(
	OFFLINE_MODEL, cache_dir=model_dir
	)
	model = MarianMTModel.from_pretrained(
	OFFLINE_MODEL, cache_dir=model_dir
	)
	use_gpu = bool(self.cfg.get("use_gpu", False))
	device = "cuda" if (use_gpu and torch.cuda.is_available()) else "cpu"
	self._offline_model = model.to(device)
	logger.info(f"Offline model loaded on '{device}'.")

	def _offline_single(self, text: str) -> Tuple[str, str]:
	with self._lock:
	if self._offline_model is None:
	self._load_offline()
	chunks = list(_split_text(text, self._OFFLINE_LIMIT))
	results = self._vectorized_translate(chunks)
	return " ".join(results), "offline"

	def _translate_batch_offline(
	self, texts: List[str]
	) -> List[Tuple[str, str]]:
	"""Vectorized: flatten all chunks, translate in one pass, reassemble."""
	with self._lock:
	if self._offline_model is None:
	self._load_offline()

	# Build chunk index: (text_idx, chunk_idx) -> flat_idx
	all_chunks: List[str] = []
	chunk_map: List[Tuple[int, int]] = [] # (text_idx, n_chunks)

	for t_idx, text in enumerate(texts):
	if not text or not text.strip():
	chunk_map.append((t_idx, 0))
	continue
	chunks = list(_split_text(text, self._OFFLINE_LIMIT))
	start = len(all_chunks)
	all_chunks.extend(chunks)
	chunk_map.append((t_idx, len(chunks)))

	if not all_chunks:
	return [(t, "passthrough") for t in texts]

	# One vectorized forward pass
	translated_chunks = self._vectorized_translate(all_chunks)

	# Reassemble
	results: List[Tuple[str, str]] = []
	flat_idx = 0
	for t_idx, n in chunk_map:
	if n == 0:
	results.append((texts[t_idx], "passthrough"))
	else:
	assembled = " ".join(translated_chunks[flat_idx : flat_idx + n])
	results.append((assembled, "offline"))
	flat_idx += n
	return results

	def _vectorized_translate(self, chunks: List[str]) -> List[str]:
	"""Run MarianMT on a list of strings in one batched forward pass."""
	if not chunks:
	return []

	tok = self._offline_tok
	model = self._offline_model
	if tok is None or model is None:
	raise RuntimeError("Offline model not loaded.")

	device = next(model.parameters()).device
	batch_size = int(self.cfg.get("batch_size", 10))
	results: List[str] = []

	# Split into mini-batches to avoid OOM on large inputs
	for i in range(0, len(chunks), batch_size):
	mini = chunks[i : i + batch_size]
	enc = tok(
	mini,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=512,
	).to(device)
	with torch.no_grad():
	out = model.generate(**enc)
	decoded = tok.batch_decode(out, skip_special_tokens=True)
	results.extend(decoded)

	return results


	# ════════════════════════════════════════════════════════════════════════════
	# TEXT SPLITTING UTILITIES
	# ════════════════════════════════════════════════════════════════════════════
	def _split_text(text: str, max_len: int) -> Generator[str, None, None]:
	"""Split text at sentence boundaries for chunking."""
	if len(text) <= max_len:
	yield text
	return

	sentence_ends = re.compile(r"[。！？\n!?\.]")
	current: List[str] = []
	current_len = 0

	for segment in sentence_ends.split(text):
	seg = segment.strip()
	if not seg:
	continue
	if current_len + len(seg) + 1 > max_len and current:
	yield " ".join(current)
	current = [seg]
	current_len = len(seg)
	else:
	current.append(seg)
	current_len += len(seg) + 1

	if current:
	yield " ".join(current)


	def _dedupe_list(lst: List[str]) -> List[str]:
	seen: set = set()
	out: List[str] = []
	for item in lst:
	if item not in seen:
	seen.add(item)
	out.append(item)
	return out


	# ════════════════════════════════════════════════════════════════════════════
	# MARKDOWN PARSER / SEGMENT EXTRACTOR
	# ════════════════════════════════════════════════════════════════════════════
	class MarkdownProcessor:
	"""Ultra-robust Markdown protection."""
	_TOKEN = "___MY_PROTECT_PH_{idx}___"

	def __init__(self) -> None:
	self._protected: Dict[int, str] = {}
	self._ph_counter = 0

	def _next_placeholder(self, original: str) -> str:
	idx = self._ph_counter
	token = self._TOKEN.format(idx=idx)
	self._protected[idx] = original
	self._ph_counter += 1
	return token

	def protect(self, text: str) -> str:
	"""Replace code/links/tags with unique tokens."""
	self._protected.clear()
	self._ph_counter = 0

	# Protect YAML
	text = _MD_FRONTMATTER.sub(lambda m: self._next_placeholder(m.group(0)), text)

	# Protect Code Fences but leave content if it has Chinese
	def _fence_sub(m: re.Match) -> str:
	full = m.group(0)
	if contains_chinese(full):
	# Only protect the ``` lines
	lines = full.splitlines()
	if len(lines) >= 2:
	p1 = self._next_placeholder(lines[0])
	p2 = self._next_placeholder(lines[-1])
	content = "\n".join(lines[1:-1])
	return f"{p1}\n{content}\n{p2}"
	return self._next_placeholder(full)
	text = _MD_CODE_FENCE.sub(_fence_sub, text)

	# Protect HTML and Inline Code and Links
	text = _MD_HTML_TAG.sub(lambda m: self._next_placeholder(m.group(0)), text)
	text = _MD_LINK.sub(lambda m: f"{m.group(1)}({self._next_placeholder(m.group(2))})", text)
	text = _MD_INLINE_CODE.sub(lambda m: self._next_placeholder(m.group(0)), text)

	return text

	def restore(self, text: str) -> str:
	"""Sequential replacement of all tokens."""
	# We replace them in reverse to avoid partial matches if idx 10 and 1 exist
	for idx in sorted(self._protected.keys(), reverse=True):
	token = self._TOKEN.format(idx=idx)
	original = self._protected[idx]
	# Use regex to handle potential space mangling by Google
	pattern = re.compile(re.escape(token).replace(r"\_", r"\s\_"), re.IGNORECASE)
	text = pattern.sub(original.replace("\\", "\\\\"), text)
	return text


	class FileTranslator:
	"""Orchestrates translation with 'Never Miss' strategy."""
	def __init__(self, config: Config) -> None:
	self.cfg = config
	self.engine = TranslationEngine(config)
	self._md_proc = MarkdownProcessor()

	def translate_file(
	self,
	input_path: Path,
	output_path: Optional[Path] = None,
	extract_only: bool = False,
	to_stdout: bool = False,
	) -> Path:
	input_path = Path(input_path).resolve()
	if not input_path.exists(): raise FileNotFoundError(f"Missing: {input_path}")

	suffix = input_path.suffix.lower()
	if suffix not in (".txt", ".md"): raise ValueError("Unsupported type")

	raw = input_path.read_text(encoding="utf-8", errors="replace")
	if extract_only:
	extracted = "\n".join([l for l in raw.splitlines() if contains_chinese(l)])
	if to_stdout: print(extracted); return input_path
	out = output_path or _default_output(input_path, self.cfg)
	out.write_text(extracted, encoding="utf-8")
	return out

	res = self._translate_md(raw) if suffix == ".md" else self._translate_txt(raw)
	if to_stdout: print(res); return input_path
	out = output_path or _default_output(input_path, self.cfg)
	out.write_text(res, encoding="utf-8")
	return out

	def _translate_txt(self, text: str) -> str:
	lines = text.splitlines(keepends=True)
	bilingual = bool(self.cfg.get("bilingual", False))

	out_lines = []
	for line in lines:
	stripped = line.rstrip("\n\r")
	if contains_chinese(stripped):
	tr = self._translate_granular(stripped)
	eol = "\n" if line.endswith("\n") else ""
	out_lines.append(f"{stripped}\n{tr}{eol}" if bilingual else f"{tr}{eol}")
	else:
	out_lines.append(line)
	return "".join(out_lines)

	def _translate_md(self, text: str) -> str:
	"""Global Surgical Batch Translation with fixed CJK regex."""
	# 1. Protect structure
	protected = self._md_proc.protect(text)

	# 2. Extract all CJK blocks (Inclusive range for stability)
	CJK_BLOCK_RE = re.compile(
	r"["
	r"\u4e00-\u9fff" # Basic
	r"\u3400-\u4dbf" # Ext A
	r"\U00020000-\U0002ceaf" # Ext B-E
	r"\uf900-\ufaff" # Compatibility
	r"\u3000-\u303f" # Symbols/Punctuation
	r"\uff00-\uffef" # Fullwidth
	r"\u00b7" # Middle dot
	r"\u2014-\u2027" # Punctuation ranges
	r"]+"
	)
	# Filter out blocks that are ONLY numbers or symbols if they don't have AT LEAST ONE CJK
	def _has_real_cjk(s):
	return any('\u4e00' <= c <= '\u9fff' or '\u3400' <= c <= '\u4dbf' or ord(c) > 0xffff for c in s)

	all_candidate_blocks = CJK_BLOCK_RE.findall(protected)
	all_blocks = _dedupe_list([b for b in all_candidate_blocks if _has_real_cjk(b)])

	if not all_blocks:
	return self._md_proc.restore(protected)

	# 3. Batch translate unique blocks
	logger.info(f"Found {len(all_blocks)} unique Chinese blocks. Batch translating...")
	translated = self.engine.translate_batch(all_blocks, source_lang="simplified")

	# 4. Global replacement
	mapping = {}
	for orig, (tr, _) in zip(all_blocks, translated):
	if tr.strip() and tr.strip() != orig.strip():
	mapping[orig] = tr
	else:
	try:
	t, _ = self.engine.translate(orig, source_lang="simplified")
	mapping[orig] = t
	except:
	mapping[orig] = orig

	sorted_orig = sorted(mapping.keys(), key=len, reverse=True)
	final_text = protected
	for orig in sorted_orig:
	final_text = final_text.replace(orig, mapping[orig])

	# 5. Restore
	return self._md_proc.restore(final_text)

	def _translate_granular(self, text: str) -> str:
	"""Fallback for TXT or other sparse areas."""
	CJK_BLOCK_RE = re.compile(
	r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002ceaf\u3000-\u303f\uff00-\uffef]+"
	)
	def _sub(m: re.Match) -> str:
	chunk = m.group(0)
	if not any('\u4e00' <= c <= '\u9fff' for c in chunk): return chunk
	try:
	t, _ = self.engine.translate(chunk, source_lang="simplified")
	return t
	except:
	return chunk
	return CJK_BLOCK_RE.sub(_sub, text)

	@staticmethod
	def _extract_chinese_lines(text: str) -> List[str]:
	"""Return only lines that contain Chinese text."""
	return [
	line for line in text.splitlines()
	if contains_chinese(line)
	]

	def _detect_script_bulk(self, texts: List[str]) -> str:
	"""Detect dominant script from a list of strings."""
	lang_mode = str(self.cfg.get("lang", "auto"))
	if lang_mode in ("simplified", "traditional"):
	return lang_mode
	combined = " ".join(texts[:50]) # sample first 50 segments
	return detect_script(combined)

	# ── Batch directory translation ───────────────────────────────────────

	def translate_directory(
	self,
	input_dir: Path,
	output_dir: Optional[Path] = None,
	) -> List[Path]:
	"""Translate all .txt and .md files in `input_dir`."""
	input_dir = Path(input_dir).resolve()
	if not input_dir.is_dir():
	raise NotADirectoryError(f"Not a directory: {input_dir}")

	files = sorted(
	list(input_dir.glob(".txt")) + list(input_dir.glob(".md"))
	)
	if not files:
	logger.warning(f"No .txt or .md files found in {input_dir}")
	return []

	logger.info(f"Batch translating {len(files)} file(s) from {input_dir}")
	out_paths: List[Path] = []

	iterable = (
	tqdm(files, desc="Files", unit="file")
	if TQDM_AVAILABLE else files
	)
	for fpath in iterable:
	try:
	if output_dir:
	out_file = Path(output_dir) / fpath.name
	Path(output_dir).mkdir(parents=True, exist_ok=True)
	else:
	out_file = _default_output(fpath, self.cfg)
	result = self.translate_file(fpath, output_path=out_file)
	out_paths.append(result)
	logger.info(f" Done: {fpath.name} -> {result.name}")
	except Exception as exc:
	logger.error(f" Failed: {fpath.name}: {exc}")

	return out_paths


	# ════════════════════════════════════════════════════════════════════════════
	# HISTORY MANAGER
	# ════════════════════════════════════════════════════════════════════════════
	class HistoryManager:
	"""Log translation sessions to a persistent JSON file."""

	def __init__(self, config: Config) -> None:
	self.cfg = config
	self._items: List[Dict[str, Any]] = []
	_HOME.mkdir(parents=True, exist_ok=True)
	self._load()

	def _load(self) -> None:
	if HISTORY_FILE.exists():
	try:
	with open(HISTORY_FILE, "r", encoding="utf-8") as f:
	self._items = json.load(f)
	except Exception:
	self._items = []

	def save(self) -> None:
	try:
	with open(HISTORY_FILE, "w", encoding="utf-8") as f:
	json.dump(self._items, f, ensure_ascii=False, indent=2)
	except Exception as exc:
	logger.error(f"History save error: {exc}")

	def add(
	self,
	input_file: str,
	output_file: str,
	backend: str,
	script: str,
	segments_count: int,
	elapsed_seconds: float,
	) -> None:
	entry: Dict[str, Any] = {
	"timestamp" : datetime.now().isoformat(),
	"input_file" : input_file,
	"output_file" : output_file,
	"backend" : backend,
	"script" : script,
	"segments_count" : segments_count,
	"elapsed_seconds": round(elapsed_seconds, 2),
	}
	self._items.insert(0, entry)
	max_h = int(self.cfg.get("max_history", 1000))
	while len(self._items) > max_h:
	self._items.pop()
	self.save()

	def export(self, path: str) -> None:
	with open(path, "w", encoding="utf-8") as f:
	json.dump(self._items, f, ensure_ascii=False, indent=2)
	logger.info(f"History exported to {path}")

	def get_all(self) -> List[Dict[str, Any]]:
	return list(self._items)


	# ════════════════════════════════════════════════════════════════════════════
	# PATH HELPERS
	# ════════════════════════════════════════════════════════════════════════════
	def _default_output(input_path: Path, config: Config) -> Path:
	"""Derive default output path: input_translated.ext"""
	suffix = str(config.get("output_suffix", "_translated"))
	return input_path.with_stem(input_path.stem + suffix)


	# ════════════════════════════════════════════════════════════════════════════
	# CLI ARG PARSER
	# ════════════════════════════════════════════════════════════════════════════
	def _build_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(
	prog="chinese_file_translator",
	description=(
	f"{APP_NAME} v{APP_VERSION} by {APP_AUTHOR}\n"
	"Translate Chinese text inside .txt or .md files to English."
	),
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=textwrap.dedent("""
	Examples:
	python chinese_file_translator.py input.txt
	python chinese_file_translator.py input.md -o translated.md
	python chinese_file_translator.py input.txt --backend offline --gpu
	python chinese_file_translator.py input.txt --bilingual
	python chinese_file_translator.py input.txt --extract-only
	python chinese_file_translator.py --batch ./docs/ --batch-out ./out/
	python chinese_file_translator.py input.txt --stdout
	"""),
	)
	parser.add_argument(
	"input",
	nargs="?",
	help="Input .txt or .md file path",
	)
	parser.add_argument(
	"-o", "--output",
	dest="output",
	metavar="FILE",
	help="Output file path (default: <input>_translated.<ext>)",
	)
	parser.add_argument(
	"--batch",
	metavar="DIR",
	help="Translate all .txt and .md files in a directory",
	)
	parser.add_argument(
	"--batch-out",
	dest="batch_out",
	metavar="DIR",
	help="Output directory for batch translation",
	)
	parser.add_argument(
	"--backend",
	choices=["google", "microsoft", "offline"],
	help="Translation backend (default: google)",
	)
	parser.add_argument(
	"--offline",
	action="store_true",
	help="Shorthand for --backend offline",
	)
	parser.add_argument(
	"--lang",
	choices=["auto", "simplified", "traditional"],
	default="auto",
	help="Chinese script mode (default: auto)",
	)
	parser.add_argument(
	"--gpu",
	action="store_true",
	help="Use GPU (CUDA) for offline translation",
	)
	parser.add_argument(
	"--confidence",
	type=float,
	metavar="0.0-1.0",
	help="Chinese detection confidence threshold (default: 0.05 ratio)",
	)
	parser.add_argument(
	"--chunk-size",
	dest="chunk_size",
	type=int,
	metavar="N",
	help="Max characters per translation request (default: 4000)",
	)
	parser.add_argument(
	"--bilingual",
	action="store_true",
	help="Keep original Chinese alongside English translation",
	)
	parser.add_argument(
	"--extract-only",
	dest="extract_only",
	action="store_true",
	help="Only extract and save detected Chinese lines, no translation",
	)
	parser.add_argument(
	"--stdout",
	action="store_true",
	help="Print translated output to stdout instead of writing a file",
	)
	parser.add_argument(
	"--export-history",
	dest="export_history",
	metavar="FILE",
	help="Export translation history to a JSON file",
	)
	parser.add_argument(
	"--version",
	action="version",
	version=f"{APP_NAME} {APP_VERSION}",
	)
	parser.add_argument(
	"--verbose",
	action="store_true",
	help="Enable DEBUG-level logging",
	)
	return parser


	# ════════════════════════════════════════════════════════════════════════════
	# DEPENDENCY CHECK
	# ════════════════════════════════════════════════════════════════════════════
	def check_dependencies(args: argparse.Namespace) -> None:
	issues: List[str] = []
	want_offline = getattr(args, "offline", False) or getattr(args, "backend", "") == "offline"
	if not DEEP_TRANSLATOR_AVAILABLE:
	issues.append(
	"deep-translator -> pip install deep-translator"
	)
	if want_offline and not OFFLINE_AVAILABLE:
	issues.append(
	"transformers / torch -> pip install transformers torch\n"
	" (CPU) pip install torch --index-url https://download.pytorch.org/whl/cpu\n"
	" (CUDA) pip install torch --index-url https://download.pytorch.org/whl/cu121"
	)
	if issues:
	print("\n" + "=" * 55)
	print(f"[{APP_NAME}] Missing dependencies:")
	for i in issues:
	print(f" {i}")
	print("=" * 55 + "\n")


	# ════════════════════════════════════════════════════════════════════════════
	# MAIN
	# ════════════════════════════════════════════════════════════════════════════
	def main() -> None:
	parser = _build_parser()
	args = parser.parse_args()

	setup_logging(verbose=getattr(args, "verbose", False))
	check_dependencies(args)

	cfg = Config()
	cfg.apply_args(args)

	history = HistoryManager(cfg)
	translator = FileTranslator(cfg)

	# ── Export history shortcut ───────────────────────────────────────────
	if getattr(args, "export_history", None):
	history.export(args.export_history)
	if not args.input and not args.batch:
	return

	# ── Batch mode ────────────────────────────────────────────────────────
	if getattr(args, "batch", None):
	batch_dir = Path(args.batch)
	out_dir = Path(args.batch_out) if getattr(args, "batch_out", None) else None
	t0 = time.time()
	out_paths = translator.translate_directory(batch_dir, output_dir=out_dir)
	elapsed = time.time() - t0
	print(
	f"\nBatch complete: {len(out_paths)} file(s) translated "
	f"in {elapsed:.1f}s"
	)
	for p in out_paths:
	print(f" -> {p}")
	history.add(
	input_file=str(batch_dir),
	output_file=str(out_dir or batch_dir),
	backend=str(cfg.get("backend")),
	script=str(cfg.get("lang")),
	segments_count=len(out_paths),
	elapsed_seconds=elapsed,
	)
	return

	# ── Single file mode ──────────────────────────────────────────────────
	if not args.input:
	parser.print_help()
	sys.exit(0)

	input_path = Path(args.input)
	output_path = Path(args.output) if getattr(args, "output", None) else None

	t0 = time.time()
	try:
	out = translator.translate_file(
	input_path = input_path,
	output_path = output_path,
	extract_only = getattr(args, "extract_only", False),
	to_stdout = getattr(args, "stdout", False),
	)
	except (FileNotFoundError, ValueError, RuntimeError) as exc:
	logger.error(str(exc))
	sys.exit(1)

	elapsed = time.time() - t0

	if not getattr(args, "stdout", False):
	print(f"\n{APP_NAME} v{APP_VERSION}")
	print(f"Input : {input_path}")
	print(f"Output : {out}")
	print(f"Backend : {cfg.get('backend')}")
	print(f"Script : {cfg.get('lang')}")
	print(f"Elapsed : {elapsed:.2f}s")
	print(f"Config : {CONFIG_FILE}")
	print(f"Log : {LOG_FILE}")

	history.add(
	input_file = str(input_path),
	output_file = str(out),
	backend = str(cfg.get("backend")),
	script = str(cfg.get("lang")),
	segments_count = 0,
	elapsed_seconds = elapsed,
	)


	if __name__ == "__main__":
	main()