Upload 8 files
Browse files- .gitattributes +65 -35
- .gitignore +132 -0
- LICENSE +21 -0
- README.md +288 -0
- chinese_file_translator.py +1159 -0
- input.md +105 -0
- input_test_SUCCESS_DEFINITIVE.md +105 -0
- requirements.txt +33 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,65 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
*
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
*.
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
*.
|
| 14 |
-
*.
|
| 15 |
-
*.
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
*.
|
| 19 |
-
*.
|
| 20 |
-
*.
|
| 21 |
-
*.
|
| 22 |
-
*.
|
| 23 |
-
*.
|
| 24 |
-
*.
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
*.
|
| 28 |
-
*.
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
*.
|
| 32 |
-
*.
|
| 33 |
-
*.
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 2 |
+
# ChineseFileTranslator — .gitattributes
|
| 3 |
+
# Author: algorembrant
|
| 4 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 5 |
+
|
| 6 |
+
# ── Default: normalize all text files to LF on commit ────────────────────
|
| 7 |
+
* text=auto eol=lf
|
| 8 |
+
|
| 9 |
+
# ── Python source files ───────────────────────────────────────────────────
|
| 10 |
+
*.py text eol=lf diff=python
|
| 11 |
+
|
| 12 |
+
# ── Markdown and documentation ────────────────────────────────────────────
|
| 13 |
+
*.md text eol=lf
|
| 14 |
+
*.rst text eol=lf
|
| 15 |
+
*.txt text eol=lf
|
| 16 |
+
|
| 17 |
+
# ── Config and data files ─────────────────────────────────────────────────
|
| 18 |
+
*.json text eol=lf
|
| 19 |
+
*.yaml text eol=lf
|
| 20 |
+
*.yml text eol=lf
|
| 21 |
+
*.toml text eol=lf
|
| 22 |
+
*.cfg text eol=lf
|
| 23 |
+
*.ini text eol=lf
|
| 24 |
+
*.env text eol=lf
|
| 25 |
+
|
| 26 |
+
# ── Shell scripts ─────────────────────────────────────────────────────────
|
| 27 |
+
*.sh text eol=lf
|
| 28 |
+
*.bash text eol=lf
|
| 29 |
+
|
| 30 |
+
# ── Windows batch scripts (CRLF required) ─────────────────────────────────
|
| 31 |
+
*.bat text eol=crlf
|
| 32 |
+
*.cmd text eol=crlf
|
| 33 |
+
*.ps1 text eol=crlf
|
| 34 |
+
|
| 35 |
+
# ── Binary files — do not modify line endings ─────────────────────────────
|
| 36 |
+
*.png binary
|
| 37 |
+
*.jpg binary
|
| 38 |
+
*.jpeg binary
|
| 39 |
+
*.gif binary
|
| 40 |
+
*.bmp binary
|
| 41 |
+
*.ico binary
|
| 42 |
+
*.svg binary
|
| 43 |
+
*.pdf binary
|
| 44 |
+
*.zip binary
|
| 45 |
+
*.tar.gz binary
|
| 46 |
+
*.whl binary
|
| 47 |
+
|
| 48 |
+
# ── HuggingFace Large File Storage (LFS) — model weights ──────────────────
|
| 49 |
+
# Uncomment if storing model checkpoints in this repo
|
| 50 |
+
# *.bin filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
# *.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
# *.pt filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
# *.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
# *.h5 filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
|
| 56 |
+
# ── Linguist overrides (GitHub language detection) ────────────────────────
|
| 57 |
+
*.md linguist-documentation
|
| 58 |
+
*.txt linguist-documentation
|
| 59 |
+
requirements.txt linguist-documentation
|
| 60 |
+
|
| 61 |
+
# ── Export-ignore (files excluded from git archive / release tarballs) ────
|
| 62 |
+
.gitattributes export-ignore
|
| 63 |
+
.gitignore export-ignore
|
| 64 |
+
.github/ export-ignore
|
| 65 |
+
tests/ export-ignore
|
.gitignore
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 2 |
+
# ChineseFileTranslator — .gitignore
|
| 3 |
+
# Author: algorembrant
|
| 4 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 5 |
+
|
| 6 |
+
# ── Python ────────────────────────────────────────────────────────────────
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*.pyo
|
| 10 |
+
*.pyd
|
| 11 |
+
*.pyc
|
| 12 |
+
*.so
|
| 13 |
+
*.egg
|
| 14 |
+
*.egg-info/
|
| 15 |
+
dist/
|
| 16 |
+
build/
|
| 17 |
+
wheels/
|
| 18 |
+
*.whl
|
| 19 |
+
*.spec
|
| 20 |
+
pip-log.txt
|
| 21 |
+
pip-delete-this-directory.txt
|
| 22 |
+
.Python
|
| 23 |
+
lib/
|
| 24 |
+
lib64/
|
| 25 |
+
parts/
|
| 26 |
+
sdist/
|
| 27 |
+
var/
|
| 28 |
+
*.manifest
|
| 29 |
+
*.egg-link
|
| 30 |
+
.installed.cfg
|
| 31 |
+
MANIFEST
|
| 32 |
+
|
| 33 |
+
# ── Virtual environments ──────────────────────────────────────────────────
|
| 34 |
+
venv/
|
| 35 |
+
env/
|
| 36 |
+
.venv/
|
| 37 |
+
.env/
|
| 38 |
+
ENV/
|
| 39 |
+
env.bak/
|
| 40 |
+
venv.bak/
|
| 41 |
+
.python-version
|
| 42 |
+
|
| 43 |
+
# ── Application runtime data ──────────────────────────────────────────────
|
| 44 |
+
# Translation history and logs are stored in ~/.chinese_file_translator/
|
| 45 |
+
# Do not commit user-generated runtime files from within the project dir
|
| 46 |
+
history.json
|
| 47 |
+
app.log
|
| 48 |
+
config.json
|
| 49 |
+
*_translated.txt
|
| 50 |
+
*_translated.md
|
| 51 |
+
|
| 52 |
+
# ── HuggingFace / Transformers model cache ────────────────────────────────
|
| 53 |
+
models/
|
| 54 |
+
*.bin
|
| 55 |
+
*.safetensors
|
| 56 |
+
*.pt
|
| 57 |
+
*.ckpt
|
| 58 |
+
*.h5
|
| 59 |
+
pytorch_model*
|
| 60 |
+
tf_model*
|
| 61 |
+
flax_model*
|
| 62 |
+
tokenizer.json
|
| 63 |
+
tokenizer_config.json
|
| 64 |
+
vocab.json
|
| 65 |
+
merges.txt
|
| 66 |
+
special_tokens_map.json
|
| 67 |
+
sentencepiece.bpe.model
|
| 68 |
+
source.spm
|
| 69 |
+
target.spm
|
| 70 |
+
|
| 71 |
+
# ── Jupyter notebooks checkpoints ────────────────────────────────────────
|
| 72 |
+
.ipynb_checkpoints/
|
| 73 |
+
*.ipynb
|
| 74 |
+
|
| 75 |
+
# ── IDE / Editor ──────────────────────────────────────────────────────────
|
| 76 |
+
.vscode/
|
| 77 |
+
.idea/
|
| 78 |
+
*.sublime-project
|
| 79 |
+
*.sublime-workspace
|
| 80 |
+
*.suo
|
| 81 |
+
*.user
|
| 82 |
+
*.sln.docstates
|
| 83 |
+
.vs/
|
| 84 |
+
*.swp
|
| 85 |
+
*.swo
|
| 86 |
+
*~
|
| 87 |
+
.DS_Store
|
| 88 |
+
Thumbs.db
|
| 89 |
+
desktop.ini
|
| 90 |
+
|
| 91 |
+
# ── Testing and coverage ──────────────────────────────────────────────────
|
| 92 |
+
.tox/
|
| 93 |
+
.nox/
|
| 94 |
+
.coverage
|
| 95 |
+
.coverage.*
|
| 96 |
+
coverage.xml
|
| 97 |
+
htmlcov/
|
| 98 |
+
*.coveragerc
|
| 99 |
+
.pytest_cache/
|
| 100 |
+
.mypy_cache/
|
| 101 |
+
.dmypy.json
|
| 102 |
+
dmypy.json
|
| 103 |
+
.pytype/
|
| 104 |
+
.pyre/
|
| 105 |
+
|
| 106 |
+
# ── Distribution / packaging ──────────────────────────────────────────────
|
| 107 |
+
.eggs/
|
| 108 |
+
*.egg-info/
|
| 109 |
+
dist/
|
| 110 |
+
build/
|
| 111 |
+
RECORD
|
| 112 |
+
|
| 113 |
+
# ── Documentation builds ─────────────────────────────────────────────────
|
| 114 |
+
docs/_build/
|
| 115 |
+
site/
|
| 116 |
+
_site/
|
| 117 |
+
|
| 118 |
+
# ── OS temp files ─────────────────────────────────────────────────────────
|
| 119 |
+
*.tmp
|
| 120 |
+
*.bak
|
| 121 |
+
*.swp
|
| 122 |
+
*.orig
|
| 123 |
+
*.rej
|
| 124 |
+
|
| 125 |
+
# ── Secrets and credentials ───────────────────────────────────────────────
|
| 126 |
+
.env
|
| 127 |
+
.env.*
|
| 128 |
+
secrets.json
|
| 129 |
+
*.pem
|
| 130 |
+
*.key
|
| 131 |
+
*.p12
|
| 132 |
+
*.pfx
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 algorembrant
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- zh
|
| 4 |
+
- en
|
| 5 |
+
tags:
|
| 6 |
+
- translation
|
| 7 |
+
- chinese
|
| 8 |
+
- nlp
|
| 9 |
+
- text-processing
|
| 10 |
+
- markdown
|
| 11 |
+
- offline
|
| 12 |
+
- deep-translator
|
| 13 |
+
- marianmt
|
| 14 |
+
license: mit
|
| 15 |
+
library_name: transformers
|
| 16 |
+
pipeline_tag: translation
|
| 17 |
+
model-index:
|
| 18 |
+
- name: Helsinki-NLP/opus-mt-zh-en
|
| 19 |
+
results: []
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
# ChineseFileTranslator
|
| 23 |
+
|
| 24 |
+
[](https://www.python.org/)
|
| 25 |
+
[](LICENSE)
|
| 26 |
+
[](CHANGELOG.md)
|
| 27 |
+
[](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en)
|
| 28 |
+
[](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en)
|
| 29 |
+
[](https://github.com/algorembrant/ChineseFileTranslator)
|
| 30 |
+
[](https://peps.python.org/pep-0008/)
|
| 31 |
+
|
| 32 |
+
Translate Chinese text (Simplified, Traditional, Cantonese, Classical) inside `.txt` and `.md` files
|
| 33 |
+
to English. Preserves full Markdown syntax. Supports Google Translate, Microsoft Translator, and a
|
| 34 |
+
fully offline Helsinki-NLP MarianMT backend with vectorized batching.
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
### Key Features
|
| 39 |
+
|
| 40 |
+
- **'Never Miss' Global Surgical Translation**: Unique strategy to capture ALL Chinese while protecting structure.
|
| 41 |
+
- **Inclusive CJK Detection**: Comprehensive 32-bit Unicode coverage (Basic, Ext A-E, Symbols, Punctuation).
|
| 42 |
+
- **Proactive Markdown Protection**: Frontmatter, code blocks, links, and HTML are safely tokenized.
|
| 43 |
+
- **Robust Placeholder Restoration**: Space-lenient, case-insensitive restoration handles engine mangling.
|
| 44 |
+
- **Unstoppable Backend Resilience**: Explicit failure detection with automatic retries and non-crashing fallbacks.
|
| 45 |
+
- **Offline First Option**: Fully local Helsinki-NLP MarianMT backend with vectorized batching.
|
| 46 |
+
- **Bilingual Mode**: Optional side-by-side Chinese and English output.
|
| 47 |
+
- **Batch Processing**: Translate entire directories with recursive discovery and persistent configuration.
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Project Structure
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
ChineseFileTranslator/
|
| 55 |
+
├── chinese_file_translator.py # Main script (single-file, no extra modules)
|
| 56 |
+
├── requirements.txt # Python dependencies
|
| 57 |
+
├── README.md # This file
|
| 58 |
+
├── .gitattributes # Git line-ending and LFS rules
|
| 59 |
+
├── .gitignore # Ignored paths
|
| 60 |
+
└── LICENSE # MIT License
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## Quickstart
|
| 66 |
+
|
| 67 |
+
### 1. Clone the repository
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
git clone https://github.com/algorembrant/ChineseFileTranslator.git
|
| 71 |
+
cd ChineseFileTranslator
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### 2. Create and activate a virtual environment (recommended)
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
python -m venv venv
|
| 78 |
+
# Windows
|
| 79 |
+
venv\Scripts\activate
|
| 80 |
+
# Linux / macOS
|
| 81 |
+
source venv/bin/activate
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
### 3. Install core dependencies
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
pip install -r requirements.txt
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
### 4. (Optional) Install offline translation backend
|
| 91 |
+
|
| 92 |
+
Choose the correct PyTorch build for your system:
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
# CPU only
|
| 96 |
+
pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 97 |
+
|
| 98 |
+
# CUDA 12.1
|
| 99 |
+
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
| 100 |
+
|
| 101 |
+
# Then install Transformers stack
|
| 102 |
+
pip install transformers sentencepiece sacremoses
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
The Helsinki-NLP/opus-mt-zh-en model (~300 MB) downloads automatically on first use.
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## Usage
|
| 110 |
+
|
| 111 |
+
### Command Reference
|
| 112 |
+
|
| 113 |
+
| Command | Description |
|
| 114 |
+
|---|---|
|
| 115 |
+
| `python chinese_file_translator.py input.txt` | Translate a plain-text file (Google backend) |
|
| 116 |
+
| `python chinese_file_translator.py input.md` | Translate a Markdown file, preserve structure |
|
| 117 |
+
| `python chinese_file_translator.py input.txt -o out.txt` | Set explicit output path |
|
| 118 |
+
| `python chinese_file_translator.py input.txt --backend offline` | Use offline MarianMT model |
|
| 119 |
+
| `python chinese_file_translator.py input.txt --backend microsoft` | Use Microsoft Translator |
|
| 120 |
+
| `python chinese_file_translator.py input.txt --offline --gpu` | Offline + GPU (CUDA) |
|
| 121 |
+
| `python chinese_file_translator.py input.txt --lang simplified` | Force Simplified Chinese |
|
| 122 |
+
| `python chinese_file_translator.py input.txt --lang traditional` | Force Traditional Chinese |
|
| 123 |
+
| `python chinese_file_translator.py input.txt --bilingual` | Keep Chinese + show English |
|
| 124 |
+
| `python chinese_file_translator.py input.txt --extract-only` | Extract Chinese lines only |
|
| 125 |
+
| `python chinese_file_translator.py input.txt --stdout` | Print output to terminal |
|
| 126 |
+
| `python chinese_file_translator.py --batch ./docs/` | Batch translate a directory |
|
| 127 |
+
| `python chinese_file_translator.py --batch ./in/ --batch-out ./out/` | Batch with output dir |
|
| 128 |
+
| `python chinese_file_translator.py input.txt --chunk-size 2000` | Custom chunk size |
|
| 129 |
+
| `python chinese_file_translator.py input.txt --export-history h.json` | Export history |
|
| 130 |
+
| `python chinese_file_translator.py input.txt --verbose` | Debug logging |
|
| 131 |
+
| `python chinese_file_translator.py --version` | Print version |
|
| 132 |
+
| `python chinese_file_translator.py --help` | Full help |
|
| 133 |
+
|
| 134 |
+
### Arguments
|
| 135 |
+
|
| 136 |
+
| Argument | Type | Default | Description |
|
| 137 |
+
|---|---|---|---|
|
| 138 |
+
| `input` | positional | — | Path to `.txt` or `.md` file |
|
| 139 |
+
| `-o / --output` | string | `<name>_translated.<ext>` | Output file path |
|
| 140 |
+
| `--batch DIR` | string | — | Directory to batch translate |
|
| 141 |
+
| `--batch-out DIR` | string | same as `--batch` | Output directory for batch |
|
| 142 |
+
| `--backend` | choice | `google` | `google`, `microsoft`, `offline` |
|
| 143 |
+
| `--offline` | flag | `false` | Shorthand for `--backend offline` |
|
| 144 |
+
| `--lang` | choice | `auto` | `auto`, `simplified`, `traditional` |
|
| 145 |
+
| `--gpu` | flag | `false` | Use CUDA for offline model |
|
| 146 |
+
| `--confidence` | float | `0.05` | Min Chinese character ratio for detection |
|
| 147 |
+
| `--chunk-size` | int | `4000` | Max chars per translation request |
|
| 148 |
+
| `--bilingual` | flag | `false` | Output both Chinese and English |
|
| 149 |
+
| `--extract-only` | flag | `false` | Save only the detected Chinese lines |
|
| 150 |
+
| `--stdout` | flag | `false` | Print result to stdout |
|
| 151 |
+
| `--export-history` | string | — | Save session history to JSON |
|
| 152 |
+
| `--verbose` | flag | `false` | Enable DEBUG logging |
|
| 153 |
+
| `--version` | flag | — | Show version and exit |
|
| 154 |
+
|
| 155 |
+
---
|
| 156 |
+
|
| 157 |
+
## Configuration
|
| 158 |
+
|
| 159 |
+
The tool writes a JSON config file on first run:
|
| 160 |
+
|
| 161 |
+
```
|
| 162 |
+
~/.chinese_file_translator/config.json
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
Example `config.json`:
|
| 166 |
+
|
| 167 |
+
```json
|
| 168 |
+
{
|
| 169 |
+
"backend": "google",
|
| 170 |
+
"lang": "auto",
|
| 171 |
+
"use_gpu": false,
|
| 172 |
+
"chunk_size": 4000,
|
| 173 |
+
"batch_size": 10,
|
| 174 |
+
"bilingual": false,
|
| 175 |
+
"microsoft_api_key": "YOUR_KEY_HERE",
|
| 176 |
+
"microsoft_region": "eastus",
|
| 177 |
+
"offline_model_dir": "~/.chinese_file_translator/models",
|
| 178 |
+
"output_suffix": "_translated",
|
| 179 |
+
"retry_attempts": 3,
|
| 180 |
+
"retry_delay_seconds": 1.5,
|
| 181 |
+
"max_history": 1000
|
| 182 |
+
}
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
---
|
| 186 |
+
|
| 187 |
+
## Supported Chinese Variants
|
| 188 |
+
|
| 189 |
+
| Variant | Notes |
|
| 190 |
+
|---|---|
|
| 191 |
+
| Simplified Chinese | Mandarin, mainland China standard |
|
| 192 |
+
| Traditional Chinese | Taiwan, Hong Kong, Macau standard |
|
| 193 |
+
| Cantonese / Yue | Detected via CJK Unicode ranges |
|
| 194 |
+
| Classical Chinese | Treated as Traditional for translation |
|
| 195 |
+
| Mixed Chinese-English | Code-switching text handled transparently |
|
| 196 |
+
|
| 197 |
+
---
|
| 198 |
+
|
| 199 |
+
## Translation Backends
|
| 200 |
+
|
| 201 |
+
| Backend | Requires | Speed | Quality | Internet |
|
| 202 |
+
|---|---|---|---|---|
|
| 203 |
+
| Google Translate | `deep-translator` | Fast | High | Yes |
|
| 204 |
+
| Microsoft Translator | Azure API key + `deep-translator` | Fast | High | Yes |
|
| 205 |
+
| Helsinki-NLP MarianMT | `transformers`, `torch` | Medium | Good | No (after download) |
|
| 206 |
+
|
| 207 |
+
Google Translate is the default. If it fails, the tool falls back to the offline model automatically.
|
| 208 |
+
|
| 209 |
+
---
|
| 210 |
+
|
| 211 |
+
---
|
| 212 |
+
|
| 213 |
+
## Technical Strategy: 'Never Miss' Logic
|
| 214 |
+
|
| 215 |
+
The tool employs a sophisticated "Global Surgical" approach to ensure no Chinese fragment is overlooked, regardless of its depth in JSON, HTML, or complex Markdown.
|
| 216 |
+
|
| 217 |
+
### 1. Surgical Block Extraction
|
| 218 |
+
Instead of line-by-line translation, the script identifies every continuous block of CJK characters (including ideographic symbols and punctuation) across the entire document. This ensures that contextually related characters are translated together for better accuracy.
|
| 219 |
+
|
| 220 |
+
### 2. Structural Protection
|
| 221 |
+
Markdown and metadata structures are tokenized using unique, collision-resistant placeholders (`___MY_PROTECT_PH_{idx}___`).
|
| 222 |
+
- **YAML/TOML**: Frontmatter is protected globally.
|
| 223 |
+
- **Code Fences**: Backticks and language identifiers are protected; Chinese content *inside* comments or strings remains translatable.
|
| 224 |
+
- **Links & HTML**: URLs and tag names are guarded, while display text is surgically translated.
|
| 225 |
+
|
| 226 |
+
### 3. Verification & Restoration
|
| 227 |
+
- **Longest-First Replacement**: Translated segments are restored starting from the longest strings to prevent partial match overwrites.
|
| 228 |
+
- **Fuzzy Restoration**: The restoration logic is space-lenient and case-insensitive to handle cases where online translation engines mangle the placeholder tokens.
|
| 229 |
+
|
| 230 |
+
---
|
| 231 |
+
|
| 232 |
+
## Markdown Preservation
|
| 233 |
+
|
| 234 |
+
The following elements are meticulously protected:
|
| 235 |
+
|
| 236 |
+
| Element | Example | Protection Method |
|
| 237 |
+
|---|---|---|
|
| 238 |
+
| Front Matter | `---\ntitle: ...\n---` | Full Tokenization |
|
| 239 |
+
| Fenced Code | ` ```python ... ``` ` | Boundary Tokenization |
|
| 240 |
+
| Inline Code | `` `code` `` | Full Tokenization |
|
| 241 |
+
| Links / Images | `[text](url)` | URL Tokenization |
|
| 242 |
+
| HTML Tags | `<div class="...">` | Tag Tokenization |
|
| 243 |
+
| Symbols | `©`, `&#x...;` | Entity Tokenization |
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
## Microsoft Translator Setup
|
| 248 |
+
|
| 249 |
+
1. Go to [Azure Cognitive Services](https://portal.azure.com/)
|
| 250 |
+
2. Create a Translator resource (Free tier: 2M chars/month)
|
| 251 |
+
3. Copy your API key and region
|
| 252 |
+
4. Add them to `~/.chinese_file_translator/config.json`:
|
| 253 |
+
|
| 254 |
+
```json
|
| 255 |
+
{
|
| 256 |
+
"microsoft_api_key": "abc123...",
|
| 257 |
+
"microsoft_region": "eastus"
|
| 258 |
+
}
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
Then run:
|
| 262 |
+
|
| 263 |
+
```bash
|
| 264 |
+
python chinese_file_translator.py input.txt --backend microsoft
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## Files Generated
|
| 270 |
+
|
| 271 |
+
| Path | Description |
|
| 272 |
+
|---|---|
|
| 273 |
+
| `~/.chinese_file_translator/config.json` | Persistent settings |
|
| 274 |
+
| `~/.chinese_file_translator/history.json` | Session history log |
|
| 275 |
+
| `~/.chinese_file_translator/app.log` | Application log file |
|
| 276 |
+
| `~/.chinese_file_translator/models/` | Offline model cache (if used) |
|
| 277 |
+
|
| 278 |
+
---
|
| 279 |
+
|
| 280 |
+
## Author
|
| 281 |
+
|
| 282 |
+
**algorembrant**
|
| 283 |
+
|
| 284 |
+
---
|
| 285 |
+
|
| 286 |
+
## License
|
| 287 |
+
|
| 288 |
+
MIT License. See [LICENSE](LICENSE) for details.
|
chinese_file_translator.py
ADDED
|
@@ -0,0 +1,1159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
ChineseFileTranslator v1.0.0
|
| 5 |
+
================================
|
| 6 |
+
Author : algorembrant
|
| 7 |
+
License : MIT
|
| 8 |
+
Version : 1.0.0
|
| 9 |
+
|
| 10 |
+
Translate Chinese text inside .txt or .md files to English.
|
| 11 |
+
Preserves Markdown structure (headings, bold, italics, code blocks, tables, links).
|
| 12 |
+
Supports batch/vectorized processing, multiple translation backends,
|
| 13 |
+
auto-detection of Chinese script, and history logging.
|
| 14 |
+
|
| 15 |
+
USAGE COMMANDS
|
| 16 |
+
--------------
|
| 17 |
+
Translate a single file (default: Google backend):
|
| 18 |
+
python chinese_file_translator.py input.txt
|
| 19 |
+
|
| 20 |
+
Translate and save to a specific output file:
|
| 21 |
+
python chinese_file_translator.py input.md -o translated.md
|
| 22 |
+
|
| 23 |
+
Translate using the offline Helsinki-NLP MarianMT model:
|
| 24 |
+
python chinese_file_translator.py input.txt --backend offline
|
| 25 |
+
|
| 26 |
+
Translate using Microsoft Translator (requires API key in config):
|
| 27 |
+
python chinese_file_translator.py input.txt --backend microsoft
|
| 28 |
+
|
| 29 |
+
Force Simplified Chinese OCR/detection:
|
| 30 |
+
python chinese_file_translator.py input.txt --lang simplified
|
| 31 |
+
|
| 32 |
+
Force Traditional Chinese:
|
| 33 |
+
python chinese_file_translator.py input.txt --lang traditional
|
| 34 |
+
|
| 35 |
+
Auto-detect Chinese script (default):
|
| 36 |
+
python chinese_file_translator.py input.txt --lang auto
|
| 37 |
+
|
| 38 |
+
Enable GPU (CUDA) for offline model:
|
| 39 |
+
python chinese_file_translator.py input.txt --backend offline --gpu
|
| 40 |
+
|
| 41 |
+
Set OCR confidence threshold (0.0 - 1.0, default 0.3):
|
| 42 |
+
python chinese_file_translator.py input.txt --confidence 0.4
|
| 43 |
+
|
| 44 |
+
Batch translate all .txt and .md files in a directory:
|
| 45 |
+
python chinese_file_translator.py --batch ./my_folder/
|
| 46 |
+
|
| 47 |
+
Batch translate with output directory:
|
| 48 |
+
python chinese_file_translator.py --batch ./input/ --batch-out ./output/
|
| 49 |
+
|
| 50 |
+
Set chunk size for large files (default 4000 chars):
|
| 51 |
+
python chinese_file_translator.py input.txt --chunk-size 2000
|
| 52 |
+
|
| 53 |
+
Append both Chinese source and English translation side-by-side:
|
| 54 |
+
python chinese_file_translator.py input.txt --bilingual
|
| 55 |
+
|
| 56 |
+
Only extract and print detected Chinese text (no translation):
|
| 57 |
+
python chinese_file_translator.py input.txt --extract-only
|
| 58 |
+
|
| 59 |
+
Print translated output to stdout instead of file:
|
| 60 |
+
python chinese_file_translator.py input.txt --stdout
|
| 61 |
+
|
| 62 |
+
Export translation history to JSON on exit:
|
| 63 |
+
python chinese_file_translator.py input.txt --export-history out.json
|
| 64 |
+
|
| 65 |
+
Enable verbose/debug logging:
|
| 66 |
+
python chinese_file_translator.py input.txt --verbose
|
| 67 |
+
|
| 68 |
+
Show version and exit:
|
| 69 |
+
python chinese_file_translator.py --version
|
| 70 |
+
|
| 71 |
+
Show full help:
|
| 72 |
+
python chinese_file_translator.py --help
|
| 73 |
+
|
| 74 |
+
SUPPORTED FILE TYPES
|
| 75 |
+
--------------------
|
| 76 |
+
- Plain text (.txt) : All Chinese detected and translated in-place
|
| 77 |
+
- Markdown (.md) : Chinese content translated; Markdown syntax preserved
|
| 78 |
+
Preserved: headings (#), bold (**), italic (*), inline code (`),
|
| 79 |
+
fenced code blocks (```), blockquotes (>), tables (|),
|
| 80 |
+
links ([text](url)), images (), horizontal rules
|
| 81 |
+
|
| 82 |
+
SUPPORTED CHINESE VARIANTS
|
| 83 |
+
---------------------------
|
| 84 |
+
- Simplified Chinese (Mandarin, simplified/simp)
|
| 85 |
+
- Traditional Chinese (Mandarin / Hong Kong / Taiwan)
|
| 86 |
+
- Cantonese / Yue (detected via Unicode CJK ranges)
|
| 87 |
+
- Classical Chinese (Literary Chinese, treated as Traditional)
|
| 88 |
+
- Mixed Chinese-English (Chinglish / code-switching)
|
| 89 |
+
|
| 90 |
+
TRANSLATION BACKENDS
|
| 91 |
+
--------------------
|
| 92 |
+
1. Google Translate (online, fast, default, no API key needed)
|
| 93 |
+
2. Microsoft Translate (online, fallback, requires Azure API key)
|
| 94 |
+
3. Helsinki-NLP MarianMT (offline, opus-mt-zh-en, ~300 MB download on first use)
|
| 95 |
+
|
| 96 |
+
CONFIGURATION
|
| 97 |
+
-------------
|
| 98 |
+
Config is stored at: ~/.chinese_file_translator/config.json
|
| 99 |
+
History is stored at: ~/.chinese_file_translator/history.json
|
| 100 |
+
Logs are stored at: ~/.chinese_file_translator/app.log
|
| 101 |
+
|
| 102 |
+
EXTERNAL SETUP REQUIRED
|
| 103 |
+
-----------------------
|
| 104 |
+
PyTorch (required only for offline backend):
|
| 105 |
+
CPU-only:
|
| 106 |
+
pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 107 |
+
CUDA 11.8:
|
| 108 |
+
pip install torch --index-url https://download.pytorch.org/whl/cu118
|
| 109 |
+
CUDA 12.1:
|
| 110 |
+
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
| 111 |
+
|
| 112 |
+
Helsinki-NLP model is downloaded automatically on first offline run (~300 MB):
|
| 113 |
+
Model: Helsinki-NLP/opus-mt-zh-en
|
| 114 |
+
Cache: ~/.chinese_file_translator/models/
|
| 115 |
+
|
| 116 |
+
Microsoft Translator (optional):
|
| 117 |
+
Get a free API key from Azure Cognitive Services and add to config.json:
|
| 118 |
+
{ "microsoft_api_key": "YOUR_KEY_HERE", "microsoft_region": "eastus" }
|
| 119 |
+
"""
|
| 120 |
+
|
| 121 |
+
# ── Standard Library ──────────────────────────────────────────────────────────
|
| 122 |
+
import os
|
| 123 |
+
import re
|
| 124 |
+
import sys
|
| 125 |
+
import json
|
| 126 |
+
import time
|
| 127 |
+
import logging
|
| 128 |
+
import argparse
|
| 129 |
+
import textwrap
|
| 130 |
+
import threading
|
| 131 |
+
import unicodedata
|
| 132 |
+
from copy import deepcopy
|
| 133 |
+
from pathlib import Path
|
| 134 |
+
from datetime import datetime
|
| 135 |
+
from typing import (
|
| 136 |
+
Any, Dict, Generator, List, Optional, Sequence, Tuple
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# ── Online Translation ────────────────────────────────────────────────────────
|
| 140 |
+
try:
|
| 141 |
+
from deep_translator import GoogleTranslator, MicrosoftTranslator
|
| 142 |
+
DEEP_TRANSLATOR_AVAILABLE = True
|
| 143 |
+
except ImportError:
|
| 144 |
+
DEEP_TRANSLATOR_AVAILABLE = False
|
| 145 |
+
|
| 146 |
+
# ── Offline Translation ───────────────────────────────────────────────────────
|
| 147 |
+
OFFLINE_AVAILABLE = False
|
| 148 |
+
try:
|
| 149 |
+
from transformers import MarianMTModel, MarianTokenizer
|
| 150 |
+
import torch
|
| 151 |
+
OFFLINE_AVAILABLE = True
|
| 152 |
+
except ImportError:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
# ── Progress bar (optional) ───────────────────────────────────────────────────
|
| 156 |
+
try:
|
| 157 |
+
from tqdm import tqdm
|
| 158 |
+
TQDM_AVAILABLE = True
|
| 159 |
+
except ImportError:
|
| 160 |
+
TQDM_AVAILABLE = False
|
| 161 |
+
|
| 162 |
+
# ── Clipboard (optional) ─────────────────────────────────────────────────────
|
| 163 |
+
try:
|
| 164 |
+
import pyperclip
|
| 165 |
+
CLIPBOARD_AVAILABLE = True
|
| 166 |
+
except ImportError:
|
| 167 |
+
CLIPBOARD_AVAILABLE = False
|
| 168 |
+
|
| 169 |
+
# ── Constants ─────────────────────────────────────────────────────────────────
|
| 170 |
+
APP_NAME = "ChineseFileTranslator"
|
| 171 |
+
APP_VERSION = "1.0.0"
|
| 172 |
+
APP_AUTHOR = "algorembrant"
|
| 173 |
+
_HOME = Path.home() / ".chinese_file_translator"
|
| 174 |
+
CONFIG_FILE = _HOME / "config.json"
|
| 175 |
+
HISTORY_FILE = _HOME / "history.json"
|
| 176 |
+
LOG_FILE = _HOME / "app.log"
|
| 177 |
+
OFFLINE_MODEL = "Helsinki-NLP/opus-mt-zh-en"
|
| 178 |
+
OFFLINE_MODEL_T = "Helsinki-NLP/opus-mt-zht-en"
|
| 179 |
+
|
| 180 |
+
# CJK Unicode blocks used for Chinese detection
|
| 181 |
+
_CJK_RANGES: Tuple[Tuple[int, int], ...] = (
|
| 182 |
+
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
| 183 |
+
(0x3400, 0x4DBF), # CJK Extension A
|
| 184 |
+
(0x20000, 0x2A6DF), # CJK Extension B
|
| 185 |
+
(0x2A700, 0x2B73F), # CJK Extension C
|
| 186 |
+
(0x2B740, 0x2B81F), # CJK Extension D
|
| 187 |
+
(0xF900, 0xFAFF), # CJK Compatibility Ideographs
|
| 188 |
+
(0x2F800, 0x2FA1F), # CJK Compatibility Supplement
|
| 189 |
+
(0x3000, 0x303F), # CJK Symbols and Punctuation
|
| 190 |
+
(0xFF00, 0xFFEF), # Fullwidth / Halfwidth Forms
|
| 191 |
+
(0xFE30, 0xFE4F), # CJK Compatibility Forms
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Markdown patterns that must NOT be translated
|
| 195 |
+
_MD_CODE_FENCE = re.compile(r"```[\s\S]*?```")
|
| 196 |
+
_MD_INLINE_CODE = re.compile(r"`[^`\n]*?`")
|
| 197 |
+
_MD_LINK = re.compile(r"(!?\[[^\]]*?\])\(([^)]*?)\)")
|
| 198 |
+
_MD_HTML_TAG = re.compile(r"<[a-zA-Z/][^>]*?>")
|
| 199 |
+
_MD_FRONTMATTER = re.compile(r"^---[\s\S]*?^---", re.MULTILINE)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 203 |
+
# LOGGING
|
| 204 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 205 |
+
def setup_logging(verbose: bool = False) -> logging.Logger:
|
| 206 |
+
_HOME.mkdir(parents=True, exist_ok=True)
|
| 207 |
+
level = logging.DEBUG if verbose else logging.INFO
|
| 208 |
+
fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
| 209 |
+
handlers: List[logging.Handler] = [
|
| 210 |
+
logging.FileHandler(LOG_FILE, encoding="utf-8"),
|
| 211 |
+
logging.StreamHandler(sys.stdout),
|
| 212 |
+
]
|
| 213 |
+
logging.basicConfig(level=level, format=fmt, handlers=handlers)
|
| 214 |
+
return logging.getLogger(APP_NAME)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
logger = logging.getLogger(APP_NAME)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 221 |
+
# CONFIG
|
| 222 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 223 |
+
class Config:
|
| 224 |
+
"""Persistent JSON configuration. CLI args override stored values."""
|
| 225 |
+
|
| 226 |
+
DEFAULTS: Dict[str, Any] = {
|
| 227 |
+
"backend" : "google",
|
| 228 |
+
"lang" : "auto",
|
| 229 |
+
"use_gpu" : False,
|
| 230 |
+
"confidence_threshold" : 0.30,
|
| 231 |
+
"chunk_size" : 4000,
|
| 232 |
+
"batch_size" : 10,
|
| 233 |
+
"bilingual" : False,
|
| 234 |
+
"preserve_whitespace" : True,
|
| 235 |
+
"microsoft_api_key" : "",
|
| 236 |
+
"microsoft_region" : "eastus",
|
| 237 |
+
"offline_model_dir" : str(_HOME / "models"),
|
| 238 |
+
"max_history" : 1000,
|
| 239 |
+
"output_suffix" : "_translated",
|
| 240 |
+
"retry_attempts" : 3,
|
| 241 |
+
"retry_delay_seconds" : 1.5,
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
def __init__(self) -> None:
|
| 245 |
+
self._data: Dict[str, Any] = dict(self.DEFAULTS)
|
| 246 |
+
_HOME.mkdir(parents=True, exist_ok=True)
|
| 247 |
+
self._load()
|
| 248 |
+
|
| 249 |
+
def _load(self) -> None:
|
| 250 |
+
if CONFIG_FILE.exists():
|
| 251 |
+
try:
|
| 252 |
+
with open(CONFIG_FILE, "r", encoding="utf-8") as f:
|
| 253 |
+
self._data.update(json.load(f))
|
| 254 |
+
except Exception as exc:
|
| 255 |
+
logger.warning(f"Config load failed ({exc}). Using defaults.")
|
| 256 |
+
|
| 257 |
+
def save(self) -> None:
|
| 258 |
+
try:
|
| 259 |
+
with open(CONFIG_FILE, "w", encoding="utf-8") as f:
|
| 260 |
+
json.dump(self._data, f, indent=2, ensure_ascii=False)
|
| 261 |
+
except Exception as exc:
|
| 262 |
+
logger.error(f"Config save failed: {exc}")
|
| 263 |
+
|
| 264 |
+
def get(self, key: str, default: Any = None) -> Any:
|
| 265 |
+
return self._data.get(key, self.DEFAULTS.get(key, default))
|
| 266 |
+
|
| 267 |
+
def set(self, key: str, value: Any) -> None:
|
| 268 |
+
self._data[key] = value
|
| 269 |
+
self.save()
|
| 270 |
+
|
| 271 |
+
def apply_args(self, args: argparse.Namespace) -> None:
|
| 272 |
+
if getattr(args, "backend", None):
|
| 273 |
+
self._data["backend"] = args.backend
|
| 274 |
+
if getattr(args, "lang", None):
|
| 275 |
+
self._data["lang"] = args.lang
|
| 276 |
+
if getattr(args, "gpu", False):
|
| 277 |
+
self._data["use_gpu"] = True
|
| 278 |
+
if getattr(args, "confidence", None) is not None:
|
| 279 |
+
self._data["confidence_threshold"] = args.confidence
|
| 280 |
+
if getattr(args, "chunk_size", None) is not None:
|
| 281 |
+
self._data["chunk_size"] = args.chunk_size
|
| 282 |
+
if getattr(args, "bilingual", False):
|
| 283 |
+
self._data["bilingual"] = True
|
| 284 |
+
if getattr(args, "offline", False):
|
| 285 |
+
self._data["backend"] = "offline"
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 289 |
+
# CHINESE DETECTION UTILITIES
|
| 290 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 291 |
+
def _is_cjk(char: str) -> bool:
|
| 292 |
+
"""Return True if the character falls within any CJK Unicode range."""
|
| 293 |
+
cp = ord(char)
|
| 294 |
+
return any(lo <= cp <= hi for lo, hi in _CJK_RANGES)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def contains_chinese(text: str, min_ratio: float = 0.0) -> bool:
|
| 298 |
+
"""
|
| 299 |
+
Return True when Chinese characters are present in `text`.
|
| 300 |
+
If `min_ratio` is > 0, requires that fraction of non-whitespace characters.
|
| 301 |
+
"""
|
| 302 |
+
if not text or not text.strip():
|
| 303 |
+
return False
|
| 304 |
+
non_ws = [c for c in text if not c.isspace()]
|
| 305 |
+
if not non_ws:
|
| 306 |
+
return False
|
| 307 |
+
cjk_count = sum(1 for c in non_ws if _is_cjk(c))
|
| 308 |
+
if min_ratio <= 0:
|
| 309 |
+
return cjk_count > 0
|
| 310 |
+
return (cjk_count / len(non_ws)) >= min_ratio
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def chinese_ratio(text: str) -> float:
|
| 314 |
+
"""Return the fraction of non-whitespace chars that are CJK."""
|
| 315 |
+
non_ws = [c for c in text if not c.isspace()]
|
| 316 |
+
if not non_ws:
|
| 317 |
+
return 0.0
|
| 318 |
+
return sum(1 for c in non_ws if _is_cjk(c)) / len(non_ws)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
def detect_script(text: str) -> str:
|
| 322 |
+
"""
|
| 323 |
+
Heuristic: Traditional Chinese uses specific code points absent from
|
| 324 |
+
Simplified. Returns 'traditional', 'simplified', or 'mixed'.
|
| 325 |
+
"""
|
| 326 |
+
# Characters common in Traditional but rarely in Simplified
|
| 327 |
+
_TRAD_MARKERS = set(
|
| 328 |
+
"繁體國語臺灣學習問題開發電腦時間工作歷史語言文化"
|
| 329 |
+
"經濟機會關係發展環境教育政府社會應該雖然雖然認為"
|
| 330 |
+
)
|
| 331 |
+
_SIMP_MARKERS = set(
|
| 332 |
+
"简体国语台湾学习问题开发电脑时间工作历史语言文化"
|
| 333 |
+
"经济机会关系发展环境教育政府社会应该虽然认为"
|
| 334 |
+
)
|
| 335 |
+
trad = sum(1 for c in text if c in _TRAD_MARKERS)
|
| 336 |
+
simp = sum(1 for c in text if c in _SIMP_MARKERS)
|
| 337 |
+
if trad > simp:
|
| 338 |
+
return "traditional"
|
| 339 |
+
if simp > trad:
|
| 340 |
+
return "simplified"
|
| 341 |
+
return "simplified" # default fallback
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 345 |
+
# TRANSLATION ENGINE
|
| 346 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 347 |
+
class TranslationEngine:
|
| 348 |
+
"""
|
| 349 |
+
Multi-backend Chinese-to-English translation.
|
| 350 |
+
|
| 351 |
+
Vectorized batch mode is used for the offline (MarianMT) backend.
|
| 352 |
+
Online backends (Google, Microsoft) chunk by character limit with
|
| 353 |
+
sentence-boundary awareness and automatic retry on transient errors.
|
| 354 |
+
"""
|
| 355 |
+
|
| 356 |
+
_GOOGLE_LIMIT = 4500 # chars per Google request
|
| 357 |
+
_MS_LIMIT = 10000 # chars per Microsoft request
|
| 358 |
+
_OFFLINE_LIMIT = 512 # tokens; use 400-char char proxy
|
| 359 |
+
|
| 360 |
+
def __init__(self, config: Config) -> None:
|
| 361 |
+
self.cfg = config
|
| 362 |
+
self._offline_model: Any = None
|
| 363 |
+
self._offline_tok: Any = None
|
| 364 |
+
self._lock = threading.Lock()
|
| 365 |
+
|
| 366 |
+
# ── Public API ────────────────────────────────────────────────────────
|
| 367 |
+
|
| 368 |
+
def translate(
|
| 369 |
+
self, text: str, source_lang: str = "auto"
|
| 370 |
+
) -> Tuple[str, str]:
|
| 371 |
+
"""
|
| 372 |
+
Translate `text` to English.
|
| 373 |
+
Returns (translated_text, backend_name).
|
| 374 |
+
"""
|
| 375 |
+
if not text or not text.strip():
|
| 376 |
+
return text, "passthrough"
|
| 377 |
+
|
| 378 |
+
backend = self.cfg.get("backend", "google")
|
| 379 |
+
attempt_order: List[str] = _dedupe_list([backend, "google", "offline"])
|
| 380 |
+
|
| 381 |
+
last_exc: Optional[Exception] = None
|
| 382 |
+
for b in attempt_order:
|
| 383 |
+
try:
|
| 384 |
+
result = self._call_backend(b, text, source_lang)
|
| 385 |
+
return result, b
|
| 386 |
+
except Exception as exc:
|
| 387 |
+
logger.warning(f"Backend '{b}' failed for [{text}]: {exc}")
|
| 388 |
+
last_exc = exc
|
| 389 |
+
|
| 390 |
+
# NEVER CRASH: return original if all failed
|
| 391 |
+
logger.error(f"All translation backends failed for [{text}]. Returning original.")
|
| 392 |
+
return text, "failed"
|
| 393 |
+
|
| 394 |
+
def translate_batch(
|
| 395 |
+
self,
|
| 396 |
+
texts: List[str],
|
| 397 |
+
source_lang: str = "auto",
|
| 398 |
+
) -> List[Tuple[str, str]]:
|
| 399 |
+
"""
|
| 400 |
+
Translate a list of strings.
|
| 401 |
+
Uses vectorized batching for the offline backend; serial calls for
|
| 402 |
+
online backends (rate-limit friendly).
|
| 403 |
+
"""
|
| 404 |
+
backend = self.cfg.get("backend", "google")
|
| 405 |
+
if backend == "offline" and OFFLINE_AVAILABLE:
|
| 406 |
+
return self._translate_batch_offline(texts)
|
| 407 |
+
# Serial with progress
|
| 408 |
+
results: List[Tuple[str, str]] = []
|
| 409 |
+
iterable = (
|
| 410 |
+
tqdm(texts, desc="Translating", unit="chunk")
|
| 411 |
+
if TQDM_AVAILABLE else texts
|
| 412 |
+
)
|
| 413 |
+
for text in iterable:
|
| 414 |
+
results.append(self.translate(text, source_lang))
|
| 415 |
+
# Small delay for online backends to avoid rate limits
|
| 416 |
+
if backend in ("google", "microsoft"):
|
| 417 |
+
time.sleep(0.3)
|
| 418 |
+
return results
|
| 419 |
+
|
| 420 |
+
# ── Backend dispatch ──────────────────────────────────────────────────
|
| 421 |
+
|
| 422 |
+
def _call_backend(
|
| 423 |
+
self, backend: str, text: str, source_lang: str
|
| 424 |
+
) -> str:
|
| 425 |
+
retries = int(self.cfg.get("retry_attempts", 3))
|
| 426 |
+
delay = float(self.cfg.get("retry_delay_seconds", 1.5))
|
| 427 |
+
last_exc2: Optional[Exception] = None
|
| 428 |
+
for attempt in range(retries):
|
| 429 |
+
try:
|
| 430 |
+
if backend == "google":
|
| 431 |
+
return self._google(text, source_lang)
|
| 432 |
+
elif backend == "microsoft":
|
| 433 |
+
return self._microsoft(text, source_lang)
|
| 434 |
+
elif backend == "offline":
|
| 435 |
+
translated, _ = self._offline_single(text)
|
| 436 |
+
return translated
|
| 437 |
+
else:
|
| 438 |
+
raise ValueError(f"Unknown backend: {backend}")
|
| 439 |
+
except Exception as exc:
|
| 440 |
+
last_exc2 = exc
|
| 441 |
+
if attempt < retries - 1:
|
| 442 |
+
time.sleep(delay * (attempt + 1))
|
| 443 |
+
raise RuntimeError(
|
| 444 |
+
f"Backend '{backend}' failed after {retries} attempts: {last_exc2}"
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
# ── Google ────────────────────────────────────────────────────────────
|
| 448 |
+
|
| 449 |
+
def _google(self, text: str, source_lang: str) -> str:
|
| 450 |
+
if not DEEP_TRANSLATOR_AVAILABLE:
|
| 451 |
+
raise RuntimeError("deep-translator not installed.")
|
| 452 |
+
|
| 453 |
+
lang_map = {"simplified": "zh-CN", "traditional": "zh-TW", "auto": "auto"}
|
| 454 |
+
src = lang_map.get(source_lang, "auto")
|
| 455 |
+
chunks = list(_split_text(text, self._GOOGLE_LIMIT))
|
| 456 |
+
parts: List[str] = []
|
| 457 |
+
|
| 458 |
+
for chunk in chunks:
|
| 459 |
+
try:
|
| 460 |
+
translated = GoogleTranslator(source=src, target="en").translate(chunk)
|
| 461 |
+
# If it's None or returned original Chinese, it failed
|
| 462 |
+
if not translated or (translated.strip() == chunk.strip() and contains_chinese(chunk)):
|
| 463 |
+
raise RuntimeError("Google returned original or None")
|
| 464 |
+
parts.append(translated)
|
| 465 |
+
except Exception as e:
|
| 466 |
+
raise RuntimeError(f"Google translate error: {e}")
|
| 467 |
+
|
| 468 |
+
return " ".join(parts)
|
| 469 |
+
|
| 470 |
+
# ── Microsoft ─────────────────────────────────────────────────────────
|
| 471 |
+
|
| 472 |
+
def _microsoft(self, text: str, source_lang: str) -> str:
|
| 473 |
+
if not DEEP_TRANSLATOR_AVAILABLE:
|
| 474 |
+
raise RuntimeError(
|
| 475 |
+
"deep-translator not installed. Run: pip install deep-translator"
|
| 476 |
+
)
|
| 477 |
+
api_key = str(self.cfg.get("microsoft_api_key", ""))
|
| 478 |
+
region = str(self.cfg.get("microsoft_region", "eastus"))
|
| 479 |
+
if not api_key:
|
| 480 |
+
raise ValueError(
|
| 481 |
+
"Microsoft API key not configured. "
|
| 482 |
+
"Add 'microsoft_api_key' to ~/.chinese_file_translator/config.json"
|
| 483 |
+
)
|
| 484 |
+
lang_map = {"simplified": "zh-Hans", "traditional": "zh-Hant", "auto": "auto"}
|
| 485 |
+
src = lang_map.get(source_lang, "auto")
|
| 486 |
+
chunks = list(_split_text(text, self._MS_LIMIT))
|
| 487 |
+
parts = []
|
| 488 |
+
for chunk in chunks:
|
| 489 |
+
tr = MicrosoftTranslator(
|
| 490 |
+
api_key=api_key, region=region, source=src, target="en"
|
| 491 |
+
).translate(chunk)
|
| 492 |
+
parts.append(tr or chunk)
|
| 493 |
+
return " ".join(parts)
|
| 494 |
+
|
| 495 |
+
# ── Offline (MarianMT) ────────────────────────────────────────────────
|
| 496 |
+
|
| 497 |
+
def _load_offline(self) -> None:
|
| 498 |
+
if not OFFLINE_AVAILABLE:
|
| 499 |
+
raise RuntimeError("Offline model dependencies not installed.")
|
| 500 |
+
model_dir = str(self.cfg.get("offline_model_dir", str(_HOME / "models")))
|
| 501 |
+
Path(model_dir).mkdir(parents=True, exist_ok=True)
|
| 502 |
+
# ...
|
| 503 |
+
self._offline_tok = MarianTokenizer.from_pretrained(
|
| 504 |
+
OFFLINE_MODEL, cache_dir=model_dir
|
| 505 |
+
)
|
| 506 |
+
model = MarianMTModel.from_pretrained(
|
| 507 |
+
OFFLINE_MODEL, cache_dir=model_dir
|
| 508 |
+
)
|
| 509 |
+
use_gpu = bool(self.cfg.get("use_gpu", False))
|
| 510 |
+
device = "cuda" if (use_gpu and torch.cuda.is_available()) else "cpu"
|
| 511 |
+
self._offline_model = model.to(device)
|
| 512 |
+
logger.info(f"Offline model loaded on '{device}'.")
|
| 513 |
+
|
| 514 |
+
def _offline_single(self, text: str) -> Tuple[str, str]:
|
| 515 |
+
with self._lock:
|
| 516 |
+
if self._offline_model is None:
|
| 517 |
+
self._load_offline()
|
| 518 |
+
chunks = list(_split_text(text, self._OFFLINE_LIMIT))
|
| 519 |
+
results = self._vectorized_translate(chunks)
|
| 520 |
+
return " ".join(results), "offline"
|
| 521 |
+
|
| 522 |
+
def _translate_batch_offline(
|
| 523 |
+
self, texts: List[str]
|
| 524 |
+
) -> List[Tuple[str, str]]:
|
| 525 |
+
"""Vectorized: flatten all chunks, translate in one pass, reassemble."""
|
| 526 |
+
with self._lock:
|
| 527 |
+
if self._offline_model is None:
|
| 528 |
+
self._load_offline()
|
| 529 |
+
|
| 530 |
+
# Build chunk index: (text_idx, chunk_idx) -> flat_idx
|
| 531 |
+
all_chunks: List[str] = []
|
| 532 |
+
chunk_map: List[Tuple[int, int]] = [] # (text_idx, n_chunks)
|
| 533 |
+
|
| 534 |
+
for t_idx, text in enumerate(texts):
|
| 535 |
+
if not text or not text.strip():
|
| 536 |
+
chunk_map.append((t_idx, 0))
|
| 537 |
+
continue
|
| 538 |
+
chunks = list(_split_text(text, self._OFFLINE_LIMIT))
|
| 539 |
+
start = len(all_chunks)
|
| 540 |
+
all_chunks.extend(chunks)
|
| 541 |
+
chunk_map.append((t_idx, len(chunks)))
|
| 542 |
+
|
| 543 |
+
if not all_chunks:
|
| 544 |
+
return [(t, "passthrough") for t in texts]
|
| 545 |
+
|
| 546 |
+
# One vectorized forward pass
|
| 547 |
+
translated_chunks = self._vectorized_translate(all_chunks)
|
| 548 |
+
|
| 549 |
+
# Reassemble
|
| 550 |
+
results: List[Tuple[str, str]] = []
|
| 551 |
+
flat_idx = 0
|
| 552 |
+
for t_idx, n in chunk_map:
|
| 553 |
+
if n == 0:
|
| 554 |
+
results.append((texts[t_idx], "passthrough"))
|
| 555 |
+
else:
|
| 556 |
+
assembled = " ".join(translated_chunks[flat_idx : flat_idx + n])
|
| 557 |
+
results.append((assembled, "offline"))
|
| 558 |
+
flat_idx += n
|
| 559 |
+
return results
|
| 560 |
+
|
| 561 |
+
def _vectorized_translate(self, chunks: List[str]) -> List[str]:
|
| 562 |
+
"""Run MarianMT on a list of strings in one batched forward pass."""
|
| 563 |
+
if not chunks:
|
| 564 |
+
return []
|
| 565 |
+
|
| 566 |
+
tok = self._offline_tok
|
| 567 |
+
model = self._offline_model
|
| 568 |
+
if tok is None or model is None:
|
| 569 |
+
raise RuntimeError("Offline model not loaded.")
|
| 570 |
+
|
| 571 |
+
device = next(model.parameters()).device
|
| 572 |
+
batch_size = int(self.cfg.get("batch_size", 10))
|
| 573 |
+
results: List[str] = []
|
| 574 |
+
|
| 575 |
+
# Split into mini-batches to avoid OOM on large inputs
|
| 576 |
+
for i in range(0, len(chunks), batch_size):
|
| 577 |
+
mini = chunks[i : i + batch_size]
|
| 578 |
+
enc = tok(
|
| 579 |
+
mini,
|
| 580 |
+
return_tensors="pt",
|
| 581 |
+
padding=True,
|
| 582 |
+
truncation=True,
|
| 583 |
+
max_length=512,
|
| 584 |
+
).to(device)
|
| 585 |
+
with torch.no_grad():
|
| 586 |
+
out = model.generate(**enc)
|
| 587 |
+
decoded = tok.batch_decode(out, skip_special_tokens=True)
|
| 588 |
+
results.extend(decoded)
|
| 589 |
+
|
| 590 |
+
return results
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 594 |
+
# TEXT SPLITTING UTILITIES
|
| 595 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 596 |
+
def _split_text(text: str, max_len: int) -> Generator[str, None, None]:
|
| 597 |
+
"""Split text at sentence boundaries for chunking."""
|
| 598 |
+
if len(text) <= max_len:
|
| 599 |
+
yield text
|
| 600 |
+
return
|
| 601 |
+
|
| 602 |
+
sentence_ends = re.compile(r"[。!?\n!?\.]")
|
| 603 |
+
current: List[str] = []
|
| 604 |
+
current_len = 0
|
| 605 |
+
|
| 606 |
+
for segment in sentence_ends.split(text):
|
| 607 |
+
seg = segment.strip()
|
| 608 |
+
if not seg:
|
| 609 |
+
continue
|
| 610 |
+
if current_len + len(seg) + 1 > max_len and current:
|
| 611 |
+
yield " ".join(current)
|
| 612 |
+
current = [seg]
|
| 613 |
+
current_len = len(seg)
|
| 614 |
+
else:
|
| 615 |
+
current.append(seg)
|
| 616 |
+
current_len += len(seg) + 1
|
| 617 |
+
|
| 618 |
+
if current:
|
| 619 |
+
yield " ".join(current)
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def _dedupe_list(lst: List[str]) -> List[str]:
|
| 623 |
+
seen: set = set()
|
| 624 |
+
out: List[str] = []
|
| 625 |
+
for item in lst:
|
| 626 |
+
if item not in seen:
|
| 627 |
+
seen.add(item)
|
| 628 |
+
out.append(item)
|
| 629 |
+
return out
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 633 |
+
# MARKDOWN PARSER / SEGMENT EXTRACTOR
|
| 634 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 635 |
+
class MarkdownProcessor:
|
| 636 |
+
"""Ultra-robust Markdown protection."""
|
| 637 |
+
_TOKEN = "___MY_PROTECT_PH_{idx}___"
|
| 638 |
+
|
| 639 |
+
def __init__(self) -> None:
|
| 640 |
+
self._protected: Dict[int, str] = {}
|
| 641 |
+
self._ph_counter = 0
|
| 642 |
+
|
| 643 |
+
def _next_placeholder(self, original: str) -> str:
|
| 644 |
+
idx = self._ph_counter
|
| 645 |
+
token = self._TOKEN.format(idx=idx)
|
| 646 |
+
self._protected[idx] = original
|
| 647 |
+
self._ph_counter += 1
|
| 648 |
+
return token
|
| 649 |
+
|
| 650 |
+
def protect(self, text: str) -> str:
|
| 651 |
+
"""Replace code/links/tags with unique tokens."""
|
| 652 |
+
self._protected.clear()
|
| 653 |
+
self._ph_counter = 0
|
| 654 |
+
|
| 655 |
+
# Protect YAML
|
| 656 |
+
text = _MD_FRONTMATTER.sub(lambda m: self._next_placeholder(m.group(0)), text)
|
| 657 |
+
|
| 658 |
+
# Protect Code Fences but leave content if it has Chinese
|
| 659 |
+
def _fence_sub(m: re.Match) -> str:
|
| 660 |
+
full = m.group(0)
|
| 661 |
+
if contains_chinese(full):
|
| 662 |
+
# Only protect the ``` lines
|
| 663 |
+
lines = full.splitlines()
|
| 664 |
+
if len(lines) >= 2:
|
| 665 |
+
p1 = self._next_placeholder(lines[0])
|
| 666 |
+
p2 = self._next_placeholder(lines[-1])
|
| 667 |
+
content = "\n".join(lines[1:-1])
|
| 668 |
+
return f"{p1}\n{content}\n{p2}"
|
| 669 |
+
return self._next_placeholder(full)
|
| 670 |
+
text = _MD_CODE_FENCE.sub(_fence_sub, text)
|
| 671 |
+
|
| 672 |
+
# Protect HTML and Inline Code and Links
|
| 673 |
+
text = _MD_HTML_TAG.sub(lambda m: self._next_placeholder(m.group(0)), text)
|
| 674 |
+
text = _MD_LINK.sub(lambda m: f"{m.group(1)}({self._next_placeholder(m.group(2))})", text)
|
| 675 |
+
text = _MD_INLINE_CODE.sub(lambda m: self._next_placeholder(m.group(0)), text)
|
| 676 |
+
|
| 677 |
+
return text
|
| 678 |
+
|
| 679 |
+
def restore(self, text: str) -> str:
|
| 680 |
+
"""Sequential replacement of all tokens."""
|
| 681 |
+
# We replace them in reverse to avoid partial matches if idx 10 and 1 exist
|
| 682 |
+
for idx in sorted(self._protected.keys(), reverse=True):
|
| 683 |
+
token = self._TOKEN.format(idx=idx)
|
| 684 |
+
original = self._protected[idx]
|
| 685 |
+
# Use regex to handle potential space mangling by Google
|
| 686 |
+
pattern = re.compile(re.escape(token).replace(r"\_", r"\s*\_*"), re.IGNORECASE)
|
| 687 |
+
text = pattern.sub(original.replace("\\", "\\\\"), text)
|
| 688 |
+
return text
|
| 689 |
+
|
| 690 |
+
|
| 691 |
+
class FileTranslator:
|
| 692 |
+
"""Orchestrates translation with 'Never Miss' strategy."""
|
| 693 |
+
def __init__(self, config: Config) -> None:
|
| 694 |
+
self.cfg = config
|
| 695 |
+
self.engine = TranslationEngine(config)
|
| 696 |
+
self._md_proc = MarkdownProcessor()
|
| 697 |
+
|
| 698 |
+
def translate_file(
|
| 699 |
+
self,
|
| 700 |
+
input_path: Path,
|
| 701 |
+
output_path: Optional[Path] = None,
|
| 702 |
+
extract_only: bool = False,
|
| 703 |
+
to_stdout: bool = False,
|
| 704 |
+
) -> Path:
|
| 705 |
+
input_path = Path(input_path).resolve()
|
| 706 |
+
if not input_path.exists(): raise FileNotFoundError(f"Missing: {input_path}")
|
| 707 |
+
|
| 708 |
+
suffix = input_path.suffix.lower()
|
| 709 |
+
if suffix not in (".txt", ".md"): raise ValueError("Unsupported type")
|
| 710 |
+
|
| 711 |
+
raw = input_path.read_text(encoding="utf-8", errors="replace")
|
| 712 |
+
if extract_only:
|
| 713 |
+
extracted = "\n".join([l for l in raw.splitlines() if contains_chinese(l)])
|
| 714 |
+
if to_stdout: print(extracted); return input_path
|
| 715 |
+
out = output_path or _default_output(input_path, self.cfg)
|
| 716 |
+
out.write_text(extracted, encoding="utf-8")
|
| 717 |
+
return out
|
| 718 |
+
|
| 719 |
+
res = self._translate_md(raw) if suffix == ".md" else self._translate_txt(raw)
|
| 720 |
+
if to_stdout: print(res); return input_path
|
| 721 |
+
out = output_path or _default_output(input_path, self.cfg)
|
| 722 |
+
out.write_text(res, encoding="utf-8")
|
| 723 |
+
return out
|
| 724 |
+
|
| 725 |
+
def _translate_txt(self, text: str) -> str:
|
| 726 |
+
lines = text.splitlines(keepends=True)
|
| 727 |
+
bilingual = bool(self.cfg.get("bilingual", False))
|
| 728 |
+
|
| 729 |
+
out_lines = []
|
| 730 |
+
for line in lines:
|
| 731 |
+
stripped = line.rstrip("\n\r")
|
| 732 |
+
if contains_chinese(stripped):
|
| 733 |
+
tr = self._translate_granular(stripped)
|
| 734 |
+
eol = "\n" if line.endswith("\n") else ""
|
| 735 |
+
out_lines.append(f"{stripped}\n{tr}{eol}" if bilingual else f"{tr}{eol}")
|
| 736 |
+
else:
|
| 737 |
+
out_lines.append(line)
|
| 738 |
+
return "".join(out_lines)
|
| 739 |
+
|
| 740 |
+
def _translate_md(self, text: str) -> str:
|
| 741 |
+
"""Global Surgical Batch Translation with fixed CJK regex."""
|
| 742 |
+
# 1. Protect structure
|
| 743 |
+
protected = self._md_proc.protect(text)
|
| 744 |
+
|
| 745 |
+
# 2. Extract all CJK blocks (Inclusive range for stability)
|
| 746 |
+
CJK_BLOCK_RE = re.compile(
|
| 747 |
+
r"["
|
| 748 |
+
r"\u4e00-\u9fff" # Basic
|
| 749 |
+
r"\u3400-\u4dbf" # Ext A
|
| 750 |
+
r"\U00020000-\U0002ceaf" # Ext B-E
|
| 751 |
+
r"\uf900-\ufaff" # Compatibility
|
| 752 |
+
r"\u3000-\u303f" # Symbols/Punctuation
|
| 753 |
+
r"\uff00-\uffef" # Fullwidth
|
| 754 |
+
r"\u00b7" # Middle dot
|
| 755 |
+
r"\u2014-\u2027" # Punctuation ranges
|
| 756 |
+
r"]+"
|
| 757 |
+
)
|
| 758 |
+
# Filter out blocks that are ONLY numbers or symbols if they don't have AT LEAST ONE CJK
|
| 759 |
+
def _has_real_cjk(s):
|
| 760 |
+
return any('\u4e00' <= c <= '\u9fff' or '\u3400' <= c <= '\u4dbf' or ord(c) > 0xffff for c in s)
|
| 761 |
+
|
| 762 |
+
all_candidate_blocks = CJK_BLOCK_RE.findall(protected)
|
| 763 |
+
all_blocks = _dedupe_list([b for b in all_candidate_blocks if _has_real_cjk(b)])
|
| 764 |
+
|
| 765 |
+
if not all_blocks:
|
| 766 |
+
return self._md_proc.restore(protected)
|
| 767 |
+
|
| 768 |
+
# 3. Batch translate unique blocks
|
| 769 |
+
logger.info(f"Found {len(all_blocks)} unique Chinese blocks. Batch translating...")
|
| 770 |
+
translated = self.engine.translate_batch(all_blocks, source_lang="simplified")
|
| 771 |
+
|
| 772 |
+
# 4. Global replacement
|
| 773 |
+
mapping = {}
|
| 774 |
+
for orig, (tr, _) in zip(all_blocks, translated):
|
| 775 |
+
if tr.strip() and tr.strip() != orig.strip():
|
| 776 |
+
mapping[orig] = tr
|
| 777 |
+
else:
|
| 778 |
+
try:
|
| 779 |
+
t, _ = self.engine.translate(orig, source_lang="simplified")
|
| 780 |
+
mapping[orig] = t
|
| 781 |
+
except:
|
| 782 |
+
mapping[orig] = orig
|
| 783 |
+
|
| 784 |
+
sorted_orig = sorted(mapping.keys(), key=len, reverse=True)
|
| 785 |
+
final_text = protected
|
| 786 |
+
for orig in sorted_orig:
|
| 787 |
+
final_text = final_text.replace(orig, mapping[orig])
|
| 788 |
+
|
| 789 |
+
# 5. Restore
|
| 790 |
+
return self._md_proc.restore(final_text)
|
| 791 |
+
|
| 792 |
+
def _translate_granular(self, text: str) -> str:
|
| 793 |
+
"""Fallback for TXT or other sparse areas."""
|
| 794 |
+
CJK_BLOCK_RE = re.compile(
|
| 795 |
+
r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002ceaf\u3000-\u303f\uff00-\uffef]+"
|
| 796 |
+
)
|
| 797 |
+
def _sub(m: re.Match) -> str:
|
| 798 |
+
chunk = m.group(0)
|
| 799 |
+
if not any('\u4e00' <= c <= '\u9fff' for c in chunk): return chunk
|
| 800 |
+
try:
|
| 801 |
+
t, _ = self.engine.translate(chunk, source_lang="simplified")
|
| 802 |
+
return t
|
| 803 |
+
except:
|
| 804 |
+
return chunk
|
| 805 |
+
return CJK_BLOCK_RE.sub(_sub, text)
|
| 806 |
+
|
| 807 |
+
@staticmethod
|
| 808 |
+
def _extract_chinese_lines(text: str) -> List[str]:
|
| 809 |
+
"""Return only lines that contain Chinese text."""
|
| 810 |
+
return [
|
| 811 |
+
line for line in text.splitlines()
|
| 812 |
+
if contains_chinese(line)
|
| 813 |
+
]
|
| 814 |
+
|
| 815 |
+
def _detect_script_bulk(self, texts: List[str]) -> str:
|
| 816 |
+
"""Detect dominant script from a list of strings."""
|
| 817 |
+
lang_mode = str(self.cfg.get("lang", "auto"))
|
| 818 |
+
if lang_mode in ("simplified", "traditional"):
|
| 819 |
+
return lang_mode
|
| 820 |
+
combined = " ".join(texts[:50]) # sample first 50 segments
|
| 821 |
+
return detect_script(combined)
|
| 822 |
+
|
| 823 |
+
# ── Batch directory translation ───────────────────────────────────────
|
| 824 |
+
|
| 825 |
+
def translate_directory(
|
| 826 |
+
self,
|
| 827 |
+
input_dir: Path,
|
| 828 |
+
output_dir: Optional[Path] = None,
|
| 829 |
+
) -> List[Path]:
|
| 830 |
+
"""Translate all .txt and .md files in `input_dir`."""
|
| 831 |
+
input_dir = Path(input_dir).resolve()
|
| 832 |
+
if not input_dir.is_dir():
|
| 833 |
+
raise NotADirectoryError(f"Not a directory: {input_dir}")
|
| 834 |
+
|
| 835 |
+
files = sorted(
|
| 836 |
+
list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
|
| 837 |
+
)
|
| 838 |
+
if not files:
|
| 839 |
+
logger.warning(f"No .txt or .md files found in {input_dir}")
|
| 840 |
+
return []
|
| 841 |
+
|
| 842 |
+
logger.info(f"Batch translating {len(files)} file(s) from {input_dir}")
|
| 843 |
+
out_paths: List[Path] = []
|
| 844 |
+
|
| 845 |
+
iterable = (
|
| 846 |
+
tqdm(files, desc="Files", unit="file")
|
| 847 |
+
if TQDM_AVAILABLE else files
|
| 848 |
+
)
|
| 849 |
+
for fpath in iterable:
|
| 850 |
+
try:
|
| 851 |
+
if output_dir:
|
| 852 |
+
out_file = Path(output_dir) / fpath.name
|
| 853 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
| 854 |
+
else:
|
| 855 |
+
out_file = _default_output(fpath, self.cfg)
|
| 856 |
+
result = self.translate_file(fpath, output_path=out_file)
|
| 857 |
+
out_paths.append(result)
|
| 858 |
+
logger.info(f" Done: {fpath.name} -> {result.name}")
|
| 859 |
+
except Exception as exc:
|
| 860 |
+
logger.error(f" Failed: {fpath.name}: {exc}")
|
| 861 |
+
|
| 862 |
+
return out_paths
|
| 863 |
+
|
| 864 |
+
|
| 865 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 866 |
+
# HISTORY MANAGER
|
| 867 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 868 |
+
class HistoryManager:
|
| 869 |
+
"""Log translation sessions to a persistent JSON file."""
|
| 870 |
+
|
| 871 |
+
def __init__(self, config: Config) -> None:
|
| 872 |
+
self.cfg = config
|
| 873 |
+
self._items: List[Dict[str, Any]] = []
|
| 874 |
+
_HOME.mkdir(parents=True, exist_ok=True)
|
| 875 |
+
self._load()
|
| 876 |
+
|
| 877 |
+
def _load(self) -> None:
|
| 878 |
+
if HISTORY_FILE.exists():
|
| 879 |
+
try:
|
| 880 |
+
with open(HISTORY_FILE, "r", encoding="utf-8") as f:
|
| 881 |
+
self._items = json.load(f)
|
| 882 |
+
except Exception:
|
| 883 |
+
self._items = []
|
| 884 |
+
|
| 885 |
+
def save(self) -> None:
|
| 886 |
+
try:
|
| 887 |
+
with open(HISTORY_FILE, "w", encoding="utf-8") as f:
|
| 888 |
+
json.dump(self._items, f, ensure_ascii=False, indent=2)
|
| 889 |
+
except Exception as exc:
|
| 890 |
+
logger.error(f"History save error: {exc}")
|
| 891 |
+
|
| 892 |
+
def add(
|
| 893 |
+
self,
|
| 894 |
+
input_file: str,
|
| 895 |
+
output_file: str,
|
| 896 |
+
backend: str,
|
| 897 |
+
script: str,
|
| 898 |
+
segments_count: int,
|
| 899 |
+
elapsed_seconds: float,
|
| 900 |
+
) -> None:
|
| 901 |
+
entry: Dict[str, Any] = {
|
| 902 |
+
"timestamp" : datetime.now().isoformat(),
|
| 903 |
+
"input_file" : input_file,
|
| 904 |
+
"output_file" : output_file,
|
| 905 |
+
"backend" : backend,
|
| 906 |
+
"script" : script,
|
| 907 |
+
"segments_count" : segments_count,
|
| 908 |
+
"elapsed_seconds": round(elapsed_seconds, 2),
|
| 909 |
+
}
|
| 910 |
+
self._items.insert(0, entry)
|
| 911 |
+
max_h = int(self.cfg.get("max_history", 1000))
|
| 912 |
+
while len(self._items) > max_h:
|
| 913 |
+
self._items.pop()
|
| 914 |
+
self.save()
|
| 915 |
+
|
| 916 |
+
def export(self, path: str) -> None:
|
| 917 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 918 |
+
json.dump(self._items, f, ensure_ascii=False, indent=2)
|
| 919 |
+
logger.info(f"History exported to {path}")
|
| 920 |
+
|
| 921 |
+
def get_all(self) -> List[Dict[str, Any]]:
|
| 922 |
+
return list(self._items)
|
| 923 |
+
|
| 924 |
+
|
| 925 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 926 |
+
# PATH HELPERS
|
| 927 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 928 |
+
def _default_output(input_path: Path, config: Config) -> Path:
|
| 929 |
+
"""Derive default output path: input_translated.ext"""
|
| 930 |
+
suffix = str(config.get("output_suffix", "_translated"))
|
| 931 |
+
return input_path.with_stem(input_path.stem + suffix)
|
| 932 |
+
|
| 933 |
+
|
| 934 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 935 |
+
# CLI ARG PARSER
|
| 936 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 937 |
+
def _build_parser() -> argparse.ArgumentParser:
|
| 938 |
+
parser = argparse.ArgumentParser(
|
| 939 |
+
prog="chinese_file_translator",
|
| 940 |
+
description=(
|
| 941 |
+
f"{APP_NAME} v{APP_VERSION} by {APP_AUTHOR}\n"
|
| 942 |
+
"Translate Chinese text inside .txt or .md files to English."
|
| 943 |
+
),
|
| 944 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 945 |
+
epilog=textwrap.dedent("""
|
| 946 |
+
Examples:
|
| 947 |
+
python chinese_file_translator.py input.txt
|
| 948 |
+
python chinese_file_translator.py input.md -o translated.md
|
| 949 |
+
python chinese_file_translator.py input.txt --backend offline --gpu
|
| 950 |
+
python chinese_file_translator.py input.txt --bilingual
|
| 951 |
+
python chinese_file_translator.py input.txt --extract-only
|
| 952 |
+
python chinese_file_translator.py --batch ./docs/ --batch-out ./out/
|
| 953 |
+
python chinese_file_translator.py input.txt --stdout
|
| 954 |
+
"""),
|
| 955 |
+
)
|
| 956 |
+
parser.add_argument(
|
| 957 |
+
"input",
|
| 958 |
+
nargs="?",
|
| 959 |
+
help="Input .txt or .md file path",
|
| 960 |
+
)
|
| 961 |
+
parser.add_argument(
|
| 962 |
+
"-o", "--output",
|
| 963 |
+
dest="output",
|
| 964 |
+
metavar="FILE",
|
| 965 |
+
help="Output file path (default: <input>_translated.<ext>)",
|
| 966 |
+
)
|
| 967 |
+
parser.add_argument(
|
| 968 |
+
"--batch",
|
| 969 |
+
metavar="DIR",
|
| 970 |
+
help="Translate all .txt and .md files in a directory",
|
| 971 |
+
)
|
| 972 |
+
parser.add_argument(
|
| 973 |
+
"--batch-out",
|
| 974 |
+
dest="batch_out",
|
| 975 |
+
metavar="DIR",
|
| 976 |
+
help="Output directory for batch translation",
|
| 977 |
+
)
|
| 978 |
+
parser.add_argument(
|
| 979 |
+
"--backend",
|
| 980 |
+
choices=["google", "microsoft", "offline"],
|
| 981 |
+
help="Translation backend (default: google)",
|
| 982 |
+
)
|
| 983 |
+
parser.add_argument(
|
| 984 |
+
"--offline",
|
| 985 |
+
action="store_true",
|
| 986 |
+
help="Shorthand for --backend offline",
|
| 987 |
+
)
|
| 988 |
+
parser.add_argument(
|
| 989 |
+
"--lang",
|
| 990 |
+
choices=["auto", "simplified", "traditional"],
|
| 991 |
+
default="auto",
|
| 992 |
+
help="Chinese script mode (default: auto)",
|
| 993 |
+
)
|
| 994 |
+
parser.add_argument(
|
| 995 |
+
"--gpu",
|
| 996 |
+
action="store_true",
|
| 997 |
+
help="Use GPU (CUDA) for offline translation",
|
| 998 |
+
)
|
| 999 |
+
parser.add_argument(
|
| 1000 |
+
"--confidence",
|
| 1001 |
+
type=float,
|
| 1002 |
+
metavar="0.0-1.0",
|
| 1003 |
+
help="Chinese detection confidence threshold (default: 0.05 ratio)",
|
| 1004 |
+
)
|
| 1005 |
+
parser.add_argument(
|
| 1006 |
+
"--chunk-size",
|
| 1007 |
+
dest="chunk_size",
|
| 1008 |
+
type=int,
|
| 1009 |
+
metavar="N",
|
| 1010 |
+
help="Max characters per translation request (default: 4000)",
|
| 1011 |
+
)
|
| 1012 |
+
parser.add_argument(
|
| 1013 |
+
"--bilingual",
|
| 1014 |
+
action="store_true",
|
| 1015 |
+
help="Keep original Chinese alongside English translation",
|
| 1016 |
+
)
|
| 1017 |
+
parser.add_argument(
|
| 1018 |
+
"--extract-only",
|
| 1019 |
+
dest="extract_only",
|
| 1020 |
+
action="store_true",
|
| 1021 |
+
help="Only extract and save detected Chinese lines, no translation",
|
| 1022 |
+
)
|
| 1023 |
+
parser.add_argument(
|
| 1024 |
+
"--stdout",
|
| 1025 |
+
action="store_true",
|
| 1026 |
+
help="Print translated output to stdout instead of writing a file",
|
| 1027 |
+
)
|
| 1028 |
+
parser.add_argument(
|
| 1029 |
+
"--export-history",
|
| 1030 |
+
dest="export_history",
|
| 1031 |
+
metavar="FILE",
|
| 1032 |
+
help="Export translation history to a JSON file",
|
| 1033 |
+
)
|
| 1034 |
+
parser.add_argument(
|
| 1035 |
+
"--version",
|
| 1036 |
+
action="version",
|
| 1037 |
+
version=f"{APP_NAME} {APP_VERSION}",
|
| 1038 |
+
)
|
| 1039 |
+
parser.add_argument(
|
| 1040 |
+
"--verbose",
|
| 1041 |
+
action="store_true",
|
| 1042 |
+
help="Enable DEBUG-level logging",
|
| 1043 |
+
)
|
| 1044 |
+
return parser
|
| 1045 |
+
|
| 1046 |
+
|
| 1047 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 1048 |
+
# DEPENDENCY CHECK
|
| 1049 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 1050 |
+
def check_dependencies(args: argparse.Namespace) -> None:
|
| 1051 |
+
issues: List[str] = []
|
| 1052 |
+
want_offline = getattr(args, "offline", False) or getattr(args, "backend", "") == "offline"
|
| 1053 |
+
if not DEEP_TRANSLATOR_AVAILABLE:
|
| 1054 |
+
issues.append(
|
| 1055 |
+
"deep-translator -> pip install deep-translator"
|
| 1056 |
+
)
|
| 1057 |
+
if want_offline and not OFFLINE_AVAILABLE:
|
| 1058 |
+
issues.append(
|
| 1059 |
+
"transformers / torch -> pip install transformers torch\n"
|
| 1060 |
+
" (CPU) pip install torch --index-url https://download.pytorch.org/whl/cpu\n"
|
| 1061 |
+
" (CUDA) pip install torch --index-url https://download.pytorch.org/whl/cu121"
|
| 1062 |
+
)
|
| 1063 |
+
if issues:
|
| 1064 |
+
print("\n" + "=" * 55)
|
| 1065 |
+
print(f"[{APP_NAME}] Missing dependencies:")
|
| 1066 |
+
for i in issues:
|
| 1067 |
+
print(f" {i}")
|
| 1068 |
+
print("=" * 55 + "\n")
|
| 1069 |
+
|
| 1070 |
+
|
| 1071 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 1072 |
+
# MAIN
|
| 1073 |
+
# ════════════════════════════════════════════════════════════════════════════
|
| 1074 |
+
def main() -> None:
|
| 1075 |
+
parser = _build_parser()
|
| 1076 |
+
args = parser.parse_args()
|
| 1077 |
+
|
| 1078 |
+
setup_logging(verbose=getattr(args, "verbose", False))
|
| 1079 |
+
check_dependencies(args)
|
| 1080 |
+
|
| 1081 |
+
cfg = Config()
|
| 1082 |
+
cfg.apply_args(args)
|
| 1083 |
+
|
| 1084 |
+
history = HistoryManager(cfg)
|
| 1085 |
+
translator = FileTranslator(cfg)
|
| 1086 |
+
|
| 1087 |
+
# ── Export history shortcut ───────────────────────────────────────────
|
| 1088 |
+
if getattr(args, "export_history", None):
|
| 1089 |
+
history.export(args.export_history)
|
| 1090 |
+
if not args.input and not args.batch:
|
| 1091 |
+
return
|
| 1092 |
+
|
| 1093 |
+
# ── Batch mode ────────────────────────────────────────────────────────
|
| 1094 |
+
if getattr(args, "batch", None):
|
| 1095 |
+
batch_dir = Path(args.batch)
|
| 1096 |
+
out_dir = Path(args.batch_out) if getattr(args, "batch_out", None) else None
|
| 1097 |
+
t0 = time.time()
|
| 1098 |
+
out_paths = translator.translate_directory(batch_dir, output_dir=out_dir)
|
| 1099 |
+
elapsed = time.time() - t0
|
| 1100 |
+
print(
|
| 1101 |
+
f"\nBatch complete: {len(out_paths)} file(s) translated "
|
| 1102 |
+
f"in {elapsed:.1f}s"
|
| 1103 |
+
)
|
| 1104 |
+
for p in out_paths:
|
| 1105 |
+
print(f" -> {p}")
|
| 1106 |
+
history.add(
|
| 1107 |
+
input_file=str(batch_dir),
|
| 1108 |
+
output_file=str(out_dir or batch_dir),
|
| 1109 |
+
backend=str(cfg.get("backend")),
|
| 1110 |
+
script=str(cfg.get("lang")),
|
| 1111 |
+
segments_count=len(out_paths),
|
| 1112 |
+
elapsed_seconds=elapsed,
|
| 1113 |
+
)
|
| 1114 |
+
return
|
| 1115 |
+
|
| 1116 |
+
# ── Single file mode ──────────────────────────────────────────────────
|
| 1117 |
+
if not args.input:
|
| 1118 |
+
parser.print_help()
|
| 1119 |
+
sys.exit(0)
|
| 1120 |
+
|
| 1121 |
+
input_path = Path(args.input)
|
| 1122 |
+
output_path = Path(args.output) if getattr(args, "output", None) else None
|
| 1123 |
+
|
| 1124 |
+
t0 = time.time()
|
| 1125 |
+
try:
|
| 1126 |
+
out = translator.translate_file(
|
| 1127 |
+
input_path = input_path,
|
| 1128 |
+
output_path = output_path,
|
| 1129 |
+
extract_only = getattr(args, "extract_only", False),
|
| 1130 |
+
to_stdout = getattr(args, "stdout", False),
|
| 1131 |
+
)
|
| 1132 |
+
except (FileNotFoundError, ValueError, RuntimeError) as exc:
|
| 1133 |
+
logger.error(str(exc))
|
| 1134 |
+
sys.exit(1)
|
| 1135 |
+
|
| 1136 |
+
elapsed = time.time() - t0
|
| 1137 |
+
|
| 1138 |
+
if not getattr(args, "stdout", False):
|
| 1139 |
+
print(f"\n{APP_NAME} v{APP_VERSION}")
|
| 1140 |
+
print(f"Input : {input_path}")
|
| 1141 |
+
print(f"Output : {out}")
|
| 1142 |
+
print(f"Backend : {cfg.get('backend')}")
|
| 1143 |
+
print(f"Script : {cfg.get('lang')}")
|
| 1144 |
+
print(f"Elapsed : {elapsed:.2f}s")
|
| 1145 |
+
print(f"Config : {CONFIG_FILE}")
|
| 1146 |
+
print(f"Log : {LOG_FILE}")
|
| 1147 |
+
|
| 1148 |
+
history.add(
|
| 1149 |
+
input_file = str(input_path),
|
| 1150 |
+
output_file = str(out),
|
| 1151 |
+
backend = str(cfg.get("backend")),
|
| 1152 |
+
script = str(cfg.get("lang")),
|
| 1153 |
+
segments_count = 0,
|
| 1154 |
+
elapsed_seconds = elapsed,
|
| 1155 |
+
)
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
if __name__ == "__main__":
|
| 1159 |
+
main()
|
input.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
im using https://github.com/pwxcoo/chinese-xinhua/blob/master/README.md as reference to test my codebase
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
```reference
|
| 6 |
+
#!/bin/bash
|
| 7 |
+
|
| 8 |
+
# chinese-xinhua
|
| 9 |
+
|
| 10 |
+
中华新华字典数据库和 API 。收录包括 14032 条歇后语,16142 个汉字,264434 个词语,31648 个成语。
|
| 11 |
+
|
| 12 |
+
## Project Structure
|
| 13 |
+
|
| 14 |
+
```
|
| 15 |
+
chinese-xinhua/
|
| 16 |
+
|
|
| 17 |
+
+- data/ <-- 数据文件夹
|
| 18 |
+
| |
|
| 19 |
+
| +- idiom.json <-- 成语
|
| 20 |
+
| |
|
| 21 |
+
| +- word.json <-- 汉字
|
| 22 |
+
| |
|
| 23 |
+
| +- xiehouyu.json <-- 歇后语
|
| 24 |
+
| |
|
| 25 |
+
| +- ci.json <-- 词语
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Database Introduction
|
| 29 |
+
|
| 30 |
+
### 成语 (idiom.json)
|
| 31 |
+
|
| 32 |
+
```json
|
| 33 |
+
[
|
| 34 |
+
{
|
| 35 |
+
"derivation": "语出《法华经·法师功德品》下至阿鼻地狱。”",
|
| 36 |
+
"example": "但也有少数意志薄弱的……逐步上当,终至堕入~。★《上饶集中营·炼狱杂记》",
|
| 37 |
+
"explanation": "阿鼻梵语的译音,意译为无间”,即痛苦无有间断之意。常用来比喻黑暗的社会和严酷的牢狱。又比喻无法摆脱的极其痛苦的境地。",
|
| 38 |
+
"pinyin": "ā bí dì yù",
|
| 39 |
+
"word": "阿鼻地狱",
|
| 40 |
+
"abbreviation": "abdy"
|
| 41 |
+
},
|
| 42 |
+
...
|
| 43 |
+
]
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### 词语 (ci.json)
|
| 47 |
+
|
| 48 |
+
```json
|
| 49 |
+
[
|
| 50 |
+
{
|
| 51 |
+
"ci": "宸纶",
|
| 52 |
+
"explanation": "1.帝王的诏书﹑制令。"
|
| 53 |
+
},
|
| 54 |
+
...
|
| 55 |
+
]
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### 汉字 (word.json)
|
| 59 |
+
|
| 60 |
+
```json
|
| 61 |
+
[
|
| 62 |
+
{
|
| 63 |
+
"word": "嗄",
|
| 64 |
+
"oldword": "嗄",
|
| 65 |
+
"strokes": "13",
|
| 66 |
+
"pinyin": "á",
|
| 67 |
+
"radicals": "口",
|
| 68 |
+
"explanation": "嗄〈叹〉\n\n 同啊”。表示省悟或惊奇\n\n 嗄!难道这里是没有地方官的么?--宋·佚名《新编五代史平话》\n\n 嗄á叹词。在句首,〈表〉疑问或反问~,这是什么?~,你想干什么?\"嗄\"另见shà㈠。\n\n 嗄shà\n\n ⒈声音嘶哑~声。\n\n 嗄a 1.助词。表示强调﹑肯定或辩解。 2.助词。方言。表示疑问或反诘。\n\n 嗄xià 1.见\"嗄饭\"。 2.见\"嗄程\"。",
|
| 69 |
+
"more": "嗄 ga、a 部首 口 部首笔画 03 总笔画 13 嗄2\nshà\n〈形〉\n(1)\n声音嘶哑的 [hoarse]\n终日嚎而嗌不嗄。--《老子》\n(2)\n又如嗄哑,嗄嘶(嗓音嘶哑)\n嗄\nshà\n〈叹〉\n(1)\n什么 [what]--表示否定\n我要丢个干干净,看你嗄法把我治。--清·蒲松龄《聊斋俚曲集》\n(2)\n旧时仆役对主人、下级对上级的应诺声 [yes]\n带进来”。两边军士应一声嗄”,即将牛皋推至面前。--《说岳全传》\n另见á\n嗄1\ná\n〈叹〉\n同啊”(á)。表示省悟或惊奇 [ah]\n嗄!难道这里是没有地方官的么?--宋·佚名《新编五代史平话》\n另见shà\n嗄1\nshà ㄕㄚ╝\n嗓音嘶哑。\n郑码janr,u55c4,gbke0c4\n笔画数13,部首口,笔顺编号2511325111354\n嗄2\ná ㄚˊ\n同啊2”。\n郑码janr,u55c4,gbke0c4\n笔画数13,部首口,笔顺编号2511325111354"
|
| 70 |
+
},
|
| 71 |
+
...
|
| 72 |
+
]
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### 歇后语 (xiehouyu.json)
|
| 76 |
+
|
| 77 |
+
```json
|
| 78 |
+
[
|
| 79 |
+
{
|
| 80 |
+
"riddle": "飞机上聊天",
|
| 81 |
+
"answer": "高谈阔论"
|
| 82 |
+
},
|
| 83 |
+
...
|
| 84 |
+
]
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Changelog
|
| 88 |
+
|
| 89 |
+
<details><summary>查看更新日志 </summary>
|
| 90 |
+
|
| 91 |
+
- 20181216: 成语数据集去重
|
| 92 |
+
- 20181216: API 功能下线
|
| 93 |
+
- 20180803: 添加词语数据集
|
| 94 |
+
- 20180206: 添加成语,歇后语,汉字数据集
|
| 95 |
+
|
| 96 |
+
</details>
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
## Copyright
|
| 100 |
+
|
| 101 |
+
本仓库的所有的数据都是我从网上收集整理的。仓库本来的目的是因为我以前想做一个成语接龙的东西,但是苦于没有现成可用的数据库,自己就从各个网站抓取整理了一份。放在 Github 是为了方便自己的使用,同时也能方便有类似需求的人不用去做这些 trival 的工作。所有抓取数据的[脚本](./scripts/README.md)都在仓库里。
|
| 102 |
+
|
| 103 |
+
**本仓库无任何商业目的!如果有侵权行为将及时删除!**
|
| 104 |
+
|
| 105 |
+
```
|
input_test_SUCCESS_DEFINITIVE.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
im using https://github.com/pwxcoo/chinese-xinhua/blob/master/README.md as reference to test my codebase
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
```reference
|
| 6 |
+
#!/bin/bash
|
| 7 |
+
|
| 8 |
+
# chinese-xinhua
|
| 9 |
+
|
| 10 |
+
Chinese Xinhua Dictionary Database and API . Included include 14032 The postscript is:16142 a Chinese character,264434 words,31648 an idiom.
|
| 11 |
+
|
| 12 |
+
## Project Structure
|
| 13 |
+
|
| 14 |
+
```
|
| 15 |
+
chinese-xinhua/
|
| 16 |
+
|
|
| 17 |
+
+- data/ <-- data folder
|
| 18 |
+
| |
|
| 19 |
+
| +- idiom.json <-- idiom
|
| 20 |
+
| |
|
| 21 |
+
| +- word.json <-- Chinese character
|
| 22 |
+
| |
|
| 23 |
+
| +- xiehouyu.json <-- idiom
|
| 24 |
+
| |
|
| 25 |
+
| +- ci.json <-- words
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Database Introduction
|
| 29 |
+
|
| 30 |
+
### idiom (idiom.json)
|
| 31 |
+
|
| 32 |
+
```json
|
| 33 |
+
[
|
| 34 |
+
{
|
| 35 |
+
"derivation": "The words "Lotus Sutra: Master's Merit and Virtue" came down to Abi Hell. "",
|
| 36 |
+
"example": "But there are also a few who are weak-willed... and gradually fall into the trap.★"Shangrao Concentration Camp·Purgatory Miscellaneous Notes"",
|
| 37 |
+
"explanation": "The transliteration of Abi in Sanskrit means "without interruption", which means pain without interruption. It is often used to describe a dark society and a harsh prison. It also refers to an extremely painful situation that cannot be escaped.",
|
| 38 |
+
"pinyin": "ā bí dì yù",
|
| 39 |
+
"word": "abi hell",
|
| 40 |
+
"abbreviation": "abdy"
|
| 41 |
+
},
|
| 42 |
+
...
|
| 43 |
+
]
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### words (ci.json)
|
| 47 |
+
|
| 48 |
+
```json
|
| 49 |
+
[
|
| 50 |
+
{
|
| 51 |
+
"ci": "Chen Lun",
|
| 52 |
+
"explanation": "1.emperor's edict﹑Make orders."
|
| 53 |
+
},
|
| 54 |
+
...
|
| 55 |
+
]
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Chinese character (word.json)
|
| 59 |
+
|
| 60 |
+
```json
|
| 61 |
+
[
|
| 62 |
+
{
|
| 63 |
+
"word": "嗄",
|
| 64 |
+
"oldword": "嗄",
|
| 65 |
+
"strokes": "13",
|
| 66 |
+
"pinyin": "á",
|
| 67 |
+
"radicals": "mouth",
|
| 68 |
+
"explanation": "sigh (sigh)\n\n "Same". It means enlightenment or surprise.\n\n 嗄!Are there no local officials here??--Song Dynasty Anonymous "Newly Compiled History of the Five Dynasties"\n\n 嗄áinterjection. At the beginning of the sentence, <expression> questions or rhetorical questions~, what is this? ~, what do you want to do?\"嗄\"See alsoshà㈠。\n\n 嗄shà\n\n ⒈Hoarse voice.\n\n 嗄a 1.particle. express emphasis﹑affirm or justify. 2.particle. dialect. Express a question or question.\n\n 嗄xià 1.See\"Eat rice\"。 2.See\"Cheng\"。",
|
| 69 |
+
"more": "嗄 ga、a radical mouth radical strokes 03 total strokes 13 嗄2\nshà\n<shape>\n(1)\nhoarse voice [hoarse]\nHowling all day long but not moaning.--"Laozi"\n(2)\nAnother example is muteness,Hiss(hoarse voice)\n嗄\nshà\n<sigh>\n(1)\nWhat [what]--express negation\nI want to throw it all away,Let me see how you treat me.--"Collection of Liaozhai Slang" by Pu Songling of the Qing Dynasty\n(2)\nIn the old days, servants made promises to their masters, and subordinates made promises to their superiors. [yes]\n"Bring them in." The sergeants on both sides responded with a cry.,Niu Gao was about to be pushed in front of him.--"The Complete Biography of Yue Yue"\nSee alsoá\n嗄1\ná\n<sigh>\n"Same"(á). express enlightenment or surprise [ah]\n嗄!Are there no local officials here??--Song Dynasty Anonymous "Newly Compiled History of the Five Dynasties"\nSee alsoshà\n嗄1\nshà ㄕㄚ╝\nHoarse voice.\nZheng Majanr,u55c4,gbke0c4\nNumber of strokes13, radical mouth, stroke order number2511325111354\n嗄2\ná ㄚˊ\nSame2”。\nZheng Majanr,u55c4,gbke0c4\nNumber of strokes13, radical mouth, stroke order number2511325111354"
|
| 70 |
+
},
|
| 71 |
+
...
|
| 72 |
+
]
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### idiom (xiehouyu.json)
|
| 76 |
+
|
| 77 |
+
```json
|
| 78 |
+
[
|
| 79 |
+
{
|
| 80 |
+
"riddle": "Chatting on the plane",
|
| 81 |
+
"answer": "talk eloquently"
|
| 82 |
+
},
|
| 83 |
+
...
|
| 84 |
+
]
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Changelog
|
| 88 |
+
|
| 89 |
+
<details><summary>View changelog </summary>
|
| 90 |
+
|
| 91 |
+
- 20181216: Deduplication of idiom data sets
|
| 92 |
+
- 20181216: API Function offline
|
| 93 |
+
- 20180803: Add word dataset
|
| 94 |
+
- 20180206: Add idioms, idioms, and Chinese character data sets
|
| 95 |
+
|
| 96 |
+
</details>
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
## Copyright
|
| 100 |
+
|
| 101 |
+
All the data in this warehouse is collected and organized by me from the Internet. The original purpose of the warehouse was because I wanted to make a idiom solitaire thing before, but because there was no readily available database, I grabbed and compiled one from various websites. put on Github It is to facilitate your own use, and also to facilitate people with similar needs without having to do these things. trival work. All scraped data[script](./scripts/README.md)All in the warehouse.
|
| 102 |
+
|
| 103 |
+
**This warehouse has no commercial purpose! If there is any infringement, it will be deleted immediately!**
|
| 104 |
+
|
| 105 |
+
```
|
requirements.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 2 |
+
# ChineseFileTranslator v1.0.0 — requirements.txt
|
| 3 |
+
# Author: algorembrant
|
| 4 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 5 |
+
# Install all core dependencies:
|
| 6 |
+
# pip install -r requirements.txt
|
| 7 |
+
#
|
| 8 |
+
# For offline translation backend (Helsinki-NLP MarianMT):
|
| 9 |
+
# CPU: pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 10 |
+
# CUDA: pip install torch --index-url https://download.pytorch.org/whl/cu121
|
| 11 |
+
# Then:
|
| 12 |
+
# pip install transformers sentencepiece sacremoses
|
| 13 |
+
#
|
| 14 |
+
# Platform note:
|
| 15 |
+
# On Linux/Mac, keyboard events require no extra steps.
|
| 16 |
+
# On Windows, run the terminal as Administrator if hotkeys fail.
|
| 17 |
+
# ════════════════════════════════════════════════════════════════════════
|
| 18 |
+
|
| 19 |
+
# ── Core translation backend ──────────────────────────────────────────────
|
| 20 |
+
deep-translator>=1.11.4
|
| 21 |
+
|
| 22 |
+
# ── Progress bar (optional but recommended) ───────────────────────────────
|
| 23 |
+
tqdm>=4.66.0
|
| 24 |
+
|
| 25 |
+
# ── Clipboard support (optional) ─────────────────────────────────────────
|
| 26 |
+
pyperclip>=1.8.2
|
| 27 |
+
|
| 28 |
+
# ── Offline translation backend (optional) ───────────────────────────────
|
| 29 |
+
# Uncomment the lines below OR follow the install note above for PyTorch.
|
| 30 |
+
# transformers>=4.40.0
|
| 31 |
+
# sentencepiece>=0.2.0
|
| 32 |
+
# sacremoses>=0.1.1
|
| 33 |
+
# torch>=2.2.0 # <-- install separately with correct CUDA URL
|