algorembrant commited on
Commit
b03b79f
·
verified ·
1 Parent(s): 370ed5e

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitattributes +65 -35
  2. .gitignore +132 -0
  3. LICENSE +21 -0
  4. README.md +288 -0
  5. chinese_file_translator.py +1159 -0
  6. input.md +105 -0
  7. input_test_SUCCESS_DEFINITIVE.md +105 -0
  8. requirements.txt +33 -0
.gitattributes CHANGED
@@ -1,35 +1,65 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ════════════════════════════════════════════════════════════════════════
2
+ # ChineseFileTranslator .gitattributes
3
+ # Author: algorembrant
4
+ # ════════════════════════════════════════════════════════════════════════
5
+
6
+ # ── Default: normalize all text files to LF on commit ────────────────────
7
+ * text=auto eol=lf
8
+
9
+ # ── Python source files ───────────────────────────────────────────────────
10
+ *.py text eol=lf diff=python
11
+
12
+ # ── Markdown and documentation ────────────────────────────────────────────
13
+ *.md text eol=lf
14
+ *.rst text eol=lf
15
+ *.txt text eol=lf
16
+
17
+ # ── Config and data files ─────────────────────────────────────────────────
18
+ *.json text eol=lf
19
+ *.yaml text eol=lf
20
+ *.yml text eol=lf
21
+ *.toml text eol=lf
22
+ *.cfg text eol=lf
23
+ *.ini text eol=lf
24
+ *.env text eol=lf
25
+
26
+ # ── Shell scripts ─────────────────────────────────────────────────────────
27
+ *.sh text eol=lf
28
+ *.bash text eol=lf
29
+
30
+ # ── Windows batch scripts (CRLF required) ─────────────────────────────────
31
+ *.bat text eol=crlf
32
+ *.cmd text eol=crlf
33
+ *.ps1 text eol=crlf
34
+
35
+ # ── Binary files — do not modify line endings ─────────────────────────────
36
+ *.png binary
37
+ *.jpg binary
38
+ *.jpeg binary
39
+ *.gif binary
40
+ *.bmp binary
41
+ *.ico binary
42
+ *.svg binary
43
+ *.pdf binary
44
+ *.zip binary
45
+ *.tar.gz binary
46
+ *.whl binary
47
+
48
+ # ── HuggingFace Large File Storage (LFS) — model weights ──────────────────
49
+ # Uncomment if storing model checkpoints in this repo
50
+ # *.bin filter=lfs diff=lfs merge=lfs -text
51
+ # *.safetensors filter=lfs diff=lfs merge=lfs -text
52
+ # *.pt filter=lfs diff=lfs merge=lfs -text
53
+ # *.ckpt filter=lfs diff=lfs merge=lfs -text
54
+ # *.h5 filter=lfs diff=lfs merge=lfs -text
55
+
56
+ # ── Linguist overrides (GitHub language detection) ────────────────────────
57
+ *.md linguist-documentation
58
+ *.txt linguist-documentation
59
+ requirements.txt linguist-documentation
60
+
61
+ # ── Export-ignore (files excluded from git archive / release tarballs) ────
62
+ .gitattributes export-ignore
63
+ .gitignore export-ignore
64
+ .github/ export-ignore
65
+ tests/ export-ignore
.gitignore ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ════════════════════════════════════════════════════════════════════════
2
+ # ChineseFileTranslator — .gitignore
3
+ # Author: algorembrant
4
+ # ════════════════════════════════════════════════════════════════════════
5
+
6
+ # ── Python ────────────────────────────────────────────────────────────────
7
+ __pycache__/
8
+ *.py[cod]
9
+ *.pyo
10
+ *.pyd
11
+ *.pyc
12
+ *.so
13
+ *.egg
14
+ *.egg-info/
15
+ dist/
16
+ build/
17
+ wheels/
18
+ *.whl
19
+ *.spec
20
+ pip-log.txt
21
+ pip-delete-this-directory.txt
22
+ .Python
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ *.manifest
29
+ *.egg-link
30
+ .installed.cfg
31
+ MANIFEST
32
+
33
+ # ── Virtual environments ──────────────────────────────────────────────────
34
+ venv/
35
+ env/
36
+ .venv/
37
+ .env/
38
+ ENV/
39
+ env.bak/
40
+ venv.bak/
41
+ .python-version
42
+
43
+ # ── Application runtime data ──────────────────────────────────────────────
44
+ # Translation history and logs are stored in ~/.chinese_file_translator/
45
+ # Do not commit user-generated runtime files from within the project dir
46
+ history.json
47
+ app.log
48
+ config.json
49
+ *_translated.txt
50
+ *_translated.md
51
+
52
+ # ── HuggingFace / Transformers model cache ────────────────────────────────
53
+ models/
54
+ *.bin
55
+ *.safetensors
56
+ *.pt
57
+ *.ckpt
58
+ *.h5
59
+ pytorch_model*
60
+ tf_model*
61
+ flax_model*
62
+ tokenizer.json
63
+ tokenizer_config.json
64
+ vocab.json
65
+ merges.txt
66
+ special_tokens_map.json
67
+ sentencepiece.bpe.model
68
+ source.spm
69
+ target.spm
70
+
71
+ # ── Jupyter notebooks checkpoints ────────────────────────────────────────
72
+ .ipynb_checkpoints/
73
+ *.ipynb
74
+
75
+ # ── IDE / Editor ──────────────────────────────────────────────────────────
76
+ .vscode/
77
+ .idea/
78
+ *.sublime-project
79
+ *.sublime-workspace
80
+ *.suo
81
+ *.user
82
+ *.sln.docstates
83
+ .vs/
84
+ *.swp
85
+ *.swo
86
+ *~
87
+ .DS_Store
88
+ Thumbs.db
89
+ desktop.ini
90
+
91
+ # ── Testing and coverage ──────────────────────────────────────────────────
92
+ .tox/
93
+ .nox/
94
+ .coverage
95
+ .coverage.*
96
+ coverage.xml
97
+ htmlcov/
98
+ *.coveragerc
99
+ .pytest_cache/
100
+ .mypy_cache/
101
+ .dmypy.json
102
+ dmypy.json
103
+ .pytype/
104
+ .pyre/
105
+
106
+ # ── Distribution / packaging ──────────────────────────────────────────────
107
+ .eggs/
108
+ *.egg-info/
109
+ dist/
110
+ build/
111
+ RECORD
112
+
113
+ # ── Documentation builds ─────────────────────────────────────────────────
114
+ docs/_build/
115
+ site/
116
+ _site/
117
+
118
+ # ── OS temp files ─────────────────────────────────────────────────────────
119
+ *.tmp
120
+ *.bak
121
+ *.swp
122
+ *.orig
123
+ *.rej
124
+
125
+ # ── Secrets and credentials ───────────────────────────────────────────────
126
+ .env
127
+ .env.*
128
+ secrets.json
129
+ *.pem
130
+ *.key
131
+ *.p12
132
+ *.pfx
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 algorembrant
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - zh
4
+ - en
5
+ tags:
6
+ - translation
7
+ - chinese
8
+ - nlp
9
+ - text-processing
10
+ - markdown
11
+ - offline
12
+ - deep-translator
13
+ - marianmt
14
+ license: mit
15
+ library_name: transformers
16
+ pipeline_tag: translation
17
+ model-index:
18
+ - name: Helsinki-NLP/opus-mt-zh-en
19
+ results: []
20
+ ---
21
+
22
+ # ChineseFileTranslator
23
+
24
+ [![Python](https://img.shields.io/badge/Python-3.9%2B-blue?logo=python&logoColor=white)](https://www.python.org/)
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
26
+ [![Version](https://img.shields.io/badge/version-1.0.0-orange)](CHANGELOG.md)
27
+ [![Offline Support](https://img.shields.io/badge/offline-Helsinki--NLP%2Fopus--mt--zh--en-blueviolet)](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en)
28
+ [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-yellow?logo=huggingface)](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en)
29
+ [![Maintenance](https://img.shields.io/badge/Maintained-yes-brightgreen)](https://github.com/algorembrant/ChineseFileTranslator)
30
+ [![Code Style](https://img.shields.io/badge/code%20style-PEP8-informational)](https://peps.python.org/pep-0008/)
31
+
32
+ Translate Chinese text (Simplified, Traditional, Cantonese, Classical) inside `.txt` and `.md` files
33
+ to English. Preserves full Markdown syntax. Supports Google Translate, Microsoft Translator, and a
34
+ fully offline Helsinki-NLP MarianMT backend with vectorized batching.
35
+
36
+ ---
37
+
38
+ ### Key Features
39
+
40
+ - **'Never Miss' Global Surgical Translation**: Unique strategy to capture ALL Chinese while protecting structure.
41
+ - **Inclusive CJK Detection**: Comprehensive 32-bit Unicode coverage (Basic, Ext A-E, Symbols, Punctuation).
42
+ - **Proactive Markdown Protection**: Frontmatter, code blocks, links, and HTML are safely tokenized.
43
+ - **Robust Placeholder Restoration**: Space-lenient, case-insensitive restoration handles engine mangling.
44
+ - **Unstoppable Backend Resilience**: Explicit failure detection with automatic retries and non-crashing fallbacks.
45
+ - **Offline First Option**: Fully local Helsinki-NLP MarianMT backend with vectorized batching.
46
+ - **Bilingual Mode**: Optional side-by-side Chinese and English output.
47
+ - **Batch Processing**: Translate entire directories with recursive discovery and persistent configuration.
48
+
49
+ ---
50
+
51
+ ## Project Structure
52
+
53
+ ```
54
+ ChineseFileTranslator/
55
+ ├── chinese_file_translator.py # Main script (single-file, no extra modules)
56
+ ├── requirements.txt # Python dependencies
57
+ ├── README.md # This file
58
+ ├── .gitattributes # Git line-ending and LFS rules
59
+ ├── .gitignore # Ignored paths
60
+ └── LICENSE # MIT License
61
+ ```
62
+
63
+ ---
64
+
65
+ ## Quickstart
66
+
67
+ ### 1. Clone the repository
68
+
69
+ ```bash
70
+ git clone https://github.com/algorembrant/ChineseFileTranslator.git
71
+ cd ChineseFileTranslator
72
+ ```
73
+
74
+ ### 2. Create and activate a virtual environment (recommended)
75
+
76
+ ```bash
77
+ python -m venv venv
78
+ # Windows
79
+ venv\Scripts\activate
80
+ # Linux / macOS
81
+ source venv/bin/activate
82
+ ```
83
+
84
+ ### 3. Install core dependencies
85
+
86
+ ```bash
87
+ pip install -r requirements.txt
88
+ ```
89
+
90
+ ### 4. (Optional) Install offline translation backend
91
+
92
+ Choose the correct PyTorch build for your system:
93
+
94
+ ```bash
95
+ # CPU only
96
+ pip install torch --index-url https://download.pytorch.org/whl/cpu
97
+
98
+ # CUDA 12.1
99
+ pip install torch --index-url https://download.pytorch.org/whl/cu121
100
+
101
+ # Then install Transformers stack
102
+ pip install transformers sentencepiece sacremoses
103
+ ```
104
+
105
+ The Helsinki-NLP/opus-mt-zh-en model (~300 MB) downloads automatically on first use.
106
+
107
+ ---
108
+
109
+ ## Usage
110
+
111
+ ### Command Reference
112
+
113
+ | Command | Description |
114
+ |---|---|
115
+ | `python chinese_file_translator.py input.txt` | Translate a plain-text file (Google backend) |
116
+ | `python chinese_file_translator.py input.md` | Translate a Markdown file, preserve structure |
117
+ | `python chinese_file_translator.py input.txt -o out.txt` | Set explicit output path |
118
+ | `python chinese_file_translator.py input.txt --backend offline` | Use offline MarianMT model |
119
+ | `python chinese_file_translator.py input.txt --backend microsoft` | Use Microsoft Translator |
120
+ | `python chinese_file_translator.py input.txt --offline --gpu` | Offline + GPU (CUDA) |
121
+ | `python chinese_file_translator.py input.txt --lang simplified` | Force Simplified Chinese |
122
+ | `python chinese_file_translator.py input.txt --lang traditional` | Force Traditional Chinese |
123
+ | `python chinese_file_translator.py input.txt --bilingual` | Keep Chinese + show English |
124
+ | `python chinese_file_translator.py input.txt --extract-only` | Extract Chinese lines only |
125
+ | `python chinese_file_translator.py input.txt --stdout` | Print output to terminal |
126
+ | `python chinese_file_translator.py --batch ./docs/` | Batch translate a directory |
127
+ | `python chinese_file_translator.py --batch ./in/ --batch-out ./out/` | Batch with output dir |
128
+ | `python chinese_file_translator.py input.txt --chunk-size 2000` | Custom chunk size |
129
+ | `python chinese_file_translator.py input.txt --export-history h.json` | Export history |
130
+ | `python chinese_file_translator.py input.txt --verbose` | Debug logging |
131
+ | `python chinese_file_translator.py --version` | Print version |
132
+ | `python chinese_file_translator.py --help` | Full help |
133
+
134
+ ### Arguments
135
+
136
+ | Argument | Type | Default | Description |
137
+ |---|---|---|---|
138
+ | `input` | positional | — | Path to `.txt` or `.md` file |
139
+ | `-o / --output` | string | `<name>_translated.<ext>` | Output file path |
140
+ | `--batch DIR` | string | — | Directory to batch translate |
141
+ | `--batch-out DIR` | string | same as `--batch` | Output directory for batch |
142
+ | `--backend` | choice | `google` | `google`, `microsoft`, `offline` |
143
+ | `--offline` | flag | `false` | Shorthand for `--backend offline` |
144
+ | `--lang` | choice | `auto` | `auto`, `simplified`, `traditional` |
145
+ | `--gpu` | flag | `false` | Use CUDA for offline model |
146
+ | `--confidence` | float | `0.05` | Min Chinese character ratio for detection |
147
+ | `--chunk-size` | int | `4000` | Max chars per translation request |
148
+ | `--bilingual` | flag | `false` | Output both Chinese and English |
149
+ | `--extract-only` | flag | `false` | Save only the detected Chinese lines |
150
+ | `--stdout` | flag | `false` | Print result to stdout |
151
+ | `--export-history` | string | — | Save session history to JSON |
152
+ | `--verbose` | flag | `false` | Enable DEBUG logging |
153
+ | `--version` | flag | — | Show version and exit |
154
+
155
+ ---
156
+
157
+ ## Configuration
158
+
159
+ The tool writes a JSON config file on first run:
160
+
161
+ ```
162
+ ~/.chinese_file_translator/config.json
163
+ ```
164
+
165
+ Example `config.json`:
166
+
167
+ ```json
168
+ {
169
+ "backend": "google",
170
+ "lang": "auto",
171
+ "use_gpu": false,
172
+ "chunk_size": 4000,
173
+ "batch_size": 10,
174
+ "bilingual": false,
175
+ "microsoft_api_key": "YOUR_KEY_HERE",
176
+ "microsoft_region": "eastus",
177
+ "offline_model_dir": "~/.chinese_file_translator/models",
178
+ "output_suffix": "_translated",
179
+ "retry_attempts": 3,
180
+ "retry_delay_seconds": 1.5,
181
+ "max_history": 1000
182
+ }
183
+ ```
184
+
185
+ ---
186
+
187
+ ## Supported Chinese Variants
188
+
189
+ | Variant | Notes |
190
+ |---|---|
191
+ | Simplified Chinese | Mandarin, mainland China standard |
192
+ | Traditional Chinese | Taiwan, Hong Kong, Macau standard |
193
+ | Cantonese / Yue | Detected via CJK Unicode ranges |
194
+ | Classical Chinese | Treated as Traditional for translation |
195
+ | Mixed Chinese-English | Code-switching text handled transparently |
196
+
197
+ ---
198
+
199
+ ## Translation Backends
200
+
201
+ | Backend | Requires | Speed | Quality | Internet |
202
+ |---|---|---|---|---|
203
+ | Google Translate | `deep-translator` | Fast | High | Yes |
204
+ | Microsoft Translator | Azure API key + `deep-translator` | Fast | High | Yes |
205
+ | Helsinki-NLP MarianMT | `transformers`, `torch` | Medium | Good | No (after download) |
206
+
207
+ Google Translate is the default. If it fails, the tool falls back to the offline model automatically.
208
+
209
+ ---
210
+
211
+ ---
212
+
213
+ ## Technical Strategy: 'Never Miss' Logic
214
+
215
+ The tool employs a sophisticated "Global Surgical" approach to ensure no Chinese fragment is overlooked, regardless of its depth in JSON, HTML, or complex Markdown.
216
+
217
+ ### 1. Surgical Block Extraction
218
+ Instead of line-by-line translation, the script identifies every continuous block of CJK characters (including ideographic symbols and punctuation) across the entire document. This ensures that contextually related characters are translated together for better accuracy.
219
+
220
+ ### 2. Structural Protection
221
+ Markdown and metadata structures are tokenized using unique, collision-resistant placeholders (`___MY_PROTECT_PH_{idx}___`).
222
+ - **YAML/TOML**: Frontmatter is protected globally.
223
+ - **Code Fences**: Backticks and language identifiers are protected; Chinese content *inside* comments or strings remains translatable.
224
+ - **Links & HTML**: URLs and tag names are guarded, while display text is surgically translated.
225
+
226
+ ### 3. Verification & Restoration
227
+ - **Longest-First Replacement**: Translated segments are restored starting from the longest strings to prevent partial match overwrites.
228
+ - **Fuzzy Restoration**: The restoration logic is space-lenient and case-insensitive to handle cases where online translation engines mangle the placeholder tokens.
229
+
230
+ ---
231
+
232
+ ## Markdown Preservation
233
+
234
+ The following elements are meticulously protected:
235
+
236
+ | Element | Example | Protection Method |
237
+ |---|---|---|
238
+ | Front Matter | `---\ntitle: ...\n---` | Full Tokenization |
239
+ | Fenced Code | ` ```python ... ``` ` | Boundary Tokenization |
240
+ | Inline Code | `` `code` `` | Full Tokenization |
241
+ | Links / Images | `[text](url)` | URL Tokenization |
242
+ | HTML Tags | `<div class="...">` | Tag Tokenization |
243
+ | Symbols | `&copy;`, `&#x...;` | Entity Tokenization |
244
+
245
+ ---
246
+
247
+ ## Microsoft Translator Setup
248
+
249
+ 1. Go to [Azure Cognitive Services](https://portal.azure.com/)
250
+ 2. Create a Translator resource (Free tier: 2M chars/month)
251
+ 3. Copy your API key and region
252
+ 4. Add them to `~/.chinese_file_translator/config.json`:
253
+
254
+ ```json
255
+ {
256
+ "microsoft_api_key": "abc123...",
257
+ "microsoft_region": "eastus"
258
+ }
259
+ ```
260
+
261
+ Then run:
262
+
263
+ ```bash
264
+ python chinese_file_translator.py input.txt --backend microsoft
265
+ ```
266
+
267
+ ---
268
+
269
+ ## Files Generated
270
+
271
+ | Path | Description |
272
+ |---|---|
273
+ | `~/.chinese_file_translator/config.json` | Persistent settings |
274
+ | `~/.chinese_file_translator/history.json` | Session history log |
275
+ | `~/.chinese_file_translator/app.log` | Application log file |
276
+ | `~/.chinese_file_translator/models/` | Offline model cache (if used) |
277
+
278
+ ---
279
+
280
+ ## Author
281
+
282
+ **algorembrant**
283
+
284
+ ---
285
+
286
+ ## License
287
+
288
+ MIT License. See [LICENSE](LICENSE) for details.
chinese_file_translator.py ADDED
@@ -0,0 +1,1159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ ChineseFileTranslator v1.0.0
5
+ ================================
6
+ Author : algorembrant
7
+ License : MIT
8
+ Version : 1.0.0
9
+
10
+ Translate Chinese text inside .txt or .md files to English.
11
+ Preserves Markdown structure (headings, bold, italics, code blocks, tables, links).
12
+ Supports batch/vectorized processing, multiple translation backends,
13
+ auto-detection of Chinese script, and history logging.
14
+
15
+ USAGE COMMANDS
16
+ --------------
17
+ Translate a single file (default: Google backend):
18
+ python chinese_file_translator.py input.txt
19
+
20
+ Translate and save to a specific output file:
21
+ python chinese_file_translator.py input.md -o translated.md
22
+
23
+ Translate using the offline Helsinki-NLP MarianMT model:
24
+ python chinese_file_translator.py input.txt --backend offline
25
+
26
+ Translate using Microsoft Translator (requires API key in config):
27
+ python chinese_file_translator.py input.txt --backend microsoft
28
+
29
+ Force Simplified Chinese OCR/detection:
30
+ python chinese_file_translator.py input.txt --lang simplified
31
+
32
+ Force Traditional Chinese:
33
+ python chinese_file_translator.py input.txt --lang traditional
34
+
35
+ Auto-detect Chinese script (default):
36
+ python chinese_file_translator.py input.txt --lang auto
37
+
38
+ Enable GPU (CUDA) for offline model:
39
+ python chinese_file_translator.py input.txt --backend offline --gpu
40
+
41
+ Set OCR confidence threshold (0.0 - 1.0, default 0.3):
42
+ python chinese_file_translator.py input.txt --confidence 0.4
43
+
44
+ Batch translate all .txt and .md files in a directory:
45
+ python chinese_file_translator.py --batch ./my_folder/
46
+
47
+ Batch translate with output directory:
48
+ python chinese_file_translator.py --batch ./input/ --batch-out ./output/
49
+
50
+ Set chunk size for large files (default 4000 chars):
51
+ python chinese_file_translator.py input.txt --chunk-size 2000
52
+
53
+ Append both Chinese source and English translation side-by-side:
54
+ python chinese_file_translator.py input.txt --bilingual
55
+
56
+ Only extract and print detected Chinese text (no translation):
57
+ python chinese_file_translator.py input.txt --extract-only
58
+
59
+ Print translated output to stdout instead of file:
60
+ python chinese_file_translator.py input.txt --stdout
61
+
62
+ Export translation history to JSON on exit:
63
+ python chinese_file_translator.py input.txt --export-history out.json
64
+
65
+ Enable verbose/debug logging:
66
+ python chinese_file_translator.py input.txt --verbose
67
+
68
+ Show version and exit:
69
+ python chinese_file_translator.py --version
70
+
71
+ Show full help:
72
+ python chinese_file_translator.py --help
73
+
74
+ SUPPORTED FILE TYPES
75
+ --------------------
76
+ - Plain text (.txt) : All Chinese detected and translated in-place
77
+ - Markdown (.md) : Chinese content translated; Markdown syntax preserved
78
+ Preserved: headings (#), bold (**), italic (*), inline code (`),
79
+ fenced code blocks (```), blockquotes (>), tables (|),
80
+ links ([text](url)), images (![alt](url)), horizontal rules
81
+
82
+ SUPPORTED CHINESE VARIANTS
83
+ ---------------------------
84
+ - Simplified Chinese (Mandarin, simplified/simp)
85
+ - Traditional Chinese (Mandarin / Hong Kong / Taiwan)
86
+ - Cantonese / Yue (detected via Unicode CJK ranges)
87
+ - Classical Chinese (Literary Chinese, treated as Traditional)
88
+ - Mixed Chinese-English (Chinglish / code-switching)
89
+
90
+ TRANSLATION BACKENDS
91
+ --------------------
92
+ 1. Google Translate (online, fast, default, no API key needed)
93
+ 2. Microsoft Translate (online, fallback, requires Azure API key)
94
+ 3. Helsinki-NLP MarianMT (offline, opus-mt-zh-en, ~300 MB download on first use)
95
+
96
+ CONFIGURATION
97
+ -------------
98
+ Config is stored at: ~/.chinese_file_translator/config.json
99
+ History is stored at: ~/.chinese_file_translator/history.json
100
+ Logs are stored at: ~/.chinese_file_translator/app.log
101
+
102
+ EXTERNAL SETUP REQUIRED
103
+ -----------------------
104
+ PyTorch (required only for offline backend):
105
+ CPU-only:
106
+ pip install torch --index-url https://download.pytorch.org/whl/cpu
107
+ CUDA 11.8:
108
+ pip install torch --index-url https://download.pytorch.org/whl/cu118
109
+ CUDA 12.1:
110
+ pip install torch --index-url https://download.pytorch.org/whl/cu121
111
+
112
+ Helsinki-NLP model is downloaded automatically on first offline run (~300 MB):
113
+ Model: Helsinki-NLP/opus-mt-zh-en
114
+ Cache: ~/.chinese_file_translator/models/
115
+
116
+ Microsoft Translator (optional):
117
+ Get a free API key from Azure Cognitive Services and add to config.json:
118
+ { "microsoft_api_key": "YOUR_KEY_HERE", "microsoft_region": "eastus" }
119
+ """
120
+
121
+ # ── Standard Library ──────────────────────────────────────────────────────────
122
+ import os
123
+ import re
124
+ import sys
125
+ import json
126
+ import time
127
+ import logging
128
+ import argparse
129
+ import textwrap
130
+ import threading
131
+ import unicodedata
132
+ from copy import deepcopy
133
+ from pathlib import Path
134
+ from datetime import datetime
135
+ from typing import (
136
+ Any, Dict, Generator, List, Optional, Sequence, Tuple
137
+ )
138
+
139
+ # ── Online Translation ────────────────────────────────────────────────────────
140
+ try:
141
+ from deep_translator import GoogleTranslator, MicrosoftTranslator
142
+ DEEP_TRANSLATOR_AVAILABLE = True
143
+ except ImportError:
144
+ DEEP_TRANSLATOR_AVAILABLE = False
145
+
146
+ # ── Offline Translation ───────────────────────────────────────────────────────
147
+ OFFLINE_AVAILABLE = False
148
+ try:
149
+ from transformers import MarianMTModel, MarianTokenizer
150
+ import torch
151
+ OFFLINE_AVAILABLE = True
152
+ except ImportError:
153
+ pass
154
+
155
+ # ── Progress bar (optional) ───────────────────────────────────────────────────
156
+ try:
157
+ from tqdm import tqdm
158
+ TQDM_AVAILABLE = True
159
+ except ImportError:
160
+ TQDM_AVAILABLE = False
161
+
162
+ # ── Clipboard (optional) ─────────────────────────────────────────────────────
163
+ try:
164
+ import pyperclip
165
+ CLIPBOARD_AVAILABLE = True
166
+ except ImportError:
167
+ CLIPBOARD_AVAILABLE = False
168
+
169
+ # ── Constants ─────────────────────────────────────────────────────────────────
170
+ APP_NAME = "ChineseFileTranslator"
171
+ APP_VERSION = "1.0.0"
172
+ APP_AUTHOR = "algorembrant"
173
+ _HOME = Path.home() / ".chinese_file_translator"
174
+ CONFIG_FILE = _HOME / "config.json"
175
+ HISTORY_FILE = _HOME / "history.json"
176
+ LOG_FILE = _HOME / "app.log"
177
+ OFFLINE_MODEL = "Helsinki-NLP/opus-mt-zh-en"
178
+ OFFLINE_MODEL_T = "Helsinki-NLP/opus-mt-zht-en"
179
+
180
+ # CJK Unicode blocks used for Chinese detection
181
+ _CJK_RANGES: Tuple[Tuple[int, int], ...] = (
182
+ (0x4E00, 0x9FFF), # CJK Unified Ideographs
183
+ (0x3400, 0x4DBF), # CJK Extension A
184
+ (0x20000, 0x2A6DF), # CJK Extension B
185
+ (0x2A700, 0x2B73F), # CJK Extension C
186
+ (0x2B740, 0x2B81F), # CJK Extension D
187
+ (0xF900, 0xFAFF), # CJK Compatibility Ideographs
188
+ (0x2F800, 0x2FA1F), # CJK Compatibility Supplement
189
+ (0x3000, 0x303F), # CJK Symbols and Punctuation
190
+ (0xFF00, 0xFFEF), # Fullwidth / Halfwidth Forms
191
+ (0xFE30, 0xFE4F), # CJK Compatibility Forms
192
+ )
193
+
194
+ # Markdown patterns that must NOT be translated
195
+ _MD_CODE_FENCE = re.compile(r"```[\s\S]*?```")
196
+ _MD_INLINE_CODE = re.compile(r"`[^`\n]*?`")
197
+ _MD_LINK = re.compile(r"(!?\[[^\]]*?\])\(([^)]*?)\)")
198
+ _MD_HTML_TAG = re.compile(r"<[a-zA-Z/][^>]*?>")
199
+ _MD_FRONTMATTER = re.compile(r"^---[\s\S]*?^---", re.MULTILINE)
200
+
201
+
202
+ # ════════════════════════════════════════════════════════════════════════════
203
+ # LOGGING
204
+ # ════════════════════════════════════════════════════════════════════════════
205
+ def setup_logging(verbose: bool = False) -> logging.Logger:
206
+ _HOME.mkdir(parents=True, exist_ok=True)
207
+ level = logging.DEBUG if verbose else logging.INFO
208
+ fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
209
+ handlers: List[logging.Handler] = [
210
+ logging.FileHandler(LOG_FILE, encoding="utf-8"),
211
+ logging.StreamHandler(sys.stdout),
212
+ ]
213
+ logging.basicConfig(level=level, format=fmt, handlers=handlers)
214
+ return logging.getLogger(APP_NAME)
215
+
216
+
217
+ logger = logging.getLogger(APP_NAME)
218
+
219
+
220
+ # ════════════════════════════════════════════════════════════════════════════
221
+ # CONFIG
222
+ # ════════════════════════════════════════════════════════════════════════════
223
+ class Config:
224
+ """Persistent JSON configuration. CLI args override stored values."""
225
+
226
+ DEFAULTS: Dict[str, Any] = {
227
+ "backend" : "google",
228
+ "lang" : "auto",
229
+ "use_gpu" : False,
230
+ "confidence_threshold" : 0.30,
231
+ "chunk_size" : 4000,
232
+ "batch_size" : 10,
233
+ "bilingual" : False,
234
+ "preserve_whitespace" : True,
235
+ "microsoft_api_key" : "",
236
+ "microsoft_region" : "eastus",
237
+ "offline_model_dir" : str(_HOME / "models"),
238
+ "max_history" : 1000,
239
+ "output_suffix" : "_translated",
240
+ "retry_attempts" : 3,
241
+ "retry_delay_seconds" : 1.5,
242
+ }
243
+
244
+ def __init__(self) -> None:
245
+ self._data: Dict[str, Any] = dict(self.DEFAULTS)
246
+ _HOME.mkdir(parents=True, exist_ok=True)
247
+ self._load()
248
+
249
+ def _load(self) -> None:
250
+ if CONFIG_FILE.exists():
251
+ try:
252
+ with open(CONFIG_FILE, "r", encoding="utf-8") as f:
253
+ self._data.update(json.load(f))
254
+ except Exception as exc:
255
+ logger.warning(f"Config load failed ({exc}). Using defaults.")
256
+
257
+ def save(self) -> None:
258
+ try:
259
+ with open(CONFIG_FILE, "w", encoding="utf-8") as f:
260
+ json.dump(self._data, f, indent=2, ensure_ascii=False)
261
+ except Exception as exc:
262
+ logger.error(f"Config save failed: {exc}")
263
+
264
+ def get(self, key: str, default: Any = None) -> Any:
265
+ return self._data.get(key, self.DEFAULTS.get(key, default))
266
+
267
+ def set(self, key: str, value: Any) -> None:
268
+ self._data[key] = value
269
+ self.save()
270
+
271
+ def apply_args(self, args: argparse.Namespace) -> None:
272
+ if getattr(args, "backend", None):
273
+ self._data["backend"] = args.backend
274
+ if getattr(args, "lang", None):
275
+ self._data["lang"] = args.lang
276
+ if getattr(args, "gpu", False):
277
+ self._data["use_gpu"] = True
278
+ if getattr(args, "confidence", None) is not None:
279
+ self._data["confidence_threshold"] = args.confidence
280
+ if getattr(args, "chunk_size", None) is not None:
281
+ self._data["chunk_size"] = args.chunk_size
282
+ if getattr(args, "bilingual", False):
283
+ self._data["bilingual"] = True
284
+ if getattr(args, "offline", False):
285
+ self._data["backend"] = "offline"
286
+
287
+
288
+ # ════════════════════════════════════════════════════════════════════════════
289
+ # CHINESE DETECTION UTILITIES
290
+ # ════════════════════════════════════════════════════════════════════════════
291
+ def _is_cjk(char: str) -> bool:
292
+ """Return True if the character falls within any CJK Unicode range."""
293
+ cp = ord(char)
294
+ return any(lo <= cp <= hi for lo, hi in _CJK_RANGES)
295
+
296
+
297
+ def contains_chinese(text: str, min_ratio: float = 0.0) -> bool:
298
+ """
299
+ Return True when Chinese characters are present in `text`.
300
+ If `min_ratio` is > 0, requires that fraction of non-whitespace characters.
301
+ """
302
+ if not text or not text.strip():
303
+ return False
304
+ non_ws = [c for c in text if not c.isspace()]
305
+ if not non_ws:
306
+ return False
307
+ cjk_count = sum(1 for c in non_ws if _is_cjk(c))
308
+ if min_ratio <= 0:
309
+ return cjk_count > 0
310
+ return (cjk_count / len(non_ws)) >= min_ratio
311
+
312
+
313
+ def chinese_ratio(text: str) -> float:
314
+ """Return the fraction of non-whitespace chars that are CJK."""
315
+ non_ws = [c for c in text if not c.isspace()]
316
+ if not non_ws:
317
+ return 0.0
318
+ return sum(1 for c in non_ws if _is_cjk(c)) / len(non_ws)
319
+
320
+
321
+ def detect_script(text: str) -> str:
322
+ """
323
+ Heuristic: Traditional Chinese uses specific code points absent from
324
+ Simplified. Returns 'traditional', 'simplified', or 'mixed'.
325
+ """
326
+ # Characters common in Traditional but rarely in Simplified
327
+ _TRAD_MARKERS = set(
328
+ "繁體國語臺灣學習問題開發電腦時間工作歷史語言文化"
329
+ "經濟機會關係發展環境教育政府社會應該雖然雖然認為"
330
+ )
331
+ _SIMP_MARKERS = set(
332
+ "简体国语台湾学习问题开发电脑时间工作历史语言文化"
333
+ "经济机会关系发展环境教育政府社会应该虽然认为"
334
+ )
335
+ trad = sum(1 for c in text if c in _TRAD_MARKERS)
336
+ simp = sum(1 for c in text if c in _SIMP_MARKERS)
337
+ if trad > simp:
338
+ return "traditional"
339
+ if simp > trad:
340
+ return "simplified"
341
+ return "simplified" # default fallback
342
+
343
+
344
+ # ════════════════════════════════════════════════════════════════════════════
345
+ # TRANSLATION ENGINE
346
+ # ════════════════════════════════════════════════════════════════════════════
347
+ class TranslationEngine:
348
+ """
349
+ Multi-backend Chinese-to-English translation.
350
+
351
+ Vectorized batch mode is used for the offline (MarianMT) backend.
352
+ Online backends (Google, Microsoft) chunk by character limit with
353
+ sentence-boundary awareness and automatic retry on transient errors.
354
+ """
355
+
356
+ _GOOGLE_LIMIT = 4500 # chars per Google request
357
+ _MS_LIMIT = 10000 # chars per Microsoft request
358
+ _OFFLINE_LIMIT = 512 # tokens; use 400-char char proxy
359
+
360
+ def __init__(self, config: Config) -> None:
361
+ self.cfg = config
362
+ self._offline_model: Any = None
363
+ self._offline_tok: Any = None
364
+ self._lock = threading.Lock()
365
+
366
+ # ── Public API ────────────────────────────────────────────────────────
367
+
368
+ def translate(
369
+ self, text: str, source_lang: str = "auto"
370
+ ) -> Tuple[str, str]:
371
+ """
372
+ Translate `text` to English.
373
+ Returns (translated_text, backend_name).
374
+ """
375
+ if not text or not text.strip():
376
+ return text, "passthrough"
377
+
378
+ backend = self.cfg.get("backend", "google")
379
+ attempt_order: List[str] = _dedupe_list([backend, "google", "offline"])
380
+
381
+ last_exc: Optional[Exception] = None
382
+ for b in attempt_order:
383
+ try:
384
+ result = self._call_backend(b, text, source_lang)
385
+ return result, b
386
+ except Exception as exc:
387
+ logger.warning(f"Backend '{b}' failed for [{text}]: {exc}")
388
+ last_exc = exc
389
+
390
+ # NEVER CRASH: return original if all failed
391
+ logger.error(f"All translation backends failed for [{text}]. Returning original.")
392
+ return text, "failed"
393
+
394
+ def translate_batch(
395
+ self,
396
+ texts: List[str],
397
+ source_lang: str = "auto",
398
+ ) -> List[Tuple[str, str]]:
399
+ """
400
+ Translate a list of strings.
401
+ Uses vectorized batching for the offline backend; serial calls for
402
+ online backends (rate-limit friendly).
403
+ """
404
+ backend = self.cfg.get("backend", "google")
405
+ if backend == "offline" and OFFLINE_AVAILABLE:
406
+ return self._translate_batch_offline(texts)
407
+ # Serial with progress
408
+ results: List[Tuple[str, str]] = []
409
+ iterable = (
410
+ tqdm(texts, desc="Translating", unit="chunk")
411
+ if TQDM_AVAILABLE else texts
412
+ )
413
+ for text in iterable:
414
+ results.append(self.translate(text, source_lang))
415
+ # Small delay for online backends to avoid rate limits
416
+ if backend in ("google", "microsoft"):
417
+ time.sleep(0.3)
418
+ return results
419
+
420
+ # ── Backend dispatch ──────────────────────────────────────────────────
421
+
422
+ def _call_backend(
423
+ self, backend: str, text: str, source_lang: str
424
+ ) -> str:
425
+ retries = int(self.cfg.get("retry_attempts", 3))
426
+ delay = float(self.cfg.get("retry_delay_seconds", 1.5))
427
+ last_exc2: Optional[Exception] = None
428
+ for attempt in range(retries):
429
+ try:
430
+ if backend == "google":
431
+ return self._google(text, source_lang)
432
+ elif backend == "microsoft":
433
+ return self._microsoft(text, source_lang)
434
+ elif backend == "offline":
435
+ translated, _ = self._offline_single(text)
436
+ return translated
437
+ else:
438
+ raise ValueError(f"Unknown backend: {backend}")
439
+ except Exception as exc:
440
+ last_exc2 = exc
441
+ if attempt < retries - 1:
442
+ time.sleep(delay * (attempt + 1))
443
+ raise RuntimeError(
444
+ f"Backend '{backend}' failed after {retries} attempts: {last_exc2}"
445
+ )
446
+
447
+ # ── Google ────────────────────────────────────────────────────────────
448
+
449
+ def _google(self, text: str, source_lang: str) -> str:
450
+ if not DEEP_TRANSLATOR_AVAILABLE:
451
+ raise RuntimeError("deep-translator not installed.")
452
+
453
+ lang_map = {"simplified": "zh-CN", "traditional": "zh-TW", "auto": "auto"}
454
+ src = lang_map.get(source_lang, "auto")
455
+ chunks = list(_split_text(text, self._GOOGLE_LIMIT))
456
+ parts: List[str] = []
457
+
458
+ for chunk in chunks:
459
+ try:
460
+ translated = GoogleTranslator(source=src, target="en").translate(chunk)
461
+ # If it's None or returned original Chinese, it failed
462
+ if not translated or (translated.strip() == chunk.strip() and contains_chinese(chunk)):
463
+ raise RuntimeError("Google returned original or None")
464
+ parts.append(translated)
465
+ except Exception as e:
466
+ raise RuntimeError(f"Google translate error: {e}")
467
+
468
+ return " ".join(parts)
469
+
470
+ # ── Microsoft ─────────────────────────────────────────────────────────
471
+
472
+ def _microsoft(self, text: str, source_lang: str) -> str:
473
+ if not DEEP_TRANSLATOR_AVAILABLE:
474
+ raise RuntimeError(
475
+ "deep-translator not installed. Run: pip install deep-translator"
476
+ )
477
+ api_key = str(self.cfg.get("microsoft_api_key", ""))
478
+ region = str(self.cfg.get("microsoft_region", "eastus"))
479
+ if not api_key:
480
+ raise ValueError(
481
+ "Microsoft API key not configured. "
482
+ "Add 'microsoft_api_key' to ~/.chinese_file_translator/config.json"
483
+ )
484
+ lang_map = {"simplified": "zh-Hans", "traditional": "zh-Hant", "auto": "auto"}
485
+ src = lang_map.get(source_lang, "auto")
486
+ chunks = list(_split_text(text, self._MS_LIMIT))
487
+ parts = []
488
+ for chunk in chunks:
489
+ tr = MicrosoftTranslator(
490
+ api_key=api_key, region=region, source=src, target="en"
491
+ ).translate(chunk)
492
+ parts.append(tr or chunk)
493
+ return " ".join(parts)
494
+
495
+ # ── Offline (MarianMT) ────────────────────────────────────────────────
496
+
497
+ def _load_offline(self) -> None:
498
+ if not OFFLINE_AVAILABLE:
499
+ raise RuntimeError("Offline model dependencies not installed.")
500
+ model_dir = str(self.cfg.get("offline_model_dir", str(_HOME / "models")))
501
+ Path(model_dir).mkdir(parents=True, exist_ok=True)
502
+ # ...
503
+ self._offline_tok = MarianTokenizer.from_pretrained(
504
+ OFFLINE_MODEL, cache_dir=model_dir
505
+ )
506
+ model = MarianMTModel.from_pretrained(
507
+ OFFLINE_MODEL, cache_dir=model_dir
508
+ )
509
+ use_gpu = bool(self.cfg.get("use_gpu", False))
510
+ device = "cuda" if (use_gpu and torch.cuda.is_available()) else "cpu"
511
+ self._offline_model = model.to(device)
512
+ logger.info(f"Offline model loaded on '{device}'.")
513
+
514
+ def _offline_single(self, text: str) -> Tuple[str, str]:
515
+ with self._lock:
516
+ if self._offline_model is None:
517
+ self._load_offline()
518
+ chunks = list(_split_text(text, self._OFFLINE_LIMIT))
519
+ results = self._vectorized_translate(chunks)
520
+ return " ".join(results), "offline"
521
+
522
+ def _translate_batch_offline(
523
+ self, texts: List[str]
524
+ ) -> List[Tuple[str, str]]:
525
+ """Vectorized: flatten all chunks, translate in one pass, reassemble."""
526
+ with self._lock:
527
+ if self._offline_model is None:
528
+ self._load_offline()
529
+
530
+ # Build chunk index: (text_idx, chunk_idx) -> flat_idx
531
+ all_chunks: List[str] = []
532
+ chunk_map: List[Tuple[int, int]] = [] # (text_idx, n_chunks)
533
+
534
+ for t_idx, text in enumerate(texts):
535
+ if not text or not text.strip():
536
+ chunk_map.append((t_idx, 0))
537
+ continue
538
+ chunks = list(_split_text(text, self._OFFLINE_LIMIT))
539
+ start = len(all_chunks)
540
+ all_chunks.extend(chunks)
541
+ chunk_map.append((t_idx, len(chunks)))
542
+
543
+ if not all_chunks:
544
+ return [(t, "passthrough") for t in texts]
545
+
546
+ # One vectorized forward pass
547
+ translated_chunks = self._vectorized_translate(all_chunks)
548
+
549
+ # Reassemble
550
+ results: List[Tuple[str, str]] = []
551
+ flat_idx = 0
552
+ for t_idx, n in chunk_map:
553
+ if n == 0:
554
+ results.append((texts[t_idx], "passthrough"))
555
+ else:
556
+ assembled = " ".join(translated_chunks[flat_idx : flat_idx + n])
557
+ results.append((assembled, "offline"))
558
+ flat_idx += n
559
+ return results
560
+
561
+ def _vectorized_translate(self, chunks: List[str]) -> List[str]:
562
+ """Run MarianMT on a list of strings in one batched forward pass."""
563
+ if not chunks:
564
+ return []
565
+
566
+ tok = self._offline_tok
567
+ model = self._offline_model
568
+ if tok is None or model is None:
569
+ raise RuntimeError("Offline model not loaded.")
570
+
571
+ device = next(model.parameters()).device
572
+ batch_size = int(self.cfg.get("batch_size", 10))
573
+ results: List[str] = []
574
+
575
+ # Split into mini-batches to avoid OOM on large inputs
576
+ for i in range(0, len(chunks), batch_size):
577
+ mini = chunks[i : i + batch_size]
578
+ enc = tok(
579
+ mini,
580
+ return_tensors="pt",
581
+ padding=True,
582
+ truncation=True,
583
+ max_length=512,
584
+ ).to(device)
585
+ with torch.no_grad():
586
+ out = model.generate(**enc)
587
+ decoded = tok.batch_decode(out, skip_special_tokens=True)
588
+ results.extend(decoded)
589
+
590
+ return results
591
+
592
+
593
+ # ════════════════════════════════════════════════════════════════════════════
594
+ # TEXT SPLITTING UTILITIES
595
+ # ════════════════════════════════════════════════════════════════════════════
596
+ def _split_text(text: str, max_len: int) -> Generator[str, None, None]:
597
+ """Split text at sentence boundaries for chunking."""
598
+ if len(text) <= max_len:
599
+ yield text
600
+ return
601
+
602
+ sentence_ends = re.compile(r"[。!?\n!?\.]")
603
+ current: List[str] = []
604
+ current_len = 0
605
+
606
+ for segment in sentence_ends.split(text):
607
+ seg = segment.strip()
608
+ if not seg:
609
+ continue
610
+ if current_len + len(seg) + 1 > max_len and current:
611
+ yield " ".join(current)
612
+ current = [seg]
613
+ current_len = len(seg)
614
+ else:
615
+ current.append(seg)
616
+ current_len += len(seg) + 1
617
+
618
+ if current:
619
+ yield " ".join(current)
620
+
621
+
622
+ def _dedupe_list(lst: List[str]) -> List[str]:
623
+ seen: set = set()
624
+ out: List[str] = []
625
+ for item in lst:
626
+ if item not in seen:
627
+ seen.add(item)
628
+ out.append(item)
629
+ return out
630
+
631
+
632
+ # ════════════════════════════════════════════════════════════════════════════
633
+ # MARKDOWN PARSER / SEGMENT EXTRACTOR
634
+ # ════════════════════════════════════════════════════════════════════════════
635
+ class MarkdownProcessor:
636
+ """Ultra-robust Markdown protection."""
637
+ _TOKEN = "___MY_PROTECT_PH_{idx}___"
638
+
639
+ def __init__(self) -> None:
640
+ self._protected: Dict[int, str] = {}
641
+ self._ph_counter = 0
642
+
643
+ def _next_placeholder(self, original: str) -> str:
644
+ idx = self._ph_counter
645
+ token = self._TOKEN.format(idx=idx)
646
+ self._protected[idx] = original
647
+ self._ph_counter += 1
648
+ return token
649
+
650
+ def protect(self, text: str) -> str:
651
+ """Replace code/links/tags with unique tokens."""
652
+ self._protected.clear()
653
+ self._ph_counter = 0
654
+
655
+ # Protect YAML
656
+ text = _MD_FRONTMATTER.sub(lambda m: self._next_placeholder(m.group(0)), text)
657
+
658
+ # Protect Code Fences but leave content if it has Chinese
659
+ def _fence_sub(m: re.Match) -> str:
660
+ full = m.group(0)
661
+ if contains_chinese(full):
662
+ # Only protect the ``` lines
663
+ lines = full.splitlines()
664
+ if len(lines) >= 2:
665
+ p1 = self._next_placeholder(lines[0])
666
+ p2 = self._next_placeholder(lines[-1])
667
+ content = "\n".join(lines[1:-1])
668
+ return f"{p1}\n{content}\n{p2}"
669
+ return self._next_placeholder(full)
670
+ text = _MD_CODE_FENCE.sub(_fence_sub, text)
671
+
672
+ # Protect HTML and Inline Code and Links
673
+ text = _MD_HTML_TAG.sub(lambda m: self._next_placeholder(m.group(0)), text)
674
+ text = _MD_LINK.sub(lambda m: f"{m.group(1)}({self._next_placeholder(m.group(2))})", text)
675
+ text = _MD_INLINE_CODE.sub(lambda m: self._next_placeholder(m.group(0)), text)
676
+
677
+ return text
678
+
679
+ def restore(self, text: str) -> str:
680
+ """Sequential replacement of all tokens."""
681
+ # We replace them in reverse to avoid partial matches if idx 10 and 1 exist
682
+ for idx in sorted(self._protected.keys(), reverse=True):
683
+ token = self._TOKEN.format(idx=idx)
684
+ original = self._protected[idx]
685
+ # Use regex to handle potential space mangling by Google
686
+ pattern = re.compile(re.escape(token).replace(r"\_", r"\s*\_*"), re.IGNORECASE)
687
+ text = pattern.sub(original.replace("\\", "\\\\"), text)
688
+ return text
689
+
690
+
691
+ class FileTranslator:
692
+ """Orchestrates translation with 'Never Miss' strategy."""
693
+ def __init__(self, config: Config) -> None:
694
+ self.cfg = config
695
+ self.engine = TranslationEngine(config)
696
+ self._md_proc = MarkdownProcessor()
697
+
698
+ def translate_file(
699
+ self,
700
+ input_path: Path,
701
+ output_path: Optional[Path] = None,
702
+ extract_only: bool = False,
703
+ to_stdout: bool = False,
704
+ ) -> Path:
705
+ input_path = Path(input_path).resolve()
706
+ if not input_path.exists(): raise FileNotFoundError(f"Missing: {input_path}")
707
+
708
+ suffix = input_path.suffix.lower()
709
+ if suffix not in (".txt", ".md"): raise ValueError("Unsupported type")
710
+
711
+ raw = input_path.read_text(encoding="utf-8", errors="replace")
712
+ if extract_only:
713
+ extracted = "\n".join([l for l in raw.splitlines() if contains_chinese(l)])
714
+ if to_stdout: print(extracted); return input_path
715
+ out = output_path or _default_output(input_path, self.cfg)
716
+ out.write_text(extracted, encoding="utf-8")
717
+ return out
718
+
719
+ res = self._translate_md(raw) if suffix == ".md" else self._translate_txt(raw)
720
+ if to_stdout: print(res); return input_path
721
+ out = output_path or _default_output(input_path, self.cfg)
722
+ out.write_text(res, encoding="utf-8")
723
+ return out
724
+
725
+ def _translate_txt(self, text: str) -> str:
726
+ lines = text.splitlines(keepends=True)
727
+ bilingual = bool(self.cfg.get("bilingual", False))
728
+
729
+ out_lines = []
730
+ for line in lines:
731
+ stripped = line.rstrip("\n\r")
732
+ if contains_chinese(stripped):
733
+ tr = self._translate_granular(stripped)
734
+ eol = "\n" if line.endswith("\n") else ""
735
+ out_lines.append(f"{stripped}\n{tr}{eol}" if bilingual else f"{tr}{eol}")
736
+ else:
737
+ out_lines.append(line)
738
+ return "".join(out_lines)
739
+
740
+ def _translate_md(self, text: str) -> str:
741
+ """Global Surgical Batch Translation with fixed CJK regex."""
742
+ # 1. Protect structure
743
+ protected = self._md_proc.protect(text)
744
+
745
+ # 2. Extract all CJK blocks (Inclusive range for stability)
746
+ CJK_BLOCK_RE = re.compile(
747
+ r"["
748
+ r"\u4e00-\u9fff" # Basic
749
+ r"\u3400-\u4dbf" # Ext A
750
+ r"\U00020000-\U0002ceaf" # Ext B-E
751
+ r"\uf900-\ufaff" # Compatibility
752
+ r"\u3000-\u303f" # Symbols/Punctuation
753
+ r"\uff00-\uffef" # Fullwidth
754
+ r"\u00b7" # Middle dot
755
+ r"\u2014-\u2027" # Punctuation ranges
756
+ r"]+"
757
+ )
758
+ # Filter out blocks that are ONLY numbers or symbols if they don't have AT LEAST ONE CJK
759
+ def _has_real_cjk(s):
760
+ return any('\u4e00' <= c <= '\u9fff' or '\u3400' <= c <= '\u4dbf' or ord(c) > 0xffff for c in s)
761
+
762
+ all_candidate_blocks = CJK_BLOCK_RE.findall(protected)
763
+ all_blocks = _dedupe_list([b for b in all_candidate_blocks if _has_real_cjk(b)])
764
+
765
+ if not all_blocks:
766
+ return self._md_proc.restore(protected)
767
+
768
+ # 3. Batch translate unique blocks
769
+ logger.info(f"Found {len(all_blocks)} unique Chinese blocks. Batch translating...")
770
+ translated = self.engine.translate_batch(all_blocks, source_lang="simplified")
771
+
772
+ # 4. Global replacement
773
+ mapping = {}
774
+ for orig, (tr, _) in zip(all_blocks, translated):
775
+ if tr.strip() and tr.strip() != orig.strip():
776
+ mapping[orig] = tr
777
+ else:
778
+ try:
779
+ t, _ = self.engine.translate(orig, source_lang="simplified")
780
+ mapping[orig] = t
781
+ except:
782
+ mapping[orig] = orig
783
+
784
+ sorted_orig = sorted(mapping.keys(), key=len, reverse=True)
785
+ final_text = protected
786
+ for orig in sorted_orig:
787
+ final_text = final_text.replace(orig, mapping[orig])
788
+
789
+ # 5. Restore
790
+ return self._md_proc.restore(final_text)
791
+
792
+ def _translate_granular(self, text: str) -> str:
793
+ """Fallback for TXT or other sparse areas."""
794
+ CJK_BLOCK_RE = re.compile(
795
+ r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002ceaf\u3000-\u303f\uff00-\uffef]+"
796
+ )
797
+ def _sub(m: re.Match) -> str:
798
+ chunk = m.group(0)
799
+ if not any('\u4e00' <= c <= '\u9fff' for c in chunk): return chunk
800
+ try:
801
+ t, _ = self.engine.translate(chunk, source_lang="simplified")
802
+ return t
803
+ except:
804
+ return chunk
805
+ return CJK_BLOCK_RE.sub(_sub, text)
806
+
807
+ @staticmethod
808
+ def _extract_chinese_lines(text: str) -> List[str]:
809
+ """Return only lines that contain Chinese text."""
810
+ return [
811
+ line for line in text.splitlines()
812
+ if contains_chinese(line)
813
+ ]
814
+
815
+ def _detect_script_bulk(self, texts: List[str]) -> str:
816
+ """Detect dominant script from a list of strings."""
817
+ lang_mode = str(self.cfg.get("lang", "auto"))
818
+ if lang_mode in ("simplified", "traditional"):
819
+ return lang_mode
820
+ combined = " ".join(texts[:50]) # sample first 50 segments
821
+ return detect_script(combined)
822
+
823
+ # ── Batch directory translation ───────────────────────────────────────
824
+
825
+ def translate_directory(
826
+ self,
827
+ input_dir: Path,
828
+ output_dir: Optional[Path] = None,
829
+ ) -> List[Path]:
830
+ """Translate all .txt and .md files in `input_dir`."""
831
+ input_dir = Path(input_dir).resolve()
832
+ if not input_dir.is_dir():
833
+ raise NotADirectoryError(f"Not a directory: {input_dir}")
834
+
835
+ files = sorted(
836
+ list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
837
+ )
838
+ if not files:
839
+ logger.warning(f"No .txt or .md files found in {input_dir}")
840
+ return []
841
+
842
+ logger.info(f"Batch translating {len(files)} file(s) from {input_dir}")
843
+ out_paths: List[Path] = []
844
+
845
+ iterable = (
846
+ tqdm(files, desc="Files", unit="file")
847
+ if TQDM_AVAILABLE else files
848
+ )
849
+ for fpath in iterable:
850
+ try:
851
+ if output_dir:
852
+ out_file = Path(output_dir) / fpath.name
853
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
854
+ else:
855
+ out_file = _default_output(fpath, self.cfg)
856
+ result = self.translate_file(fpath, output_path=out_file)
857
+ out_paths.append(result)
858
+ logger.info(f" Done: {fpath.name} -> {result.name}")
859
+ except Exception as exc:
860
+ logger.error(f" Failed: {fpath.name}: {exc}")
861
+
862
+ return out_paths
863
+
864
+
865
+ # ════════════════════════════════════════════════════════════════════════════
866
+ # HISTORY MANAGER
867
+ # ════════════════════════════════════════════════════════════════════════════
868
+ class HistoryManager:
869
+ """Log translation sessions to a persistent JSON file."""
870
+
871
+ def __init__(self, config: Config) -> None:
872
+ self.cfg = config
873
+ self._items: List[Dict[str, Any]] = []
874
+ _HOME.mkdir(parents=True, exist_ok=True)
875
+ self._load()
876
+
877
+ def _load(self) -> None:
878
+ if HISTORY_FILE.exists():
879
+ try:
880
+ with open(HISTORY_FILE, "r", encoding="utf-8") as f:
881
+ self._items = json.load(f)
882
+ except Exception:
883
+ self._items = []
884
+
885
+ def save(self) -> None:
886
+ try:
887
+ with open(HISTORY_FILE, "w", encoding="utf-8") as f:
888
+ json.dump(self._items, f, ensure_ascii=False, indent=2)
889
+ except Exception as exc:
890
+ logger.error(f"History save error: {exc}")
891
+
892
+ def add(
893
+ self,
894
+ input_file: str,
895
+ output_file: str,
896
+ backend: str,
897
+ script: str,
898
+ segments_count: int,
899
+ elapsed_seconds: float,
900
+ ) -> None:
901
+ entry: Dict[str, Any] = {
902
+ "timestamp" : datetime.now().isoformat(),
903
+ "input_file" : input_file,
904
+ "output_file" : output_file,
905
+ "backend" : backend,
906
+ "script" : script,
907
+ "segments_count" : segments_count,
908
+ "elapsed_seconds": round(elapsed_seconds, 2),
909
+ }
910
+ self._items.insert(0, entry)
911
+ max_h = int(self.cfg.get("max_history", 1000))
912
+ while len(self._items) > max_h:
913
+ self._items.pop()
914
+ self.save()
915
+
916
+ def export(self, path: str) -> None:
917
+ with open(path, "w", encoding="utf-8") as f:
918
+ json.dump(self._items, f, ensure_ascii=False, indent=2)
919
+ logger.info(f"History exported to {path}")
920
+
921
+ def get_all(self) -> List[Dict[str, Any]]:
922
+ return list(self._items)
923
+
924
+
925
+ # ════════════════════════════════════════════════════════════════════════════
926
+ # PATH HELPERS
927
+ # ════════════════════════════════════════════════════════════════════════════
928
+ def _default_output(input_path: Path, config: Config) -> Path:
929
+ """Derive default output path: input_translated.ext"""
930
+ suffix = str(config.get("output_suffix", "_translated"))
931
+ return input_path.with_stem(input_path.stem + suffix)
932
+
933
+
934
+ # ════════════════════════════════════════════════════════════════════════════
935
+ # CLI ARG PARSER
936
+ # ════════════════════════════════════════════════════════════════════════════
937
+ def _build_parser() -> argparse.ArgumentParser:
938
+ parser = argparse.ArgumentParser(
939
+ prog="chinese_file_translator",
940
+ description=(
941
+ f"{APP_NAME} v{APP_VERSION} by {APP_AUTHOR}\n"
942
+ "Translate Chinese text inside .txt or .md files to English."
943
+ ),
944
+ formatter_class=argparse.RawDescriptionHelpFormatter,
945
+ epilog=textwrap.dedent("""
946
+ Examples:
947
+ python chinese_file_translator.py input.txt
948
+ python chinese_file_translator.py input.md -o translated.md
949
+ python chinese_file_translator.py input.txt --backend offline --gpu
950
+ python chinese_file_translator.py input.txt --bilingual
951
+ python chinese_file_translator.py input.txt --extract-only
952
+ python chinese_file_translator.py --batch ./docs/ --batch-out ./out/
953
+ python chinese_file_translator.py input.txt --stdout
954
+ """),
955
+ )
956
+ parser.add_argument(
957
+ "input",
958
+ nargs="?",
959
+ help="Input .txt or .md file path",
960
+ )
961
+ parser.add_argument(
962
+ "-o", "--output",
963
+ dest="output",
964
+ metavar="FILE",
965
+ help="Output file path (default: <input>_translated.<ext>)",
966
+ )
967
+ parser.add_argument(
968
+ "--batch",
969
+ metavar="DIR",
970
+ help="Translate all .txt and .md files in a directory",
971
+ )
972
+ parser.add_argument(
973
+ "--batch-out",
974
+ dest="batch_out",
975
+ metavar="DIR",
976
+ help="Output directory for batch translation",
977
+ )
978
+ parser.add_argument(
979
+ "--backend",
980
+ choices=["google", "microsoft", "offline"],
981
+ help="Translation backend (default: google)",
982
+ )
983
+ parser.add_argument(
984
+ "--offline",
985
+ action="store_true",
986
+ help="Shorthand for --backend offline",
987
+ )
988
+ parser.add_argument(
989
+ "--lang",
990
+ choices=["auto", "simplified", "traditional"],
991
+ default="auto",
992
+ help="Chinese script mode (default: auto)",
993
+ )
994
+ parser.add_argument(
995
+ "--gpu",
996
+ action="store_true",
997
+ help="Use GPU (CUDA) for offline translation",
998
+ )
999
+ parser.add_argument(
1000
+ "--confidence",
1001
+ type=float,
1002
+ metavar="0.0-1.0",
1003
+ help="Chinese detection confidence threshold (default: 0.05 ratio)",
1004
+ )
1005
+ parser.add_argument(
1006
+ "--chunk-size",
1007
+ dest="chunk_size",
1008
+ type=int,
1009
+ metavar="N",
1010
+ help="Max characters per translation request (default: 4000)",
1011
+ )
1012
+ parser.add_argument(
1013
+ "--bilingual",
1014
+ action="store_true",
1015
+ help="Keep original Chinese alongside English translation",
1016
+ )
1017
+ parser.add_argument(
1018
+ "--extract-only",
1019
+ dest="extract_only",
1020
+ action="store_true",
1021
+ help="Only extract and save detected Chinese lines, no translation",
1022
+ )
1023
+ parser.add_argument(
1024
+ "--stdout",
1025
+ action="store_true",
1026
+ help="Print translated output to stdout instead of writing a file",
1027
+ )
1028
+ parser.add_argument(
1029
+ "--export-history",
1030
+ dest="export_history",
1031
+ metavar="FILE",
1032
+ help="Export translation history to a JSON file",
1033
+ )
1034
+ parser.add_argument(
1035
+ "--version",
1036
+ action="version",
1037
+ version=f"{APP_NAME} {APP_VERSION}",
1038
+ )
1039
+ parser.add_argument(
1040
+ "--verbose",
1041
+ action="store_true",
1042
+ help="Enable DEBUG-level logging",
1043
+ )
1044
+ return parser
1045
+
1046
+
1047
+ # ════════════════════════════════════════════════════════════════════════════
1048
+ # DEPENDENCY CHECK
1049
+ # ════════════════════════════════════════════════════════════════════════════
1050
+ def check_dependencies(args: argparse.Namespace) -> None:
1051
+ issues: List[str] = []
1052
+ want_offline = getattr(args, "offline", False) or getattr(args, "backend", "") == "offline"
1053
+ if not DEEP_TRANSLATOR_AVAILABLE:
1054
+ issues.append(
1055
+ "deep-translator -> pip install deep-translator"
1056
+ )
1057
+ if want_offline and not OFFLINE_AVAILABLE:
1058
+ issues.append(
1059
+ "transformers / torch -> pip install transformers torch\n"
1060
+ " (CPU) pip install torch --index-url https://download.pytorch.org/whl/cpu\n"
1061
+ " (CUDA) pip install torch --index-url https://download.pytorch.org/whl/cu121"
1062
+ )
1063
+ if issues:
1064
+ print("\n" + "=" * 55)
1065
+ print(f"[{APP_NAME}] Missing dependencies:")
1066
+ for i in issues:
1067
+ print(f" {i}")
1068
+ print("=" * 55 + "\n")
1069
+
1070
+
1071
+ # ════════════════════════════════════════════════════════════════════════════
1072
+ # MAIN
1073
+ # ════════════════════════════════════════════════════════════════════════════
1074
+ def main() -> None:
1075
+ parser = _build_parser()
1076
+ args = parser.parse_args()
1077
+
1078
+ setup_logging(verbose=getattr(args, "verbose", False))
1079
+ check_dependencies(args)
1080
+
1081
+ cfg = Config()
1082
+ cfg.apply_args(args)
1083
+
1084
+ history = HistoryManager(cfg)
1085
+ translator = FileTranslator(cfg)
1086
+
1087
+ # ── Export history shortcut ───────────────────────────────────────────
1088
+ if getattr(args, "export_history", None):
1089
+ history.export(args.export_history)
1090
+ if not args.input and not args.batch:
1091
+ return
1092
+
1093
+ # ── Batch mode ────────────────────────────────────────────────────────
1094
+ if getattr(args, "batch", None):
1095
+ batch_dir = Path(args.batch)
1096
+ out_dir = Path(args.batch_out) if getattr(args, "batch_out", None) else None
1097
+ t0 = time.time()
1098
+ out_paths = translator.translate_directory(batch_dir, output_dir=out_dir)
1099
+ elapsed = time.time() - t0
1100
+ print(
1101
+ f"\nBatch complete: {len(out_paths)} file(s) translated "
1102
+ f"in {elapsed:.1f}s"
1103
+ )
1104
+ for p in out_paths:
1105
+ print(f" -> {p}")
1106
+ history.add(
1107
+ input_file=str(batch_dir),
1108
+ output_file=str(out_dir or batch_dir),
1109
+ backend=str(cfg.get("backend")),
1110
+ script=str(cfg.get("lang")),
1111
+ segments_count=len(out_paths),
1112
+ elapsed_seconds=elapsed,
1113
+ )
1114
+ return
1115
+
1116
+ # ── Single file mode ──────────────────────────────────────────────────
1117
+ if not args.input:
1118
+ parser.print_help()
1119
+ sys.exit(0)
1120
+
1121
+ input_path = Path(args.input)
1122
+ output_path = Path(args.output) if getattr(args, "output", None) else None
1123
+
1124
+ t0 = time.time()
1125
+ try:
1126
+ out = translator.translate_file(
1127
+ input_path = input_path,
1128
+ output_path = output_path,
1129
+ extract_only = getattr(args, "extract_only", False),
1130
+ to_stdout = getattr(args, "stdout", False),
1131
+ )
1132
+ except (FileNotFoundError, ValueError, RuntimeError) as exc:
1133
+ logger.error(str(exc))
1134
+ sys.exit(1)
1135
+
1136
+ elapsed = time.time() - t0
1137
+
1138
+ if not getattr(args, "stdout", False):
1139
+ print(f"\n{APP_NAME} v{APP_VERSION}")
1140
+ print(f"Input : {input_path}")
1141
+ print(f"Output : {out}")
1142
+ print(f"Backend : {cfg.get('backend')}")
1143
+ print(f"Script : {cfg.get('lang')}")
1144
+ print(f"Elapsed : {elapsed:.2f}s")
1145
+ print(f"Config : {CONFIG_FILE}")
1146
+ print(f"Log : {LOG_FILE}")
1147
+
1148
+ history.add(
1149
+ input_file = str(input_path),
1150
+ output_file = str(out),
1151
+ backend = str(cfg.get("backend")),
1152
+ script = str(cfg.get("lang")),
1153
+ segments_count = 0,
1154
+ elapsed_seconds = elapsed,
1155
+ )
1156
+
1157
+
1158
+ if __name__ == "__main__":
1159
+ main()
input.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ im using https://github.com/pwxcoo/chinese-xinhua/blob/master/README.md as reference to test my codebase
3
+ """
4
+
5
+ ```reference
6
+ #!/bin/bash
7
+
8
+ # chinese-xinhua
9
+
10
+ 中华新华字典数据库和 API 。收录包括 14032 条歇后语,16142 个汉字,264434 个词语,31648 个成语。
11
+
12
+ ## Project Structure
13
+
14
+ ```
15
+ chinese-xinhua/
16
+ |
17
+ +- data/ <-- 数据文件夹
18
+ | |
19
+ | +- idiom.json <-- 成语
20
+ | |
21
+ | +- word.json <-- 汉字
22
+ | |
23
+ | +- xiehouyu.json <-- 歇后语
24
+ | |
25
+ | +- ci.json <-- 词语
26
+ ```
27
+
28
+ ## Database Introduction
29
+
30
+ ### 成语 (idiom.json)
31
+
32
+ ```json
33
+ [
34
+ {
35
+ "derivation": "语出《法华经·法师功德品》下至阿鼻地狱。”",
36
+ "example": "但也有少数意志薄弱的……逐步上当,终至堕入~。★《上饶集中营·炼狱杂记》",
37
+ "explanation": "阿鼻梵语的译音,意译为无间”,即痛苦无有间断之意。常用来比喻黑暗的社会和严酷的牢狱。又比喻无法摆脱的极其痛苦的境地。",
38
+ "pinyin": "ā bí dì yù",
39
+ "word": "阿鼻地狱",
40
+ "abbreviation": "abdy"
41
+ },
42
+ ...
43
+ ]
44
+ ```
45
+
46
+ ### 词语 (ci.json)
47
+
48
+ ```json
49
+ [
50
+ {
51
+ "ci": "宸纶",
52
+ "explanation": "1.帝王的诏书﹑制令。"
53
+ },
54
+ ...
55
+ ]
56
+ ```
57
+
58
+ ### 汉字 (word.json)
59
+
60
+ ```json
61
+ [
62
+ {
63
+ "word": "嗄",
64
+ "oldword": "嗄",
65
+ "strokes": "13",
66
+ "pinyin": "á",
67
+ "radicals": "口",
68
+ "explanation": "嗄〈叹〉\n\n 同啊”。表示省悟或惊奇\n\n 嗄!难道这里是没有地方官的么?--宋·佚名《新编五代史平话》\n\n 嗄á叹词。在句首,〈表〉疑问或反问~,这是什么?~,你想干什么?\"嗄\"另见shà㈠。\n\n 嗄shà\n\n ⒈声音嘶哑~声。\n\n 嗄a 1.助词。表示强调﹑肯定或辩解。 2.助词。方言。表示疑问或反诘。\n\n 嗄xià 1.见\"嗄饭\"。 2.见\"嗄程\"。",
69
+ "more": "嗄 ga、a 部首 口 部首笔画 03 总笔画 13 嗄2\nshà\n〈形〉\n(1)\n声音嘶哑的 [hoarse]\n终日嚎而嗌不嗄。--《老子》\n(2)\n又如嗄哑,嗄嘶(嗓音嘶哑)\n嗄\nshà\n〈叹〉\n(1)\n什么 [what]--表示否定\n我要丢个干干净,看你嗄法把我治。--清·蒲松龄《聊斋俚曲集》\n(2)\n旧时仆役对主人、下级对上级的应诺声 [yes]\n带进来”。两边军士应一声嗄”,即将牛皋推至面前。--《说岳全传》\n另见á\n嗄1\ná\n〈叹〉\n同啊”(á)。表示省悟或惊奇 [ah]\n嗄!难道这里是没有地方官的么?--宋·佚名《新编五代史平话》\n另见shà\n嗄1\nshà ㄕㄚ╝\n嗓音嘶哑。\n郑码janr,u55c4,gbke0c4\n笔画数13,部首口,笔顺编号2511325111354\n嗄2\ná ㄚˊ\n同啊2”。\n郑码janr,u55c4,gbke0c4\n笔画数13,部首口,笔顺编号2511325111354"
70
+ },
71
+ ...
72
+ ]
73
+ ```
74
+
75
+ ### 歇后语 (xiehouyu.json)
76
+
77
+ ```json
78
+ [
79
+ {
80
+ "riddle": "飞机上聊天",
81
+ "answer": "高谈阔论"
82
+ },
83
+ ...
84
+ ]
85
+ ```
86
+
87
+ ## Changelog
88
+
89
+ <details><summary>查看更新日志 </summary>
90
+
91
+ - 20181216: 成语数据集去重
92
+ - 20181216: API 功能下线
93
+ - 20180803: 添加词语数据集
94
+ - 20180206: 添加成语,歇后语,汉字数据集
95
+
96
+ </details>
97
+
98
+
99
+ ## Copyright
100
+
101
+ 本仓库的所有的数据都是我从网上收集整理的。仓库本来的目的是因为我以前想做一个成语接龙的东西,但是苦于没有现成可用的数据库,自己就从各个网站抓取整理了一份。放在 Github 是为了方便自己的使用,同时也能方便有类似需求的人不用去做这些 trival 的工作。所有抓取数据的[脚本](./scripts/README.md)都在仓库里。
102
+
103
+ **本仓库无任何商业目的!如果有侵权行为将及时删除!**
104
+
105
+ ```
input_test_SUCCESS_DEFINITIVE.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ im using https://github.com/pwxcoo/chinese-xinhua/blob/master/README.md as reference to test my codebase
3
+ """
4
+
5
+ ```reference
6
+ #!/bin/bash
7
+
8
+ # chinese-xinhua
9
+
10
+ Chinese Xinhua Dictionary Database and API . Included include 14032 The postscript is:16142 a Chinese character,264434 words,31648 an idiom.
11
+
12
+ ## Project Structure
13
+
14
+ ```
15
+ chinese-xinhua/
16
+ |
17
+ +- data/ <-- data folder
18
+ | |
19
+ | +- idiom.json <-- idiom
20
+ | |
21
+ | +- word.json <-- Chinese character
22
+ | |
23
+ | +- xiehouyu.json <-- idiom
24
+ | |
25
+ | +- ci.json <-- words
26
+ ```
27
+
28
+ ## Database Introduction
29
+
30
+ ### idiom (idiom.json)
31
+
32
+ ```json
33
+ [
34
+ {
35
+ "derivation": "The words "Lotus Sutra: Master's Merit and Virtue" came down to Abi Hell. "",
36
+ "example": "But there are also a few who are weak-willed... and gradually fall into the trap.★"Shangrao Concentration Camp·Purgatory Miscellaneous Notes"",
37
+ "explanation": "The transliteration of Abi in Sanskrit means "without interruption", which means pain without interruption. It is often used to describe a dark society and a harsh prison. It also refers to an extremely painful situation that cannot be escaped.",
38
+ "pinyin": "ā bí dì yù",
39
+ "word": "abi hell",
40
+ "abbreviation": "abdy"
41
+ },
42
+ ...
43
+ ]
44
+ ```
45
+
46
+ ### words (ci.json)
47
+
48
+ ```json
49
+ [
50
+ {
51
+ "ci": "Chen Lun",
52
+ "explanation": "1.emperor's edict﹑Make orders."
53
+ },
54
+ ...
55
+ ]
56
+ ```
57
+
58
+ ### Chinese character (word.json)
59
+
60
+ ```json
61
+ [
62
+ {
63
+ "word": "嗄",
64
+ "oldword": "嗄",
65
+ "strokes": "13",
66
+ "pinyin": "á",
67
+ "radicals": "mouth",
68
+ "explanation": "sigh (sigh)\n\n "Same". It means enlightenment or surprise.\n\n 嗄!Are there no local officials here??--Song Dynasty Anonymous "Newly Compiled History of the Five Dynasties"\n\n 嗄áinterjection. At the beginning of the sentence, <expression> questions or rhetorical questions~, what is this? ~, what do you want to do?\"嗄\"See alsoshà㈠。\n\n 嗄shà\n\n ⒈Hoarse voice.\n\n 嗄a 1.particle. express emphasis﹑affirm or justify. 2.particle. dialect. Express a question or question.\n\n 嗄xià 1.See\"Eat rice\"。 2.See\"Cheng\"。",
69
+ "more": "嗄 ga、a radical mouth radical strokes 03 total strokes 13 嗄2\nshà\n<shape>\n(1)\nhoarse voice [hoarse]\nHowling all day long but not moaning.--"Laozi"\n(2)\nAnother example is muteness,Hiss(hoarse voice)\n嗄\nshà\n<sigh>\n(1)\nWhat [what]--express negation\nI want to throw it all away,Let me see how you treat me.--"Collection of Liaozhai Slang" by Pu Songling of the Qing Dynasty\n(2)\nIn the old days, servants made promises to their masters, and subordinates made promises to their superiors. [yes]\n"Bring them in." The sergeants on both sides responded with a cry.,Niu Gao was about to be pushed in front of him.--"The Complete Biography of Yue Yue"\nSee alsoá\n嗄1\ná\n<sigh>\n"Same"(á). express enlightenment or surprise [ah]\n嗄!Are there no local officials here??--Song Dynasty Anonymous "Newly Compiled History of the Five Dynasties"\nSee alsoshà\n嗄1\nshà ㄕㄚ╝\nHoarse voice.\nZheng Majanr,u55c4,gbke0c4\nNumber of strokes13, radical mouth, stroke order number2511325111354\n嗄2\ná ㄚˊ\nSame2”。\nZheng Majanr,u55c4,gbke0c4\nNumber of strokes13, radical mouth, stroke order number2511325111354"
70
+ },
71
+ ...
72
+ ]
73
+ ```
74
+
75
+ ### idiom (xiehouyu.json)
76
+
77
+ ```json
78
+ [
79
+ {
80
+ "riddle": "Chatting on the plane",
81
+ "answer": "talk eloquently"
82
+ },
83
+ ...
84
+ ]
85
+ ```
86
+
87
+ ## Changelog
88
+
89
+ <details><summary>View changelog </summary>
90
+
91
+ - 20181216: Deduplication of idiom data sets
92
+ - 20181216: API Function offline
93
+ - 20180803: Add word dataset
94
+ - 20180206: Add idioms, idioms, and Chinese character data sets
95
+
96
+ </details>
97
+
98
+
99
+ ## Copyright
100
+
101
+ All the data in this warehouse is collected and organized by me from the Internet. The original purpose of the warehouse was because I wanted to make a idiom solitaire thing before, but because there was no readily available database, I grabbed and compiled one from various websites. put on Github It is to facilitate your own use, and also to facilitate people with similar needs without having to do these things. trival work. All scraped data[script](./scripts/README.md)All in the warehouse.
102
+
103
+ **This warehouse has no commercial purpose! If there is any infringement, it will be deleted immediately!**
104
+
105
+ ```
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ════════════════════════════════════════════════════════════════════════
2
+ # ChineseFileTranslator v1.0.0 — requirements.txt
3
+ # Author: algorembrant
4
+ # ════════════════════════════════════════════════════════════════════════
5
+ # Install all core dependencies:
6
+ # pip install -r requirements.txt
7
+ #
8
+ # For offline translation backend (Helsinki-NLP MarianMT):
9
+ # CPU: pip install torch --index-url https://download.pytorch.org/whl/cpu
10
+ # CUDA: pip install torch --index-url https://download.pytorch.org/whl/cu121
11
+ # Then:
12
+ # pip install transformers sentencepiece sacremoses
13
+ #
14
+ # Platform note:
15
+ # On Linux/Mac, keyboard events require no extra steps.
16
+ # On Windows, run the terminal as Administrator if hotkeys fail.
17
+ # ════════════════════════════════════════════════════════════════════════
18
+
19
+ # ── Core translation backend ──────────────────────────────────────────────
20
+ deep-translator>=1.11.4
21
+
22
+ # ── Progress bar (optional but recommended) ───────────────────────────────
23
+ tqdm>=4.66.0
24
+
25
+ # ── Clipboard support (optional) ─────────────────────────────────────────
26
+ pyperclip>=1.8.2
27
+
28
+ # ── Offline translation backend (optional) ───────────────────────────────
29
+ # Uncomment the lines below OR follow the install note above for PyTorch.
30
+ # transformers>=4.40.0
31
+ # sentencepiece>=0.2.0
32
+ # sacremoses>=0.1.1
33
+ # torch>=2.2.0 # <-- install separately with correct CUDA URL