import os import io import tempfile from typing import List import re import gradio as gr from huggingface_hub import snapshot_download import ctranslate2 import sentencepiece as spm import hanlp # ====== CONFIGURE HERE ====== # Public Hugging Face repo of your CTranslate2 model MODEL_REPO = "ogaith/zhen-ctranslate2" # Paths (inside the model repo) to the SentencePiece models SRC_SPM = "source.spm" TGT_SPM = "target.spm" # Local example file that lives in the SAME repo as this app.py EXAMPLE_FILE = "example_corpus.txt" # ============================ # Download the model once into the Space cache (or local cache when running locally) MODEL_DIR = snapshot_download(MODEL_REPO) # Load CT2 translator + SentencePiece + HanLP once at startup translator = ctranslate2.Translator(MODEL_DIR, device="auto") sp_src = spm.SentencePieceProcessor(os.path.join(MODEL_DIR, SRC_SPM)) sp_tgt = spm.SentencePieceProcessor(os.path.join(MODEL_DIR, TGT_SPM)) # HanLP: Chinese segmenter (adjust to your preferred HanLP pipeline if needed) hanlp_tok = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG') def preprocess_source(text: str) -> str: text = text.strip() if not text: return "" tokens = hanlp_tok(text) text = "".join(tokens) text = re.sub(r'([\u4e00-\u9fff])(\d)', r'\1 \2', text) text = re.sub(r'(\d)([\u4e00-\u9fff])', r'\1 \2', text) text = re.sub(r'\s+', ' ', text).strip() return text def translate_lines(lines: List[str], beam_size: int, max_len: int, batch_size: int) -> List[str]: """Translate a list of CN lines -> EN lines using CT2. Source side: HanLP preprocessing + SentencePiece encode. Target side: SentencePiece decode ONLY (no HanLP). """ out_lines = [] for i in range(0, len(lines), batch_size): chunk = lines[i:i + batch_size] pre = [preprocess_source(s) for s in chunk] src_tok = [sp_src.encode(s, out_type=str) for s in pre] results = translator.translate_batch( src_tok, beam_size=int(beam_size), max_decoding_length=int(max_len), ) for r in results: out_lines.append(sp_tgt.decode(r.hypotheses[0])) return out_lines def to_temp_txt(content: str) -> str: """Write content to a temporary .txt file and return its path for download.""" tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") tmp.write(content) tmp.close() return tmp.name def run_on_uploaded(file_bytes, beam_size, max_len, batch_size): """Handle user-uploaded .txt (UTF-8). Returns: downloadable file, preview, status msg.""" if file_bytes is None: return None, gr.update(value=None), "Please upload a .txt file." try: text = file_bytes.decode("utf-8") except UnicodeDecodeError: # We expect UTF-8; if not, inform the user clearly. return None, gr.update(value=None), "Encoding error. Make sure the file is UTF-8." lines = text.splitlines() outs = translate_lines(lines, beam_size, max_len, batch_size) out_txt = "\n".join(outs) + ("\n" if outs else "") path = to_temp_txt(out_txt) # Short preview: first 10 lines to avoid clutter preview = "\n".join(outs[:10]) return path, preview, f"Translated {len(outs)} lines." def run_on_example(beam_size, max_len, batch_size): """Translate the local example_corpus.txt that ships with this repo.""" if not os.path.exists(EXAMPLE_FILE): return None, gr.update(value=None), f"File '{EXAMPLE_FILE}' not found in the repo." with open(EXAMPLE_FILE, "r", encoding="utf-8") as f: lines = f.read().splitlines() outs = translate_lines(lines, beam_size, max_len, batch_size) out_txt = "\n".join(outs) + ("\n" if outs else "") path = to_temp_txt(out_txt) preview = "\n".join(outs[:10]) return path, preview, f"Translated {len(outs)} lines from '{EXAMPLE_FILE}'." with gr.Blocks() as demo: gr.Markdown("# 🇨🇳→🇬🇧 TXT Translation (CTranslate2 + HanLP + SentencePiece)") gr.Markdown( "Upload a UTF-8 `.txt` with **one Chinese sentence per line** and download the English `.txt` output.\n\n" f"Or click to translate the bundled **`{EXAMPLE_FILE}`** in this repository." ) with gr.Row(): beam = gr.Slider(1, 8, value=4, step=1, label="Beam size") max_len = gr.Slider(16, 512, value=256, step=1, label="Max decoding length") bs = gr.Slider(1, 128, value=32, step=1, label="Batch size") gr.Markdown("### Translate an uploaded file") with gr.Row(): inp = gr.File(label="Upload .txt (UTF-8)", file_count="single", type="binary") btn_upload = gr.Button("Translate uploaded file") out_file_upload = gr.File(label="Download translations (.txt)") out_preview_upload = gr.Textbox(label="Preview (first 10 lines)", lines=10) status_upload = gr.Markdown() btn_upload.click( run_on_uploaded, [inp, beam, max_len, bs], [out_file_upload, out_preview_upload, status_upload], ) gr.Markdown("---") gr.Markdown(f"### Translate the repository example file (`{EXAMPLE_FILE}`)") btn_example = gr.Button("Translate example_corpus.txt") out_file_example = gr.File(label="Download example translations (.txt)") out_preview_example = gr.Textbox(label="Example preview (first 10)", lines=10) status_example = gr.Markdown() btn_example.click( run_on_example, [beam, max_len, bs], [out_file_example, out_preview_example, status_example], ) # Important: Spaces call demo.launch() automatically; keeping it explicit also allows local runs. demo.launch()