Spaces:

ogaith
/

webnovel-translator

Sleeping

App Files Files Community

webnovel-translator / app.py

ogaith

Update app.py

ade444b 5 months ago

raw

history blame contribute delete

5.84 kB

	import os
	import io
	import tempfile
	from typing import List
	import re
	import gradio as gr
	from huggingface_hub import snapshot_download
	import ctranslate2
	import sentencepiece as spm
	import hanlp

	# ====== CONFIGURE HERE ======
	# Public Hugging Face repo of your CTranslate2 model
	MODEL_REPO = "ogaith/zhen-ctranslate2"
	# Paths (inside the model repo) to the SentencePiece models
	SRC_SPM = "source.spm"
	TGT_SPM = "target.spm"
	# Local example file that lives in the SAME repo as this app.py
	EXAMPLE_FILE = "example_corpus.txt"
	# ============================

	# Download the model once into the Space cache (or local cache when running locally)
	MODEL_DIR = snapshot_download(MODEL_REPO)

	# Load CT2 translator + SentencePiece + HanLP once at startup
	translator = ctranslate2.Translator(MODEL_DIR, device="auto")
	sp_src = spm.SentencePieceProcessor(os.path.join(MODEL_DIR, SRC_SPM))
	sp_tgt = spm.SentencePieceProcessor(os.path.join(MODEL_DIR, TGT_SPM))

	# HanLP: Chinese segmenter (adjust to your preferred HanLP pipeline if needed)
	hanlp_tok = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')

	def preprocess_source(text: str) -> str:
	text = text.strip()
	if not text:
	return ""
	tokens = hanlp_tok(text)
	text = "".join(tokens)

	text = re.sub(r'([\u4e00-\u9fff])(\d)', r'\1 \2', text)
	text = re.sub(r'(\d)([\u4e00-\u9fff])', r'\1 \2', text)

	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def translate_lines(lines: List[str], beam_size: int, max_len: int, batch_size: int) -> List[str]:
	"""Translate a list of CN lines -> EN lines using CT2.

	Source side: HanLP preprocessing + SentencePiece encode.
	Target side: SentencePiece decode ONLY (no HanLP).
	"""
	out_lines = []
	for i in range(0, len(lines), batch_size):
	chunk = lines[i:i + batch_size]
	pre = [preprocess_source(s) for s in chunk]
	src_tok = [sp_src.encode(s, out_type=str) for s in pre]
	results = translator.translate_batch(
	src_tok,
	beam_size=int(beam_size),
	max_decoding_length=int(max_len),
	)
	for r in results:
	out_lines.append(sp_tgt.decode(r.hypotheses[0]))
	return out_lines

	def to_temp_txt(content: str) -> str:
	"""Write content to a temporary .txt file and return its path for download."""
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
	tmp.write(content)
	tmp.close()
	return tmp.name

	def run_on_uploaded(file_bytes, beam_size, max_len, batch_size):
	"""Handle user-uploaded .txt (UTF-8). Returns: downloadable file, preview, status msg."""
	if file_bytes is None:
	return None, gr.update(value=None), "Please upload a .txt file."
	try:
	text = file_bytes.decode("utf-8")
	except UnicodeDecodeError:
	# We expect UTF-8; if not, inform the user clearly.
	return None, gr.update(value=None), "Encoding error. Make sure the file is UTF-8."

	lines = text.splitlines()
	outs = translate_lines(lines, beam_size, max_len, batch_size)
	out_txt = "\n".join(outs) + ("\n" if outs else "")
	path = to_temp_txt(out_txt)

	# Short preview: first 10 lines to avoid clutter
	preview = "\n".join(outs[:10])
	return path, preview, f"Translated {len(outs)} lines."

	def run_on_example(beam_size, max_len, batch_size):
	"""Translate the local example_corpus.txt that ships with this repo."""
	if not os.path.exists(EXAMPLE_FILE):
	return None, gr.update(value=None), f"File '{EXAMPLE_FILE}' not found in the repo."

	with open(EXAMPLE_FILE, "r", encoding="utf-8") as f:
	lines = f.read().splitlines()

	outs = translate_lines(lines, beam_size, max_len, batch_size)
	out_txt = "\n".join(outs) + ("\n" if outs else "")
	path = to_temp_txt(out_txt)
	preview = "\n".join(outs[:10])
	return path, preview, f"Translated {len(outs)} lines from '{EXAMPLE_FILE}'."

	with gr.Blocks() as demo:
	gr.Markdown("# 🇨🇳→🇬🇧 TXT Translation (CTranslate2 + HanLP + SentencePiece)")
	gr.Markdown(
	"Upload a UTF-8 `.txt` with one Chinese sentence per line and download the English `.txt` output.\n\n"
	f"Or click to translate the bundled `{EXAMPLE_FILE}` in this repository."
	)

	with gr.Row():
	beam = gr.Slider(1, 8, value=4, step=1, label="Beam size")
	max_len = gr.Slider(16, 512, value=256, step=1, label="Max decoding length")
	bs = gr.Slider(1, 128, value=32, step=1, label="Batch size")

	gr.Markdown("### Translate an uploaded file")
	with gr.Row():
	inp = gr.File(label="Upload .txt (UTF-8)", file_count="single", type="binary")
	btn_upload = gr.Button("Translate uploaded file")
	out_file_upload = gr.File(label="Download translations (.txt)")
	out_preview_upload = gr.Textbox(label="Preview (first 10 lines)", lines=10)
	status_upload = gr.Markdown()

	btn_upload.click(
	run_on_uploaded,
	[inp, beam, max_len, bs],
	[out_file_upload, out_preview_upload, status_upload],
	)

	gr.Markdown("---")
	gr.Markdown(f"### Translate the repository example file (`{EXAMPLE_FILE}`)")
	btn_example = gr.Button("Translate example_corpus.txt")
	out_file_example = gr.File(label="Download example translations (.txt)")
	out_preview_example = gr.Textbox(label="Example preview (first 10)", lines=10)
	status_example = gr.Markdown()

	btn_example.click(
	run_on_example,
	[beam, max_len, bs],
	[out_file_example, out_preview_example, status_example],
	)

	# Important: Spaces call demo.launch() automatically; keeping it explicit also allows local runs.
	demo.launch()