Spaces:

5AILingouCore
/

LingouCore

Running

File size: 21,293 Bytes

# app.py — Hugging Face Spaces (Gradio) "全部入り" 翻訳アプリ
# - Single translate + history table
# - Batch translate (TXT/CSV) + download result CSV
# - Glossary CSV (src,tgt)
# - Model selector (m2m100 / opus-mt / nllb)
# - Safe limits for free CPU Spaces

import os
import io
import csv
import time
import json
import tempfile
from itertools import islice
from typing import Dict, Optional, List, Tuple, Any

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------
# Model registry
# -------------------------
# NOTE:
# - "opus-mt" is fast on CPU (recommended for free tier speed)
# - "m2m100-418M" matches your current project
# - "nllb-600M" can be heavier (quality often good, CPU slower)
MODEL_SPECS: Dict[str, Dict[str, Any]] = {
    "m2m100-418M (multilingual, your current)": {
        "kind": "m2m100",
        "name": {"ja-en": "facebook/m2m100_418M", "en-ja": "facebook/m2m100_418M"},
        "lang": {"ja": "ja", "en": "en"},
        "needs_forced_bos": True,
        "supports_src_lang": True,
    },
    "opus-mt (fast CPU, ja<->en)": {
        "kind": "opus",
        "name": {"ja-en": "Helsinki-NLP/opus-mt-ja-en", "en-ja": "Helsinki-NLP/opus-mt-en-ja"},
        "lang": {"ja": None, "en": None},
        "needs_forced_bos": False,
        "supports_src_lang": False,
    },
    "nllb-600M (quality, heavier)": {
        "kind": "nllb",
        "name": {"ja-en": "facebook/nllb-200-distilled-600M", "en-ja": "facebook/nllb-200-distilled-600M"},
        "lang": {"ja": "jpn_Jpan", "en": "eng_Latn"},
        "needs_forced_bos": True,
        "supports_src_lang": True,
    },
}

# Cache: (model_key, direction) -> (tokenizer, model)
TOK_CACHE: Dict[Tuple[str, str], Any] = {}
MDL_CACHE: Dict[Tuple[str, str], Any] = {}

# -------------------------
# Safety limits (public space)
# -------------------------
MAX_SINGLE_CHARS = 4000        # single input max chars
MAX_BATCH_LINES = 200          # batch line cap
MAX_BATCH_CHARS_TOTAL = 20000  # batch total chars cap
DEFAULT_MAX_NEW_TOKENS = 256

# -------------------------
# Helpers
# -------------------------
def detect_direction_by_text(text: str, prefer: str = "ja-en") -> str:
    """Simple heuristic: Japanese char => ja-en else en-ja."""
    for ch in text:
        if ("\u3040" <= ch <= "\u30ff") or ("\u4e00" <= ch <= "\u9fff"):
            return "ja-en"
    return "en-ja" if prefer == "ja-en" else "ja-en"


def read_glossary_csv(path: Optional[str]) -> Optional[List[List[str]]]:
    """Read glossary CSV (src,tgt). UTF-8. No header assumed."""
    if not path:
        return None
    rows: List[List[str]] = []
    with open(path, "r", encoding="utf-8") as f:
        for r in csv.reader(f):
            if len(r) >= 2:
                src = (r[0] or "").strip()
                tgt = (r[1] or "").strip()
                if src:
                    rows.append([src, tgt])
    return rows or None


def apply_glossary(text: str, glossary: Optional[List[List[str]]]) -> str:
    if not glossary:
        return text
    out = text
    for src, tgt in glossary:
        if src:
            out = out.replace(src, tgt)
    return out


def gen_kwargs_for_mode(conversation_mode: bool, base_beams: int) -> dict:
    """
    Stable defaults for public CPU:
    - Normal: deterministic beam search
    - Conversation: slightly more colloquial (beam-sampling) but still stable
    """
    if conversation_mode:
        return dict(
            do_sample=True,
            temperature=0.75,
            top_p=0.85,
            top_k=40,
            num_beams=max(1, min(2, int(base_beams))),  # keep it small for stability
            repetition_penalty=1.08,
        )
    return dict(
        do_sample=False,
        num_beams=int(base_beams),
        repetition_penalty=1.05,
    )


def _get_forced_bos_id(tokenizer, lang: str) -> Optional[int]:
    # M2M100: get_lang_id
    if hasattr(tokenizer, "get_lang_id"):
        try:
            return tokenizer.get_lang_id(lang)
        except Exception:
            pass
    # NLLB: lang_code_to_id
    if hasattr(tokenizer, "lang_code_to_id") and isinstance(getattr(tokenizer, "lang_code_to_id"), dict):
        if lang in tokenizer.lang_code_to_id:
            return tokenizer.lang_code_to_id[lang]
    # Fallback: token id
    try:
        return tokenizer.convert_tokens_to_ids(lang)
    except Exception:
        return None


def _load_model(model_key: str, direction: str):
    """Lazy load + cache."""
    cache_key = (model_key, direction)
    if cache_key in TOK_CACHE:
        return TOK_CACHE[cache_key], MDL_CACHE[cache_key]

    spec = MODEL_SPECS[model_key]
    model_name = spec["name"][direction]

    tok = AutoTokenizer.from_pretrained(model_name)

    dtype = torch.float16 if DEVICE.type == "cuda" else torch.float32
    mdl = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,
    )
    mdl.to(DEVICE).eval()

    TOK_CACHE[cache_key] = tok
    MDL_CACHE[cache_key] = mdl
    return tok, mdl


@torch.inference_mode()
def translate_one(
    model_key: str,
    direction: str,
    text: str,
    max_new_tokens: int,
    num_beams: int,
    conversation: bool,
) -> str:
    tok, mdl = _load_model(model_key, direction)
    spec = MODEL_SPECS[model_key]

    # language tags (if supported)
    src_lang = spec["lang"]["ja" if direction == "ja-en" else "en"]
    tgt_lang = spec["lang"]["en" if direction == "ja-en" else "ja"]

    if spec.get("supports_src_lang") and hasattr(tok, "src_lang") and src_lang:
        tok.src_lang = src_lang

    inputs = tok(text, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)

    gen_opts = gen_kwargs_for_mode(bool(conversation), int(num_beams))

    # forced BOS for multilingual models
    forced_id = None
    if spec.get("needs_forced_bos") and tgt_lang:
        forced_id = _get_forced_bos_id(tok, tgt_lang)

    generate_kwargs = dict(
        **inputs,
        max_new_tokens=int(max_new_tokens),
        no_repeat_ngram_size=3,
        length_penalty=1.05,
        **gen_opts,
    )
    if forced_id is not None:
        generate_kwargs["forced_bos_token_id"] = forced_id

    out_ids = mdl.generate(**generate_kwargs)
    return tok.batch_decode(out_ids, skip_special_tokens=True)[0]


def _clamp_int(v: Any, lo: int, hi: int, default: int) -> int:
    try:
        x = int(v)
        return max(lo, min(hi, x))
    except Exception:
        return default


def _history_to_table(history: List[Dict[str, str]]) -> List[List[str]]:
    # headers: time, direction, src, dst
    rows = []
    for item in history[-100:][::-1]:  # show latest first, cap 100 rows
        rows.append([item["time"], item["direction"], item["src"], item["dst"]])
    return rows


def _export_history(history: List[Dict[str, str]], fmt: str) -> str:
    tmpdir = tempfile.mkdtemp(prefix="history_")
    if fmt == "csv":
        path = os.path.join(tmpdir, "history.csv")
        with open(path, "w", newline="", encoding="utf-8-sig") as f:
            w = csv.writer(f)
            w.writerow(["time", "direction", "src", "dst"])
            for item in history:
                w.writerow([item["time"], item["direction"], item["src"], item["dst"]])
        return path
    else:
        path = os.path.join(tmpdir, "history.txt")
        with open(path, "w", encoding="utf-8") as f:
            for i, item in enumerate(history, 1):
                f.write(f"[{i}] {item['time']} | {item['direction']}\n")
                f.write(f"SRC: {item['src']}\n")
                f.write(f"DST: {item['dst']}\n")
                f.write("\n")
        return path


def _read_batch_lines(file_path: str) -> List[str]:
    """
    Accept:
      - .txt: 1 line = 1 item
      - .csv: use first column as src (ignores header if it looks like header)
    """
    lower = (file_path or "").lower()
    lines: List[str] = []

    if lower.endswith(".csv"):
        with open(file_path, "r", encoding="utf-8") as f:
            r = csv.reader(f)
            for row in islice(r, MAX_BATCH_LINES + 5):
                if not row:
                    continue
                val = (row[0] or "").strip()
                if not val:
                    continue
                # naive header skip
                if len(lines) == 0 and val.lower() in ("src", "source", "text", "input"):
                    continue
                lines.append(val)
                if len(lines) >= MAX_BATCH_LINES:
                    break
    else:
        with open(file_path, "r", encoding="utf-8") as f:
            for ln in islice(f, MAX_BATCH_LINES):
                ln = ln.rstrip("\n").strip()
                if ln:
                    lines.append(ln)

    # total chars guard
    total_chars = sum(len(x) for x in lines)
    if total_chars > MAX_BATCH_CHARS_TOTAL:
        # shrink until safe
        kept = []
        c = 0
        for s in lines:
            if c + len(s) > MAX_BATCH_CHARS_TOTAL:
                break
            kept.append(s)
            c += len(s)
        lines = kept

    return lines


# -------------------------
# Gradio handlers
# -------------------------
def warmup(model_key: str) -> str:
    t0 = time.time()
    try:
        _load_model(model_key, "ja-en")
        used = time.time() - t0
        return f"✅ Warmup OK ({used:.2f}s) — model: {model_key}"
    except Exception as e:
        return f"❌ Warmup failed: {e}"


def do_translate(
    text: str,
    model_key: str,
    dir_choice: str,
    auto_on: bool,
    conversation_on: bool,
    glossary_path: Optional[str],
    max_new_tokens: int,
    num_beams: int,
    history: List[Dict[str, str]],
):
    text = (text or "").strip()
    if not text:
        return "", "⚠️ テキストを入力してください。", history, _history_to_table(history), gr.update(visible=False), gr.update(visible=False)

    if len(text) > MAX_SINGLE_CHARS:
        return "", f"⚠️ 入力が長すぎます（最大 {MAX_SINGLE_CHARS} 文字）。", history, _history_to_table(history), gr.update(visible=False), gr.update(visible=False)

    direction = detect_direction_by_text(text, prefer=dir_choice) if auto_on else dir_choice
    glossary = read_glossary_csv(glossary_path)
    src_processed = apply_glossary(text, glossary)

    max_new_tokens = _clamp_int(max_new_tokens, 16, 512, DEFAULT_MAX_NEW_TOKENS)
    num_beams = _clamp_int(num_beams, 1, 6, 4)

    t0 = time.time()
    try:
        out = translate_one(
            model_key=model_key,
            direction=direction,
            text=src_processed,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            conversation=bool(conversation_on),
        )
        used = time.time() - t0

        item = {
            "time": time.strftime("%Y-%m-%d %H:%M:%S"),
            "direction": direction,
            "src": text,
            "dst": out,
        }
        history = (history or []) + [item]
        table = _history_to_table(history)

        info = f"✅ 完了：{used:.2f}s｜model: **{model_key}**｜方向：**{direction}**｜chars: {len(text)}"
        # show export buttons when history exists
        return out, info, history, table, gr.update(visible=True), gr.update(visible=True)
    except Exception as e:
        info = f"❌ 翻訳に失敗しました: {e}"
        return "", info, history, _history_to_table(history), gr.update(visible=bool(history)), gr.update(visible=bool(history))


def clear_all(history):
    history = []
    return (
        "", "🧹 クリアしました。",
        history, [],
        gr.update(visible=False),  # dl_hist_csv
        gr.update(visible=False),  # dl_hist_txt
        gr.update(visible=False, value=None),  # dl_batch_csv
        gr.update(visible=False, value=None),  # dl_batch_txt
        "",  # batch_status
        "",  # batch_preview
    )


def export_history_csv(history: List[Dict[str, str]]):
    if not history:
        return None
    return _export_history(history, "csv")


def export_history_txt(history: List[Dict[str, str]]):
    if not history:
        return None
    return _export_history(history, "txt")


def do_batch(
    batch_file_path: Optional[str],
    model_key: str,
    conversation_on: bool,
    glossary_path: Optional[str],
    max_new_tokens: int,
    num_beams: int,
):
    if not batch_file_path:
        yield "⚠️ バッチファイル（TXT/CSV）を選択してください。", "", gr.update(visible=False), None
        return

    lines = _read_batch_lines(batch_file_path)
    total = len(lines)
    if total == 0:
        yield "⚠️ 読み取れる行がありません（空/制限超過の可能性）。", "", gr.update(visible=False), None
        return

    glossary = read_glossary_csv(glossary_path)
    max_new_tokens = _clamp_int(max_new_tokens, 16, 512, DEFAULT_MAX_NEW_TOKENS)
    num_beams = _clamp_int(num_beams, 1, 6, 4)

    t0 = time.time()
    rows: List[Tuple[str, str, str]] = []  # (direction, src, dst)

    yield "⏳ バッチ翻訳中… 0/..", "", gr.update(visible=False, value=None), gr.update(visible=False, value=None)

    for i, src in enumerate(lines, 1):
        direction = detect_direction_by_text(src, prefer="ja-en")
        src_processed = apply_glossary(src, glossary)

        try:
            dst = translate_one(
                model_key=model_key,
                direction=direction,
                text=src_processed,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                conversation=bool(conversation_on),
            )
        except Exception as e:
            dst = f"[ERROR] {e}"

        rows.append((direction, src, dst))

        if i == 1 or i % 5 == 0 or i == total:
            pct = int(i * 100 / total)
            yield f"⏳ バッチ翻訳中… {i}/{total} ({pct}%)", "", gr.update(visible=False), None

    # Preview (limit)
    preview_lines = []
    for idx, (direction, s, d) in enumerate(rows[:50], 1):
        preview_lines.append(f"**{idx}. ({direction})**\n- SRC: {s}\n- DST: {d}\n")
    preview = "\n".join(preview_lines)
    if total > 50:
        preview += f"\n…（プレビューは先頭50行まで。全{total}行はCSVでダウンロード）"

    # Write result CSV
    tmpdir = tempfile.mkdtemp(prefix="batch_")
    out_csv = os.path.join(tmpdir, "batch_result.csv")
    out_txt = os.path.join(tmpdir, "batch_result.txt")

    with open(out_csv, "w", newline="", encoding="utf-8-sig") as f:
        w = csv.writer(f)
        w.writerow(["direction", "src", "dst"])
        for direction, s, d in rows:
            w.writerow([direction, s, d])

    with open(out_txt, "w", encoding="utf-8") as f:
        for i, (direction, s, d) in enumerate(rows, 1):
            f.write(f"[{i}] ({direction})\n")
            f.write(f"SRC: {s}\n")
            f.write(f"DST: {d}\n\n")

    used = time.time() - t0
    done_msg = f"✅ バッチ完了：{used:.2f}s｜行数：{total}"

    yield (
        done_msg,
        preview,
        gr.update(visible=True, value=out_csv),
        gr.update(visible=True, value=out_txt),
    )


# -------------------------
# UI
# -------------------------
CUSTOM_CSS = """
.gradio-container { max-width: 1100px !important; }
.header-title { font-size: 34px; font-weight: 900; letter-spacing: .4px; margin: 6px 0 4px; }
.subtle { opacity: 0.9; }
.badge { display: inline-block; padding: 2px 10px; border-radius: 999px; border: 1px solid rgba(120,120,120,.35); font-size: 12px; }
"""

with gr.Blocks(title="Linguo Core — Translation Space") as demo:
    gr.HTML("<div class='header-title'>Linguo Core — Translation</div>")
    gr.HTML("""
    <script>
    async function copyTextToClipboard(text){
    try{
        await navigator.clipboard.writeText(text || "");
        return "✅ Copied!";
    }catch(e){
        // fallback: older browsers
        const ta = document.createElement("textarea");
        ta.value = text || "";
        document.body.appendChild(ta);
        ta.select();
        document.execCommand("copy");
        document.body.removeChild(ta);
        return "✅ Copied!";
    }
    }
    </script>
    """)

    gr.Markdown(
        "<span class='badge'>HF Spaces</span> <span class='badge'>Public-safe</span> "
        "<span class='badge'>Glossary CSV</span> <span class='badge'>History</span> <span class='badge'>Batch</span>",
        elem_classes=["subtle"],
    )

    history_state = gr.State([])  # List[Dict]

    with gr.Row():
        model_key = gr.Dropdown(
            choices=list(MODEL_SPECS.keys()),
            value="m2m100-418M (multilingual, your current)",
            label="Model（無料CPUなら opus-mt が速い）",
        )
        warm = gr.Button("Warmup（初回ロード）")

    warm_info = gr.Markdown("")

    with gr.Row():
        direction = gr.Radio(["ja-en", "en-ja"], value="ja-en", label="Direction")
        auto = gr.Checkbox(value=True, label="Auto detect (日本語が含まれたら ja-en)")
        conversation = gr.Checkbox(value=False, label="Conversation mode（口語寄せ）")

    info = gr.Markdown("翻訳待機中…")

    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            src = gr.Textbox(lines=10, label="Input", placeholder="翻訳したい文章を入力…")
            with gr.Row():
                btn = gr.Button("Translate", variant="primary")
                btn_clear = gr.Button("Clear")
        with gr.Column(scale=1):
            dst = gr.Textbox(lines=10, label="Output")
            copy_btn = gr.Button("Copy Output")
            copy_status = gr.Markdown("")

    with gr.Accordion("Glossary / Advanced / History / Batch", open=False):
        file_gloss = gr.File(label="Glossary CSV（src,tgt）", file_count="single", type="filepath")

        with gr.Row():
            max_len = gr.Slider(16, 512, DEFAULT_MAX_NEW_TOKENS, step=16, label="max_new_tokens")
            beams = gr.Slider(1, 6, 4, step=1, label="num_beams（通常モード向け）")

        gr.Markdown("### History（直近100件表示 / エクスポート可）")
        history_table = gr.Dataframe(
            headers=["time", "direction", "src", "dst"],
            datatype=["str", "str", "str", "str"],
            row_count=0,
            column_count=(4, "fixed"),
            wrap=True,
            interactive=False,
            value=[],
            label="History",
        )
        with gr.Row():
            btn_clear_history = gr.Button("Clear history")
            dl_hist_csv = gr.DownloadButton("Download history CSV", visible=False)
            dl_hist_txt = gr.DownloadButton("Download history TXT", visible=False)

        gr.Markdown("### Batch（TXT/CSV：1行=1件 / 公開Space保護で最大200行）")
        batch_file = gr.File(label="Batch file (TXT/CSV UTF-8)", file_count="single", type="filepath")
        btn_batch = gr.Button("Run batch translate")
        batch_status = gr.Markdown("")
        batch_preview = gr.Markdown("")
        dl_batch_csv = gr.DownloadButton("Download batch_result.csv", visible=False)
        dl_batch_txt = gr.DownloadButton("Download batch_result.txt", visible=False)


    # Events
    warm.click(warmup, inputs=[model_key], outputs=[warm_info], queue=True)

    btn.click(
        do_translate,
        inputs=[src, model_key, direction, auto, conversation, file_gloss, max_len, beams, history_state],
        outputs=[dst, info, history_state, history_table, dl_hist_csv, dl_hist_txt],
        queue=True,
    )
    def _noop(x):
        return x

    copy_btn.click(
        fn=_noop,
        inputs=[dst],
        outputs=[],
        js="(text) => copyTextToClipboard(text)"
    ).then(
        fn=lambda: "✅ Copied to clipboard.",
        inputs=None,
        outputs=copy_status
    )

    src.submit(
        do_translate,
        inputs=[src, model_key, direction, auto, conversation, file_gloss, max_len, beams, history_state],
        outputs=[dst, info, history_state, history_table, dl_hist_csv, dl_hist_txt],
        queue=True,
    )

    btn_clear.click(
        lambda h: ("", "🧹 入力をクリアしました。", h, _history_to_table(h), gr.update(visible=bool(h)), gr.update(visible=bool(h))),
        inputs=[history_state],
        outputs=[src, info, history_state, history_table, dl_hist_csv, dl_hist_txt],
        queue=False,
    )

    btn_clear_history.click(
        clear_all,
        inputs=[history_state],
        outputs=[src, info, history_state, history_table, dl_hist_csv, dl_hist_txt, dl_batch_csv, dl_batch_txt, batch_status, batch_preview],
        queue=False,
    )

    dl_hist_csv.click(export_history_csv, inputs=[history_state], outputs=[dl_hist_csv], queue=False)
    dl_hist_txt.click(export_history_txt, inputs=[history_state], outputs=[dl_hist_txt], queue=False)

    btn_batch.click(
    do_batch,
    inputs=[batch_file, model_key, conversation, file_gloss, max_len, beams],
    outputs=[batch_status, batch_preview, dl_batch_csv, dl_batch_txt],
    queue=True,
)


demo.queue(max_size=16, default_concurrency_limit=1).launch()