Spaces:

jing-ju
/

AI-Translates

Runtime error

App Files Files Community

jing-ju commited on Sep 13

Commit

7a80146

verified ·

1 Parent(s): 9ecf72c

Update app.py

Browse files

Files changed (1) hide show

app.py +313 -195

app.py CHANGED Viewed

@@ -1,32 +1,13 @@
-import os
-import math
-import re
-from typing import List, Optional
-import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# ✅ Import tương thích nhiều phiên bản:
-try:
-    # Nhiều bản đặt ở đây
-    from transformers.quantizers import CompressedTensorsQuantizationConfig
-except Exception:
-    try:
-        # Một số bản export ở root (phòng hờ)
-        from transformers import CompressedTensorsQuantizationConfig  # type: ignore
-    except Exception:
-        CompressedTensorsQuantizationConfig = None  # sẽ fallback qua dict
-# =========================
-# CẤU HÌNH MẶC ĐỊNH
-# =========================
-# Model mặc định: nhẹ hơn và phù hợp hơn cho CPU Free
-DEFAULT_MODEL = "tencent/Hunyuan-MT-7B-fp8"
-MODEL_NAME = os.getenv("MODEL_NAME", DEFAULT_MODEL)
-# Tham số sinh gợi ý (giữ thấp để tránh quá tải CPU)
 GEN_KW = dict(
     max_new_tokens=256,
     top_k=20,
@@ -36,188 +17,325 @@ GEN_KW = dict(
     do_sample=True,
 )
-# Giới hạn token đầu vào mỗi lượt để tránh OOM/timeout trên CPU
-# (tổng input ≲ 900–1000 token trên CPU Free cho an toàn)
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "800"))
-# =========================
-# TẢI MODEL & TOKENIZER
-# =========================
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-# Ghi đè config lượng tử hóa để tránh lỗi "ignore NoneType" trên một số bản fp8
-ctq = CompressedTensorsQuantizationConfig(
-    quantization_method="fp8",
-    ignore=[],  # chìa khóa tránh TypeError: 'NoneType' object is not iterable
-)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True,
-    quantization_config=ctq,
-)
-DEVICE = getattr(model, "device", torch.device("cpu"))
-# =========================
-# TIỆN ÍCH CHUẨN HÓA NGÔN NGỮ
-# =========================
-# Map tên ngôn ngữ phổ biến -> tên tiếng Anh để nhúng vào prompt (đơn giản hóa)
-LANG_ALIASES = {
-    # Vietnamese
-    "vi": "Vietnamese", "vie": "Vietnamese",
-    "vietnamese": "Vietnamese", "tiếng việt": "Vietnamese",
-    # Chinese
-    "zh": "Chinese", "chi": "Chinese", "zho": "Chinese",
-    "chinese": "Chinese", "tiếng trung": "Chinese", "hán ngữ": "Chinese",
-    "mandarin": "Chinese",
-    # English
-    "en": "English", "eng": "English", "tiếng anh": "English", "english": "English",
-    # Japanese
-    "ja": "Japanese", "jpn": "Japanese", "tiếng nhật": "Japanese", "japanese": "Japanese",
-    # Korean
-    "ko": "Korean", "kor": "Korean", "tiếng hàn": "Korean", "korean": "Korean",
-    # French
-    "fr": "French", "fra": "French", "fre": "French", "tiếng pháp": "French", "french": "French",
-    # German
-    "de": "German", "deu": "German", "ger": "German", "tiếng đức": "German", "german": "German",
-    # Spanish
-    "es": "Spanish", "spa": "Spanish", "tiếng tây ban nha": "Spanish", "spanish": "Spanish",
-    # Thai
-    "th": "Thai", "tha": "Thai", "tiếng thái": "Thai", "thai": "Thai",
-    # Indonesian
-    "id": "Indonesian", "ind": "Indonesian", "tiếng indonesia": "Indonesian", "indonesian": "Indonesian",
-    # Malay
-    "ms": "Malay", "msa": "Malay", "tiếng malaysia": "Malay", "malay": "Malay",
-    # Portuguese
-    "pt": "Portuguese", "por": "Portuguese", "tiếng bồ đào nha": "Portuguese", "portuguese": "Portuguese",
-    # Russian
-    "ru": "Russian", "rus": "Russian", "tiếng nga": "Russian", "russian": "Russian",
 }
-def normalize_lang_name(s: Optional[str]) -> Optional[str]:
-    if not s:
-        return None
-    key = s.strip().lower()
-    return LANG_ALIASES.get(key, s.strip())
-# =========================
-# CHIA ĐOẠN THEO TOKEN
-# =========================
-def chunk_text_by_tokens(text: str, max_tokens: int) -> List[str]:
-    """
-    Chia văn bản thành các đoạn dựa vào số token của tokenizer để tránh vượt ngưỡng input.
-    Ưu tiên cắt theo dấu câu. Nếu đoạn vẫn dài, cắt tiếp theo token.
-    """
-    # Tách theo các dấu câu lớn trước
-    rough_parts = re.split(r"(?<=[\.!?。！？])\s+", text.strip())
-    chunks = []
-    buf = ""
-    def token_len(s: str) -> int:
-        return tokenizer(s, add_special_tokens=False, return_length=True)["length"]
-    for part in rough_parts:
-        candidate = (buf + " " + part).strip() if buf else part
-        if token_len(candidate) <= max_tokens:
-            buf = candidate
         else:
-            if buf:
-                chunks.append(buf)
-                buf = ""
-            # Nếu part tự thân đã quá dài, cắt tiếp theo token
-            if token_len(part) <= max_tokens:
-                buf = part
             else:
-                # Cắt theo token “cứng”
-                ids = tokenizer(part, add_special_tokens=False)["input_ids"]
-                for i in range(0, len(ids), max_tokens):
-                    piece_ids = ids[i:i + max_tokens]
-                    piece = tokenizer.decode(piece_ids, skip_special_tokens=True)
-                    chunks.append(piece)
-                buf = ""
-    if buf:
-        chunks.append(buf)
-    # Loại bỏ rỗng
-    return [c for c in chunks if c.strip()]
-# =========================
-# CORE TRANSLATION (SỬ DỤNG CHAT TEMPLATE)
-# =========================
-@torch.inference_mode()
-def translate_text(
-    text: str,
-    target_lang: str,
-    source_lang: Optional[str] = None,
-) -> str:
-    target = normalize_lang_name(target_lang) or "Vietnamese"
-    src = normalize_lang_name(source_lang)
-    # Xây prompt: có thể thêm nguồn nếu người dùng cung cấp, còn không để model tự đoán
-    if src:
-        sys_prompt = f"Translate the following segment from {src} into {target}, without additional explanation."
     else:
-        sys_prompt = f"Translate the following segment into {target}, without additional explanation."
-    pieces = chunk_text_by_tokens(text, MAX_INPUT_TOKENS)
-    outputs = []
-    for piece in pieces:
-        messages = [{"role": "user", "content": f"{sys_prompt}\n\n{piece}"}]
-        inputs = tokenizer.apply_chat_template(
-            messages, tokenize=True, add_generation_prompt=False, return_tensors="pt"
         )
-        out_ids = model.generate(inputs.to(DEVICE), **GEN_KW)
-        out_text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
-        outputs.append(out_text.strip())
-    return "\n".join(outputs).strip()
-def translate_batch(
-    texts: List[str],
-    target_lang: str,
-    source_lang: Optional[str] = None,
-) -> List[str]:
-    return [translate_text(t, target_lang, source_lang) for t in texts]
-# =========================
-# GRADIO UI + API
-# =========================
-LANG_CHOICES = sorted(list(set(LANG_ALIASES.values())))
-with gr.Blocks() as demo:
-    gr.Markdown(
-        "## Hunyuan-MT (fp8) — Multilingual Translation (Trial on CPU)\n"
-        "Bản HF Spaces Free (CPU) — tốc độ chậm, đã có chia đoạn tự động theo token."
-    )
-    with gr.Tab("Single"):
-        src = gr.Textbox(label="Văn bản nguồn", lines=10, placeholder="Dán văn bản cần dịch…")
-        with gr.Row():
-            src_lang = gr.Textbox(label="Ngôn ngữ nguồn (tùy chọn, ví dụ: Vietnamese/Chinese/English…)", placeholder="Để trống nếu không chắc")
-            tgt_lang = gr.Dropdown(label="Ngôn ngữ đích", choices=LANG_CHOICES, value="Vietnamese")
-        out = gr.Textbox(label="Bản dịch", lines=10)
-        btn = gr.Button("Dịch")
-        btn.click(fn=translate_text, inputs=[src, tgt_lang, src_lang], outputs=out, api_name="translate_text")
-    with gr.Tab("Batch"):
-        src_list = gr.Textbox(
-            label="Danh sách câu (mỗi dòng 1 câu/đoạn ngắn)",
-            lines=10,
-            placeholder="Mỗi dòng là một câu/đoạn…"
-        )
-        with gr.Row():
-            src_lang_b = gr.Textbox(label="Ngôn ngữ nguồn (tuỳ chọn)", placeholder="Để trống nếu không chắc")
-            tgt_lang_b = gr.Dropdown(label="Ngôn ngữ đích", choices=LANG_CHOICES, value="Vietnamese")
-        out_list = gr.Textbox(label="Kết quả (mỗi dòng tương ứng 1 đầu vào)", lines=10)
-        def _batch_wrapper(texts_raw: str, tgt: str, src_: Optional[str]):
-            texts = [x for x in texts_raw.splitlines() if x.strip()]
-            results = translate_batch(texts, tgt, src_)
-            return "\n".join(results)
-        btn_b = gr.Button("Dịch Batch")
-        btn_b.click(fn=_batch_wrapper, inputs=[src_list, tgt_lang_b, src_lang_b], outputs=out_list, api_name="translate_batch")
-# Giới hạn tải cho demo
-demo.queue(concurrency_count=1, max_size=2).launch()

 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import gradio as gr
+import re
+# Environment variables
+MODEL_NAME = os.getenv("MODEL_NAME", "tencent/Hunyuan-MT-7B-fp8")
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "800"))
+# Generation parameters optimized for CPU
 GEN_KW = dict(
     max_new_tokens=256,
     top_k=20,
     do_sample=True,
 )
+# Language mapping for normalization
+LANGUAGE_MAPPING = {
+    "vi": "Vietnamese",
+    "vietnamese": "Vietnamese",
+    "tiếng việt": "Vietnamese",
+    "zh": "Chinese",
+    "chinese": "Chinese",
+    "tiếng trung": "Chinese",
+    "中文": "Chinese",
+    "en": "English",
+    "english": "English",
+    "tiếng anh": "English",
+    "ja": "Japanese",
+    "japanese": "Japanese",
+    "tiếng nhật": "Japanese",
+    "日本語": "Japanese",
+    "ko": "Korean",
+    "korean": "Korean",
+    "tiếng hàn": "Korean",
+    "한국어": "Korean",
+    "fr": "French",
+    "french": "French",
+    "tiếng pháp": "French",
+    "de": "German",
+    "german": "German",
+    "tiếng đức": "German",
+    "es": "Spanish",
+    "spanish": "Spanish",
+    "tiếng tây ban nha": "Spanish",
+    "th": "Thai",
+    "thai": "Thai",
+    "tiếng thái": "Thai",
+    "id": "Indonesian",
+    "indonesian": "Indonesian",
+    "tiếng indonesia": "Indonesian",
+    "ms": "Malay",
+    "malay": "Malay",
+    "tiếng malaysia": "Malay",
+    "pt": "Portuguese",
+    "portuguese": "Portuguese",
+    "tiếng bồ đào nha": "Portuguese",
+    "ru": "Russian",
+    "russian": "Russian",
+    "tiếng nga": "Russian",
 }
+SUPPORTED_LANGUAGES = [
+    "Vietnamese", "Chinese", "English", "Japanese", "Korean",
+    "French", "German", "Spanish", "Thai", "Indonesian",
+    "Malay", "Portuguese", "Russian"
+]
+def normalize_language(lang):
+    """Normalize language name"""
+    if not lang:
+        return None
+    lang_lower = lang.strip().lower()
+    return LANGUAGE_MAPPING.get(lang_lower, lang.strip())
+def load_model():
+    """Load model and tokenizer with fp8 quantization config"""
+    print(f"Loading model: {MODEL_NAME}")
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_NAME,
+        trust_remote_code=True
+    )
+    # Create quantization config for fp8
+    try:
+        from transformers.quantizers import CompressedTensorsQuantizationConfig
+        quantization_config = CompressedTensorsQuantizationConfig(
+            quantization_method="fp8",
+            ignore=[]
+        )
+    except ImportError:
+        # Fallback to dict format
+        quantization_config = {
+            "quantization_method": "fp8",
+            "ignore": []
+        }
+    # Load model with quantization config
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        trust_remote_code=True,
+        quantization_config=quantization_config,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    )
+    return tokenizer, model
+def chunk_text_by_tokens(text, tokenizer, max_tokens):
+    """Split text into chunks based on token count"""
+    if not text.strip():
+        return []
+    # First, try splitting by sentence delimiters
+    sentences = re.split(r'[.!?。！？]', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        test_chunk = current_chunk + " " + sentence if current_chunk else sentence
+        # Estimate token length
+        try:
+            token_count = len(tokenizer.encode(test_chunk, add_special_tokens=False))
+        except:
+            token_count = len(test_chunk.split()) * 1.3  # rough estimation
+        if token_count <= max_tokens:
+            current_chunk = test_chunk
         else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            # If single sentence is too long, split it forcefully
+            if len(tokenizer.encode(sentence, add_special_tokens=False)) > max_tokens:
+                tokens = tokenizer.encode(sentence, add_special_tokens=False)
+                for i in range(0, len(tokens), max_tokens):
+                    chunk_tokens = tokens[i:i + max_tokens]
+                    chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+                    chunks.append(chunk_text)
+                current_chunk = ""
             else:
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def translate_text_chunk(text, target_lang, source_lang, tokenizer, model):
+    """Translate a single chunk of text"""
+    target_lang = normalize_language(target_lang)
+    source_lang = normalize_language(source_lang) if source_lang else None
+    if not target_lang:
+        return "Error: Invalid target language"
+    # Create prompt
+    if source_lang:
+        prompt = f"Translate the following segment from {source_lang} into {target_lang}, without additional explanation.\n\n{text}"
     else:
+        prompt = f"Translate the following segment into {target_lang}, without additional explanation.\n\n{text}"
+    # Apply chat template
+    messages = [{"role": "user", "content": prompt}]
+    input_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Tokenize
+    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+    # Generate
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            **GEN_KW,
+            pad_token_id=tokenizer.eos_token_id
         )
+    # Decode
+    response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+    return response.strip()
+def translate_single(text, target_lang, source_lang, tokenizer, model):
+    """Translate text with automatic chunking"""
+    if not text.strip():
+        return "Please enter text to translate."
+    if not target_lang:
+        return "Please select a target language."
+    try:
+        # Split into chunks
+        chunks = chunk_text_by_tokens(text, tokenizer, MAX_INPUT_TOKENS)
+        if not chunks:
+            return "No valid text to translate."
+        # Translate each chunk
+        translations = []
+        for chunk in chunks:
+            translation = translate_text_chunk(chunk, target_lang, source_lang, tokenizer, model)
+            translations.append(translation)
+        return " ".join(translations)
+    except Exception as e:
+        return f"Translation error: {str(e)}"
+def translate_batch(text_lines, target_lang, source_lang, tokenizer, model):
+    """Translate multiple lines of text"""
+    if not text_lines.strip():
+        return "Please enter text lines to translate."
+    if not target_lang:
+        return "Please select a target language."
+    lines = [line.strip() for line in text_lines.split('\n') if line.strip()]
+    if not lines:
+        return "No valid text lines to translate."
+    try:
+        results = []
+        for line in lines:
+            translation = translate_single(line, target_lang, source_lang, tokenizer, model)
+            results.append(translation)
+        return '\n'.join(results)
+    except Exception as e:
+        return f"Batch translation error: {str(e)}"
+# Load model and tokenizer
+print("Initializing model...")
+tokenizer, model = load_model()
+device = model.device
+print(f"Model loaded on device: {device}")
+# Create Gradio interface
+with gr.Blocks(title="Hunyuan-MT Multi-language Translation") as demo:
+    gr.Markdown("# 🌍 Hunyuan-MT Multi-language Translation")
+    gr.Markdown(f"**Model**: {MODEL_NAME}")
+    gr.Markdown("⚠️ **Note**: Running on Free CPU - translation may be slow and length is limited.")
+    with gr.Tabs():
+        with gr.TabItem("Single Translation"):
+            with gr.Row():
+                with gr.Column():
+                    input_text = gr.Textbox(
+                        label="Text to translate",
+                        placeholder="Enter your text here...",
+                        lines=5
+                    )
+                    target_lang = gr.Dropdown(
+                        choices=SUPPORTED_LANGUAGES,
+                        label="Target Language",
+                        value="Vietnamese"
+                    )
+                    source_lang = gr.Textbox(
+                        label="Source Language (optional)",
+                        placeholder="Leave empty for auto-detection"
+                    )
+                    translate_btn = gr.Button("Translate", variant="primary")
+                with gr.Column():
+                    output_text = gr.Textbox(
+                        label="Translation",
+                        lines=5,
+                        interactive=False
+                    )
+            translate_btn.click(
+                fn=lambda text, tgt, src: translate_single(text, tgt, src, tokenizer, model),
+                inputs=[input_text, target_lang, source_lang],
+                outputs=output_text,
+                api_name="translate_text"
+            )
+        with gr.TabItem("Batch Translation"):
+            with gr.Row():
+                with gr.Column():
+                    batch_input = gr.Textbox(
+                        label="Text lines to translate (one per line)",
+                        placeholder="Line 1\nLine 2\nLine 3...",
+                        lines=8
+                    )
+                    batch_target_lang = gr.Dropdown(
+                        choices=SUPPORTED_LANGUAGES,
+                        label="Target Language",
+                        value="Vietnamese"
+                    )
+                    batch_source_lang = gr.Textbox(
+                        label="Source Language (optional)",
+                        placeholder="Leave empty for auto-detection"
+                    )
+                    batch_translate_btn = gr.Button("Translate Batch", variant="primary")
+                with gr.Column():
+                    batch_output = gr.Textbox(
+                        label="Batch Translation Results",
+                        lines=8,
+                        interactive=False
+                    )
+            batch_translate_btn.click(
+                fn=lambda text, tgt, src: translate_batch(text, tgt, src, tokenizer, model),
+                inputs=[batch_input, batch_target_lang, batch_source_lang],
+                outputs=batch_output,
+                api_name="translate_batch"
+            )
+    gr.Markdown("### API Usage")
+    gr.Markdown("""
+    ```python
+    from gradio_client import Client
+    client = Client("YOUR_SPACE_URL")
+    # Single translation
+    result = client.predict("你好", "Vietnamese", None, api_name="/translate_text")
+    # Batch translation
+    result = client.predict("你好\\n再见", "Vietnamese", None, api_name="/translate_batch")
+    ```
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.queue(concurrency_count=1, max_size=2).launch()