Spaces:

jing-ju
/

AI-Translates

Runtime error

App Files Files Community

jing-ju commited on Sep 13

Commit

3a03ca4

verified ·

1 Parent(s): 9e876fc

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -262

app.py CHANGED Viewed

@@ -1,273 +1,122 @@
 import gradio as gr
 import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    BitsAndBytesConfig
-)
-import logging
-import gc
-import psutil
-import os
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Global variables
-tokenizer = None
-model = None
-def get_memory_usage():
-    """Get current memory usage"""
-    process = psutil.Process(os.getpid())
-    return process.memory_info().rss / 1024 / 1024 / 1024  # GB
-def load_model_optimized():
-    """Load model with maximum optimization for CPU"""
-    global tokenizer, model
-    if model is not None:
-        return model, tokenizer
-    model_name = "Tencent/Hunyuan-MT-7B-FS8"
-    logger.info(f"Loading {model_name} with optimizations...")
-    logger.info(f"Memory before loading: {get_memory_usage():.2f} GB")
-    try:
-        # Load tokenizer first
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            trust_remote_code=True
-        )
-        # Load model with aggressive optimizations
-        model = AutoModelForSeq2SeqLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16,  # Half precision
-            device_map="cpu",
-            low_cpu_mem_usage=True,     # Reduce memory usage
-            trust_remote_code=True,
-            use_cache=False,            # Disable KV cache
-            offload_folder="./offload", # Offload to disk if needed
-        )
-        # Additional optimizations
-        model.eval()  # Set to evaluation mode
-        # Enable torch optimizations
-        torch.set_num_threads(2)  # Limit threads
-        logger.info(f"Memory after loading: {get_memory_usage():.2f} GB")
-        logger.info("Model loaded successfully!")
-        return model, tokenizer
-    except Exception as e:
-        logger.error(f"Error loading model: {e}")
-        # Try fallback with 8-bit quantization
-        try:
-            logger.info("Trying 8-bit quantization...")
-            quantization_config = BitsAndBytesConfig(
-                load_in_8bit=True,
-                llm_int8_enable_fp32_cpu_offload=True
-            )
-            model = AutoModelForSeq2SeqLM.from_pretrained(
-                model_name,
-                quantization_config=quantization_config,
-                device_map="auto",
-                trust_remote_code=True,
-                low_cpu_mem_usage=True
-            )
-            logger.info("8-bit model loaded!")
-            return model, tokenizer
-        except Exception as e2:
-            logger.error(f"8-bit loading also failed: {e2}")
-            raise e2
-def translate_text_optimized(
-    text: str,
-    source_lang: str = "auto",
-    target_lang: str = "en"
-) -> str:
-    """Optimized translation function"""
-    if not text.strip():
-        return "Please enter text to translate"
-    # Memory cleanup before translation
-    gc.collect()
-    torch.cuda.empty_cache() if torch.cuda.is_available() else None
-    try:
-        model, tokenizer = load_model_optimized()
-        # Format input
-        if source_lang == "auto":
-            input_text = f"Translate to {target_lang}: {text}"
         else:
-            input_text = f"Translate from {source_lang} to {target_lang}: {text}"
-        logger.info(f"Translating: {input_text[:50]}...")
-        start_memory = get_memory_usage()
-        # Tokenize with truncation
-        inputs = tokenizer(
-            input_text,
-            return_tensors="pt",
-            max_length=512,     # Limit input length
-            truncation=True,
-            padding=False       # No padding for single input
-        )
-        # Generate with minimal settings
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=256,    # Limit output length
-                min_length=1,
-                num_beams=2,           # Reduce beams for speed
-                early_stopping=True,
-                do_sample=False,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                use_cache=False        # Disable cache
-            )
-        # Decode output
-        translated_text = tokenizer.decode(
-            outputs[0],
-            skip_special_tokens=True
-        )
-        # Clean output
-        if ":" in translated_text:
-            translated_text = translated_text.split(":", 1)[-1].strip()
-        # Memory cleanup after translation
-        del inputs, outputs
-        gc.collect()
-        end_memory = get_memory_usage()
-        logger.info(f"Translation completed. Memory: {start_memory:.2f}GB -> {end_memory:.2f}GB")
-        return translated_text
-    except Exception as e:
-        logger.error(f"Translation error: {e}")
-        gc.collect()  # Cleanup on error
-        return f"Translation failed: {str(e)}"
-# Language mapping
-LANGUAGES = {
-    "auto": "Auto Detect",
-    "en": "English",
-    "zh": "Chinese",
-    "vi": "Vietnamese",
-    "ja": "Japanese",
-    "ko": "Korean",
-    "th": "Thai",
-    "id": "Indonesian",
-    "ms": "Malay",
-    "fil": "Filipino"
-}
-# Create Gradio interface
-with gr.Blocks(
-    title="Hunyuan-MT Translation (CPU Optimized)",
-    theme=gr.themes.Monochrome(),
-) as demo:
-    gr.HTML("""
-    <div style="text-align: center; margin: 20px;">
-        <h1>🧠 Hunyuan-MT-7B Translation</h1>
-        <p><strong>CPU Optimized Version</strong></p>
-        <p><em>⚠️ First translation may take 1-2 minutes to load model</em></p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            input_text = gr.Textbox(
-                label="Input Text",
-                placeholder="Enter text to translate (max 200 words for best performance)...",
-                lines=4,
-                max_lines=8
-            )
-            with gr.Row():
-                source_lang = gr.Dropdown(
-                    choices=list(LANGUAGES.items()),
-                    label="From",
-                    value="auto"
-                )
-                target_lang = gr.Dropdown(
-                    choices=[(k, v) for k, v in LANGUAGES.items() if k != "auto"],
-                    label="To",
-                    value="en"
-                )
-            translate_btn = gr.Button(
-                "🔄 Translate",
-                variant="primary",
-                size="lg"
-            )
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="Translation",
-                lines=4,
-                max_lines=8,
-                interactive=False
-            )
-            memory_display = gr.Textbox(
-                label="System Status",
-                value="Ready",
-                interactive=False
-            )
-    # Memory monitoring
-    def update_memory():
-        return f"Memory: {get_memory_usage():.1f}GB / 16GB"
-    def translate_with_status(text, src, tgt):
-        if len(text.split()) > 100:  # Limit word count
-            return "Please limit input to 100 words for optimal performance", update_memory()
-        result = translate_text_optimized(text, src, tgt)
-        return result, update_memory()
-    # Examples for testing
-    gr.Examples(
-        examples=[
-            ["Hello, how are you?", "en", "vi"],
-            ["Xin chào", "vi", "en"],
-            ["Good morning", "en", "zh"],
-            ["Thank you very much", "en", "ja"],
-        ],
-        inputs=[input_text, source_lang, target_lang],
-        outputs=[output_text, memory_display],
-        fn=translate_with_status
-    )
-    translate_btn.click(
-        fn=translate_with_status,
-        inputs=[input_text, source_lang, target_lang],
-        outputs=[output_text, memory_display]
-    )
-    # Auto-update memory display
-    demo.load(fn=update_memory, outputs=memory_display)
-# Launch with specific settings for HF Spaces
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_api=True,
-        enable_monitoring=False  # Disable to save resources
-    )

+# app.py — HF Spaces Free (CPU), Hunyuan-MT 7B-fp8, đa ngôn ngữ, chia đoạn, UI + API
+import os, re
+from typing import List, Optional
 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# ===== Cấu hình =====
+DEFAULT_MODEL = "tencent/Hunyuan-MT-7B-fp8"  # đổi bằng env MODEL_NAME nếu muốn
+MODEL_NAME = os.getenv("MODEL_NAME", DEFAULT_MODEL)
+GEN_KW = dict(  # tham số sinh nhẹ cho CPU
+    max_new_tokens=256,
+    top_k=20,
+    top_p=0.6,
+    repetition_penalty=1.05,
+    temperature=0.7,
+    do_sample=True,
+)
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "800"))  # giới hạn input mỗi mảnh
+# ===== Load tokenizer & model (fp8 bằng dict quantization_config) =====
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+quant_cfg = {"quantization_method": "fp8", "ignore": []}  # tránh lỗi ignore=None
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    trust_remote_code=True,
+    quantization_config=quant_cfg,
+)
+DEVICE = getattr(model, "device", torch.device("cpu"))
+# ===== Chuẩn hóa tên ngôn ngữ =====
+LANG_ALIASES = {
+    "vi": "Vietnamese", "vie": "Vietnamese", "vietnamese": "Vietnamese", "tiếng việt": "Vietnamese",
+    "zh": "Chinese", "chi": "Chinese", "zho": "Chinese", "chinese": "Chinese", "tiếng trung": "Chinese", "hán ngữ": "Chinese", "mandarin": "Chinese",
+    "en": "English", "eng": "English", "tiếng anh": "English", "english": "English",
+    "ja": "Japanese", "jpn": "Japanese", "tiếng nhật": "Japanese", "japanese": "Japanese",
+    "ko": "Korean", "kor": "Korean", "tiếng hàn": "Korean", "korean": "Korean",
+    "fr": "French", "fra": "French", "fre": "French", "tiếng pháp": "French", "french": "French",
+    "de": "German", "deu": "German", "ger": "German", "tiếng đức": "German", "german": "German",
+    "es": "Spanish", "spa": "Spanish", "tiếng tây ban nha": "Spanish", "spanish": "Spanish",
+    "th": "Thai", "tha": "Thai", "tiếng thái": "Thai", "thai": "Thai",
+    "id": "Indonesian", "ind": "Indonesian", "tiếng indonesia": "Indonesian", "indonesian": "Indonesian",
+    "ms": "Malay", "msa": "Malay", "tiếng malaysia": "Malay", "malay": "Malay",
+    "pt": "Portuguese", "por": "Portuguese", "tiếng bồ đào nha": "Portuguese", "portuguese": "Portuguese",
+    "ru": "Russian", "rus": "Russian", "tiếng nga": "Russian", "russian": "Russian",
+}
+LANG_CHOICES = sorted(set(LANG_ALIASES.values()))
+def norm_lang(s: Optional[str]) -> Optional[str]:
+    if not s: return None
+    k = s.strip().lower()
+    return LANG_ALIASES.get(k, s.strip())
+# ===== Chia văn bản theo token =====
+def chunk_by_tokens(text: str, max_tokens: int) -> List[str]:
+    text = text.strip()
+    if not text: return []
+    rough = re.split(r"(?<=[\.!?。！？])\s+", text)
+    chunks, buf = [], ""
+    def tok_len(s: str) -> int:
+        return tokenizer(s, add_special_tokens=False, return_length=True)["length"]
+    for part in rough:
+        cand = (buf + " " + part).strip() if buf else part
+        if tok_len(cand) <= max_tokens:
+            buf = cand
         else:
+            if buf: chunks.append(buf); buf = ""
+            if tok_len(part) <= max_tokens:
+                buf = part
+            else:
+                ids = tokenizer(part, add_special_tokens=False)["input_ids"]
+                for i in range(0, len(ids), max_tokens):
+                    piece = tokenizer.decode(ids[i:i+max_tokens], skip_special_tokens=True)
+                    if piece.strip(): chunks.append(piece.strip())
+    if buf: chunks.append(buf)
+    return [c for c in chunks if c.strip()]
+# ===== Core translate (chat template) =====
+@torch.inference_mode()
+def translate_text(text: str, target_lang: str, source_lang: Optional[str]=None) -> str:
+    tgt = norm_lang(target_lang) or "Vietnamese"
+    src = norm_lang(source_lang)
+    sys_prompt = (f"Translate the following segment from {src} into {tgt}, without additional explanation."
+                  if src else
+                  f"Translate the following segment into {tgt}, without additional explanation.")
+    outs = []
+    for piece in chunk_by_tokens(text, MAX_INPUT_TOKENS):
+        msgs = [{"role":"user","content": f"{sys_prompt}\n\n{piece}"}]
+        inputs = tokenizer.apply_chat_template(msgs, tokenize=True, add_generation_prompt=False, return_tensors="pt")
+        out_ids = model.generate(inputs.to(DEVICE), **GEN_KW)
+        outs.append(tokenizer.decode(out_ids[0], skip_special_tokens=True).strip())
+    return "\n".join(outs).strip()
+def translate_batch(texts: List[str], target_lang: str, source_lang: Optional[str]=None) -> List[str]:
+    return [translate_text(t, target_lang, source_lang) for t in texts]
+# ===== Gradio UI + API =====
+with gr.Blocks() as demo:
+    gr.Markdown("## Hunyuan-MT 7B-fp8 — Multilingual Translation (HF Free CPU)\nChia đoạn theo token, UI + API (Gradio).")
+    with gr.Tab("Single"):
+        src = gr.Textbox(label="Văn bản nguồn", lines=10, placeholder="Dán văn bản cần dịch…")
+        with gr.Row():
+            src_lang = gr.Textbox(label="Ngôn ngữ nguồn (tùy chọn)", placeholder="Ví dụ: Vietnamese/Chinese/English…")
+            tgt_lang = gr.Dropdown(label="Ngôn ngữ đích", choices=LANG_CHOICES, value="Vietnamese")
+        out = gr.Textbox(label="Bản dịch", lines=10)
+        gr.Button("Dịch").click(translate_text, inputs=[src, tgt_lang, src_lang], outputs=out, api_name="translate_text")
+    with gr.Tab("Batch"):
+        src_list = gr.Textbox(label="Mỗi dòng 1 câu/đoạn", lines=10)
+        with gr.Row():
+            src_lang_b = gr.Textbox(label="Ngôn ngữ nguồn (tùy chọn)")
+            tgt_lang_b = gr.Dropdown(label="Ngôn ngữ đích", choices=LANG_CHOICES, value="Vietnamese")
+        out_list = gr.Textbox(label="Kết quả (mỗi dòng tương ứng 1 đầu vào)", lines=10)
+        def _batch(txts_raw: str, tgt: str, src_: Optional[str]):
+            texts = [x for x in txts_raw.splitlines() if x.strip()]
+            return "\n".join(translate_batch(texts, tgt, src_))
+        gr.Button("Dịch Batch").click(_batch, inputs=[src_list, tgt_lang_b, src_lang_b], outputs=out_list, api_name="translate_batch")
+demo.queue(concurrency_count=1, max_size=2).launch()