Spaces:

jing-ju
/

AI-Translates

Runtime error

App Files Files Community

jing-ju commited on Sep 13

Commit

6bb64e2

verified ·

1 Parent(s): 0fdedb6

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -349

app.py CHANGED Viewed

@@ -1,390 +1,273 @@
-import os
-import torch
-import re
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
-# Environment variables
-MODEL_NAME = os.getenv("MODEL_NAME", "tencent/Hunyuan-MT-7B-fp8")
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "800"))
-# Generation parameters optimized for CPU
-GEN_KW = dict(
-    max_new_tokens=256,
-    top_k=20,
-    top_p=0.6,
-    repetition_penalty=1.05,
-    temperature=0.7,
-    do_sample=True,
 )
-# Language mapping for normalization
-LANGUAGE_MAPPING = {
-    "vi": "Vietnamese",
-    "vietnamese": "Vietnamese",
-    "tiếng việt": "Vietnamese",
-    "zh": "Chinese",
-    "chinese": "Chinese",
-    "tiếng trung": "Chinese",
-    "中文": "Chinese",
-    "en": "English",
-    "english": "English",
-    "tiếng anh": "English",
-    "ja": "Japanese",
-    "japanese": "Japanese",
-    "tiếng nhật": "Japanese",
-    "日本語": "Japanese",
-    "ko": "Korean",
-    "korean": "Korean",
-    "tiếng hàn": "Korean",
-    "한국어": "Korean",
-    "fr": "French",
-    "french": "French",
-    "tiếng pháp": "French",
-    "de": "German",
-    "german": "German",
-    "tiếng đức": "German",
-    "es": "Spanish",
-    "spanish": "Spanish",
-    "tiếng tây ban nha": "Spanish",
-    "th": "Thai",
-    "thai": "Thai",
-    "tiếng thái": "Thai",
-    "id": "Indonesian",
-    "indonesian": "Indonesian",
-    "tiếng indonesia": "Indonesian",
-    "ms": "Malay",
-    "malay": "Malay",
-    "tiếng malaysia": "Malay",
-    "pt": "Portuguese",
-    "portuguese": "Portuguese",
-    "tiếng bồ đào nha": "Portuguese",
-    "ru": "Russian",
-    "russian": "Russian",
-    "tiếng nga": "Russian",
-}
-SUPPORTED_LANGUAGES = [
-    "Vietnamese", "Chinese", "English", "Japanese", "Korean",
-    "French", "German", "Spanish", "Thai", "Indonesian",
-    "Malay", "Portuguese", "Russian"
-]
-def normalize_language(lang):
-    """Normalize language name"""
-    if not lang:
-        return None
-    lang_lower = lang.strip().lower()
-    return LANGUAGE_MAPPING.get(lang_lower, lang.strip())
-def load_model():
-    """Load model and tokenizer with fp8 quantization config"""
-    print(f"Loading model: {MODEL_NAME}")
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_NAME,
-        trust_remote_code=True
-    )
-    # Create quantization config for fp8 - must use the actual class
     try:
-        from compressed_tensors import CompressedTensorsConfig
-        quantization_config = CompressedTensorsConfig(
-            quantization_method="fp8",
-            ignore=[]
         )
-        print("Using CompressedTensorsConfig")
-    except ImportError:
         try:
-            from transformers.quantizers import CompressedTensorsQuantizationConfig
-            quantization_config = CompressedTensorsQuantizationConfig(
-                quantization_method="fp8",
-                ignore=[]
             )
-            print("Using CompressedTensorsQuantizationConfig")
-        except ImportError:
-            # If both fail, load without custom quantization config
-            print("Loading model without custom quantization config")
-            quantization_config = None
-    # Load model with quantization config
-    model_kwargs = {
-        "trust_remote_code": True,
-        "dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-    }
-    if quantization_config is not None:
-        model_kwargs["quantization_config"] = quantization_config
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        **model_kwargs
-    )
-    return tokenizer, model
-def chunk_text_by_tokens(text, tokenizer, max_tokens):
-    """Split text into chunks based on token count"""
-    if not text.strip():
-        return []
-    # First, try splitting by sentence delimiters
-    sentences = re.split(r'[.!?。！？]', text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        test_chunk = current_chunk + " " + sentence if current_chunk else sentence
-        # Estimate token length
-        try:
-            token_count = len(tokenizer.encode(test_chunk, add_special_tokens=False))
-        except:
-            token_count = len(test_chunk.split()) * 1.3  # rough estimation
-        if token_count <= max_tokens:
-            current_chunk = test_chunk
-        else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            # If single sentence is too long, split it forcefully
-            try:
-                sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
-                if len(sentence_tokens) > max_tokens:
-                    for i in range(0, len(sentence_tokens), max_tokens):
-                        chunk_tokens = sentence_tokens[i:i + max_tokens]
-                        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
-                        chunks.append(chunk_text)
-                    current_chunk = ""
-                else:
-                    current_chunk = sentence
-            except:
-                current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-def translate_text_chunk(text, target_lang, source_lang, tokenizer, model):
-    """Translate a single chunk of text"""
-    target_lang = normalize_language(target_lang)
-    source_lang = normalize_language(source_lang) if source_lang else None
-    if not target_lang:
-        return "Error: Invalid target language"
-    # Create prompt
-    if source_lang:
-        prompt = f"Translate the following segment from {source_lang} into {target_lang}, without additional explanation.\n\n{text}"
-    else:
-        prompt = f"Translate the following segment into {target_lang}, without additional explanation.\n\n{text}"
-    # Apply chat template
     try:
-        messages = [{"role": "user", "content": prompt}]
-        input_text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
         )
-    except:
-        # Fallback if chat template fails
-        input_text = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-    # Tokenize
-    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
-    # Generate
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            **GEN_KW,
-            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id
         )
-    # Decode
-    response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
-    return response.strip()
-def translate_single(text, target_lang, source_lang, tokenizer, model):
-    """Translate text with automatic chunking"""
-    if not text.strip():
-        return "Please enter text to translate."
-    if not target_lang:
-        return "Please select a target language."
-    try:
-        # Split into chunks
-        chunks = chunk_text_by_tokens(text, tokenizer, MAX_INPUT_TOKENS)
-        if not chunks:
-            return "No valid text to translate."
-        # Translate each chunk
-        translations = []
-        for chunk in chunks:
-            translation = translate_text_chunk(chunk, target_lang, source_lang, tokenizer, model)
-            translations.append(translation)
-        return " ".join(translations)
-    except Exception as e:
-        return f"Translation error: {str(e)}"
-def translate_batch(text_lines, target_lang, source_lang, tokenizer, model):
-    """Translate multiple lines of text"""
-    if not text_lines.strip():
-        return "Please enter text lines to translate."
-    if not target_lang:
-        return "Please select a target language."
-    lines = [line.strip() for line in text_lines.split('\n') if line.strip()]
-    if not lines:
-        return "No valid text lines to translate."
-    try:
-        results = []
-        for line in lines:
-            translation = translate_single(line, target_lang, source_lang, tokenizer, model)
-            results.append(translation)
-        return '\n'.join(results)
     except Exception as e:
-        return f"Batch translation error: {str(e)}"
-# Load model and tokenizer
-print("Initializing model...")
-try:
-    tokenizer, model = load_model()
-    device = model.device
-    print(f"Model loaded successfully on device: {device}")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    # Create dummy functions for interface
-    tokenizer = None
-    model = None
-    def dummy_translate(text, target_lang, source_lang):
-        return f"Model loading failed: {e}"
-    translate_single = dummy_translate
-    translate_batch = lambda text_lines, target_lang, source_lang, *args: dummy_translate(text_lines, target_lang, source_lang)
 # Create Gradio interface
-with gr.Blocks(title="Hunyuan-MT Multi-language Translation") as demo:
-    gr.Markdown("# 🌍 Hunyuan-MT Multi-language Translation")
-    gr.Markdown(f"**Model**: {MODEL_NAME}")
-    gr.Markdown("⚠️ **Note**: Running on Free CPU - translation may be slow and length is limited.")
-    with gr.Tabs():
-        with gr.TabItem("Single Translation"):
-            with gr.Row():
-                with gr.Column():
-                    input_text = gr.Textbox(
-                        label="Text to translate",
-                        placeholder="Enter your text here...",
-                        lines=5
-                    )
-                    target_lang = gr.Dropdown(
-                        choices=SUPPORTED_LANGUAGES,
-                        label="Target Language",
-                        value="Vietnamese"
-                    )
-                    source_lang = gr.Textbox(
-                        label="Source Language (optional)",
-                        placeholder="Leave empty for auto-detection"
-                    )
-                    translate_btn = gr.Button("Translate", variant="primary")
-                with gr.Column():
-                    output_text = gr.Textbox(
-                        label="Translation",
-                        lines=5,
-                        interactive=False
-                    )
-            if tokenizer and model:
-                translate_btn.click(
-                    fn=lambda text, tgt, src: translate_single(text, tgt, src, tokenizer, model),
-                    inputs=[input_text, target_lang, source_lang],
-                    outputs=output_text,
-                    api_name="translate_text"
                 )
-            else:
-                translate_btn.click(
-                    fn=lambda text, tgt, src: translate_single(text, tgt, src),
-                    inputs=[input_text, target_lang, source_lang],
-                    outputs=output_text,
-                    api_name="translate_text"
                 )
-        with gr.TabItem("Batch Translation"):
-            with gr.Row():
-                with gr.Column():
-                    batch_input = gr.Textbox(
-                        label="Text lines to translate (one per line)",
-                        placeholder="Line 1\nLine 2\nLine 3...",
-                        lines=8
-                    )
-                    batch_target_lang = gr.Dropdown(
-                        choices=SUPPORTED_LANGUAGES,
-                        label="Target Language",
-                        value="Vietnamese"
-                    )
-                    batch_source_lang = gr.Textbox(
-                        label="Source Language (optional)",
-                        placeholder="Leave empty for auto-detection"
-                    )
-                    batch_translate_btn = gr.Button("Translate Batch", variant="primary")
-                with gr.Column():
-                    batch_output = gr.Textbox(
-                        label="Batch Translation Results",
-                        lines=8,
-                        interactive=False
-                    )
-            if tokenizer and model:
-                batch_translate_btn.click(
-                    fn=lambda text, tgt, src: translate_batch(text, tgt, src, tokenizer, model),
-                    inputs=[batch_input, batch_target_lang, batch_source_lang],
-                    outputs=batch_output,
-                    api_name="translate_batch"
-                )
-            else:
-                batch_translate_btn.click(
-                    fn=lambda text, tgt, src: translate_batch(text, tgt, src),
-                    inputs=[batch_input, batch_target_lang, batch_source_lang],
-                    outputs=batch_output,
-                    api_name="translate_batch"
-                )
-    gr.Markdown("### API Usage")
-    gr.Markdown("""
-    ```python
-    from gradio_client import Client
-    client = Client("YOUR_SPACE_URL")
-    # Single translation
-    result = client.predict("你好", "Vietnamese", None, api_name="/translate_text")
-    # Batch translation
-    result = client.predict("你好\\n再见", "Vietnamese", None, api_name="/translate_batch")
-    ```
-    """)
-# Launch the app
 if __name__ == "__main__":
-    demo.queue(concurrency_count=1, max_size=2).launch()

 import gradio as gr
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    BitsAndBytesConfig
 )
+import logging
+import gc
+import psutil
+import os
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variables
+tokenizer = None
+model = None
+def get_memory_usage():
+    """Get current memory usage"""
+    process = psutil.Process(os.getpid())
+    return process.memory_info().rss / 1024 / 1024 / 1024  # GB
+def load_model_optimized():
+    """Load model with maximum optimization for CPU"""
+    global tokenizer, model
+    if model is not None:
+        return model, tokenizer
+    model_name = "Tencent/Hunyuan-MT-7B-FS8"
+    logger.info(f"Loading {model_name} with optimizations...")
+    logger.info(f"Memory before loading: {get_memory_usage():.2f} GB")
     try:
+        # Load tokenizer first
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True
         )
+        # Load model with aggressive optimizations
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,  # Half precision
+            device_map="cpu",
+            low_cpu_mem_usage=True,     # Reduce memory usage
+            trust_remote_code=True,
+            use_cache=False,            # Disable KV cache
+            offload_folder="./offload", # Offload to disk if needed
+        )
+        # Additional optimizations
+        model.eval()  # Set to evaluation mode
+        # Enable torch optimizations
+        torch.set_num_threads(2)  # Limit threads
+        logger.info(f"Memory after loading: {get_memory_usage():.2f} GB")
+        logger.info("Model loaded successfully!")
+        return model, tokenizer
+    except Exception as e:
+        logger.error(f"Error loading model: {e}")
+        # Try fallback with 8-bit quantization
         try:
+            logger.info("Trying 8-bit quantization...")
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True
             )
+            model = AutoModelForSeq2SeqLM.from_pretrained(
+                model_name,
+                quantization_config=quantization_config,
+                device_map="auto",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            logger.info("8-bit model loaded!")
+            return model, tokenizer
+        except Exception as e2:
+            logger.error(f"8-bit loading also failed: {e2}")
+            raise e2
+def translate_text_optimized(
+    text: str,
+    source_lang: str = "auto",
+    target_lang: str = "en"
+) -> str:
+    """Optimized translation function"""
+    if not text.strip():
+        return "Please enter text to translate"
+    # Memory cleanup before translation
+    gc.collect()
+    torch.cuda.empty_cache() if torch.cuda.is_available() else None
     try:
+        model, tokenizer = load_model_optimized()
+        # Format input
+        if source_lang == "auto":
+            input_text = f"Translate to {target_lang}: {text}"
+        else:
+            input_text = f"Translate from {source_lang} to {target_lang}: {text}"
+        logger.info(f"Translating: {input_text[:50]}...")
+        start_memory = get_memory_usage()
+        # Tokenize with truncation
+        inputs = tokenizer(
+            input_text,
+            return_tensors="pt",
+            max_length=512,     # Limit input length
+            truncation=True,
+            padding=False       # No padding for single input
         )
+        # Generate with minimal settings
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=256,    # Limit output length
+                min_length=1,
+                num_beams=2,           # Reduce beams for speed
+                early_stopping=True,
+                do_sample=False,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                use_cache=False        # Disable cache
+            )
+        # Decode output
+        translated_text = tokenizer.decode(
+            outputs[0],
+            skip_special_tokens=True
         )
+        # Clean output
+        if ":" in translated_text:
+            translated_text = translated_text.split(":", 1)[-1].strip()
+        # Memory cleanup after translation
+        del inputs, outputs
+        gc.collect()
+        end_memory = get_memory_usage()
+        logger.info(f"Translation completed. Memory: {start_memory:.2f}GB -> {end_memory:.2f}GB")
+        return translated_text
     except Exception as e:
+        logger.error(f"Translation error: {e}")
+        gc.collect()  # Cleanup on error
+        return f"Translation failed: {str(e)}"
+# Language mapping
+LANGUAGES = {
+    "auto": "Auto Detect",
+    "en": "English",
+    "zh": "Chinese",
+    "vi": "Vietnamese",
+    "ja": "Japanese",
+    "ko": "Korean",
+    "th": "Thai",
+    "id": "Indonesian",
+    "ms": "Malay",
+    "fil": "Filipino"
+}
 # Create Gradio interface
+with gr.Blocks(
+    title="Hunyuan-MT Translation (CPU Optimized)",
+    theme=gr.themes.Monochrome(),
+) as demo:
+    gr.HTML("""
+    <div style="text-align: center; margin: 20px;">
+        <h1>🧠 Hunyuan-MT-7B Translation</h1>
+        <p><strong>CPU Optimized Version</strong></p>
+        <p><em>⚠️ First translation may take 1-2 minutes to load model</em></p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter text to translate (max 200 words for best performance)...",
+                lines=4,
+                max_lines=8
+            )
+            with gr.Row():
+                source_lang = gr.Dropdown(
+                    choices=list(LANGUAGES.items()),
+                    label="From",
+                    value="auto"
                 )
+                target_lang = gr.Dropdown(
+                    choices=[(k, v) for k, v in LANGUAGES.items() if k != "auto"],
+                    label="To",
+                    value="en"
                 )
+            translate_btn = gr.Button(
+                "🔄 Translate",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column():
+            output_text = gr.Textbox(
+                label="Translation",
+                lines=4,
+                max_lines=8,
+                interactive=False
+            )
+            memory_display = gr.Textbox(
+                label="System Status",
+                value="Ready",
+                interactive=False
+            )
+    # Memory monitoring
+    def update_memory():
+        return f"Memory: {get_memory_usage():.1f}GB / 16GB"
+    def translate_with_status(text, src, tgt):
+        if len(text.split()) > 100:  # Limit word count
+            return "Please limit input to 100 words for optimal performance", update_memory()
+        result = translate_text_optimized(text, src, tgt)
+        return result, update_memory()
+    # Examples for testing
+    gr.Examples(
+        examples=[
+            ["Hello, how are you?", "en", "vi"],
+            ["Xin chào", "vi", "en"],
+            ["Good morning", "en", "zh"],
+            ["Thank you very much", "en", "ja"],
+        ],
+        inputs=[input_text, source_lang, target_lang],
+        outputs=[output_text, memory_display],
+        fn=translate_with_status
+    )
+    translate_btn.click(
+        fn=translate_with_status,
+        inputs=[input_text, source_lang, target_lang],
+        outputs=[output_text, memory_display]
+    )
+    # Auto-update memory display
+    demo.load(fn=update_memory, outputs=memory_display)
+# Launch with specific settings for HF Spaces
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_api=True,
+        enable_monitoring=False  # Disable to save resources
+    )