Spaces:

zazaman
/

guardrails-final

Sleeping

File size: 13,291 Bytes

from typing import Generator, Any, Dict
import os
import subprocess
import tempfile
import zipfile
import urllib.request
import shutil
from pathlib import Path
from .base import LlmClient


TRANSLATION_SYSTEM_INSTRUCTIONS = """You are a professional translator. Translate the user's text to English. Preserve the meaning, tone, and intent exactly. Return only the English translation, no additional commentary or explanation."""


class QwenTranslatorClient(LlmClient):
    """
    Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary.
    Translates non-English text to English so it can be processed by the English-only classifier.
    
    Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
    Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed.
    The binary is automatically downloaded and extracted on first use.
    Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
    """

    # Class-level cache for the binary path
    _binary_path = None
    _binary_dir = None

    def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
        super().__init__(config_dict, system_prompt)
        self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
        self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")
        self.temperature = float(self.config.get("temperature", 0.3))
        self.top_p = float(self.config.get("top_p", 0.9))
        self.top_k = int(self.config.get("top_k", 40))
        self.max_tokens = int(self.config.get("max_tokens", 256))
        self.context_size = int(self.config.get("context_size", 512))
        self.n_threads = int(self.config.get("n_threads", 0))  # 0 = auto-detect CPU threads
        self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0))  # 0 = CPU only, >0 for GPU
        self.n_batch = int(self.config.get("n_batch", 256))
        
        # Model path will be set on first use
        self.model_path = None
        self._model_downloaded = False
        
        print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")

    @classmethod
    def _download_binary(cls) -> str:
        """Download and extract the pre-built llama.cpp binary from GitHub releases."""
        if cls._binary_path and os.path.exists(cls._binary_path):
            return cls._binary_path
        
        print("📥 Downloading pre-built llama.cpp binary...")
        
        # Create a temporary directory for the binary
        if cls._binary_dir is None:
            cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_")
        
        binary_dir = Path(cls._binary_dir)
        
        # Try common binary names (main is the standard, but some releases use llama-cli)
        possible_binary_names = ["main", "llama-cli", "llama"]
        binary_path = None
        
        # Check if any binary already exists
        for name in possible_binary_names:
            path = binary_dir / name
            if path.exists() and os.access(path, os.X_OK):
                cls._binary_path = str(path)
                print(f"✅ Using existing binary at: {cls._binary_path}")
                return cls._binary_path
        
        # If not found, we'll search after extraction
        binary_path = binary_dir / "main"  # Default to 'main' (standard llama.cpp binary name)
        
        # Download the zip file
        zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip"
        zip_path = binary_dir / "llama-binary.zip"
        
        try:
            print(f"   Downloading from: {zip_url}")
            urllib.request.urlretrieve(zip_url, zip_path)
            print(f"   ✅ Downloaded to: {zip_path}")
            
            # Extract the zip file
            print(f"   📦 Extracting zip file...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(binary_dir)
            
            # Find the binary in the extracted files
            # The binary might be called 'main', 'llama-cli', or 'llama'
            # It might be in the root or in a subdirectory
            found_binary = None
            
            # First, try common locations and names
            for name in possible_binary_names:
                possible_paths = [
                    binary_dir / name,
                    binary_dir / "bin" / name,
                    binary_dir / "llama-b6995-bin-ubuntu-x64" / name,
                ]
                for path in possible_paths:
                    if path.exists():
                        found_binary = path
                        break
                if found_binary:
                    break
            
            # Also search recursively for any executable file matching our names
            if found_binary is None:
                for root, dirs, files in os.walk(binary_dir):
                    for file in files:
                        if file in possible_binary_names or file.startswith("llama"):
                            candidate = Path(root) / file
                            # Check if it's executable (or at least a regular file)
                            if candidate.is_file() and os.access(candidate, os.X_OK):
                                found_binary = candidate
                                break
                    if found_binary:
                        break
            
            if found_binary is None:
                raise RuntimeError(
                    f"Could not find llama.cpp binary in extracted zip. "
                    f"Searched for: {possible_binary_names}. "
                    f"Please check the zip file structure."
                )
            
            # Make it executable
            os.chmod(found_binary, 0o755)
            
            # Move to expected location if needed (use 'main' as standard name)
            if found_binary != binary_path:
                if binary_path.exists():
                    binary_path.unlink()  # Remove old binary if exists
                shutil.move(str(found_binary), str(binary_path))
            
            cls._binary_path = str(binary_path)
            print(f"   ✅ Binary extracted and ready at: {cls._binary_path}")
            
            # Clean up zip file
            zip_path.unlink()
            
            return cls._binary_path
            
        except Exception as e:
            raise RuntimeError(
                f"Failed to download/extract llama.cpp binary from {zip_url}. "
                f"Error: {e}"
            ) from e

    def _download_model_if_needed(self) -> str:
        """Download GGUF model file from HuggingFace if not already cached."""
        from huggingface_hub import hf_hub_download
        
        if self._model_downloaded and self.model_path and os.path.exists(self.model_path):
            return self.model_path
        
        # Set up cache directory
        cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
        os.makedirs(cache_dir, exist_ok=True)
        
        try:
            print(f"   📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...")
            model_path = hf_hub_download(
                repo_id=self.repo_id,
                filename=self.model_file,
                cache_dir=cache_dir,
                resume_download=True
            )
            print(f"   ✅ Model downloaded/cached at: {model_path}")
            self.model_path = model_path
            self._model_downloaded = True
            return model_path
        except Exception as e:
            error_msg = (
                f"Failed to download GGUF model '{self.model_file}' from '{self.repo_id}'. "
                f"Error: {e}\n"
                f"Please verify:\n"
                f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
                f"2. The model file name is correct\n"
                f"3. You have internet connectivity"
            )
            raise RuntimeError(error_msg) from e

    def _build_translation_prompt(self, user_text: str) -> str:
        """Build a prompt for translation to English using Qwen's chat format."""
        prompt = f"""<|im_start|>system
{TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
<|im_start|>user
{user_text}<|im_end|>
<|im_start|>assistant
"""
        return prompt

    def generate_content(self, prompt: str) -> str:
        """
        Translate the input text to English using the pre-built llama.cpp binary.
        Returns the English translation as a plain string.
        """
        # Download binary and model if needed
        binary_path = self._download_binary()
        model_path = self._download_model_if_needed()
        
        # Build translation prompt
        translation_prompt = self._build_translation_prompt(prompt)
        
        # Prepare command-line arguments for llama.cpp binary
        # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
        cmd = [
            binary_path,
            "-m", model_path,
            "-p", translation_prompt,
            "--temp", str(self.temperature),
            "--top-p", str(self.top_p),
            "--top-k", str(self.top_k),
            "-n", str(self.max_tokens),  # Number of tokens to generate
            "-c", str(self.context_size),  # Context size
        ]
        
        # Add thread count if specified (0 means auto-detect, which is default)
        if self.n_threads > 0:
            cmd.extend(["-t", str(self.n_threads)])
        
        # Add GPU layers if specified
        if self.n_gpu_layers > 0:
            cmd.extend(["-ngl", str(self.n_gpu_layers)])
        
        # Add stop sequences (llama.cpp uses --stop for each stop token)
        cmd.extend(["--stop", "<|im_end|>", "--stop", "<|im_start|>"])
        
        try:
            # Run the binary and capture output
            print(f"   🔄 Running translation with llama.cpp binary...")
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=60,  # 60 second timeout
                check=True
            )
            
            # Parse the output
            output = result.stdout.strip()
            
            # The output might include the prompt, so we need to extract just the generated part
            # Look for the assistant response after the prompt
            if "<|im_start|>assistant" in output:
                # Extract everything after the assistant tag
                output = output.split("<|im_start|>assistant")[-1].strip()
            
            # Remove any remaining chat format tokens
            translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
            
        except subprocess.TimeoutExpired:
            raise RuntimeError("Translation timed out after 60 seconds")
        except subprocess.CalledProcessError as e:
            error_output = e.stderr if e.stderr else e.stdout
            raise RuntimeError(
                f"Translation failed with llama.cpp binary. "
                f"Exit code: {e.returncode}, Error: {error_output}"
            ) from e
        except Exception as e:
            raise RuntimeError(f"Translation generation failed: {e}") from e
        
        # Clean up the response
        translated_text = translated_text.strip()
        
        # Remove common prefixes that might be added by the model
        prefixes_to_remove = [
            "English translation:",
            "Translation:",
            "English:",
            "Here is the translation:",
            "The translation is:",
            "Assistant:"
        ]
        for prefix in prefixes_to_remove:
            if translated_text.lower().startswith(prefix.lower()):
                translated_text = translated_text[len(prefix):].strip()
        
        # Remove leading/trailing quotes if present
        translated_text = translated_text.strip('"').strip("'").strip()
        
        # If translation is empty or suspiciously short, return original
        if not translated_text or len(translated_text) < len(prompt) * 0.1:
            print(f"⚠️  Translation may have failed (too short or empty), returning original text")
            return prompt
        
        return translated_text

    def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
        """
        Stream translation using llama.cpp binary.
        For simplicity, we'll collect the full response and yield it.
        True streaming can be added later if needed.
        """
        # For now, just yield the full translation (streaming can be optimized later)
        translation = self.generate_content(prompt)
        yield translation

    def _generate_content_impl(self, prompt: str) -> str:
        return self.generate_content(prompt)

    def _generate_content_stream_impl(self, prompt: str) -> Generator[Any, None, None]:
        return self.generate_content_stream(prompt)