Spaces:

zazaman
/

guardrails-final

Sleeping

App Files Files Community

zazaman commited on Nov 9

Commit

c26a471

1 Parent(s): b2635ed

Replace llama-cpp-python with pre-built llama.cpp binary for Qwen translator

Browse files

Files changed (4) hide show

Dockerfile +3 -3
config.py +1 -1
llm_clients/qwen_translator.py +188 -92
requirements.txt +1 -2

Dockerfile CHANGED Viewed

@@ -3,13 +3,13 @@ FROM python:3.10-slim
 # Set working directory
 WORKDIR /app
-# Install system dependencies for PDF processing, llama-cpp-python compilation, and other requirements
 RUN apt-get update && apt-get install -y \
     gcc \
     g++ \
-    cmake \
-    make \
     git \
     && rm -rf /var/lib/apt/lists/*
 # Create a user to avoid running as root

 # Set working directory
 WORKDIR /app
+# Install system dependencies for PDF processing and other requirements
+# Note: llama.cpp binary is downloaded at runtime, no compilation needed
 RUN apt-get update && apt-get install -y \
     gcc \
     g++ \
     git \
+    unzip \
     && rm -rf /var/lib/apt/lists/*
 # Create a user to avoid running as root

config.py CHANGED Viewed

@@ -28,7 +28,7 @@ AI_DETECTION_MODE = {
 # Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces.
 NON_ENGLISH_TRANSLATOR = {
     "enabled": True,
-    "provider": "qwen_translator",  # Translation client using GGUF models via llama-cpp-python
     "config": {
         # GGUF model repository and file from unsloth (pre-quantized)
         "repo_id": "unsloth/Qwen3-0.6B-GGUF",

 # Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces.
 NON_ENGLISH_TRANSLATOR = {
     "enabled": True,
+    "provider": "qwen_translator",  # Translation client using GGUF models via pre-built llama.cpp binary
     "config": {
         # GGUF model repository and file from unsloth (pre-quantized)
         "repo_id": "unsloth/Qwen3-0.6B-GGUF",

llm_clients/qwen_translator.py CHANGED Viewed

@@ -1,5 +1,11 @@
 from typing import Generator, Any, Dict
 import os
 from .base import LlmClient
@@ -8,17 +14,23 @@ TRANSLATION_SYSTEM_INSTRUCTIONS = """You are a professional translator. Translat
 class QwenTranslatorClient(LlmClient):
     """
-    Translation client using Qwen3-0.6B-GGUF pre-quantized models via llama-cpp-python.
     Translates non-English text to English so it can be processed by the English-only classifier.
     Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
     Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
     """
     def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
         super().__init__(config_dict, system_prompt)
         self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
-        self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")  # Default to IQ4_XS for good balance
         self.temperature = float(self.config.get("temperature", 0.3))
         self.top_p = float(self.config.get("top_p", 0.9))
         self.top_k = int(self.config.get("top_k", 40))
@@ -26,31 +38,131 @@ class QwenTranslatorClient(LlmClient):
         self.context_size = int(self.config.get("context_size", 512))
         self.n_threads = int(self.config.get("n_threads", 0))  # 0 = auto-detect CPU threads
         self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0))  # 0 = CPU only, >0 for GPU
-        self.n_batch = int(self.config.get("n_batch", 256))  # Batch size for prompt processing
-        # Model will be loaded lazily on first use
-        self.llm = None
-        self._model_loaded = False
         print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")
     def _download_model_if_needed(self) -> str:
         """Download GGUF model file from HuggingFace if not already cached."""
-        from huggingface_hub import hf_hub_download, list_repo_files
-        import os
         # Set up cache directory
         cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
         os.makedirs(cache_dir, exist_ok=True)
         try:
-            # First, try to list available files to help with debugging
-            try:
-                repo_files = list_repo_files(repo_id=self.repo_id, repo_type="model")
-                print(f"   📋 Available files in {self.repo_id}: {[f for f in repo_files if f.endswith('.gguf')][:5]}...")
-            except Exception:
-                pass  # Ignore if we can't list files
             print(f"   📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...")
             model_path = hf_hub_download(
                 repo_id=self.repo_id,
@@ -59,6 +171,8 @@ class QwenTranslatorClient(LlmClient):
                 resume_download=True
             )
             print(f"   ✅ Model downloaded/cached at: {model_path}")
             return model_path
         except Exception as e:
             error_msg = (
@@ -66,62 +180,13 @@ class QwenTranslatorClient(LlmClient):
                 f"Error: {e}\n"
                 f"Please verify:\n"
                 f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
-                f"2. The model file name is correct (check available .gguf files in the repo)\n"
-                f"3. You have internet connectivity\n"
-                f"Common file names: Qwen3-0.6B-Base-Q4_K_M.gguf, qwen3-0.6b-base-q4_k_m.gguf, etc."
             )
             raise RuntimeError(error_msg) from e
-    def _load_model(self):
-        """Lazy load the GGUF model on first use."""
-        if self._model_loaded:
-            return
-        try:
-            from llama_cpp import Llama
-            print(f"🔄 Loading GGUF translation model: {self.model_file}")
-            # Download model if needed
-            model_path = self._download_model_if_needed()
-            # Load the GGUF model with llama-cpp-python
-            print(f"   📥 Loading model from: {model_path}")
-            # Optimize for speed: use mmap for faster loading, no memory locking
-            self.llm = Llama(
-                model_path=model_path,
-                n_ctx=self.context_size,  # Context window size (smaller = faster)
-                n_threads=self.n_threads if self.n_threads > 0 else None,  # Auto-detect if 0
-                n_gpu_layers=self.n_gpu_layers,  # 0 = CPU only, >0 for GPU layers
-                verbose=False,  # Suppress verbose output
-                use_mlock=False,  # Don't lock memory (faster, better for Spaces)
-                use_mmap=True,  # Use memory mapping for faster loading
-                n_batch=self.n_batch,  # Batch size (smaller = faster for short prompts)
-                n_predict=self.max_tokens,  # Max tokens to predict
-            )
-            self._model_loaded = True
-            actual_threads = self.llm.n_threads if hasattr(self.llm, 'n_threads') else self.n_threads
-            print(f"✅ GGUF translation model loaded successfully")
-            print(f"   Context size: {self.context_size} (reduced for faster inference)")
-            print(f"   CPU threads: {actual_threads} ({'auto-detected' if self.n_threads == 0 else 'manual'})")
-            print(f"   GPU layers: {self.n_gpu_layers} (0 = CPU only, >0 for GPU acceleration)")
-            print(f"   Batch size: {self.n_batch}")
-        except ImportError as e:
-            raise ImportError(
-                f"llama-cpp-python library is required for QwenTranslatorClient with GGUF models. "
-                f"Install it with: pip install llama-cpp-python\n"
-                f"Original error: {e}"
-            ) from e
-        except Exception as e:
-            raise RuntimeError(f"Failed to load GGUF translation model {self.model_file}: {e}") from e
     def _build_translation_prompt(self, user_text: str) -> str:
         """Build a prompt for translation to English using Qwen's chat format."""
-        # Qwen3 uses a specific chat template format: <|im_start|>role\ncontent<|im_end|>
-        # System prompt handles the translation instruction, user just provides the text
         prompt = f"""<|im_start|>system
 {TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
 <|im_start|>user
@@ -132,44 +197,76 @@ class QwenTranslatorClient(LlmClient):
     def generate_content(self, prompt: str) -> str:
         """
-        Translate the input text to English.
         Returns the English translation as a plain string.
         """
-        # Load model if not already loaded (lazy loading)
-        if not self._model_loaded:
-            self._load_model()
         # Build translation prompt
         translation_prompt = self._build_translation_prompt(prompt)
-        # Generate translation using llama-cpp-python
         try:
-            # Optimize generation for speed
-            response = self.llm(
-                translation_prompt,
-                max_tokens=self.max_tokens,
-                temperature=self.temperature,
-                top_p=self.top_p,
-                top_k=self.top_k,
-                stop=["<|im_end|>", "<|im_start|>"],  # Stop at chat format tokens
-                echo=False,  # Don't echo the prompt
-                repeat_penalty=1.1,  # Slight penalty to avoid repetition (faster)
             )
-            # Extract the generated text
-            if 'choices' in response and len(response['choices']) > 0:
-                generated_text = response['choices'][0]['text'].strip()
-            else:
-                raise ValueError("Empty response from GGUF model")
         except Exception as e:
             raise RuntimeError(f"Translation generation failed: {e}") from e
         # Clean up the response
-        translated_text = generated_text.strip()
-        # Remove any remaining chat format tokens
-        translated_text = translated_text.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
         # Remove common prefixes that might be added by the model
         prefixes_to_remove = [
@@ -189,7 +286,6 @@ class QwenTranslatorClient(LlmClient):
         # If translation is empty or suspiciously short, return original
         if not translated_text or len(translated_text) < len(prompt) * 0.1:
-            # Model might not have translated properly, return original
             print(f"⚠️  Translation may have failed (too short or empty), returning original text")
             return prompt
@@ -197,7 +293,7 @@ class QwenTranslatorClient(LlmClient):
     def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
         """
-        Stream translation using llama-cpp-python streaming.
         For simplicity, we'll collect the full response and yield it.
         True streaming can be added later if needed.
         """

 from typing import Generator, Any, Dict
 import os
+import subprocess
+import tempfile
+import zipfile
+import urllib.request
+import shutil
+from pathlib import Path
 from .base import LlmClient
 class QwenTranslatorClient(LlmClient):
     """
+    Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary.
     Translates non-English text to English so it can be processed by the English-only classifier.
     Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
+    Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed.
+    The binary is automatically downloaded and extracted on first use.
     Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
     """
+    # Class-level cache for the binary path
+    _binary_path = None
+    _binary_dir = None
     def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
         super().__init__(config_dict, system_prompt)
         self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
+        self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")
         self.temperature = float(self.config.get("temperature", 0.3))
         self.top_p = float(self.config.get("top_p", 0.9))
         self.top_k = int(self.config.get("top_k", 40))
         self.context_size = int(self.config.get("context_size", 512))
         self.n_threads = int(self.config.get("n_threads", 0))  # 0 = auto-detect CPU threads
         self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0))  # 0 = CPU only, >0 for GPU
+        self.n_batch = int(self.config.get("n_batch", 256))
+        # Model path will be set on first use
+        self.model_path = None
+        self._model_downloaded = False
         print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")
+    @classmethod
+    def _download_binary(cls) -> str:
+        """Download and extract the pre-built llama.cpp binary from GitHub releases."""
+        if cls._binary_path and os.path.exists(cls._binary_path):
+            return cls._binary_path
+        print("📥 Downloading pre-built llama.cpp binary...")
+        # Create a temporary directory for the binary
+        if cls._binary_dir is None:
+            cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_")
+        binary_dir = Path(cls._binary_dir)
+        # Try common binary names (main is the standard, but some releases use llama-cli)
+        possible_binary_names = ["main", "llama-cli", "llama"]
+        binary_path = None
+        # Check if any binary already exists
+        for name in possible_binary_names:
+            path = binary_dir / name
+            if path.exists() and os.access(path, os.X_OK):
+                cls._binary_path = str(path)
+                print(f"✅ Using existing binary at: {cls._binary_path}")
+                return cls._binary_path
+        # If not found, we'll search after extraction
+        binary_path = binary_dir / "main"  # Default to 'main' (standard llama.cpp binary name)
+        # Download the zip file
+        zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip"
+        zip_path = binary_dir / "llama-binary.zip"
+        try:
+            print(f"   Downloading from: {zip_url}")
+            urllib.request.urlretrieve(zip_url, zip_path)
+            print(f"   ✅ Downloaded to: {zip_path}")
+            # Extract the zip file
+            print(f"   📦 Extracting zip file...")
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(binary_dir)
+            # Find the binary in the extracted files
+            # The binary might be called 'main', 'llama-cli', or 'llama'
+            # It might be in the root or in a subdirectory
+            found_binary = None
+            # First, try common locations and names
+            for name in possible_binary_names:
+                possible_paths = [
+                    binary_dir / name,
+                    binary_dir / "bin" / name,
+                    binary_dir / "llama-b6995-bin-ubuntu-x64" / name,
+                ]
+                for path in possible_paths:
+                    if path.exists():
+                        found_binary = path
+                        break
+                if found_binary:
+                    break
+            # Also search recursively for any executable file matching our names
+            if found_binary is None:
+                for root, dirs, files in os.walk(binary_dir):
+                    for file in files:
+                        if file in possible_binary_names or file.startswith("llama"):
+                            candidate = Path(root) / file
+                            # Check if it's executable (or at least a regular file)
+                            if candidate.is_file() and os.access(candidate, os.X_OK):
+                                found_binary = candidate
+                                break
+                    if found_binary:
+                        break
+            if found_binary is None:
+                raise RuntimeError(
+                    f"Could not find llama.cpp binary in extracted zip. "
+                    f"Searched for: {possible_binary_names}. "
+                    f"Please check the zip file structure."
+                )
+            # Make it executable
+            os.chmod(found_binary, 0o755)
+            # Move to expected location if needed (use 'main' as standard name)
+            if found_binary != binary_path:
+                if binary_path.exists():
+                    binary_path.unlink()  # Remove old binary if exists
+                shutil.move(str(found_binary), str(binary_path))
+            cls._binary_path = str(binary_path)
+            print(f"   ✅ Binary extracted and ready at: {cls._binary_path}")
+            # Clean up zip file
+            zip_path.unlink()
+            return cls._binary_path
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to download/extract llama.cpp binary from {zip_url}. "
+                f"Error: {e}"
+            ) from e
     def _download_model_if_needed(self) -> str:
         """Download GGUF model file from HuggingFace if not already cached."""
+        from huggingface_hub import hf_hub_download
+        if self._model_downloaded and self.model_path and os.path.exists(self.model_path):
+            return self.model_path
         # Set up cache directory
         cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
         os.makedirs(cache_dir, exist_ok=True)
         try:
             print(f"   📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...")
             model_path = hf_hub_download(
                 repo_id=self.repo_id,
                 resume_download=True
             )
             print(f"   ✅ Model downloaded/cached at: {model_path}")
+            self.model_path = model_path
+            self._model_downloaded = True
             return model_path
         except Exception as e:
             error_msg = (
                 f"Error: {e}\n"
                 f"Please verify:\n"
                 f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
+                f"2. The model file name is correct\n"
+                f"3. You have internet connectivity"
             )
             raise RuntimeError(error_msg) from e
     def _build_translation_prompt(self, user_text: str) -> str:
         """Build a prompt for translation to English using Qwen's chat format."""
         prompt = f"""<|im_start|>system
 {TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
 <|im_start|>user
     def generate_content(self, prompt: str) -> str:
         """
+        Translate the input text to English using the pre-built llama.cpp binary.
         Returns the English translation as a plain string.
         """
+        # Download binary and model if needed
+        binary_path = self._download_binary()
+        model_path = self._download_model_if_needed()
         # Build translation prompt
         translation_prompt = self._build_translation_prompt(prompt)
+        # Prepare command-line arguments for llama.cpp binary
+        # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
+        cmd = [
+            binary_path,
+            "-m", model_path,
+            "-p", translation_prompt,
+            "--temp", str(self.temperature),
+            "--top-p", str(self.top_p),
+            "--top-k", str(self.top_k),
+            "-n", str(self.max_tokens),  # Number of tokens to generate
+            "-c", str(self.context_size),  # Context size
+        ]
+        # Add thread count if specified (0 means auto-detect, which is default)
+        if self.n_threads > 0:
+            cmd.extend(["-t", str(self.n_threads)])
+        # Add GPU layers if specified
+        if self.n_gpu_layers > 0:
+            cmd.extend(["-ngl", str(self.n_gpu_layers)])
+        # Add stop sequences (llama.cpp uses --stop for each stop token)
+        cmd.extend(["--stop", "<|im_end|>", "--stop", "<|im_start|>"])
         try:
+            # Run the binary and capture output
+            print(f"   🔄 Running translation with llama.cpp binary...")
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=60,  # 60 second timeout
+                check=True
             )
+            # Parse the output
+            output = result.stdout.strip()
+            # The output might include the prompt, so we need to extract just the generated part
+            # Look for the assistant response after the prompt
+            if "<|im_start|>assistant" in output:
+                # Extract everything after the assistant tag
+                output = output.split("<|im_start|>assistant")[-1].strip()
+            # Remove any remaining chat format tokens
+            translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("Translation timed out after 60 seconds")
+        except subprocess.CalledProcessError as e:
+            error_output = e.stderr if e.stderr else e.stdout
+            raise RuntimeError(
+                f"Translation failed with llama.cpp binary. "
+                f"Exit code: {e.returncode}, Error: {error_output}"
+            ) from e
         except Exception as e:
             raise RuntimeError(f"Translation generation failed: {e}") from e
         # Clean up the response
+        translated_text = translated_text.strip()
         # Remove common prefixes that might be added by the model
         prefixes_to_remove = [
         # If translation is empty or suspiciously short, return original
         if not translated_text or len(translated_text) < len(prompt) * 0.1:
             print(f"⚠️  Translation may have failed (too short or empty), returning original text")
             return prompt
     def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
         """
+        Stream translation using llama.cpp binary.
         For simplicity, we'll collect the full response and yield it.
         True streaming can be added later if needed.
         """

requirements.txt CHANGED Viewed

@@ -10,5 +10,4 @@ sentence-transformers
 accelerate
 PyMuPDF
 python-docx
-huggingface-hub
-llama-cpp-python>=0.2.0

 accelerate
 PyMuPDF
 python-docx
+huggingface-hub