from typing import Generator, Any, Dict import os import sys import platform import subprocess import tempfile import zipfile import urllib.request import shutil import re from pathlib import Path from .base import LlmClient TRANSLATION_SYSTEM_INSTRUCTIONS = """Translate the text to English. Output only the translation, nothing else.""" class QwenTranslatorClient(LlmClient): """ Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary. Translates non-English text to English so it can be processed by the English-only classifier. Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed. Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed. The binary is automatically downloaded and extracted on first use. Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference. """ # Class-level cache for the binary path _binary_path = None _binary_dir = None def __init__(self, config_dict: Dict[str, Any], system_prompt: str): super().__init__(config_dict, system_prompt) self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF") self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf") self.temperature = float(self.config.get("temperature", 0.3)) self.top_p = float(self.config.get("top_p", 0.9)) self.top_k = int(self.config.get("top_k", 40)) self.max_tokens = int(self.config.get("max_tokens", 256)) self.context_size = int(self.config.get("context_size", 512)) self.n_threads = int(self.config.get("n_threads", 0)) # 0 = auto-detect CPU threads self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0)) # 0 = CPU only, >0 for GPU self.n_batch = int(self.config.get("n_batch", 256)) # Model path will be set on first use self.model_path = None self._model_downloaded = False print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)") @classmethod def _download_binary(cls) -> str: """Get the llama.cpp binary - prefer compiled version, fallback to downloaded.""" # Check OS - the binary only works on Linux if sys.platform == "win32": raise RuntimeError( "Translation with llama.cpp binary is not supported on Windows. " "The binary is for Linux only. " "Please use this feature on Linux or Hugging Face Spaces." ) # First, check if we have a cached binary path if cls._binary_path and os.path.exists(cls._binary_path): # Verify it's still executable if os.access(cls._binary_path, os.X_OK): return cls._binary_path else: # Try to fix permissions try: os.chmod(cls._binary_path, 0o755) if os.access(cls._binary_path, os.X_OK): return cls._binary_path except Exception: pass # If we can't fix it, reset cls._binary_path = None # Check for compiled binary (from Dockerfile) first compiled_binary_paths = [ "/usr/local/bin/llama-main", # Standard location from Dockerfile "/app/llama.cpp/build/bin/main", # Alternative location "/usr/bin/llama-main", # Another possible location ] for compiled_path in compiled_binary_paths: if os.path.exists(compiled_path) and os.access(compiled_path, os.X_OK): cls._binary_path = compiled_path print(f"✅ Using compiled llama.cpp binary at: {compiled_path}", flush=True) return cls._binary_path # If no compiled binary found, download pre-built binary as fallback print("📥 No compiled binary found, downloading pre-built llama.cpp binary...", flush=True) # Create a temporary directory for the binary if cls._binary_dir is None: cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_") binary_dir = Path(cls._binary_dir) # Try common binary names (main is the standard, but some releases use llama-cli) possible_binary_names = ["main", "llama-cli", "llama"] binary_path = None # Check if any binary already exists for name in possible_binary_names: path = binary_dir / name if path.exists() and os.access(path, os.X_OK): cls._binary_path = str(path) print(f"✅ Using existing binary at: {cls._binary_path}") return cls._binary_path # If not found, we'll search after extraction binary_path = binary_dir / "main" # Default to 'main' (standard llama.cpp binary name) # Download the zip file zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip" zip_path = binary_dir / "llama-binary.zip" try: print(f" Downloading from: {zip_url}") # Use a more robust download method try: urllib.request.urlretrieve(zip_url, str(zip_path)) except Exception as download_error: raise RuntimeError(f"Failed to download binary from {zip_url}: {download_error}") from download_error if not zip_path.exists(): raise RuntimeError(f"Downloaded file not found at {zip_path}") print(f" ✅ Downloaded to: {zip_path} ({zip_path.stat().st_size / 1024 / 1024:.1f} MB)") # Extract the zip file print(f" 📦 Extracting zip file...") try: with zipfile.ZipFile(str(zip_path), 'r') as zip_ref: zip_ref.extractall(str(binary_dir)) except Exception as extract_error: raise RuntimeError(f"Failed to extract zip file {zip_path}: {extract_error}") from extract_error # Find the binary in the extracted files # The binary might be called 'main', 'llama-cli', or 'llama' # It might be in the root or in a subdirectory found_binary = None # First, try common locations and names for name in possible_binary_names: possible_paths = [ binary_dir / name, binary_dir / "bin" / name, binary_dir / "llama-b6995-bin-ubuntu-x64" / name, ] for path in possible_paths: if path.exists(): found_binary = path break if found_binary: break # Also search recursively for any executable file matching our names if found_binary is None: for root, dirs, files in os.walk(str(binary_dir)): for file in files: if file in possible_binary_names or file.startswith("llama"): candidate = Path(root) / file # Check if it's executable (or at least a regular file) if candidate.is_file(): found_binary = candidate break if found_binary: break if found_binary is None: # List what we found for debugging found_files = [] for root, dirs, files in os.walk(str(binary_dir)): for file in files: found_files.append(str(Path(root) / file)) raise RuntimeError( f"Could not find llama.cpp binary in extracted zip. " f"Searched for: {possible_binary_names}. " f"Found files: {found_files[:10]}" ) # Make it executable (Linux/Unix only) try: os.chmod(found_binary, 0o755) except Exception as chmod_error: print(f" ⚠️ Warning: Could not set executable permissions: {chmod_error}", flush=True) # Move to expected location if needed (use 'main' as standard name) if found_binary != binary_path: if binary_path.exists(): binary_path.unlink() # Remove old binary if exists shutil.move(str(found_binary), str(binary_path)) cls._binary_path = str(binary_path) print(f" ✅ Binary extracted and ready at: {cls._binary_path}", flush=True) # Verify binary is executable and test it if not os.access(cls._binary_path, os.X_OK): print(f" ⚠️ Warning: Binary may not be executable. Attempting to fix...", flush=True) try: os.chmod(cls._binary_path, 0o755) except Exception as e: print(f" ⚠️ Could not set permissions: {e}", flush=True) # Test if binary can actually run (check architecture compatibility) # Skip test for compiled binaries (they should work) is_compiled = cls._binary_path.startswith("/usr/local/bin") or cls._binary_path.startswith("/app/llama.cpp") if not is_compiled: print(f" 🔍 Testing binary compatibility...", flush=True) machine = platform.machine() print(f" System architecture: {machine}", flush=True) try: # Try to run the binary with --help to verify it works test_result = subprocess.run( [cls._binary_path, "--help"], capture_output=True, text=True, timeout=5 ) if test_result.returncode == 0 or "usage" in test_result.stdout.lower() or "options" in test_result.stdout.lower(): print(f" ✅ Binary is compatible and executable", flush=True) else: print(f" ⚠️ Binary test returned code {test_result.returncode}", flush=True) if test_result.stderr: print(f" Stderr: {test_result.stderr[:200]}", flush=True) except subprocess.TimeoutExpired: print(f" ⚠️ Binary test timed out", flush=True) except OSError as os_error: error_msg = str(os_error) errno = getattr(os_error, 'errno', None) if errno == 8 or "Exec format error" in error_msg or "cannot execute" in error_msg.lower(): # Check what the binary actually is using 'file' command if available file_info = "unknown" try: file_result = subprocess.run( ["file", cls._binary_path], capture_output=True, text=True, timeout=2 ) if file_result.returncode == 0: file_info = file_result.stdout.strip() except (subprocess.TimeoutExpired, FileNotFoundError, Exception): pass raise RuntimeError( f"Binary architecture mismatch. The downloaded binary is not compatible with this system.\n" f"System architecture: {machine}\n" f"Binary info: {file_info}\n" f"Error: {error_msg}\n" f"The Ubuntu x64 binary may not be compatible with this system. " f"Translation feature requires a compatible llama.cpp binary for this architecture." ) from os_error else: raise except Exception as test_error: error_msg = str(test_error) print(f" ⚠️ Binary test warning: {test_error}", flush=True) else: print(f" ✅ Using compiled binary (no compatibility test needed)", flush=True) # Clean up zip file try: zip_path.unlink() except Exception: pass # Ignore cleanup errors return cls._binary_path except Exception as e: error_msg = ( f"Failed to download/extract llama.cpp binary from {zip_url}. " f"Error: {e}" ) print(f" ❌ {error_msg}") raise RuntimeError(error_msg) from e def _download_model_if_needed(self) -> str: """Download GGUF model file from HuggingFace if not already cached.""" from huggingface_hub import hf_hub_download if self._model_downloaded and self.model_path and os.path.exists(self.model_path): return self.model_path # Set up cache directory cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface")) os.makedirs(cache_dir, exist_ok=True) try: print(f" 📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...") model_path = hf_hub_download( repo_id=self.repo_id, filename=self.model_file, cache_dir=cache_dir, resume_download=True ) print(f" ✅ Model downloaded/cached at: {model_path}") self.model_path = model_path self._model_downloaded = True return model_path except Exception as e: error_msg = ( f"Failed to download GGUF model '{self.model_file}' from '{self.repo_id}'. " f"Error: {e}\n" f"Please verify:\n" f"1. The repository exists: https://huggingface.co/{self.repo_id}\n" f"2. The model file name is correct\n" f"3. You have internet connectivity" ) raise RuntimeError(error_msg) from e def _build_translation_prompt(self, user_text: str) -> str: """Build a prompt for translation to English using Qwen's chat format.""" # Use a very simple, direct prompt to minimize reasoning prompt = f"""<|im_start|>system {TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|> <|im_start|>user {user_text}<|im_end|> <|im_start|>assistant """ return prompt def generate_content(self, prompt: str) -> str: """ Translate the input text to English using the pre-built llama.cpp binary. Returns the English translation as a plain string. """ # Download binary and model if needed binary_path = self._download_binary() model_path = self._download_model_if_needed() # Build translation prompt translation_prompt = self._build_translation_prompt(prompt) # Prepare command-line arguments for llama.cpp binary # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 128 -c 256 -t 0 # Add --no-display-prompt to avoid printing the prompt in output cmd = [ binary_path, "-m", model_path, "-p", translation_prompt, "--temp", str(self.temperature), "--top-p", str(self.top_p), "--top-k", str(self.top_k), "-n", str(self.max_tokens), # Number of tokens to generate "-c", str(self.context_size), # Context size "--no-display-prompt", # Don't echo the prompt in output (if supported) ] # Add thread count if specified (0 means auto-detect, which is default) if self.n_threads > 0: cmd.extend(["-t", str(self.n_threads)]) # Add GPU layers if specified if self.n_gpu_layers > 0: cmd.extend(["-ngl", str(self.n_gpu_layers)]) # Note: Stop sequences are handled in post-processing since --stop may not be available in all llama.cpp versions try: # Run the binary and capture output print(f" 🔄 Running translation with llama.cpp binary...", flush=True) print(f" Command: {' '.join(cmd[:3])}... (model: {os.path.basename(model_path)})", flush=True) print(f" Input prompt length: {len(translation_prompt)} chars", flush=True) result = subprocess.run( cmd, capture_output=True, text=True, timeout=30, # 30 second timeout (should be enough for short translations) check=False # Don't raise on non-zero exit, we'll check manually ) print(f" Binary exit code: {result.returncode}", flush=True) print(f" Stdout length: {len(result.stdout)} chars", flush=True) print(f" Stderr length: {len(result.stderr)} chars", flush=True) # Check if command succeeded if result.returncode != 0: error_msg = f"llama.cpp binary exited with code {result.returncode}" if result.stderr: error_msg += f"\nStderr: {result.stderr[:500]}" print(f" Stderr: {result.stderr[:500]}", flush=True) if result.stdout: error_msg += f"\nStdout: {result.stdout[:500]}" print(f" Stdout: {result.stdout[:500]}", flush=True) print(f" ❌ {error_msg}", flush=True) raise RuntimeError(error_msg) # Parse the output output = result.stdout.strip() if not output: print(f" ❌ Empty output from binary", flush=True) raise RuntimeError("llama.cpp binary returned empty output") print(f" Raw output (first 200 chars): {output[:200]}", flush=True) # Remove reasoning/thinking tags and their content (Qwen models sometimes output these) # Remove ... tags and content output = re.sub(r'.*?', '', output, flags=re.DOTALL) # Remove ... tags if present output = re.sub(r'.*?', '', output, flags=re.DOTALL) # Remove any other reasoning tags output = re.sub(r'.*?', '', output, flags=re.DOTALL) # The output might include the prompt, so we need to extract just the generated part # Look for the assistant response after the prompt if "<|im_start|>assistant" in output: # Extract everything after the assistant tag output = output.split("<|im_start|>assistant")[-1].strip() print(f" Extracted after assistant tag: {output[:200]}", flush=True) # Remove any remaining chat format tokens translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip() # If the text still contains reasoning-like patterns, try to extract just the translation # Look for patterns like "The translation is:" or "English translation:" or just clean text if "translation" in translated_text.lower() and len(translated_text) > 100: # Try to find the actual translation after common prefixes translation_patterns = [ r'(?:translation|translated|english):\s*(.+?)(?:\n|$)', r'(?:the translation is|here is the translation|english translation):\s*(.+?)(?:\n|$)', ] for pattern in translation_patterns: match = re.search(pattern, translated_text, re.IGNORECASE | re.DOTALL) if match: translated_text = match.group(1).strip() print(f" Extracted translation from pattern: {translated_text[:200]}", flush=True) break # Clean up any remaining artifacts translated_text = re.sub(r'\n+', ' ', translated_text) # Replace multiple newlines with space translated_text = translated_text.strip() if not translated_text: print(f" ❌ Translation output is empty after parsing", flush=True) print(f" Original output was: {result.stdout[:500]}", flush=True) raise RuntimeError("Translation output is empty after parsing") print(f" ✅ Translation completed ({len(translated_text)} chars): '{translated_text[:200]}...'", flush=True) except subprocess.TimeoutExpired: error_msg = "Translation timed out after 30 seconds" print(f" ❌ {error_msg}", flush=True) print(f" This may indicate the model is too slow. Consider using a smaller model or enabling GPU.", flush=True) raise RuntimeError(error_msg) except subprocess.CalledProcessError as e: error_output = e.stderr if e.stderr else e.stdout error_msg = ( f"Translation failed with llama.cpp binary. " f"Exit code: {e.returncode}, Error: {error_output[:500]}" ) print(f" ❌ {error_msg}") raise RuntimeError(error_msg) from e except Exception as e: error_msg = f"Translation generation failed: {e}" print(f" ❌ {error_msg}") raise RuntimeError(error_msg) from e # Clean up the response translated_text = translated_text.strip() # Remove common prefixes that might be added by the model prefixes_to_remove = [ "English translation:", "Translation:", "English:", "Here is the translation:", "The translation is:", "Assistant:" ] for prefix in prefixes_to_remove: if translated_text.lower().startswith(prefix.lower()): translated_text = translated_text[len(prefix):].strip() # Remove leading/trailing quotes if present translated_text = translated_text.strip('"').strip("'").strip() # If translation is empty or suspiciously short, return original if not translated_text or len(translated_text) < len(prompt) * 0.1: print(f"⚠️ Translation may have failed (too short or empty), returning original text") return prompt return translated_text def generate_content_stream(self, prompt: str) -> Generator[str, None, None]: """ Stream translation using llama.cpp binary. For simplicity, we'll collect the full response and yield it. True streaming can be added later if needed. """ # For now, just yield the full translation (streaming can be optimized later) translation = self.generate_content(prompt) yield translation def _generate_content_impl(self, prompt: str) -> str: return self.generate_content(prompt) def _generate_content_stream_impl(self, prompt: str) -> Generator[Any, None, None]: return self.generate_content_stream(prompt)