Spaces:

zazaman
/

guardrails-final

Sleeping

App Files Files Community

guardrails-final / llm_clients /qwen_translator.py

zazaman

Fix translation: add OS detection, better error handling and logging

1af1f14 about 1 month ago

raw

history blame

16.4 kB

	from typing import Generator, Any, Dict
	import os
	import sys
	import subprocess
	import tempfile
	import zipfile
	import urllib.request
	import shutil
	from pathlib import Path
	from .base import LlmClient


	TRANSLATION_SYSTEM_INSTRUCTIONS = """You are a professional translator. Translate the user's text to English. Preserve the meaning, tone, and intent exactly. Return only the English translation, no additional commentary or explanation."""


	class QwenTranslatorClient(LlmClient):
	"""
	Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary.
	Translates non-English text to English so it can be processed by the English-only classifier.

	Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
	Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed.
	The binary is automatically downloaded and extracted on first use.
	Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
	"""

	# Class-level cache for the binary path
	_binary_path = None
	_binary_dir = None

	def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
	super().__init__(config_dict, system_prompt)
	self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
	self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")
	self.temperature = float(self.config.get("temperature", 0.3))
	self.top_p = float(self.config.get("top_p", 0.9))
	self.top_k = int(self.config.get("top_k", 40))
	self.max_tokens = int(self.config.get("max_tokens", 256))
	self.context_size = int(self.config.get("context_size", 512))
	self.n_threads = int(self.config.get("n_threads", 0)) # 0 = auto-detect CPU threads
	self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0)) # 0 = CPU only, >0 for GPU
	self.n_batch = int(self.config.get("n_batch", 256))

	# Model path will be set on first use
	self.model_path = None
	self._model_downloaded = False

	print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")

	@classmethod
	def _download_binary(cls) -> str:
	"""Download and extract the pre-built llama.cpp binary from GitHub releases."""
	# Check OS - the Ubuntu binary only works on Linux
	if sys.platform == "win32":
	raise RuntimeError(
	"Translation with llama.cpp binary is not supported on Windows. "
	"The pre-built binary is for Linux only. "
	"Please use this feature on Linux or Hugging Face Spaces."
	)

	if cls._binary_path and os.path.exists(cls._binary_path):
	return cls._binary_path

	print("📥 Downloading pre-built llama.cpp binary...")

	# Create a temporary directory for the binary
	if cls._binary_dir is None:
	cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_")

	binary_dir = Path(cls._binary_dir)

	# Try common binary names (main is the standard, but some releases use llama-cli)
	possible_binary_names = ["main", "llama-cli", "llama"]
	binary_path = None

	# Check if any binary already exists
	for name in possible_binary_names:
	path = binary_dir / name
	if path.exists() and os.access(path, os.X_OK):
	cls._binary_path = str(path)
	print(f"✅ Using existing binary at: {cls._binary_path}")
	return cls._binary_path

	# If not found, we'll search after extraction
	binary_path = binary_dir / "main" # Default to 'main' (standard llama.cpp binary name)

	# Download the zip file
	zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip"
	zip_path = binary_dir / "llama-binary.zip"

	try:
	print(f" Downloading from: {zip_url}")
	# Use a more robust download method
	try:
	urllib.request.urlretrieve(zip_url, str(zip_path))
	except Exception as download_error:
	raise RuntimeError(f"Failed to download binary from {zip_url}: {download_error}") from download_error

	if not zip_path.exists():
	raise RuntimeError(f"Downloaded file not found at {zip_path}")

	print(f" ✅ Downloaded to: {zip_path} ({zip_path.stat().st_size / 1024 / 1024:.1f} MB)")

	# Extract the zip file
	print(f" 📦 Extracting zip file...")
	try:
	with zipfile.ZipFile(str(zip_path), 'r') as zip_ref:
	zip_ref.extractall(str(binary_dir))
	except Exception as extract_error:
	raise RuntimeError(f"Failed to extract zip file {zip_path}: {extract_error}") from extract_error

	# Find the binary in the extracted files
	# The binary might be called 'main', 'llama-cli', or 'llama'
	# It might be in the root or in a subdirectory
	found_binary = None

	# First, try common locations and names
	for name in possible_binary_names:
	possible_paths = [
	binary_dir / name,
	binary_dir / "bin" / name,
	binary_dir / "llama-b6995-bin-ubuntu-x64" / name,
	]
	for path in possible_paths:
	if path.exists():
	found_binary = path
	break
	if found_binary:
	break

	# Also search recursively for any executable file matching our names
	if found_binary is None:
	for root, dirs, files in os.walk(str(binary_dir)):
	for file in files:
	if file in possible_binary_names or file.startswith("llama"):
	candidate = Path(root) / file
	# Check if it's executable (or at least a regular file)
	if candidate.is_file():
	found_binary = candidate
	break
	if found_binary:
	break

	if found_binary is None:
	# List what we found for debugging
	found_files = []
	for root, dirs, files in os.walk(str(binary_dir)):
	for file in files:
	found_files.append(str(Path(root) / file))
	raise RuntimeError(
	f"Could not find llama.cpp binary in extracted zip. "
	f"Searched for: {possible_binary_names}. "
	f"Found files: {found_files[:10]}"
	)

	# Make it executable (Linux/Unix only)
	try:
	os.chmod(found_binary, 0o755)
	except Exception as chmod_error:
	print(f" ⚠️ Warning: Could not set executable permissions: {chmod_error}")

	# Move to expected location if needed (use 'main' as standard name)
	if found_binary != binary_path:
	if binary_path.exists():
	binary_path.unlink() # Remove old binary if exists
	shutil.move(str(found_binary), str(binary_path))

	cls._binary_path = str(binary_path)
	print(f" ✅ Binary extracted and ready at: {cls._binary_path}")

	# Verify binary is executable
	if not os.access(cls._binary_path, os.X_OK):
	print(f" ⚠️ Warning: Binary may not be executable. Attempting to fix...")
	try:
	os.chmod(cls._binary_path, 0o755)
	except Exception:
	pass

	# Clean up zip file
	try:
	zip_path.unlink()
	except Exception:
	pass # Ignore cleanup errors

	return cls._binary_path

	except Exception as e:
	error_msg = (
	f"Failed to download/extract llama.cpp binary from {zip_url}. "
	f"Error: {e}"
	)
	print(f" ❌ {error_msg}")
	raise RuntimeError(error_msg) from e

	def _download_model_if_needed(self) -> str:
	"""Download GGUF model file from HuggingFace if not already cached."""
	from huggingface_hub import hf_hub_download

	if self._model_downloaded and self.model_path and os.path.exists(self.model_path):
	return self.model_path

	# Set up cache directory
	cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
	os.makedirs(cache_dir, exist_ok=True)

	try:
	print(f" 📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...")
	model_path = hf_hub_download(
	repo_id=self.repo_id,
	filename=self.model_file,
	cache_dir=cache_dir,
	resume_download=True
	)
	print(f" ✅ Model downloaded/cached at: {model_path}")
	self.model_path = model_path
	self._model_downloaded = True
	return model_path
	except Exception as e:
	error_msg = (
	f"Failed to download GGUF model '{self.model_file}' from '{self.repo_id}'. "
	f"Error: {e}\n"
	f"Please verify:\n"
	f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
	f"2. The model file name is correct\n"
	f"3. You have internet connectivity"
	)
	raise RuntimeError(error_msg) from e

	def _build_translation_prompt(self, user_text: str) -> str:
	"""Build a prompt for translation to English using Qwen's chat format."""
	prompt = f"""<\|im_start\|>system
	{TRANSLATION_SYSTEM_INSTRUCTIONS}<\|im_end\|>
	<\|im_start\|>user
	{user_text}<\|im_end\|>
	<\|im_start\|>assistant
	"""
	return prompt

	def generate_content(self, prompt: str) -> str:
	"""
	Translate the input text to English using the pre-built llama.cpp binary.
	Returns the English translation as a plain string.
	"""
	# Download binary and model if needed
	binary_path = self._download_binary()
	model_path = self._download_model_if_needed()

	# Build translation prompt
	translation_prompt = self._build_translation_prompt(prompt)

	# Prepare command-line arguments for llama.cpp binary
	# Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
	cmd = [
	binary_path,
	"-m", model_path,
	"-p", translation_prompt,
	"--temp", str(self.temperature),
	"--top-p", str(self.top_p),
	"--top-k", str(self.top_k),
	"-n", str(self.max_tokens), # Number of tokens to generate
	"-c", str(self.context_size), # Context size
	]

	# Add thread count if specified (0 means auto-detect, which is default)
	if self.n_threads > 0:
	cmd.extend(["-t", str(self.n_threads)])

	# Add GPU layers if specified
	if self.n_gpu_layers > 0:
	cmd.extend(["-ngl", str(self.n_gpu_layers)])

	# Add stop sequences (llama.cpp uses --stop for each stop token)
	cmd.extend(["--stop", "<\|im_end\|>", "--stop", "<\|im_start\|>"])

	try:
	# Run the binary and capture output
	print(f" 🔄 Running translation with llama.cpp binary...")
	print(f" Command: {' '.join(cmd[:3])}... (model: {os.path.basename(model_path)})")

	result = subprocess.run(
	cmd,
	capture_output=True,
	text=True,
	timeout=60, # 60 second timeout
	check=False # Don't raise on non-zero exit, we'll check manually
	)

	# Check if command succeeded
	if result.returncode != 0:
	error_msg = f"llama.cpp binary exited with code {result.returncode}"
	if result.stderr:
	error_msg += f"\nStderr: {result.stderr[:500]}"
	if result.stdout:
	error_msg += f"\nStdout: {result.stdout[:500]}"
	print(f" ❌ {error_msg}")
	raise RuntimeError(error_msg)

	# Parse the output
	output = result.stdout.strip()

	if not output:
	raise RuntimeError("llama.cpp binary returned empty output")

	# The output might include the prompt, so we need to extract just the generated part
	# Look for the assistant response after the prompt
	if "<\|im_start\|>assistant" in output:
	# Extract everything after the assistant tag
	output = output.split("<\|im_start\|>assistant")[-1].strip()

	# Remove any remaining chat format tokens
	translated_text = output.replace("<\|im_start\|>", "").replace("<\|im_end\|>", "").strip()

	if not translated_text:
	raise RuntimeError("Translation output is empty after parsing")

	print(f" ✅ Translation completed: '{translated_text[:100]}...'")

	except subprocess.TimeoutExpired:
	error_msg = "Translation timed out after 60 seconds"
	print(f" ❌ {error_msg}")
	raise RuntimeError(error_msg)
	except subprocess.CalledProcessError as e:
	error_output = e.stderr if e.stderr else e.stdout
	error_msg = (
	f"Translation failed with llama.cpp binary. "
	f"Exit code: {e.returncode}, Error: {error_output[:500]}"
	)
	print(f" ❌ {error_msg}")
	raise RuntimeError(error_msg) from e
	except Exception as e:
	error_msg = f"Translation generation failed: {e}"
	print(f" ❌ {error_msg}")
	raise RuntimeError(error_msg) from e

	# Clean up the response
	translated_text = translated_text.strip()

	# Remove common prefixes that might be added by the model
	prefixes_to_remove = [
	"English translation:",
	"Translation:",
	"English:",
	"Here is the translation:",
	"The translation is:",
	"Assistant:"
	]
	for prefix in prefixes_to_remove:
	if translated_text.lower().startswith(prefix.lower()):
	translated_text = translated_text[len(prefix):].strip()

	# Remove leading/trailing quotes if present
	translated_text = translated_text.strip('"').strip("'").strip()

	# If translation is empty or suspiciously short, return original
	if not translated_text or len(translated_text) < len(prompt) * 0.1:
	print(f"⚠️ Translation may have failed (too short or empty), returning original text")
	return prompt

	return translated_text

	def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
	"""
	Stream translation using llama.cpp binary.
	For simplicity, we'll collect the full response and yield it.
	True streaming can be added later if needed.
	"""
	# For now, just yield the full translation (streaming can be optimized later)
	translation = self.generate_content(prompt)
	yield translation

	def _generate_content_impl(self, prompt: str) -> str:
	return self.generate_content(prompt)

	def _generate_content_stream_impl(self, prompt: str) -> Generator[Any, None, None]:
	return self.generate_content_stream(prompt)