Spaces:
Sleeping
Sleeping
| from typing import Generator, Any, Dict | |
| import os | |
| import subprocess | |
| import tempfile | |
| import zipfile | |
| import urllib.request | |
| import shutil | |
| from pathlib import Path | |
| from .base import LlmClient | |
| TRANSLATION_SYSTEM_INSTRUCTIONS = """You are a professional translator. Translate the user's text to English. Preserve the meaning, tone, and intent exactly. Return only the English translation, no additional commentary or explanation.""" | |
| class QwenTranslatorClient(LlmClient): | |
| """ | |
| Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary. | |
| Translates non-English text to English so it can be processed by the English-only classifier. | |
| Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed. | |
| Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed. | |
| The binary is automatically downloaded and extracted on first use. | |
| Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference. | |
| """ | |
| # Class-level cache for the binary path | |
| _binary_path = None | |
| _binary_dir = None | |
| def __init__(self, config_dict: Dict[str, Any], system_prompt: str): | |
| super().__init__(config_dict, system_prompt) | |
| self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF") | |
| self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf") | |
| self.temperature = float(self.config.get("temperature", 0.3)) | |
| self.top_p = float(self.config.get("top_p", 0.9)) | |
| self.top_k = int(self.config.get("top_k", 40)) | |
| self.max_tokens = int(self.config.get("max_tokens", 256)) | |
| self.context_size = int(self.config.get("context_size", 512)) | |
| self.n_threads = int(self.config.get("n_threads", 0)) # 0 = auto-detect CPU threads | |
| self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0)) # 0 = CPU only, >0 for GPU | |
| self.n_batch = int(self.config.get("n_batch", 256)) | |
| # Model path will be set on first use | |
| self.model_path = None | |
| self._model_downloaded = False | |
| print(f"β Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)") | |
| def _download_binary(cls) -> str: | |
| """Download and extract the pre-built llama.cpp binary from GitHub releases.""" | |
| if cls._binary_path and os.path.exists(cls._binary_path): | |
| return cls._binary_path | |
| print("π₯ Downloading pre-built llama.cpp binary...") | |
| # Create a temporary directory for the binary | |
| if cls._binary_dir is None: | |
| cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_") | |
| binary_dir = Path(cls._binary_dir) | |
| # Try common binary names (main is the standard, but some releases use llama-cli) | |
| possible_binary_names = ["main", "llama-cli", "llama"] | |
| binary_path = None | |
| # Check if any binary already exists | |
| for name in possible_binary_names: | |
| path = binary_dir / name | |
| if path.exists() and os.access(path, os.X_OK): | |
| cls._binary_path = str(path) | |
| print(f"β Using existing binary at: {cls._binary_path}") | |
| return cls._binary_path | |
| # If not found, we'll search after extraction | |
| binary_path = binary_dir / "main" # Default to 'main' (standard llama.cpp binary name) | |
| # Download the zip file | |
| zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip" | |
| zip_path = binary_dir / "llama-binary.zip" | |
| try: | |
| print(f" Downloading from: {zip_url}") | |
| urllib.request.urlretrieve(zip_url, zip_path) | |
| print(f" β Downloaded to: {zip_path}") | |
| # Extract the zip file | |
| print(f" π¦ Extracting zip file...") | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(binary_dir) | |
| # Find the binary in the extracted files | |
| # The binary might be called 'main', 'llama-cli', or 'llama' | |
| # It might be in the root or in a subdirectory | |
| found_binary = None | |
| # First, try common locations and names | |
| for name in possible_binary_names: | |
| possible_paths = [ | |
| binary_dir / name, | |
| binary_dir / "bin" / name, | |
| binary_dir / "llama-b6995-bin-ubuntu-x64" / name, | |
| ] | |
| for path in possible_paths: | |
| if path.exists(): | |
| found_binary = path | |
| break | |
| if found_binary: | |
| break | |
| # Also search recursively for any executable file matching our names | |
| if found_binary is None: | |
| for root, dirs, files in os.walk(binary_dir): | |
| for file in files: | |
| if file in possible_binary_names or file.startswith("llama"): | |
| candidate = Path(root) / file | |
| # Check if it's executable (or at least a regular file) | |
| if candidate.is_file() and os.access(candidate, os.X_OK): | |
| found_binary = candidate | |
| break | |
| if found_binary: | |
| break | |
| if found_binary is None: | |
| raise RuntimeError( | |
| f"Could not find llama.cpp binary in extracted zip. " | |
| f"Searched for: {possible_binary_names}. " | |
| f"Please check the zip file structure." | |
| ) | |
| # Make it executable | |
| os.chmod(found_binary, 0o755) | |
| # Move to expected location if needed (use 'main' as standard name) | |
| if found_binary != binary_path: | |
| if binary_path.exists(): | |
| binary_path.unlink() # Remove old binary if exists | |
| shutil.move(str(found_binary), str(binary_path)) | |
| cls._binary_path = str(binary_path) | |
| print(f" β Binary extracted and ready at: {cls._binary_path}") | |
| # Clean up zip file | |
| zip_path.unlink() | |
| return cls._binary_path | |
| except Exception as e: | |
| raise RuntimeError( | |
| f"Failed to download/extract llama.cpp binary from {zip_url}. " | |
| f"Error: {e}" | |
| ) from e | |
| def _download_model_if_needed(self) -> str: | |
| """Download GGUF model file from HuggingFace if not already cached.""" | |
| from huggingface_hub import hf_hub_download | |
| if self._model_downloaded and self.model_path and os.path.exists(self.model_path): | |
| return self.model_path | |
| # Set up cache directory | |
| cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface")) | |
| os.makedirs(cache_dir, exist_ok=True) | |
| try: | |
| print(f" π₯ Downloading GGUF model: {self.model_file} from {self.repo_id}...") | |
| model_path = hf_hub_download( | |
| repo_id=self.repo_id, | |
| filename=self.model_file, | |
| cache_dir=cache_dir, | |
| resume_download=True | |
| ) | |
| print(f" β Model downloaded/cached at: {model_path}") | |
| self.model_path = model_path | |
| self._model_downloaded = True | |
| return model_path | |
| except Exception as e: | |
| error_msg = ( | |
| f"Failed to download GGUF model '{self.model_file}' from '{self.repo_id}'. " | |
| f"Error: {e}\n" | |
| f"Please verify:\n" | |
| f"1. The repository exists: https://huggingface.co/{self.repo_id}\n" | |
| f"2. The model file name is correct\n" | |
| f"3. You have internet connectivity" | |
| ) | |
| raise RuntimeError(error_msg) from e | |
| def _build_translation_prompt(self, user_text: str) -> str: | |
| """Build a prompt for translation to English using Qwen's chat format.""" | |
| prompt = f"""<|im_start|>system | |
| {TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|> | |
| <|im_start|>user | |
| {user_text}<|im_end|> | |
| <|im_start|>assistant | |
| """ | |
| return prompt | |
| def generate_content(self, prompt: str) -> str: | |
| """ | |
| Translate the input text to English using the pre-built llama.cpp binary. | |
| Returns the English translation as a plain string. | |
| """ | |
| # Download binary and model if needed | |
| binary_path = self._download_binary() | |
| model_path = self._download_model_if_needed() | |
| # Build translation prompt | |
| translation_prompt = self._build_translation_prompt(prompt) | |
| # Prepare command-line arguments for llama.cpp binary | |
| # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0 | |
| cmd = [ | |
| binary_path, | |
| "-m", model_path, | |
| "-p", translation_prompt, | |
| "--temp", str(self.temperature), | |
| "--top-p", str(self.top_p), | |
| "--top-k", str(self.top_k), | |
| "-n", str(self.max_tokens), # Number of tokens to generate | |
| "-c", str(self.context_size), # Context size | |
| ] | |
| # Add thread count if specified (0 means auto-detect, which is default) | |
| if self.n_threads > 0: | |
| cmd.extend(["-t", str(self.n_threads)]) | |
| # Add GPU layers if specified | |
| if self.n_gpu_layers > 0: | |
| cmd.extend(["-ngl", str(self.n_gpu_layers)]) | |
| # Add stop sequences (llama.cpp uses --stop for each stop token) | |
| cmd.extend(["--stop", "<|im_end|>", "--stop", "<|im_start|>"]) | |
| try: | |
| # Run the binary and capture output | |
| print(f" π Running translation with llama.cpp binary...") | |
| result = subprocess.run( | |
| cmd, | |
| capture_output=True, | |
| text=True, | |
| timeout=60, # 60 second timeout | |
| check=True | |
| ) | |
| # Parse the output | |
| output = result.stdout.strip() | |
| # The output might include the prompt, so we need to extract just the generated part | |
| # Look for the assistant response after the prompt | |
| if "<|im_start|>assistant" in output: | |
| # Extract everything after the assistant tag | |
| output = output.split("<|im_start|>assistant")[-1].strip() | |
| # Remove any remaining chat format tokens | |
| translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip() | |
| except subprocess.TimeoutExpired: | |
| raise RuntimeError("Translation timed out after 60 seconds") | |
| except subprocess.CalledProcessError as e: | |
| error_output = e.stderr if e.stderr else e.stdout | |
| raise RuntimeError( | |
| f"Translation failed with llama.cpp binary. " | |
| f"Exit code: {e.returncode}, Error: {error_output}" | |
| ) from e | |
| except Exception as e: | |
| raise RuntimeError(f"Translation generation failed: {e}") from e | |
| # Clean up the response | |
| translated_text = translated_text.strip() | |
| # Remove common prefixes that might be added by the model | |
| prefixes_to_remove = [ | |
| "English translation:", | |
| "Translation:", | |
| "English:", | |
| "Here is the translation:", | |
| "The translation is:", | |
| "Assistant:" | |
| ] | |
| for prefix in prefixes_to_remove: | |
| if translated_text.lower().startswith(prefix.lower()): | |
| translated_text = translated_text[len(prefix):].strip() | |
| # Remove leading/trailing quotes if present | |
| translated_text = translated_text.strip('"').strip("'").strip() | |
| # If translation is empty or suspiciously short, return original | |
| if not translated_text or len(translated_text) < len(prompt) * 0.1: | |
| print(f"β οΈ Translation may have failed (too short or empty), returning original text") | |
| return prompt | |
| return translated_text | |
| def generate_content_stream(self, prompt: str) -> Generator[str, None, None]: | |
| """ | |
| Stream translation using llama.cpp binary. | |
| For simplicity, we'll collect the full response and yield it. | |
| True streaming can be added later if needed. | |
| """ | |
| # For now, just yield the full translation (streaming can be optimized later) | |
| translation = self.generate_content(prompt) | |
| yield translation | |
| def _generate_content_impl(self, prompt: str) -> str: | |
| return self.generate_content(prompt) | |
| def _generate_content_stream_impl(self, prompt: str) -> Generator[Any, None, None]: | |
| return self.generate_content_stream(prompt) | |