Spaces:
Sleeping
Sleeping
File size: 13,291 Bytes
a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 c26a471 a2e1879 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
from typing import Generator, Any, Dict
import os
import subprocess
import tempfile
import zipfile
import urllib.request
import shutil
from pathlib import Path
from .base import LlmClient
TRANSLATION_SYSTEM_INSTRUCTIONS = """You are a professional translator. Translate the user's text to English. Preserve the meaning, tone, and intent exactly. Return only the English translation, no additional commentary or explanation."""
class QwenTranslatorClient(LlmClient):
"""
Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary.
Translates non-English text to English so it can be processed by the English-only classifier.
Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed.
The binary is automatically downloaded and extracted on first use.
Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
"""
# Class-level cache for the binary path
_binary_path = None
_binary_dir = None
def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
super().__init__(config_dict, system_prompt)
self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")
self.temperature = float(self.config.get("temperature", 0.3))
self.top_p = float(self.config.get("top_p", 0.9))
self.top_k = int(self.config.get("top_k", 40))
self.max_tokens = int(self.config.get("max_tokens", 256))
self.context_size = int(self.config.get("context_size", 512))
self.n_threads = int(self.config.get("n_threads", 0)) # 0 = auto-detect CPU threads
self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0)) # 0 = CPU only, >0 for GPU
self.n_batch = int(self.config.get("n_batch", 256))
# Model path will be set on first use
self.model_path = None
self._model_downloaded = False
print(f"β
Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")
@classmethod
def _download_binary(cls) -> str:
"""Download and extract the pre-built llama.cpp binary from GitHub releases."""
if cls._binary_path and os.path.exists(cls._binary_path):
return cls._binary_path
print("π₯ Downloading pre-built llama.cpp binary...")
# Create a temporary directory for the binary
if cls._binary_dir is None:
cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_")
binary_dir = Path(cls._binary_dir)
# Try common binary names (main is the standard, but some releases use llama-cli)
possible_binary_names = ["main", "llama-cli", "llama"]
binary_path = None
# Check if any binary already exists
for name in possible_binary_names:
path = binary_dir / name
if path.exists() and os.access(path, os.X_OK):
cls._binary_path = str(path)
print(f"β
Using existing binary at: {cls._binary_path}")
return cls._binary_path
# If not found, we'll search after extraction
binary_path = binary_dir / "main" # Default to 'main' (standard llama.cpp binary name)
# Download the zip file
zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip"
zip_path = binary_dir / "llama-binary.zip"
try:
print(f" Downloading from: {zip_url}")
urllib.request.urlretrieve(zip_url, zip_path)
print(f" β
Downloaded to: {zip_path}")
# Extract the zip file
print(f" π¦ Extracting zip file...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(binary_dir)
# Find the binary in the extracted files
# The binary might be called 'main', 'llama-cli', or 'llama'
# It might be in the root or in a subdirectory
found_binary = None
# First, try common locations and names
for name in possible_binary_names:
possible_paths = [
binary_dir / name,
binary_dir / "bin" / name,
binary_dir / "llama-b6995-bin-ubuntu-x64" / name,
]
for path in possible_paths:
if path.exists():
found_binary = path
break
if found_binary:
break
# Also search recursively for any executable file matching our names
if found_binary is None:
for root, dirs, files in os.walk(binary_dir):
for file in files:
if file in possible_binary_names or file.startswith("llama"):
candidate = Path(root) / file
# Check if it's executable (or at least a regular file)
if candidate.is_file() and os.access(candidate, os.X_OK):
found_binary = candidate
break
if found_binary:
break
if found_binary is None:
raise RuntimeError(
f"Could not find llama.cpp binary in extracted zip. "
f"Searched for: {possible_binary_names}. "
f"Please check the zip file structure."
)
# Make it executable
os.chmod(found_binary, 0o755)
# Move to expected location if needed (use 'main' as standard name)
if found_binary != binary_path:
if binary_path.exists():
binary_path.unlink() # Remove old binary if exists
shutil.move(str(found_binary), str(binary_path))
cls._binary_path = str(binary_path)
print(f" β
Binary extracted and ready at: {cls._binary_path}")
# Clean up zip file
zip_path.unlink()
return cls._binary_path
except Exception as e:
raise RuntimeError(
f"Failed to download/extract llama.cpp binary from {zip_url}. "
f"Error: {e}"
) from e
def _download_model_if_needed(self) -> str:
"""Download GGUF model file from HuggingFace if not already cached."""
from huggingface_hub import hf_hub_download
if self._model_downloaded and self.model_path and os.path.exists(self.model_path):
return self.model_path
# Set up cache directory
cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
os.makedirs(cache_dir, exist_ok=True)
try:
print(f" π₯ Downloading GGUF model: {self.model_file} from {self.repo_id}...")
model_path = hf_hub_download(
repo_id=self.repo_id,
filename=self.model_file,
cache_dir=cache_dir,
resume_download=True
)
print(f" β
Model downloaded/cached at: {model_path}")
self.model_path = model_path
self._model_downloaded = True
return model_path
except Exception as e:
error_msg = (
f"Failed to download GGUF model '{self.model_file}' from '{self.repo_id}'. "
f"Error: {e}\n"
f"Please verify:\n"
f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
f"2. The model file name is correct\n"
f"3. You have internet connectivity"
)
raise RuntimeError(error_msg) from e
def _build_translation_prompt(self, user_text: str) -> str:
"""Build a prompt for translation to English using Qwen's chat format."""
prompt = f"""<|im_start|>system
{TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
<|im_start|>user
{user_text}<|im_end|>
<|im_start|>assistant
"""
return prompt
def generate_content(self, prompt: str) -> str:
"""
Translate the input text to English using the pre-built llama.cpp binary.
Returns the English translation as a plain string.
"""
# Download binary and model if needed
binary_path = self._download_binary()
model_path = self._download_model_if_needed()
# Build translation prompt
translation_prompt = self._build_translation_prompt(prompt)
# Prepare command-line arguments for llama.cpp binary
# Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
cmd = [
binary_path,
"-m", model_path,
"-p", translation_prompt,
"--temp", str(self.temperature),
"--top-p", str(self.top_p),
"--top-k", str(self.top_k),
"-n", str(self.max_tokens), # Number of tokens to generate
"-c", str(self.context_size), # Context size
]
# Add thread count if specified (0 means auto-detect, which is default)
if self.n_threads > 0:
cmd.extend(["-t", str(self.n_threads)])
# Add GPU layers if specified
if self.n_gpu_layers > 0:
cmd.extend(["-ngl", str(self.n_gpu_layers)])
# Add stop sequences (llama.cpp uses --stop for each stop token)
cmd.extend(["--stop", "<|im_end|>", "--stop", "<|im_start|>"])
try:
# Run the binary and capture output
print(f" π Running translation with llama.cpp binary...")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60, # 60 second timeout
check=True
)
# Parse the output
output = result.stdout.strip()
# The output might include the prompt, so we need to extract just the generated part
# Look for the assistant response after the prompt
if "<|im_start|>assistant" in output:
# Extract everything after the assistant tag
output = output.split("<|im_start|>assistant")[-1].strip()
# Remove any remaining chat format tokens
translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
except subprocess.TimeoutExpired:
raise RuntimeError("Translation timed out after 60 seconds")
except subprocess.CalledProcessError as e:
error_output = e.stderr if e.stderr else e.stdout
raise RuntimeError(
f"Translation failed with llama.cpp binary. "
f"Exit code: {e.returncode}, Error: {error_output}"
) from e
except Exception as e:
raise RuntimeError(f"Translation generation failed: {e}") from e
# Clean up the response
translated_text = translated_text.strip()
# Remove common prefixes that might be added by the model
prefixes_to_remove = [
"English translation:",
"Translation:",
"English:",
"Here is the translation:",
"The translation is:",
"Assistant:"
]
for prefix in prefixes_to_remove:
if translated_text.lower().startswith(prefix.lower()):
translated_text = translated_text[len(prefix):].strip()
# Remove leading/trailing quotes if present
translated_text = translated_text.strip('"').strip("'").strip()
# If translation is empty or suspiciously short, return original
if not translated_text or len(translated_text) < len(prompt) * 0.1:
print(f"β οΈ Translation may have failed (too short or empty), returning original text")
return prompt
return translated_text
def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
"""
Stream translation using llama.cpp binary.
For simplicity, we'll collect the full response and yield it.
True streaming can be added later if needed.
"""
# For now, just yield the full translation (streaming can be optimized later)
translation = self.generate_content(prompt)
yield translation
def _generate_content_impl(self, prompt: str) -> str:
return self.generate_content(prompt)
def _generate_content_stream_impl(self, prompt: str) -> Generator[Any, None, None]:
return self.generate_content_stream(prompt)
|