zazaman commited on
Commit
c26a471
·
1 Parent(s): b2635ed

Replace llama-cpp-python with pre-built llama.cpp binary for Qwen translator

Browse files
Files changed (4) hide show
  1. Dockerfile +3 -3
  2. config.py +1 -1
  3. llm_clients/qwen_translator.py +188 -92
  4. requirements.txt +1 -2
Dockerfile CHANGED
@@ -3,13 +3,13 @@ FROM python:3.10-slim
3
  # Set working directory
4
  WORKDIR /app
5
 
6
- # Install system dependencies for PDF processing, llama-cpp-python compilation, and other requirements
 
7
  RUN apt-get update && apt-get install -y \
8
  gcc \
9
  g++ \
10
- cmake \
11
- make \
12
  git \
 
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  # Create a user to avoid running as root
 
3
  # Set working directory
4
  WORKDIR /app
5
 
6
+ # Install system dependencies for PDF processing and other requirements
7
+ # Note: llama.cpp binary is downloaded at runtime, no compilation needed
8
  RUN apt-get update && apt-get install -y \
9
  gcc \
10
  g++ \
 
 
11
  git \
12
+ unzip \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  # Create a user to avoid running as root
config.py CHANGED
@@ -28,7 +28,7 @@ AI_DETECTION_MODE = {
28
  # Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces.
29
  NON_ENGLISH_TRANSLATOR = {
30
  "enabled": True,
31
- "provider": "qwen_translator", # Translation client using GGUF models via llama-cpp-python
32
  "config": {
33
  # GGUF model repository and file from unsloth (pre-quantized)
34
  "repo_id": "unsloth/Qwen3-0.6B-GGUF",
 
28
  # Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces.
29
  NON_ENGLISH_TRANSLATOR = {
30
  "enabled": True,
31
+ "provider": "qwen_translator", # Translation client using GGUF models via pre-built llama.cpp binary
32
  "config": {
33
  # GGUF model repository and file from unsloth (pre-quantized)
34
  "repo_id": "unsloth/Qwen3-0.6B-GGUF",
llm_clients/qwen_translator.py CHANGED
@@ -1,5 +1,11 @@
1
  from typing import Generator, Any, Dict
2
  import os
 
 
 
 
 
 
3
  from .base import LlmClient
4
 
5
 
@@ -8,17 +14,23 @@ TRANSLATION_SYSTEM_INSTRUCTIONS = """You are a professional translator. Translat
8
 
9
  class QwenTranslatorClient(LlmClient):
10
  """
11
- Translation client using Qwen3-0.6B-GGUF pre-quantized models via llama-cpp-python.
12
  Translates non-English text to English so it can be processed by the English-only classifier.
13
 
14
  Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
 
 
15
  Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
16
  """
17
 
 
 
 
 
18
  def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
19
  super().__init__(config_dict, system_prompt)
20
  self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
21
- self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf") # Default to IQ4_XS for good balance
22
  self.temperature = float(self.config.get("temperature", 0.3))
23
  self.top_p = float(self.config.get("top_p", 0.9))
24
  self.top_k = int(self.config.get("top_k", 40))
@@ -26,31 +38,131 @@ class QwenTranslatorClient(LlmClient):
26
  self.context_size = int(self.config.get("context_size", 512))
27
  self.n_threads = int(self.config.get("n_threads", 0)) # 0 = auto-detect CPU threads
28
  self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0)) # 0 = CPU only, >0 for GPU
29
- self.n_batch = int(self.config.get("n_batch", 256)) # Batch size for prompt processing
30
 
31
- # Model will be loaded lazily on first use
32
- self.llm = None
33
- self._model_loaded = False
34
 
35
  print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def _download_model_if_needed(self) -> str:
38
  """Download GGUF model file from HuggingFace if not already cached."""
39
- from huggingface_hub import hf_hub_download, list_repo_files
40
- import os
 
 
41
 
42
  # Set up cache directory
43
  cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
44
  os.makedirs(cache_dir, exist_ok=True)
45
 
46
  try:
47
- # First, try to list available files to help with debugging
48
- try:
49
- repo_files = list_repo_files(repo_id=self.repo_id, repo_type="model")
50
- print(f" 📋 Available files in {self.repo_id}: {[f for f in repo_files if f.endswith('.gguf')][:5]}...")
51
- except Exception:
52
- pass # Ignore if we can't list files
53
-
54
  print(f" 📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...")
55
  model_path = hf_hub_download(
56
  repo_id=self.repo_id,
@@ -59,6 +171,8 @@ class QwenTranslatorClient(LlmClient):
59
  resume_download=True
60
  )
61
  print(f" ✅ Model downloaded/cached at: {model_path}")
 
 
62
  return model_path
63
  except Exception as e:
64
  error_msg = (
@@ -66,62 +180,13 @@ class QwenTranslatorClient(LlmClient):
66
  f"Error: {e}\n"
67
  f"Please verify:\n"
68
  f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
69
- f"2. The model file name is correct (check available .gguf files in the repo)\n"
70
- f"3. You have internet connectivity\n"
71
- f"Common file names: Qwen3-0.6B-Base-Q4_K_M.gguf, qwen3-0.6b-base-q4_k_m.gguf, etc."
72
  )
73
  raise RuntimeError(error_msg) from e
74
 
75
- def _load_model(self):
76
- """Lazy load the GGUF model on first use."""
77
- if self._model_loaded:
78
- return
79
-
80
- try:
81
- from llama_cpp import Llama
82
-
83
- print(f"🔄 Loading GGUF translation model: {self.model_file}")
84
-
85
- # Download model if needed
86
- model_path = self._download_model_if_needed()
87
-
88
- # Load the GGUF model with llama-cpp-python
89
- print(f" 📥 Loading model from: {model_path}")
90
-
91
- # Optimize for speed: use mmap for faster loading, no memory locking
92
- self.llm = Llama(
93
- model_path=model_path,
94
- n_ctx=self.context_size, # Context window size (smaller = faster)
95
- n_threads=self.n_threads if self.n_threads > 0 else None, # Auto-detect if 0
96
- n_gpu_layers=self.n_gpu_layers, # 0 = CPU only, >0 for GPU layers
97
- verbose=False, # Suppress verbose output
98
- use_mlock=False, # Don't lock memory (faster, better for Spaces)
99
- use_mmap=True, # Use memory mapping for faster loading
100
- n_batch=self.n_batch, # Batch size (smaller = faster for short prompts)
101
- n_predict=self.max_tokens, # Max tokens to predict
102
- )
103
-
104
- self._model_loaded = True
105
- actual_threads = self.llm.n_threads if hasattr(self.llm, 'n_threads') else self.n_threads
106
- print(f"✅ GGUF translation model loaded successfully")
107
- print(f" Context size: {self.context_size} (reduced for faster inference)")
108
- print(f" CPU threads: {actual_threads} ({'auto-detected' if self.n_threads == 0 else 'manual'})")
109
- print(f" GPU layers: {self.n_gpu_layers} (0 = CPU only, >0 for GPU acceleration)")
110
- print(f" Batch size: {self.n_batch}")
111
-
112
- except ImportError as e:
113
- raise ImportError(
114
- f"llama-cpp-python library is required for QwenTranslatorClient with GGUF models. "
115
- f"Install it with: pip install llama-cpp-python\n"
116
- f"Original error: {e}"
117
- ) from e
118
- except Exception as e:
119
- raise RuntimeError(f"Failed to load GGUF translation model {self.model_file}: {e}") from e
120
-
121
  def _build_translation_prompt(self, user_text: str) -> str:
122
  """Build a prompt for translation to English using Qwen's chat format."""
123
- # Qwen3 uses a specific chat template format: <|im_start|>role\ncontent<|im_end|>
124
- # System prompt handles the translation instruction, user just provides the text
125
  prompt = f"""<|im_start|>system
126
  {TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
127
  <|im_start|>user
@@ -132,44 +197,76 @@ class QwenTranslatorClient(LlmClient):
132
 
133
  def generate_content(self, prompt: str) -> str:
134
  """
135
- Translate the input text to English.
136
  Returns the English translation as a plain string.
137
  """
138
- # Load model if not already loaded (lazy loading)
139
- if not self._model_loaded:
140
- self._load_model()
141
 
142
  # Build translation prompt
143
  translation_prompt = self._build_translation_prompt(prompt)
144
 
145
- # Generate translation using llama-cpp-python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  try:
147
- # Optimize generation for speed
148
- response = self.llm(
149
- translation_prompt,
150
- max_tokens=self.max_tokens,
151
- temperature=self.temperature,
152
- top_p=self.top_p,
153
- top_k=self.top_k,
154
- stop=["<|im_end|>", "<|im_start|>"], # Stop at chat format tokens
155
- echo=False, # Don't echo the prompt
156
- repeat_penalty=1.1, # Slight penalty to avoid repetition (faster)
157
  )
158
 
159
- # Extract the generated text
160
- if 'choices' in response and len(response['choices']) > 0:
161
- generated_text = response['choices'][0]['text'].strip()
162
- else:
163
- raise ValueError("Empty response from GGUF model")
 
 
 
 
 
 
164
 
 
 
 
 
 
 
 
 
165
  except Exception as e:
166
  raise RuntimeError(f"Translation generation failed: {e}") from e
167
 
168
  # Clean up the response
169
- translated_text = generated_text.strip()
170
-
171
- # Remove any remaining chat format tokens
172
- translated_text = translated_text.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
173
 
174
  # Remove common prefixes that might be added by the model
175
  prefixes_to_remove = [
@@ -189,7 +286,6 @@ class QwenTranslatorClient(LlmClient):
189
 
190
  # If translation is empty or suspiciously short, return original
191
  if not translated_text or len(translated_text) < len(prompt) * 0.1:
192
- # Model might not have translated properly, return original
193
  print(f"⚠️ Translation may have failed (too short or empty), returning original text")
194
  return prompt
195
 
@@ -197,7 +293,7 @@ class QwenTranslatorClient(LlmClient):
197
 
198
  def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
199
  """
200
- Stream translation using llama-cpp-python streaming.
201
  For simplicity, we'll collect the full response and yield it.
202
  True streaming can be added later if needed.
203
  """
 
1
  from typing import Generator, Any, Dict
2
  import os
3
+ import subprocess
4
+ import tempfile
5
+ import zipfile
6
+ import urllib.request
7
+ import shutil
8
+ from pathlib import Path
9
  from .base import LlmClient
10
 
11
 
 
14
 
15
  class QwenTranslatorClient(LlmClient):
16
  """
17
+ Translation client using Qwen3-0.6B-GGUF pre-quantized models via pre-built llama.cpp binary.
18
  Translates non-English text to English so it can be processed by the English-only classifier.
19
 
20
  Uses GGUF format models from unsloth/Qwen3-0.6B-GGUF - already quantized, no bitsandbytes needed.
21
+ Uses pre-built llama.cpp binary (llama-b6995-bin-ubuntu-x64.zip) from GitHub releases - no compilation needed.
22
+ The binary is automatically downloaded and extracted on first use.
23
  Optimized for Hugging Face Spaces with lazy loading and efficient CPU inference.
24
  """
25
 
26
+ # Class-level cache for the binary path
27
+ _binary_path = None
28
+ _binary_dir = None
29
+
30
  def __init__(self, config_dict: Dict[str, Any], system_prompt: str):
31
  super().__init__(config_dict, system_prompt)
32
  self.repo_id = self.config.get("repo_id", "unsloth/Qwen3-0.6B-GGUF")
33
+ self.model_file = self.config.get("model_file", "Qwen3-0.6B-IQ4_XS.gguf")
34
  self.temperature = float(self.config.get("temperature", 0.3))
35
  self.top_p = float(self.config.get("top_p", 0.9))
36
  self.top_k = int(self.config.get("top_k", 40))
 
38
  self.context_size = int(self.config.get("context_size", 512))
39
  self.n_threads = int(self.config.get("n_threads", 0)) # 0 = auto-detect CPU threads
40
  self.n_gpu_layers = int(self.config.get("n_gpu_layers", 0)) # 0 = CPU only, >0 for GPU
41
+ self.n_batch = int(self.config.get("n_batch", 256))
42
 
43
+ # Model path will be set on first use
44
+ self.model_path = None
45
+ self._model_downloaded = False
46
 
47
  print(f"✅ Qwen GGUF translator client initialized (repo: {self.repo_id}, model: {self.model_file}, will load on first use)")
48
 
49
+ @classmethod
50
+ def _download_binary(cls) -> str:
51
+ """Download and extract the pre-built llama.cpp binary from GitHub releases."""
52
+ if cls._binary_path and os.path.exists(cls._binary_path):
53
+ return cls._binary_path
54
+
55
+ print("📥 Downloading pre-built llama.cpp binary...")
56
+
57
+ # Create a temporary directory for the binary
58
+ if cls._binary_dir is None:
59
+ cls._binary_dir = tempfile.mkdtemp(prefix="llama_cpp_binary_")
60
+
61
+ binary_dir = Path(cls._binary_dir)
62
+
63
+ # Try common binary names (main is the standard, but some releases use llama-cli)
64
+ possible_binary_names = ["main", "llama-cli", "llama"]
65
+ binary_path = None
66
+
67
+ # Check if any binary already exists
68
+ for name in possible_binary_names:
69
+ path = binary_dir / name
70
+ if path.exists() and os.access(path, os.X_OK):
71
+ cls._binary_path = str(path)
72
+ print(f"✅ Using existing binary at: {cls._binary_path}")
73
+ return cls._binary_path
74
+
75
+ # If not found, we'll search after extraction
76
+ binary_path = binary_dir / "main" # Default to 'main' (standard llama.cpp binary name)
77
+
78
+ # Download the zip file
79
+ zip_url = "https://github.com/ggml-org/llama.cpp/releases/download/b6995/llama-b6995-bin-ubuntu-x64.zip"
80
+ zip_path = binary_dir / "llama-binary.zip"
81
+
82
+ try:
83
+ print(f" Downloading from: {zip_url}")
84
+ urllib.request.urlretrieve(zip_url, zip_path)
85
+ print(f" ✅ Downloaded to: {zip_path}")
86
+
87
+ # Extract the zip file
88
+ print(f" 📦 Extracting zip file...")
89
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
90
+ zip_ref.extractall(binary_dir)
91
+
92
+ # Find the binary in the extracted files
93
+ # The binary might be called 'main', 'llama-cli', or 'llama'
94
+ # It might be in the root or in a subdirectory
95
+ found_binary = None
96
+
97
+ # First, try common locations and names
98
+ for name in possible_binary_names:
99
+ possible_paths = [
100
+ binary_dir / name,
101
+ binary_dir / "bin" / name,
102
+ binary_dir / "llama-b6995-bin-ubuntu-x64" / name,
103
+ ]
104
+ for path in possible_paths:
105
+ if path.exists():
106
+ found_binary = path
107
+ break
108
+ if found_binary:
109
+ break
110
+
111
+ # Also search recursively for any executable file matching our names
112
+ if found_binary is None:
113
+ for root, dirs, files in os.walk(binary_dir):
114
+ for file in files:
115
+ if file in possible_binary_names or file.startswith("llama"):
116
+ candidate = Path(root) / file
117
+ # Check if it's executable (or at least a regular file)
118
+ if candidate.is_file() and os.access(candidate, os.X_OK):
119
+ found_binary = candidate
120
+ break
121
+ if found_binary:
122
+ break
123
+
124
+ if found_binary is None:
125
+ raise RuntimeError(
126
+ f"Could not find llama.cpp binary in extracted zip. "
127
+ f"Searched for: {possible_binary_names}. "
128
+ f"Please check the zip file structure."
129
+ )
130
+
131
+ # Make it executable
132
+ os.chmod(found_binary, 0o755)
133
+
134
+ # Move to expected location if needed (use 'main' as standard name)
135
+ if found_binary != binary_path:
136
+ if binary_path.exists():
137
+ binary_path.unlink() # Remove old binary if exists
138
+ shutil.move(str(found_binary), str(binary_path))
139
+
140
+ cls._binary_path = str(binary_path)
141
+ print(f" ✅ Binary extracted and ready at: {cls._binary_path}")
142
+
143
+ # Clean up zip file
144
+ zip_path.unlink()
145
+
146
+ return cls._binary_path
147
+
148
+ except Exception as e:
149
+ raise RuntimeError(
150
+ f"Failed to download/extract llama.cpp binary from {zip_url}. "
151
+ f"Error: {e}"
152
+ ) from e
153
+
154
  def _download_model_if_needed(self) -> str:
155
  """Download GGUF model file from HuggingFace if not already cached."""
156
+ from huggingface_hub import hf_hub_download
157
+
158
+ if self._model_downloaded and self.model_path and os.path.exists(self.model_path):
159
+ return self.model_path
160
 
161
  # Set up cache directory
162
  cache_dir = os.environ.get('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
163
  os.makedirs(cache_dir, exist_ok=True)
164
 
165
  try:
 
 
 
 
 
 
 
166
  print(f" 📥 Downloading GGUF model: {self.model_file} from {self.repo_id}...")
167
  model_path = hf_hub_download(
168
  repo_id=self.repo_id,
 
171
  resume_download=True
172
  )
173
  print(f" ✅ Model downloaded/cached at: {model_path}")
174
+ self.model_path = model_path
175
+ self._model_downloaded = True
176
  return model_path
177
  except Exception as e:
178
  error_msg = (
 
180
  f"Error: {e}\n"
181
  f"Please verify:\n"
182
  f"1. The repository exists: https://huggingface.co/{self.repo_id}\n"
183
+ f"2. The model file name is correct\n"
184
+ f"3. You have internet connectivity"
 
185
  )
186
  raise RuntimeError(error_msg) from e
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def _build_translation_prompt(self, user_text: str) -> str:
189
  """Build a prompt for translation to English using Qwen's chat format."""
 
 
190
  prompt = f"""<|im_start|>system
191
  {TRANSLATION_SYSTEM_INSTRUCTIONS}<|im_end|>
192
  <|im_start|>user
 
197
 
198
  def generate_content(self, prompt: str) -> str:
199
  """
200
+ Translate the input text to English using the pre-built llama.cpp binary.
201
  Returns the English translation as a plain string.
202
  """
203
+ # Download binary and model if needed
204
+ binary_path = self._download_binary()
205
+ model_path = self._download_model_if_needed()
206
 
207
  # Build translation prompt
208
  translation_prompt = self._build_translation_prompt(prompt)
209
 
210
+ # Prepare command-line arguments for llama.cpp binary
211
+ # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
212
+ cmd = [
213
+ binary_path,
214
+ "-m", model_path,
215
+ "-p", translation_prompt,
216
+ "--temp", str(self.temperature),
217
+ "--top-p", str(self.top_p),
218
+ "--top-k", str(self.top_k),
219
+ "-n", str(self.max_tokens), # Number of tokens to generate
220
+ "-c", str(self.context_size), # Context size
221
+ ]
222
+
223
+ # Add thread count if specified (0 means auto-detect, which is default)
224
+ if self.n_threads > 0:
225
+ cmd.extend(["-t", str(self.n_threads)])
226
+
227
+ # Add GPU layers if specified
228
+ if self.n_gpu_layers > 0:
229
+ cmd.extend(["-ngl", str(self.n_gpu_layers)])
230
+
231
+ # Add stop sequences (llama.cpp uses --stop for each stop token)
232
+ cmd.extend(["--stop", "<|im_end|>", "--stop", "<|im_start|>"])
233
+
234
  try:
235
+ # Run the binary and capture output
236
+ print(f" 🔄 Running translation with llama.cpp binary...")
237
+ result = subprocess.run(
238
+ cmd,
239
+ capture_output=True,
240
+ text=True,
241
+ timeout=60, # 60 second timeout
242
+ check=True
 
 
243
  )
244
 
245
+ # Parse the output
246
+ output = result.stdout.strip()
247
+
248
+ # The output might include the prompt, so we need to extract just the generated part
249
+ # Look for the assistant response after the prompt
250
+ if "<|im_start|>assistant" in output:
251
+ # Extract everything after the assistant tag
252
+ output = output.split("<|im_start|>assistant")[-1].strip()
253
+
254
+ # Remove any remaining chat format tokens
255
+ translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
256
 
257
+ except subprocess.TimeoutExpired:
258
+ raise RuntimeError("Translation timed out after 60 seconds")
259
+ except subprocess.CalledProcessError as e:
260
+ error_output = e.stderr if e.stderr else e.stdout
261
+ raise RuntimeError(
262
+ f"Translation failed with llama.cpp binary. "
263
+ f"Exit code: {e.returncode}, Error: {error_output}"
264
+ ) from e
265
  except Exception as e:
266
  raise RuntimeError(f"Translation generation failed: {e}") from e
267
 
268
  # Clean up the response
269
+ translated_text = translated_text.strip()
 
 
 
270
 
271
  # Remove common prefixes that might be added by the model
272
  prefixes_to_remove = [
 
286
 
287
  # If translation is empty or suspiciously short, return original
288
  if not translated_text or len(translated_text) < len(prompt) * 0.1:
 
289
  print(f"⚠️ Translation may have failed (too short or empty), returning original text")
290
  return prompt
291
 
 
293
 
294
  def generate_content_stream(self, prompt: str) -> Generator[str, None, None]:
295
  """
296
+ Stream translation using llama.cpp binary.
297
  For simplicity, we'll collect the full response and yield it.
298
  True streaming can be added later if needed.
299
  """
requirements.txt CHANGED
@@ -10,5 +10,4 @@ sentence-transformers
10
  accelerate
11
  PyMuPDF
12
  python-docx
13
- huggingface-hub
14
- llama-cpp-python>=0.2.0
 
10
  accelerate
11
  PyMuPDF
12
  python-docx
13
+ huggingface-hub