Spaces:
Sleeping
Sleeping
Optimize translation: reduce max_tokens and context_size, add no-display-prompt flag
Browse files- config.py +3 -3
- llm_clients/qwen_translator.py +4 -2
config.py
CHANGED
|
@@ -43,10 +43,10 @@ NON_ENGLISH_TRANSLATOR = {
|
|
| 43 |
"temperature": 0.3, # Lower temperature for more accurate, consistent translations
|
| 44 |
"top_p": 0.9,
|
| 45 |
"top_k": 40,
|
| 46 |
-
# Max tokens to generate (reduced for faster inference)
|
| 47 |
-
"max_tokens":
|
| 48 |
# Context window size (reduced for faster inference - translation doesn't need large context)
|
| 49 |
-
"context_size":
|
| 50 |
# CPU threads for inference (use more threads for faster inference)
|
| 51 |
# Set to 0 to auto-detect, or specify number of CPU cores
|
| 52 |
"n_threads": 0, # 0 = auto-detect (uses all available cores)
|
|
|
|
| 43 |
"temperature": 0.3, # Lower temperature for more accurate, consistent translations
|
| 44 |
"top_p": 0.9,
|
| 45 |
"top_k": 40,
|
| 46 |
+
# Max tokens to generate (reduced for faster inference - translation is usually short)
|
| 47 |
+
"max_tokens": 128,
|
| 48 |
# Context window size (reduced for faster inference - translation doesn't need large context)
|
| 49 |
+
"context_size": 256,
|
| 50 |
# CPU threads for inference (use more threads for faster inference)
|
| 51 |
# Set to 0 to auto-detect, or specify number of CPU cores
|
| 52 |
"n_threads": 0, # 0 = auto-detect (uses all available cores)
|
llm_clients/qwen_translator.py
CHANGED
|
@@ -336,7 +336,8 @@ class QwenTranslatorClient(LlmClient):
|
|
| 336 |
translation_prompt = self._build_translation_prompt(prompt)
|
| 337 |
|
| 338 |
# Prepare command-line arguments for llama.cpp binary
|
| 339 |
-
# Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n
|
|
|
|
| 340 |
cmd = [
|
| 341 |
binary_path,
|
| 342 |
"-m", model_path,
|
|
@@ -346,6 +347,7 @@ class QwenTranslatorClient(LlmClient):
|
|
| 346 |
"--top-k", str(self.top_k),
|
| 347 |
"-n", str(self.max_tokens), # Number of tokens to generate
|
| 348 |
"-c", str(self.context_size), # Context size
|
|
|
|
| 349 |
]
|
| 350 |
|
| 351 |
# Add thread count if specified (0 means auto-detect, which is default)
|
|
@@ -368,7 +370,7 @@ class QwenTranslatorClient(LlmClient):
|
|
| 368 |
cmd,
|
| 369 |
capture_output=True,
|
| 370 |
text=True,
|
| 371 |
-
timeout=
|
| 372 |
check=False # Don't raise on non-zero exit, we'll check manually
|
| 373 |
)
|
| 374 |
|
|
|
|
| 336 |
translation_prompt = self._build_translation_prompt(prompt)
|
| 337 |
|
| 338 |
# Prepare command-line arguments for llama.cpp binary
|
| 339 |
+
# Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 128 -c 256 -t 0
|
| 340 |
+
# Add --no-display-prompt to avoid printing the prompt in output
|
| 341 |
cmd = [
|
| 342 |
binary_path,
|
| 343 |
"-m", model_path,
|
|
|
|
| 347 |
"--top-k", str(self.top_k),
|
| 348 |
"-n", str(self.max_tokens), # Number of tokens to generate
|
| 349 |
"-c", str(self.context_size), # Context size
|
| 350 |
+
"--no-display-prompt", # Don't echo the prompt in output (if supported)
|
| 351 |
]
|
| 352 |
|
| 353 |
# Add thread count if specified (0 means auto-detect, which is default)
|
|
|
|
| 370 |
cmd,
|
| 371 |
capture_output=True,
|
| 372 |
text=True,
|
| 373 |
+
timeout=30, # 30 second timeout (should be enough for short translations)
|
| 374 |
check=False # Don't raise on non-zero exit, we'll check manually
|
| 375 |
)
|
| 376 |
|