zazaman commited on
Commit
b45369f
·
1 Parent(s): bb181f0

Optimize translation: reduce max_tokens and context_size, add no-display-prompt flag

Browse files
Files changed (2) hide show
  1. config.py +3 -3
  2. llm_clients/qwen_translator.py +4 -2
config.py CHANGED
@@ -43,10 +43,10 @@ NON_ENGLISH_TRANSLATOR = {
43
  "temperature": 0.3, # Lower temperature for more accurate, consistent translations
44
  "top_p": 0.9,
45
  "top_k": 40,
46
- # Max tokens to generate (reduced for faster inference)
47
- "max_tokens": 256,
48
  # Context window size (reduced for faster inference - translation doesn't need large context)
49
- "context_size": 512,
50
  # CPU threads for inference (use more threads for faster inference)
51
  # Set to 0 to auto-detect, or specify number of CPU cores
52
  "n_threads": 0, # 0 = auto-detect (uses all available cores)
 
43
  "temperature": 0.3, # Lower temperature for more accurate, consistent translations
44
  "top_p": 0.9,
45
  "top_k": 40,
46
+ # Max tokens to generate (reduced for faster inference - translation is usually short)
47
+ "max_tokens": 128,
48
  # Context window size (reduced for faster inference - translation doesn't need large context)
49
+ "context_size": 256,
50
  # CPU threads for inference (use more threads for faster inference)
51
  # Set to 0 to auto-detect, or specify number of CPU cores
52
  "n_threads": 0, # 0 = auto-detect (uses all available cores)
llm_clients/qwen_translator.py CHANGED
@@ -336,7 +336,8 @@ class QwenTranslatorClient(LlmClient):
336
  translation_prompt = self._build_translation_prompt(prompt)
337
 
338
  # Prepare command-line arguments for llama.cpp binary
339
- # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
 
340
  cmd = [
341
  binary_path,
342
  "-m", model_path,
@@ -346,6 +347,7 @@ class QwenTranslatorClient(LlmClient):
346
  "--top-k", str(self.top_k),
347
  "-n", str(self.max_tokens), # Number of tokens to generate
348
  "-c", str(self.context_size), # Context size
 
349
  ]
350
 
351
  # Add thread count if specified (0 means auto-detect, which is default)
@@ -368,7 +370,7 @@ class QwenTranslatorClient(LlmClient):
368
  cmd,
369
  capture_output=True,
370
  text=True,
371
- timeout=60, # 60 second timeout
372
  check=False # Don't raise on non-zero exit, we'll check manually
373
  )
374
 
 
336
  translation_prompt = self._build_translation_prompt(prompt)
337
 
338
  # Prepare command-line arguments for llama.cpp binary
339
+ # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 128 -c 256 -t 0
340
+ # Add --no-display-prompt to avoid printing the prompt in output
341
  cmd = [
342
  binary_path,
343
  "-m", model_path,
 
347
  "--top-k", str(self.top_k),
348
  "-n", str(self.max_tokens), # Number of tokens to generate
349
  "-c", str(self.context_size), # Context size
350
+ "--no-display-prompt", # Don't echo the prompt in output (if supported)
351
  ]
352
 
353
  # Add thread count if specified (0 means auto-detect, which is default)
 
370
  cmd,
371
  capture_output=True,
372
  text=True,
373
+ timeout=30, # 30 second timeout (should be enough for short translations)
374
  check=False # Don't raise on non-zero exit, we'll check manually
375
  )
376