Spaces:

zazaman
/

guardrails-final

Sleeping

zazaman commited on Nov 9

Commit

b45369f

1 Parent(s): bb181f0

Optimize translation: reduce max_tokens and context_size, add no-display-prompt flag

Files changed (2) hide show

config.py CHANGED Viewed

@@ -43,10 +43,10 @@ NON_ENGLISH_TRANSLATOR = {
         "temperature": 0.3,  # Lower temperature for more accurate, consistent translations
         "top_p": 0.9,
         "top_k": 40,
-        # Max tokens to generate (reduced for faster inference)
-        "max_tokens": 256,
         # Context window size (reduced for faster inference - translation doesn't need large context)
-        "context_size": 512,
         # CPU threads for inference (use more threads for faster inference)
         # Set to 0 to auto-detect, or specify number of CPU cores
         "n_threads": 0,  # 0 = auto-detect (uses all available cores)

         "temperature": 0.3,  # Lower temperature for more accurate, consistent translations
         "top_p": 0.9,
         "top_k": 40,
+        # Max tokens to generate (reduced for faster inference - translation is usually short)
+        "max_tokens": 128,
         # Context window size (reduced for faster inference - translation doesn't need large context)
+        "context_size": 256,
         # CPU threads for inference (use more threads for faster inference)
         # Set to 0 to auto-detect, or specify number of CPU cores
         "n_threads": 0,  # 0 = auto-detect (uses all available cores)

llm_clients/qwen_translator.py CHANGED Viewed

@@ -336,7 +336,8 @@ class QwenTranslatorClient(LlmClient):
         translation_prompt = self._build_translation_prompt(prompt)
         # Prepare command-line arguments for llama.cpp binary
-        # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 256 -c 512 -t 0
         cmd = [
             binary_path,
             "-m", model_path,
@@ -346,6 +347,7 @@ class QwenTranslatorClient(LlmClient):
             "--top-k", str(self.top_k),
             "-n", str(self.max_tokens),  # Number of tokens to generate
             "-c", str(self.context_size),  # Context size
         ]
         # Add thread count if specified (0 means auto-detect, which is default)
@@ -368,7 +370,7 @@ class QwenTranslatorClient(LlmClient):
                 cmd,
                 capture_output=True,
                 text=True,
-                timeout=60,  # 60 second timeout
                 check=False  # Don't raise on non-zero exit, we'll check manually
             )

         translation_prompt = self._build_translation_prompt(prompt)
         # Prepare command-line arguments for llama.cpp binary
+        # Standard format: ./main -m model.gguf -p "prompt" --temp 0.3 --top-p 0.9 --top-k 40 -n 128 -c 256 -t 0
+        # Add --no-display-prompt to avoid printing the prompt in output
         cmd = [
             binary_path,
             "-m", model_path,
             "--top-k", str(self.top_k),
             "-n", str(self.max_tokens),  # Number of tokens to generate
             "-c", str(self.context_size),  # Context size
+            "--no-display-prompt",  # Don't echo the prompt in output (if supported)
         ]
         # Add thread count if specified (0 means auto-detect, which is default)
                 cmd,
                 capture_output=True,
                 text=True,
+                timeout=30,  # 30 second timeout (should be enough for short translations)
                 check=False  # Don't raise on non-zero exit, we'll check manually
             )