Spaces:

zazaman
/

guardrails-final

Sleeping

App Files Files Community

zazaman commited on Nov 9

Commit

08546b6

1 Parent(s): cee104f

Filter out reasoning tokens and extract actual translation from Qwen output

Browse files

Files changed (1) hide show

llm_clients/qwen_translator.py +28 -0

llm_clients/qwen_translator.py CHANGED Viewed

@@ -399,6 +399,15 @@ class QwenTranslatorClient(LlmClient):
             print(f"   Raw output (first 200 chars): {output[:200]}", flush=True)
             # The output might include the prompt, so we need to extract just the generated part
             # Look for the assistant response after the prompt
             if "<|im_start|>assistant" in output:
@@ -409,6 +418,25 @@ class QwenTranslatorClient(LlmClient):
             # Remove any remaining chat format tokens
             translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
             if not translated_text:
                 print(f"   ❌ Translation output is empty after parsing", flush=True)
                 print(f"   Original output was: {result.stdout[:500]}", flush=True)

             print(f"   Raw output (first 200 chars): {output[:200]}", flush=True)
+            # Remove reasoning/thinking tags and their content (Qwen models sometimes output these)
+            import re
+            # Remove <think>...</think> tags and content
+            output = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
+            # Remove <thinking>...</thinking> tags if present
+            output = re.sub(r'<thinking>.*?</thinking>', '', output, flags=re.DOTALL)
+            # Remove any other reasoning tags
+            output = re.sub(r'<reasoning>.*?</reasoning>', '', output, flags=re.DOTALL)
             # The output might include the prompt, so we need to extract just the generated part
             # Look for the assistant response after the prompt
             if "<|im_start|>assistant" in output:
             # Remove any remaining chat format tokens
             translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
+            # If the text still contains reasoning-like patterns, try to extract just the translation
+            # Look for patterns like "The translation is:" or "English translation:" or just clean text
+            if "translation" in translated_text.lower() and len(translated_text) > 100:
+                # Try to find the actual translation after common prefixes
+                translation_patterns = [
+                    r'(?:translation|translated|english):\s*(.+?)(?:\n|$)',
+                    r'(?:the translation is|here is the translation|english translation):\s*(.+?)(?:\n|$)',
+                ]
+                for pattern in translation_patterns:
+                    match = re.search(pattern, translated_text, re.IGNORECASE | re.DOTALL)
+                    if match:
+                        translated_text = match.group(1).strip()
+                        print(f"   Extracted translation from pattern: {translated_text[:200]}", flush=True)
+                        break
+            # Clean up any remaining artifacts
+            translated_text = re.sub(r'\n+', ' ', translated_text)  # Replace multiple newlines with space
+            translated_text = translated_text.strip()
             if not translated_text:
                 print(f"   ❌ Translation output is empty after parsing", flush=True)
                 print(f"   Original output was: {result.stdout[:500]}", flush=True)