Spaces:
Sleeping
Sleeping
Filter out reasoning tokens and extract actual translation from Qwen output
Browse files
llm_clients/qwen_translator.py
CHANGED
|
@@ -399,6 +399,15 @@ class QwenTranslatorClient(LlmClient):
|
|
| 399 |
|
| 400 |
print(f" Raw output (first 200 chars): {output[:200]}", flush=True)
|
| 401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
# The output might include the prompt, so we need to extract just the generated part
|
| 403 |
# Look for the assistant response after the prompt
|
| 404 |
if "<|im_start|>assistant" in output:
|
|
@@ -409,6 +418,25 @@ class QwenTranslatorClient(LlmClient):
|
|
| 409 |
# Remove any remaining chat format tokens
|
| 410 |
translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
if not translated_text:
|
| 413 |
print(f" ❌ Translation output is empty after parsing", flush=True)
|
| 414 |
print(f" Original output was: {result.stdout[:500]}", flush=True)
|
|
|
|
| 399 |
|
| 400 |
print(f" Raw output (first 200 chars): {output[:200]}", flush=True)
|
| 401 |
|
| 402 |
+
# Remove reasoning/thinking tags and their content (Qwen models sometimes output these)
|
| 403 |
+
import re
|
| 404 |
+
# Remove <think>...</think> tags and content
|
| 405 |
+
output = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
|
| 406 |
+
# Remove <thinking>...</thinking> tags if present
|
| 407 |
+
output = re.sub(r'<thinking>.*?</thinking>', '', output, flags=re.DOTALL)
|
| 408 |
+
# Remove any other reasoning tags
|
| 409 |
+
output = re.sub(r'<reasoning>.*?</reasoning>', '', output, flags=re.DOTALL)
|
| 410 |
+
|
| 411 |
# The output might include the prompt, so we need to extract just the generated part
|
| 412 |
# Look for the assistant response after the prompt
|
| 413 |
if "<|im_start|>assistant" in output:
|
|
|
|
| 418 |
# Remove any remaining chat format tokens
|
| 419 |
translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
|
| 420 |
|
| 421 |
+
# If the text still contains reasoning-like patterns, try to extract just the translation
|
| 422 |
+
# Look for patterns like "The translation is:" or "English translation:" or just clean text
|
| 423 |
+
if "translation" in translated_text.lower() and len(translated_text) > 100:
|
| 424 |
+
# Try to find the actual translation after common prefixes
|
| 425 |
+
translation_patterns = [
|
| 426 |
+
r'(?:translation|translated|english):\s*(.+?)(?:\n|$)',
|
| 427 |
+
r'(?:the translation is|here is the translation|english translation):\s*(.+?)(?:\n|$)',
|
| 428 |
+
]
|
| 429 |
+
for pattern in translation_patterns:
|
| 430 |
+
match = re.search(pattern, translated_text, re.IGNORECASE | re.DOTALL)
|
| 431 |
+
if match:
|
| 432 |
+
translated_text = match.group(1).strip()
|
| 433 |
+
print(f" Extracted translation from pattern: {translated_text[:200]}", flush=True)
|
| 434 |
+
break
|
| 435 |
+
|
| 436 |
+
# Clean up any remaining artifacts
|
| 437 |
+
translated_text = re.sub(r'\n+', ' ', translated_text) # Replace multiple newlines with space
|
| 438 |
+
translated_text = translated_text.strip()
|
| 439 |
+
|
| 440 |
if not translated_text:
|
| 441 |
print(f" ❌ Translation output is empty after parsing", flush=True)
|
| 442 |
print(f" Original output was: {result.stdout[:500]}", flush=True)
|