zazaman commited on
Commit
08546b6
·
1 Parent(s): cee104f

Filter out reasoning tokens and extract actual translation from Qwen output

Browse files
Files changed (1) hide show
  1. llm_clients/qwen_translator.py +28 -0
llm_clients/qwen_translator.py CHANGED
@@ -399,6 +399,15 @@ class QwenTranslatorClient(LlmClient):
399
 
400
  print(f" Raw output (first 200 chars): {output[:200]}", flush=True)
401
 
 
 
 
 
 
 
 
 
 
402
  # The output might include the prompt, so we need to extract just the generated part
403
  # Look for the assistant response after the prompt
404
  if "<|im_start|>assistant" in output:
@@ -409,6 +418,25 @@ class QwenTranslatorClient(LlmClient):
409
  # Remove any remaining chat format tokens
410
  translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  if not translated_text:
413
  print(f" ❌ Translation output is empty after parsing", flush=True)
414
  print(f" Original output was: {result.stdout[:500]}", flush=True)
 
399
 
400
  print(f" Raw output (first 200 chars): {output[:200]}", flush=True)
401
 
402
+ # Remove reasoning/thinking tags and their content (Qwen models sometimes output these)
403
+ import re
404
+ # Remove <think>...</think> tags and content
405
+ output = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
406
+ # Remove <thinking>...</thinking> tags if present
407
+ output = re.sub(r'<thinking>.*?</thinking>', '', output, flags=re.DOTALL)
408
+ # Remove any other reasoning tags
409
+ output = re.sub(r'<reasoning>.*?</reasoning>', '', output, flags=re.DOTALL)
410
+
411
  # The output might include the prompt, so we need to extract just the generated part
412
  # Look for the assistant response after the prompt
413
  if "<|im_start|>assistant" in output:
 
418
  # Remove any remaining chat format tokens
419
  translated_text = output.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
420
 
421
+ # If the text still contains reasoning-like patterns, try to extract just the translation
422
+ # Look for patterns like "The translation is:" or "English translation:" or just clean text
423
+ if "translation" in translated_text.lower() and len(translated_text) > 100:
424
+ # Try to find the actual translation after common prefixes
425
+ translation_patterns = [
426
+ r'(?:translation|translated|english):\s*(.+?)(?:\n|$)',
427
+ r'(?:the translation is|here is the translation|english translation):\s*(.+?)(?:\n|$)',
428
+ ]
429
+ for pattern in translation_patterns:
430
+ match = re.search(pattern, translated_text, re.IGNORECASE | re.DOTALL)
431
+ if match:
432
+ translated_text = match.group(1).strip()
433
+ print(f" Extracted translation from pattern: {translated_text[:200]}", flush=True)
434
+ break
435
+
436
+ # Clean up any remaining artifacts
437
+ translated_text = re.sub(r'\n+', ' ', translated_text) # Replace multiple newlines with space
438
+ translated_text = translated_text.strip()
439
+
440
  if not translated_text:
441
  print(f" ❌ Translation output is empty after parsing", flush=True)
442
  print(f" Original output was: {result.stdout[:500]}", flush=True)