worship / compare_translation_methods.py
Peter Yang
Improve Qwen2.5 prompting with chat template and optimized parameters, add detailed comparison analysis
9720182
#!/usr/bin/env python3
"""
Compare OPUS-MT vs Qwen2.5 LLM Translation
Run both methods on the same texts and compare quality
"""
import asyncio
import os
import sys
import logging
from pathlib import Path
# Add current directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Test cases - same as before
TEST_CASES = [
{
"chinese": "今天我们要学习神的话语,让我们一起来祷告。",
"expected_keywords": ["today", "learn", "word", "god", "pray"],
"context": "Religious/formal language"
},
{
"chinese": "感谢主,让我们能够聚集在一起敬拜。",
"expected_keywords": ["thank", "lord", "gather", "worship"],
"context": "Worship context"
},
{
"chinese": "我们要为教会的事工祷告,求神赐福。",
"expected_keywords": ["church", "ministry", "pray", "bless"],
"context": "Church ministry"
},
{
"chinese": "这段经文告诉我们,神爱世人,甚至将他的独生子赐给他们。",
"expected_keywords": ["scripture", "god", "love", "world", "son"],
"context": "Biblical reference"
},
{
"chinese": "耶稣说:'我就是道路、真理、生命。'",
"expected_keywords": ["jesus", "way", "truth", "life"],
"context": "Biblical quote"
}
]
async def translate_with_opus_mt(text: str):
"""Translate using OPUS-MT (current method)"""
try:
from document_processing_agent import DocumentProcessingAgent
processor = DocumentProcessingAgent("http://localhost:8080")
result = await processor._translate_text(text, 'zh', 'en')
return result
except Exception as e:
logger.error(f"OPUS-MT error: {e}")
return None
async def translate_with_qwen(text: str):
"""Translate using Qwen2.5 LLM"""
try:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import platform
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
# Load model (cache after first load)
if not hasattr(translate_with_qwen, 'model'):
logger.info("Loading Qwen2.5 model (first time may take a while)...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Force CPU on macOS
if platform.system() == "Darwin":
device_map = "cpu"
torch_dtype = torch.float32
else:
device_map = "auto"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch_dtype,
device_map=device_map
)
model.eval()
# Cache model
translate_with_qwen.model = model
translate_with_qwen.tokenizer = tokenizer
translate_with_qwen.device = "cpu" if platform.system() == "Darwin" else ("cuda" if torch.cuda.is_available() else "cpu")
logger.info("✅ Qwen2.5 model loaded")
model = translate_with_qwen.model
tokenizer = translate_with_qwen.tokenizer
device = translate_with_qwen.device
# Use Qwen2.5's chat template for better results
messages = [
{
"role": "system",
"content": "You are a professional translator specializing in Christian religious texts. Translate Chinese to English accurately. Output only the English translation, nothing else."
},
{
"role": "user",
"content": f"Translate this Chinese text to English:\n\n{text}"
}
]
# Apply chat template
try:
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
except:
# Fallback if chat template not available
prompt = f"""Translate this Chinese text to English. Output only the translation.
Chinese: {text}
English:"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
model = model.to(device)
# Get the tokenizer's eos token
eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=128, # Enough for complete sentences
temperature=0.1, # Very low temperature for deterministic output
do_sample=True,
top_p=0.9, # Nucleus sampling
top_k=40, # Limit to top 40 tokens
repetition_penalty=1.2, # Penalty to avoid repetition
pad_token_id=eos_token_id,
eos_token_id=eos_token_id,
no_repeat_ngram_size=2, # Avoid repeating 2-grams
)
# Decode response
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract translation from chat format
# Qwen2.5 chat format: <|im_start|>assistant\n{translation}<|im_end|>
if "<|im_start|>assistant" in full_response:
translation = full_response.split("<|im_start|>assistant")[-1].strip()
translation = translation.split("<|im_end|>")[0].strip()
elif "assistant" in full_response.lower():
# Try to extract after assistant marker
parts = full_response.split("assistant")
if len(parts) > 1:
translation = parts[-1].strip()
else:
# Fallback: extract after prompt
if "English:" in full_response:
translation = full_response.split("English:")[-1].strip()
else:
translation = full_response[len(prompt):].strip() if len(full_response) > len(prompt) else full_response.strip()
# Aggressive cleanup: remove any continuation text
# Stop at common continuation markers
stop_markers = [
"\n\n", "Chinese:", "English:", "Human:", "User:", "翻译", "Translation:",
"The translation", "Here is", "Note:", "If you", "You are", "I am"
]
for marker in stop_markers:
if marker in translation:
translation = translation.split(marker)[0].strip()
# Take only first sentence (stop at period, exclamation, question mark)
# But preserve quotes if they're part of the translation
# Handle quoted sentences specially (like "I am the way, the truth, the life.")
if translation.count('"') >= 2:
# Has complete quoted sentence
first_quote = translation.find('"')
second_quote = translation.find('"', first_quote + 1)
if second_quote > first_quote:
# Check if there's punctuation after the closing quote
if second_quote + 1 < len(translation) and translation[second_quote + 1] in ['.', '!', '?']:
translation = translation[:second_quote + 2].strip()
else:
translation = translation[:second_quote + 1].strip()
else:
# Regular sentence ending
first_period = translation.find('.')
first_exclamation = translation.find('!')
first_question = translation.find('?')
# Find the first sentence-ending punctuation
sentence_ends = [i for i in [first_period, first_exclamation, first_question] if i > 0]
if sentence_ends:
end_idx = min(sentence_ends)
translation = translation[:end_idx + 1].strip()
# Remove trailing punctuation issues (but keep sentence-ending punctuation)
translation = translation.rstrip(';:')
# Final cleanup: remove outer quotes if the entire translation is quoted
if len(translation) > 2:
if translation.startswith('"') and translation.endswith('"'):
translation = translation[1:-1].strip()
elif translation.startswith("'") and translation.endswith("'"):
translation = translation[1:-1].strip()
return translation if translation and len(translation) > 5 else None
except Exception as e:
logger.error(f"Qwen2.5 error: {e}", exc_info=True)
return None
def check_keywords(translation: str, expected_keywords: list) -> tuple:
"""Check how many expected keywords are present"""
if not translation:
return 0, []
translation_lower = translation.lower()
found = [kw for kw in expected_keywords if kw in translation_lower]
return len(found), found
def rate_translation_quality(translation: str, chinese: str) -> dict:
"""Rate translation quality on various metrics"""
if not translation:
return {
"length_ratio": 0,
"has_punctuation": False,
"word_count": 0,
"natural_score": 0
}
# Length ratio (English is typically longer than Chinese)
length_ratio = len(translation) / len(chinese) if len(chinese) > 0 else 0
# Check punctuation
has_punctuation = any(p in translation for p in ['.', '!', '?', ','])
# Word count
word_count = len(translation.split())
# Natural score (heuristic: check for common English words)
common_words = ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with']
natural_score = sum(1 for word in translation.lower().split() if word in common_words) / max(word_count, 1)
return {
"length_ratio": length_ratio,
"has_punctuation": has_punctuation,
"word_count": word_count,
"natural_score": natural_score
}
async def compare_methods():
"""Compare OPUS-MT vs Qwen2.5 on all test cases"""
print("\n" + "=" * 80)
print("Translation Method Comparison: OPUS-MT vs Qwen2.5")
print("=" * 80)
print()
results = []
for i, test_case in enumerate(TEST_CASES, 1):
print(f"\n{'='*80}")
print(f"Test {i}: {test_case['context']}")
print(f"{'='*80}")
print(f"Chinese: {test_case['chinese']}")
print()
# Translate with both methods
print("Translating with OPUS-MT...")
opus_result = await translate_with_opus_mt(test_case['chinese'])
print("Translating with Qwen2.5...")
qwen_result = await translate_with_qwen(test_case['chinese'])
# Check keywords
opus_found, opus_keywords = check_keywords(opus_result, test_case['expected_keywords'])
qwen_found, qwen_keywords = check_keywords(qwen_result, test_case['expected_keywords'])
# Rate quality
opus_quality = rate_translation_quality(opus_result, test_case['chinese'])
qwen_quality = rate_translation_quality(qwen_result, test_case['chinese'])
# Display results
print(f"\n{'─'*80}")
print("OPUS-MT Translation:")
print(f" {opus_result if opus_result else '❌ Translation failed'}")
print(f" Keywords: {opus_found}/{len(test_case['expected_keywords'])} ({opus_found/len(test_case['expected_keywords'])*100:.0f}%)")
print(f" Found: {opus_keywords}")
print(f" Quality: Length ratio={opus_quality['length_ratio']:.2f}, Natural={opus_quality['natural_score']:.2f}")
print(f"\n{'─'*80}")
print("Qwen2.5 Translation:")
print(f" {qwen_result if qwen_result else '❌ Translation failed'}")
print(f" Keywords: {qwen_found}/{len(test_case['expected_keywords'])} ({qwen_found/len(test_case['expected_keywords'])*100:.0f}%)")
print(f" Found: {qwen_keywords}")
print(f" Quality: Length ratio={qwen_quality['length_ratio']:.2f}, Natural={qwen_quality['natural_score']:.2f}")
# Compare
print(f"\n{'─'*80}")
print("Comparison:")
if opus_found > qwen_found:
print(f" ✅ OPUS-MT found more keywords ({opus_found} vs {qwen_found})")
elif qwen_found > opus_found:
print(f" ✅ Qwen2.5 found more keywords ({qwen_found} vs {opus_found})")
else:
print(f" ⚖️ Both found same number of keywords ({opus_found})")
if qwen_quality['natural_score'] > opus_quality['natural_score']:
print(f" ✅ Qwen2.5 has better naturalness ({qwen_quality['natural_score']:.2f} vs {opus_quality['natural_score']:.2f})")
elif opus_quality['natural_score'] > qwen_quality['natural_score']:
print(f" ✅ OPUS-MT has better naturalness ({opus_quality['natural_score']:.2f} vs {qwen_quality['natural_score']:.2f})")
results.append({
"test": i,
"chinese": test_case['chinese'],
"context": test_case['context'],
"opus": {
"translation": opus_result,
"keywords_found": opus_found,
"keywords_total": len(test_case['expected_keywords']),
"quality": opus_quality
},
"qwen": {
"translation": qwen_result,
"keywords_found": qwen_found,
"keywords_total": len(test_case['expected_keywords']),
"quality": qwen_quality
}
})
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
opus_total_keywords = sum(r['opus']['keywords_found'] for r in results)
qwen_total_keywords = sum(r['qwen']['keywords_found'] for r in results)
total_possible = sum(r['opus']['keywords_total'] for r in results)
opus_avg_keywords = opus_total_keywords / len(results) if results else 0
qwen_avg_keywords = qwen_total_keywords / len(results) if results else 0
opus_avg_natural = sum(r['opus']['quality']['natural_score'] for r in results) / len(results) if results else 0
qwen_avg_natural = sum(r['qwen']['quality']['natural_score'] for r in results) / len(results) if results else 0
print(f"\nKeyword Matching:")
print(f" OPUS-MT: {opus_total_keywords}/{total_possible} ({opus_total_keywords/total_possible*100:.1f}%) - Avg: {opus_avg_keywords:.1f} per test")
print(f" Qwen2.5: {qwen_total_keywords}/{total_possible} ({qwen_total_keywords/total_possible*100:.1f}%) - Avg: {qwen_avg_keywords:.1f} per test")
print(f"\nNaturalness Score:")
print(f" OPUS-MT: {opus_avg_natural:.2f}")
print(f" Qwen2.5: {qwen_avg_natural:.2f}")
print(f"\nOverall Winner:")
if qwen_total_keywords > opus_total_keywords and qwen_avg_natural > opus_avg_natural:
print(" 🏆 Qwen2.5 wins on both metrics!")
elif opus_total_keywords > qwen_total_keywords and opus_avg_natural > qwen_avg_natural:
print(" 🏆 OPUS-MT wins on both metrics!")
elif qwen_total_keywords > opus_total_keywords:
print(" 🏆 Qwen2.5 wins on keyword matching, OPUS-MT wins on naturalness")
else:
print(" 🏆 OPUS-MT wins on keyword matching, Qwen2.5 wins on naturalness")
return results
async def main():
"""Main function"""
try:
results = await compare_methods()
print("\n" + "=" * 80)
print("Comparison Complete!")
print("=" * 80)
print("\nReview the results above to decide which method to use.")
print("Consider:")
print(" - Translation quality (keyword matching, naturalness)")
print(" - Speed (OPUS-MT is faster)")
print(" - Resource usage (OPUS-MT uses less memory)")
print(" - Context awareness (Qwen2.5 understands context better)")
except KeyboardInterrupt:
print("\n\n⚠️ Comparison interrupted by user")
except Exception as e:
print(f"\n\n❌ Fatal error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())