Spaces:

NextDrought
/

worship

Sleeping

worship / compare_translation_methods.py

Peter Yang

Improve Qwen2.5 prompting with chat template and optimized parameters, add detailed comparison analysis

9720182 5 months ago

16.5 kB

	#!/usr/bin/env python3
	"""
	Compare OPUS-MT vs Qwen2.5 LLM Translation
	Run both methods on the same texts and compare quality
	"""

	import asyncio
	import os
	import sys
	import logging
	from pathlib import Path

	# Add current directory to path
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Test cases - same as before
	TEST_CASES = [
	{
	"chinese": "今天我们要学习神的话语，让我们一起来祷告。",
	"expected_keywords": ["today", "learn", "word", "god", "pray"],
	"context": "Religious/formal language"
	},
	{
	"chinese": "感谢主，让我们能够聚集在一起敬拜。",
	"expected_keywords": ["thank", "lord", "gather", "worship"],
	"context": "Worship context"
	},
	{
	"chinese": "我们要为教会的事工祷告，求神赐福。",
	"expected_keywords": ["church", "ministry", "pray", "bless"],
	"context": "Church ministry"
	},
	{
	"chinese": "这段经文告诉我们，神爱世人，甚至将他的独生子赐给他们。",
	"expected_keywords": ["scripture", "god", "love", "world", "son"],
	"context": "Biblical reference"
	},
	{
	"chinese": "耶稣说：'我就是道路、真理、生命。'",
	"expected_keywords": ["jesus", "way", "truth", "life"],
	"context": "Biblical quote"
	}
	]


	async def translate_with_opus_mt(text: str):
	"""Translate using OPUS-MT (current method)"""
	try:
	from document_processing_agent import DocumentProcessingAgent

	processor = DocumentProcessingAgent("http://localhost:8080")
	result = await processor._translate_text(text, 'zh', 'en')
	return result
	except Exception as e:
	logger.error(f"OPUS-MT error: {e}")
	return None


	async def translate_with_qwen(text: str):
	"""Translate using Qwen2.5 LLM"""
	try:
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import platform

	model_name = "Qwen/Qwen2.5-1.5B-Instruct"

	# Load model (cache after first load)
	if not hasattr(translate_with_qwen, 'model'):
	logger.info("Loading Qwen2.5 model (first time may take a while)...")
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Force CPU on macOS
	if platform.system() == "Darwin":
	device_map = "cpu"
	torch_dtype = torch.float32
	else:
	device_map = "auto"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch_dtype,
	device_map=device_map
	)
	model.eval()

	# Cache model
	translate_with_qwen.model = model
	translate_with_qwen.tokenizer = tokenizer
	translate_with_qwen.device = "cpu" if platform.system() == "Darwin" else ("cuda" if torch.cuda.is_available() else "cpu")
	logger.info("✅ Qwen2.5 model loaded")

	model = translate_with_qwen.model
	tokenizer = translate_with_qwen.tokenizer
	device = translate_with_qwen.device

	# Use Qwen2.5's chat template for better results
	messages = [
	{
	"role": "system",
	"content": "You are a professional translator specializing in Christian religious texts. Translate Chinese to English accurately. Output only the English translation, nothing else."
	},
	{
	"role": "user",
	"content": f"Translate this Chinese text to English:\n\n{text}"
	}
	]

	# Apply chat template
	try:
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	except:
	# Fallback if chat template not available
	prompt = f"""Translate this Chinese text to English. Output only the translation.

	Chinese: {text}
	English:"""

	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
	model = model.to(device)

	# Get the tokenizer's eos token
	eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=128, # Enough for complete sentences
	temperature=0.1, # Very low temperature for deterministic output
	do_sample=True,
	top_p=0.9, # Nucleus sampling
	top_k=40, # Limit to top 40 tokens
	repetition_penalty=1.2, # Penalty to avoid repetition
	pad_token_id=eos_token_id,
	eos_token_id=eos_token_id,
	no_repeat_ngram_size=2, # Avoid repeating 2-grams
	)

	# Decode response
	full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract translation from chat format
	# Qwen2.5 chat format: <\|im_start\|>assistant\n{translation}<\|im_end\|>
	if "<\|im_start\|>assistant" in full_response:
	translation = full_response.split("<\|im_start\|>assistant")[-1].strip()
	translation = translation.split("<\|im_end\|>")[0].strip()
	elif "assistant" in full_response.lower():
	# Try to extract after assistant marker
	parts = full_response.split("assistant")
	if len(parts) > 1:
	translation = parts[-1].strip()
	else:
	# Fallback: extract after prompt
	if "English:" in full_response:
	translation = full_response.split("English:")[-1].strip()
	else:
	translation = full_response[len(prompt):].strip() if len(full_response) > len(prompt) else full_response.strip()

	# Aggressive cleanup: remove any continuation text
	# Stop at common continuation markers
	stop_markers = [
	"\n\n", "Chinese:", "English:", "Human:", "User:", "翻译", "Translation:",
	"The translation", "Here is", "Note:", "If you", "You are", "I am"
	]
	for marker in stop_markers:
	if marker in translation:
	translation = translation.split(marker)[0].strip()

	# Take only first sentence (stop at period, exclamation, question mark)
	# But preserve quotes if they're part of the translation
	# Handle quoted sentences specially (like "I am the way, the truth, the life.")
	if translation.count('"') >= 2:
	# Has complete quoted sentence
	first_quote = translation.find('"')
	second_quote = translation.find('"', first_quote + 1)
	if second_quote > first_quote:
	# Check if there's punctuation after the closing quote
	if second_quote + 1 < len(translation) and translation[second_quote + 1] in ['.', '!', '?']:
	translation = translation[:second_quote + 2].strip()
	else:
	translation = translation[:second_quote + 1].strip()
	else:
	# Regular sentence ending
	first_period = translation.find('.')
	first_exclamation = translation.find('!')
	first_question = translation.find('?')

	# Find the first sentence-ending punctuation
	sentence_ends = [i for i in [first_period, first_exclamation, first_question] if i > 0]
	if sentence_ends:
	end_idx = min(sentence_ends)
	translation = translation[:end_idx + 1].strip()

	# Remove trailing punctuation issues (but keep sentence-ending punctuation)
	translation = translation.rstrip(';:')

	# Final cleanup: remove outer quotes if the entire translation is quoted
	if len(translation) > 2:
	if translation.startswith('"') and translation.endswith('"'):
	translation = translation[1:-1].strip()
	elif translation.startswith("'") and translation.endswith("'"):
	translation = translation[1:-1].strip()

	return translation if translation and len(translation) > 5 else None

	except Exception as e:
	logger.error(f"Qwen2.5 error: {e}", exc_info=True)
	return None


	def check_keywords(translation: str, expected_keywords: list) -> tuple:
	"""Check how many expected keywords are present"""
	if not translation:
	return 0, []

	translation_lower = translation.lower()
	found = [kw for kw in expected_keywords if kw in translation_lower]
	return len(found), found


	def rate_translation_quality(translation: str, chinese: str) -> dict:
	"""Rate translation quality on various metrics"""
	if not translation:
	return {
	"length_ratio": 0,
	"has_punctuation": False,
	"word_count": 0,
	"natural_score": 0
	}

	# Length ratio (English is typically longer than Chinese)
	length_ratio = len(translation) / len(chinese) if len(chinese) > 0 else 0

	# Check punctuation
	has_punctuation = any(p in translation for p in ['.', '!', '?', ','])

	# Word count
	word_count = len(translation.split())

	# Natural score (heuristic: check for common English words)
	common_words = ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with']
	natural_score = sum(1 for word in translation.lower().split() if word in common_words) / max(word_count, 1)

	return {
	"length_ratio": length_ratio,
	"has_punctuation": has_punctuation,
	"word_count": word_count,
	"natural_score": natural_score
	}


	async def compare_methods():
	"""Compare OPUS-MT vs Qwen2.5 on all test cases"""
	print("\n" + "=" * 80)
	print("Translation Method Comparison: OPUS-MT vs Qwen2.5")
	print("=" * 80)
	print()

	results = []

	for i, test_case in enumerate(TEST_CASES, 1):
	print(f"\n{'='*80}")
	print(f"Test {i}: {test_case['context']}")
	print(f"{'='*80}")
	print(f"Chinese: {test_case['chinese']}")
	print()

	# Translate with both methods
	print("Translating with OPUS-MT...")
	opus_result = await translate_with_opus_mt(test_case['chinese'])

	print("Translating with Qwen2.5...")
	qwen_result = await translate_with_qwen(test_case['chinese'])

	# Check keywords
	opus_found, opus_keywords = check_keywords(opus_result, test_case['expected_keywords'])
	qwen_found, qwen_keywords = check_keywords(qwen_result, test_case['expected_keywords'])

	# Rate quality
	opus_quality = rate_translation_quality(opus_result, test_case['chinese'])
	qwen_quality = rate_translation_quality(qwen_result, test_case['chinese'])

	# Display results
	print(f"\n{'─'*80}")
	print("OPUS-MT Translation:")
	print(f" {opus_result if opus_result else '❌ Translation failed'}")
	print(f" Keywords: {opus_found}/{len(test_case['expected_keywords'])} ({opus_found/len(test_case['expected_keywords'])*100:.0f}%)")
	print(f" Found: {opus_keywords}")
	print(f" Quality: Length ratio={opus_quality['length_ratio']:.2f}, Natural={opus_quality['natural_score']:.2f}")

	print(f"\n{'─'*80}")
	print("Qwen2.5 Translation:")
	print(f" {qwen_result if qwen_result else '❌ Translation failed'}")
	print(f" Keywords: {qwen_found}/{len(test_case['expected_keywords'])} ({qwen_found/len(test_case['expected_keywords'])*100:.0f}%)")
	print(f" Found: {qwen_keywords}")
	print(f" Quality: Length ratio={qwen_quality['length_ratio']:.2f}, Natural={qwen_quality['natural_score']:.2f}")

	# Compare
	print(f"\n{'─'*80}")
	print("Comparison:")
	if opus_found > qwen_found:
	print(f" ✅ OPUS-MT found more keywords ({opus_found} vs {qwen_found})")
	elif qwen_found > opus_found:
	print(f" ✅ Qwen2.5 found more keywords ({qwen_found} vs {opus_found})")
	else:
	print(f" ⚖️ Both found same number of keywords ({opus_found})")

	if qwen_quality['natural_score'] > opus_quality['natural_score']:
	print(f" ✅ Qwen2.5 has better naturalness ({qwen_quality['natural_score']:.2f} vs {opus_quality['natural_score']:.2f})")
	elif opus_quality['natural_score'] > qwen_quality['natural_score']:
	print(f" ✅ OPUS-MT has better naturalness ({opus_quality['natural_score']:.2f} vs {qwen_quality['natural_score']:.2f})")

	results.append({
	"test": i,
	"chinese": test_case['chinese'],
	"context": test_case['context'],
	"opus": {
	"translation": opus_result,
	"keywords_found": opus_found,
	"keywords_total": len(test_case['expected_keywords']),
	"quality": opus_quality
	},
	"qwen": {
	"translation": qwen_result,
	"keywords_found": qwen_found,
	"keywords_total": len(test_case['expected_keywords']),
	"quality": qwen_quality
	}
	})

	# Summary
	print("\n" + "=" * 80)
	print("SUMMARY")
	print("=" * 80)

	opus_total_keywords = sum(r['opus']['keywords_found'] for r in results)
	qwen_total_keywords = sum(r['qwen']['keywords_found'] for r in results)
	total_possible = sum(r['opus']['keywords_total'] for r in results)

	opus_avg_keywords = opus_total_keywords / len(results) if results else 0
	qwen_avg_keywords = qwen_total_keywords / len(results) if results else 0

	opus_avg_natural = sum(r['opus']['quality']['natural_score'] for r in results) / len(results) if results else 0
	qwen_avg_natural = sum(r['qwen']['quality']['natural_score'] for r in results) / len(results) if results else 0

	print(f"\nKeyword Matching:")
	print(f" OPUS-MT: {opus_total_keywords}/{total_possible} ({opus_total_keywords/total_possible*100:.1f}%) - Avg: {opus_avg_keywords:.1f} per test")
	print(f" Qwen2.5: {qwen_total_keywords}/{total_possible} ({qwen_total_keywords/total_possible*100:.1f}%) - Avg: {qwen_avg_keywords:.1f} per test")

	print(f"\nNaturalness Score:")
	print(f" OPUS-MT: {opus_avg_natural:.2f}")
	print(f" Qwen2.5: {qwen_avg_natural:.2f}")

	print(f"\nOverall Winner:")
	if qwen_total_keywords > opus_total_keywords and qwen_avg_natural > opus_avg_natural:
	print(" 🏆 Qwen2.5 wins on both metrics!")
	elif opus_total_keywords > qwen_total_keywords and opus_avg_natural > qwen_avg_natural:
	print(" 🏆 OPUS-MT wins on both metrics!")
	elif qwen_total_keywords > opus_total_keywords:
	print(" 🏆 Qwen2.5 wins on keyword matching, OPUS-MT wins on naturalness")
	else:
	print(" 🏆 OPUS-MT wins on keyword matching, Qwen2.5 wins on naturalness")

	return results


	async def main():
	"""Main function"""
	try:
	results = await compare_methods()

	print("\n" + "=" * 80)
	print("Comparison Complete!")
	print("=" * 80)
	print("\nReview the results above to decide which method to use.")
	print("Consider:")
	print(" - Translation quality (keyword matching, naturalness)")
	print(" - Speed (OPUS-MT is faster)")
	print(" - Resource usage (OPUS-MT uses less memory)")
	print(" - Context awareness (Qwen2.5 understands context better)")

	except KeyboardInterrupt:
	print("\n\n⚠️ Comparison interrupted by user")
	except Exception as e:
	print(f"\n\n❌ Fatal error: {e}")
	import traceback
	traceback.print_exc()


	if __name__ == "__main__":
	asyncio.run(main())