Spaces:
Sleeping
Sleeping
Peter Yang
Improve Qwen2.5 prompting with chat template and optimized parameters, add detailed comparison analysis
9720182 | #!/usr/bin/env python3 | |
| """ | |
| Compare OPUS-MT vs Qwen2.5 LLM Translation | |
| Run both methods on the same texts and compare quality | |
| """ | |
| import asyncio | |
| import os | |
| import sys | |
| import logging | |
| from pathlib import Path | |
| # Add current directory to path | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| # Set up logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Test cases - same as before | |
| TEST_CASES = [ | |
| { | |
| "chinese": "今天我们要学习神的话语,让我们一起来祷告。", | |
| "expected_keywords": ["today", "learn", "word", "god", "pray"], | |
| "context": "Religious/formal language" | |
| }, | |
| { | |
| "chinese": "感谢主,让我们能够聚集在一起敬拜。", | |
| "expected_keywords": ["thank", "lord", "gather", "worship"], | |
| "context": "Worship context" | |
| }, | |
| { | |
| "chinese": "我们要为教会的事工祷告,求神赐福。", | |
| "expected_keywords": ["church", "ministry", "pray", "bless"], | |
| "context": "Church ministry" | |
| }, | |
| { | |
| "chinese": "这段经文告诉我们,神爱世人,甚至将他的独生子赐给他们。", | |
| "expected_keywords": ["scripture", "god", "love", "world", "son"], | |
| "context": "Biblical reference" | |
| }, | |
| { | |
| "chinese": "耶稣说:'我就是道路、真理、生命。'", | |
| "expected_keywords": ["jesus", "way", "truth", "life"], | |
| "context": "Biblical quote" | |
| } | |
| ] | |
| async def translate_with_opus_mt(text: str): | |
| """Translate using OPUS-MT (current method)""" | |
| try: | |
| from document_processing_agent import DocumentProcessingAgent | |
| processor = DocumentProcessingAgent("http://localhost:8080") | |
| result = await processor._translate_text(text, 'zh', 'en') | |
| return result | |
| except Exception as e: | |
| logger.error(f"OPUS-MT error: {e}") | |
| return None | |
| async def translate_with_qwen(text: str): | |
| """Translate using Qwen2.5 LLM""" | |
| try: | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import platform | |
| model_name = "Qwen/Qwen2.5-1.5B-Instruct" | |
| # Load model (cache after first load) | |
| if not hasattr(translate_with_qwen, 'model'): | |
| logger.info("Loading Qwen2.5 model (first time may take a while)...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Force CPU on macOS | |
| if platform.system() == "Darwin": | |
| device_map = "cpu" | |
| torch_dtype = torch.float32 | |
| else: | |
| device_map = "auto" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch_dtype, | |
| device_map=device_map | |
| ) | |
| model.eval() | |
| # Cache model | |
| translate_with_qwen.model = model | |
| translate_with_qwen.tokenizer = tokenizer | |
| translate_with_qwen.device = "cpu" if platform.system() == "Darwin" else ("cuda" if torch.cuda.is_available() else "cpu") | |
| logger.info("✅ Qwen2.5 model loaded") | |
| model = translate_with_qwen.model | |
| tokenizer = translate_with_qwen.tokenizer | |
| device = translate_with_qwen.device | |
| # Use Qwen2.5's chat template for better results | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": "You are a professional translator specializing in Christian religious texts. Translate Chinese to English accurately. Output only the English translation, nothing else." | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Translate this Chinese text to English:\n\n{text}" | |
| } | |
| ] | |
| # Apply chat template | |
| try: | |
| prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| except: | |
| # Fallback if chat template not available | |
| prompt = f"""Translate this Chinese text to English. Output only the translation. | |
| Chinese: {text} | |
| English:""" | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device) | |
| model = model.to(device) | |
| # Get the tokenizer's eos token | |
| eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=128, # Enough for complete sentences | |
| temperature=0.1, # Very low temperature for deterministic output | |
| do_sample=True, | |
| top_p=0.9, # Nucleus sampling | |
| top_k=40, # Limit to top 40 tokens | |
| repetition_penalty=1.2, # Penalty to avoid repetition | |
| pad_token_id=eos_token_id, | |
| eos_token_id=eos_token_id, | |
| no_repeat_ngram_size=2, # Avoid repeating 2-grams | |
| ) | |
| # Decode response | |
| full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract translation from chat format | |
| # Qwen2.5 chat format: <|im_start|>assistant\n{translation}<|im_end|> | |
| if "<|im_start|>assistant" in full_response: | |
| translation = full_response.split("<|im_start|>assistant")[-1].strip() | |
| translation = translation.split("<|im_end|>")[0].strip() | |
| elif "assistant" in full_response.lower(): | |
| # Try to extract after assistant marker | |
| parts = full_response.split("assistant") | |
| if len(parts) > 1: | |
| translation = parts[-1].strip() | |
| else: | |
| # Fallback: extract after prompt | |
| if "English:" in full_response: | |
| translation = full_response.split("English:")[-1].strip() | |
| else: | |
| translation = full_response[len(prompt):].strip() if len(full_response) > len(prompt) else full_response.strip() | |
| # Aggressive cleanup: remove any continuation text | |
| # Stop at common continuation markers | |
| stop_markers = [ | |
| "\n\n", "Chinese:", "English:", "Human:", "User:", "翻译", "Translation:", | |
| "The translation", "Here is", "Note:", "If you", "You are", "I am" | |
| ] | |
| for marker in stop_markers: | |
| if marker in translation: | |
| translation = translation.split(marker)[0].strip() | |
| # Take only first sentence (stop at period, exclamation, question mark) | |
| # But preserve quotes if they're part of the translation | |
| # Handle quoted sentences specially (like "I am the way, the truth, the life.") | |
| if translation.count('"') >= 2: | |
| # Has complete quoted sentence | |
| first_quote = translation.find('"') | |
| second_quote = translation.find('"', first_quote + 1) | |
| if second_quote > first_quote: | |
| # Check if there's punctuation after the closing quote | |
| if second_quote + 1 < len(translation) and translation[second_quote + 1] in ['.', '!', '?']: | |
| translation = translation[:second_quote + 2].strip() | |
| else: | |
| translation = translation[:second_quote + 1].strip() | |
| else: | |
| # Regular sentence ending | |
| first_period = translation.find('.') | |
| first_exclamation = translation.find('!') | |
| first_question = translation.find('?') | |
| # Find the first sentence-ending punctuation | |
| sentence_ends = [i for i in [first_period, first_exclamation, first_question] if i > 0] | |
| if sentence_ends: | |
| end_idx = min(sentence_ends) | |
| translation = translation[:end_idx + 1].strip() | |
| # Remove trailing punctuation issues (but keep sentence-ending punctuation) | |
| translation = translation.rstrip(';:') | |
| # Final cleanup: remove outer quotes if the entire translation is quoted | |
| if len(translation) > 2: | |
| if translation.startswith('"') and translation.endswith('"'): | |
| translation = translation[1:-1].strip() | |
| elif translation.startswith("'") and translation.endswith("'"): | |
| translation = translation[1:-1].strip() | |
| return translation if translation and len(translation) > 5 else None | |
| except Exception as e: | |
| logger.error(f"Qwen2.5 error: {e}", exc_info=True) | |
| return None | |
| def check_keywords(translation: str, expected_keywords: list) -> tuple: | |
| """Check how many expected keywords are present""" | |
| if not translation: | |
| return 0, [] | |
| translation_lower = translation.lower() | |
| found = [kw for kw in expected_keywords if kw in translation_lower] | |
| return len(found), found | |
| def rate_translation_quality(translation: str, chinese: str) -> dict: | |
| """Rate translation quality on various metrics""" | |
| if not translation: | |
| return { | |
| "length_ratio": 0, | |
| "has_punctuation": False, | |
| "word_count": 0, | |
| "natural_score": 0 | |
| } | |
| # Length ratio (English is typically longer than Chinese) | |
| length_ratio = len(translation) / len(chinese) if len(chinese) > 0 else 0 | |
| # Check punctuation | |
| has_punctuation = any(p in translation for p in ['.', '!', '?', ',']) | |
| # Word count | |
| word_count = len(translation.split()) | |
| # Natural score (heuristic: check for common English words) | |
| common_words = ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with'] | |
| natural_score = sum(1 for word in translation.lower().split() if word in common_words) / max(word_count, 1) | |
| return { | |
| "length_ratio": length_ratio, | |
| "has_punctuation": has_punctuation, | |
| "word_count": word_count, | |
| "natural_score": natural_score | |
| } | |
| async def compare_methods(): | |
| """Compare OPUS-MT vs Qwen2.5 on all test cases""" | |
| print("\n" + "=" * 80) | |
| print("Translation Method Comparison: OPUS-MT vs Qwen2.5") | |
| print("=" * 80) | |
| print() | |
| results = [] | |
| for i, test_case in enumerate(TEST_CASES, 1): | |
| print(f"\n{'='*80}") | |
| print(f"Test {i}: {test_case['context']}") | |
| print(f"{'='*80}") | |
| print(f"Chinese: {test_case['chinese']}") | |
| print() | |
| # Translate with both methods | |
| print("Translating with OPUS-MT...") | |
| opus_result = await translate_with_opus_mt(test_case['chinese']) | |
| print("Translating with Qwen2.5...") | |
| qwen_result = await translate_with_qwen(test_case['chinese']) | |
| # Check keywords | |
| opus_found, opus_keywords = check_keywords(opus_result, test_case['expected_keywords']) | |
| qwen_found, qwen_keywords = check_keywords(qwen_result, test_case['expected_keywords']) | |
| # Rate quality | |
| opus_quality = rate_translation_quality(opus_result, test_case['chinese']) | |
| qwen_quality = rate_translation_quality(qwen_result, test_case['chinese']) | |
| # Display results | |
| print(f"\n{'─'*80}") | |
| print("OPUS-MT Translation:") | |
| print(f" {opus_result if opus_result else '❌ Translation failed'}") | |
| print(f" Keywords: {opus_found}/{len(test_case['expected_keywords'])} ({opus_found/len(test_case['expected_keywords'])*100:.0f}%)") | |
| print(f" Found: {opus_keywords}") | |
| print(f" Quality: Length ratio={opus_quality['length_ratio']:.2f}, Natural={opus_quality['natural_score']:.2f}") | |
| print(f"\n{'─'*80}") | |
| print("Qwen2.5 Translation:") | |
| print(f" {qwen_result if qwen_result else '❌ Translation failed'}") | |
| print(f" Keywords: {qwen_found}/{len(test_case['expected_keywords'])} ({qwen_found/len(test_case['expected_keywords'])*100:.0f}%)") | |
| print(f" Found: {qwen_keywords}") | |
| print(f" Quality: Length ratio={qwen_quality['length_ratio']:.2f}, Natural={qwen_quality['natural_score']:.2f}") | |
| # Compare | |
| print(f"\n{'─'*80}") | |
| print("Comparison:") | |
| if opus_found > qwen_found: | |
| print(f" ✅ OPUS-MT found more keywords ({opus_found} vs {qwen_found})") | |
| elif qwen_found > opus_found: | |
| print(f" ✅ Qwen2.5 found more keywords ({qwen_found} vs {opus_found})") | |
| else: | |
| print(f" ⚖️ Both found same number of keywords ({opus_found})") | |
| if qwen_quality['natural_score'] > opus_quality['natural_score']: | |
| print(f" ✅ Qwen2.5 has better naturalness ({qwen_quality['natural_score']:.2f} vs {opus_quality['natural_score']:.2f})") | |
| elif opus_quality['natural_score'] > qwen_quality['natural_score']: | |
| print(f" ✅ OPUS-MT has better naturalness ({opus_quality['natural_score']:.2f} vs {qwen_quality['natural_score']:.2f})") | |
| results.append({ | |
| "test": i, | |
| "chinese": test_case['chinese'], | |
| "context": test_case['context'], | |
| "opus": { | |
| "translation": opus_result, | |
| "keywords_found": opus_found, | |
| "keywords_total": len(test_case['expected_keywords']), | |
| "quality": opus_quality | |
| }, | |
| "qwen": { | |
| "translation": qwen_result, | |
| "keywords_found": qwen_found, | |
| "keywords_total": len(test_case['expected_keywords']), | |
| "quality": qwen_quality | |
| } | |
| }) | |
| # Summary | |
| print("\n" + "=" * 80) | |
| print("SUMMARY") | |
| print("=" * 80) | |
| opus_total_keywords = sum(r['opus']['keywords_found'] for r in results) | |
| qwen_total_keywords = sum(r['qwen']['keywords_found'] for r in results) | |
| total_possible = sum(r['opus']['keywords_total'] for r in results) | |
| opus_avg_keywords = opus_total_keywords / len(results) if results else 0 | |
| qwen_avg_keywords = qwen_total_keywords / len(results) if results else 0 | |
| opus_avg_natural = sum(r['opus']['quality']['natural_score'] for r in results) / len(results) if results else 0 | |
| qwen_avg_natural = sum(r['qwen']['quality']['natural_score'] for r in results) / len(results) if results else 0 | |
| print(f"\nKeyword Matching:") | |
| print(f" OPUS-MT: {opus_total_keywords}/{total_possible} ({opus_total_keywords/total_possible*100:.1f}%) - Avg: {opus_avg_keywords:.1f} per test") | |
| print(f" Qwen2.5: {qwen_total_keywords}/{total_possible} ({qwen_total_keywords/total_possible*100:.1f}%) - Avg: {qwen_avg_keywords:.1f} per test") | |
| print(f"\nNaturalness Score:") | |
| print(f" OPUS-MT: {opus_avg_natural:.2f}") | |
| print(f" Qwen2.5: {qwen_avg_natural:.2f}") | |
| print(f"\nOverall Winner:") | |
| if qwen_total_keywords > opus_total_keywords and qwen_avg_natural > opus_avg_natural: | |
| print(" 🏆 Qwen2.5 wins on both metrics!") | |
| elif opus_total_keywords > qwen_total_keywords and opus_avg_natural > qwen_avg_natural: | |
| print(" 🏆 OPUS-MT wins on both metrics!") | |
| elif qwen_total_keywords > opus_total_keywords: | |
| print(" 🏆 Qwen2.5 wins on keyword matching, OPUS-MT wins on naturalness") | |
| else: | |
| print(" 🏆 OPUS-MT wins on keyword matching, Qwen2.5 wins on naturalness") | |
| return results | |
| async def main(): | |
| """Main function""" | |
| try: | |
| results = await compare_methods() | |
| print("\n" + "=" * 80) | |
| print("Comparison Complete!") | |
| print("=" * 80) | |
| print("\nReview the results above to decide which method to use.") | |
| print("Consider:") | |
| print(" - Translation quality (keyword matching, naturalness)") | |
| print(" - Speed (OPUS-MT is faster)") | |
| print(" - Resource usage (OPUS-MT uses less memory)") | |
| print(" - Context awareness (Qwen2.5 understands context better)") | |
| except KeyboardInterrupt: | |
| print("\n\n⚠️ Comparison interrupted by user") | |
| except Exception as e: | |
| print(f"\n\n❌ Fatal error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |