Spaces:
Runtime error
Runtime error
| import json | |
| import re | |
| from pathlib import Path | |
| from anthropic import Anthropic | |
| from glossary_checker import GlossaryChecker | |
| class TranslationValidator: | |
| def __init__(self, glossary_checker, anthropic_api_key): | |
| """Initialize validator with glossary checker and API key.""" | |
| self.checker = glossary_checker | |
| self.client = Anthropic(api_key=anthropic_api_key) | |
| def analyze_terms(self, source_text, target_text, found_terms): | |
| """Analyze terms using Claude to assess their usage and translation in context.""" | |
| if not found_terms: | |
| return [] | |
| prompt = f"""Analyze each term found in this Tibetan text and its translation: | |
| Tibetan text: {source_text} | |
| English translation: {target_text} | |
| For each term, I'll provide: | |
| - The term | |
| - Expected translations from glossary | |
| Please analyze:""" | |
| # Add term details to prompt | |
| for term in found_terms: | |
| prompt += f"\n\nTerm: {term['source_term']}" | |
| for cat_name, cat_data in term['categories'].items(): | |
| prompt += f"\nCategory '{cat_name}':" | |
| prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}" | |
| if 'definitions' in cat_data: | |
| prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}" | |
| prompt += """\n | |
| For each term, provide analysis in JSON format: | |
| [{ | |
| "term": "term1", | |
| "analysis": { | |
| "translated_as": "how it appears in the target translation", | |
| "glossary_translation": "how it should be translated according to the glossary", | |
| "matching_categories": ["category1", "category2"], | |
| "translation_assessment": { | |
| "translated_correctly": true/false, | |
| "should_be_counted": true/false | |
| } | |
| } | |
| }] | |
| Key points for analysis: | |
| 2. should_be_counted: true if the term's usage matches any of the glossary definitions | |
| 3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions | |
| 4. Consider both the definitions and provided translations when analyzing the term's usage | |
| 5. translated_correctly: true if the term matches the glossary definition with these specific conditions: | |
| 5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term: | |
| - NOT correct, even if semantically equivalent | |
| Example: | |
| - ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect | |
| - རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect | |
| 5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations: | |
| - Correct if only differs in: | |
| * Singular/plural forms (sugata/sugatas) | |
| * Case variations (buddha/buddha's) | |
| * Common derived forms (dharma/dharmic) | |
| Example: | |
| - བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct | |
| - སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct | |
| 5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents.""" | |
| try: | |
| message = self.client.messages.create( | |
| model="claude-3-sonnet-20240229", | |
| max_tokens=2000, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL) | |
| if not json_match: | |
| return [] | |
| analysis = json.loads(json_match.group()) | |
| # Add analysis to each term | |
| analyzed_terms = [] | |
| for term in found_terms: | |
| for item in analysis: | |
| if item["term"] == term["source_term"]: | |
| # Preserve original term data and add analysis | |
| analyzed_term = { | |
| "source_term": term["source_term"], | |
| "categories": {}, # Keep original categories | |
| "analysis": item["analysis"] | |
| } | |
| # Only include matching categories | |
| for cat_name, cat_data in term["categories"].items(): | |
| if cat_name in item["analysis"]["matching_categories"]: | |
| analyzed_term["categories"][cat_name] = cat_data | |
| analyzed_terms.append(analyzed_term) | |
| break | |
| return analyzed_terms | |
| except (json.JSONDecodeError, KeyError) as e: | |
| print(f"Error parsing LLM response: {e}") | |
| return [] | |
| def calculate_translation_score(self, found_terms): | |
| """Calculate translation score based on correct translations.""" | |
| if not found_terms: | |
| return 0.0 | |
| total_countable_terms = 0 | |
| correctly_translated = 0 | |
| for term in found_terms: | |
| analysis = term["analysis"] | |
| assessment = analysis["translation_assessment"] | |
| # Only count terms that should be counted and match glossary | |
| if assessment["should_be_counted"]: | |
| total_countable_terms += 1 | |
| if assessment["translated_correctly"]: | |
| correctly_translated += 1 | |
| return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0 | |
| def validate_translation(self, aligned_file_path): | |
| """Process aligned file and validate translations.""" | |
| aligned_pairs = self.load_aligned_file(aligned_file_path) | |
| results = [] | |
| for line_num, (source, target) in enumerate(aligned_pairs, 1): | |
| # Check against glossary | |
| check_results = self.checker.check(source, target) | |
| # Analyze terms | |
| analyzed_terms = self.analyze_terms(source, target, check_results) | |
| # Calculate score | |
| score = self.calculate_translation_score(analyzed_terms) | |
| results.append({ | |
| "line_number": line_num, | |
| "source": source, | |
| "target": target, | |
| "terms": analyzed_terms, | |
| "score": score, | |
| }) | |
| return results | |
| def load_aligned_file(self, file_path): | |
| """Load tab-separated source and target segments.""" | |
| aligned_pairs = [] | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| parts = line.split("\t") | |
| if len(parts) != 2: | |
| print(f"Warning: Skipping malformed line: {line}") | |
| continue | |
| source, target = parts | |
| aligned_pairs.append((source.strip(), target.strip())) | |
| return aligned_pairs | |
| def save_results(self, results, output_path): | |
| """Save validation results to JSON file.""" | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump( | |
| { | |
| "summary": { | |
| "total_lines": len(results), | |
| "average_score": ( | |
| sum(r["score"] for r in results) / len(results) | |
| if results | |
| else 0 | |
| ), | |
| }, | |
| "lines": results, | |
| }, | |
| f, | |
| ensure_ascii=False, | |
| indent=2, | |
| ) | |
| # Example usage: | |
| if __name__ == "__main__": | |
| import os | |
| data_path = Path(__file__).parent / "data" | |
| # Initialize components | |
| glossary_path = data_path / "84000_glossary.json" | |
| checker = GlossaryChecker(glossary_path) | |
| validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY")) | |
| # Process aligned file | |
| aligned_file = data_path / "example_translations.txt" | |
| results = validator.validate_translation(aligned_file) | |
| # Save results | |
| validator.save_results(results, data_path / "validation_results.json") | |
| print("Validation completed. Results saved to 'data/validation_results.json'.") | |