Spaces:
Sleeping
Sleeping
| import json | |
| from dataclasses import dataclass | |
| from datetime import datetime | |
| from enum import Enum | |
| from typing import Dict, List, Optional | |
| from claudette import Chat, models | |
| class Context: | |
| tibetan: str | |
| english: str | |
| commentaries: List[str] | |
| sanskrit: Optional[str] = None | |
| class AnalysisType(Enum): | |
| SEMANTIC = "semantic" | |
| TERM_GENERATION = "term_generation" | |
| EVALUATION = "evaluation" | |
| class BuddhistTermAnalyzer: | |
| def __init__(self): | |
| # Use Claude 3.5 Sonnet | |
| self.model = models[1] # claude-3-5-sonnet | |
| self.total_api_calls_cost = 0 | |
| self.token_usage = {} | |
| # Initialize different chats for different analysis types | |
| self.system_prompts = { | |
| AnalysisType.SEMANTIC: """You are an expert in Buddhist terminology analysis with deep knowledge of Sanskrit and Tibetan. | |
| Analyze the given term through a systematic philological approach. | |
| You must ONLY respond with a valid JSON object, no other text. | |
| Never include any explanatory text before or after the JSON. | |
| Required JSON structure: | |
| { | |
| "sanskrit_analysis": { | |
| "term": "string", # Sanskrit equivalent | |
| "morphology": "string", # Morphological breakdown | |
| "literal_meaning": "string", # Literal meaning in Sanskrit | |
| "technical_usage": "string" # Technical usage in Sanskrit Buddhist literature | |
| }, | |
| "tibetan_mapping": { | |
| "term": "string", # Tibetan term | |
| "morphology": "string", # Morphological breakdown of Tibetan | |
| "translation_strategy": "string", # How Tibetan translates the Sanskrit | |
| "semantic_extension": "string" # Any semantic changes or extensions in Tibetan | |
| }, | |
| "commentary_insights": [ | |
| { | |
| "source": "string", # Which commentary | |
| "explanation": "string", # Key explanation | |
| "technical_points": ["string"] # Technical clarifications | |
| } | |
| ], | |
| "english_renderings": [ | |
| { | |
| "translation": "string", | |
| "accuracy_score": number, # 1-10 | |
| "captures_sanskrit": boolean, | |
| "captures_tibetan": boolean, | |
| "notes": "string" | |
| } | |
| ], | |
| "semantic_synthesis": { | |
| "core_meaning": "string", # Core meaning synthesized from all sources | |
| "technical_usage": ["string"], # List of technical usages found in context | |
| "connotative_aspects": ["string"] # Important connotations and implications | |
| }, | |
| "usage_examples": [ | |
| { | |
| "source_text": "string", # Original context | |
| "usage_type": "string", # How term is used here | |
| "commentary_explanation": "string" # What commentary says about this usage | |
| } | |
| ] | |
| }""", | |
| AnalysisType.TERM_GENERATION: """You are an expert Buddhist translator. | |
| You must ONLY respond with a valid JSON object, no other text. | |
| Never include any explanatory text before or after the JSON. | |
| Required JSON structure: | |
| { | |
| "academic": { | |
| "terms": ["term1", "term2"], | |
| "reasoning": "string" | |
| }, | |
| "practitioner": { | |
| "terms": ["term1", "term2"], | |
| "reasoning": "string" | |
| }, | |
| "general": { | |
| "terms": ["term1", "term2"], | |
| "reasoning": "string" | |
| } | |
| }""", | |
| AnalysisType.EVALUATION: """You are an expert evaluator of Buddhist translations. | |
| You must ONLY respond with a valid JSON object, no other text. | |
| Never include any explanatory text before or after the JSON. | |
| Required JSON structure: | |
| { | |
| "evaluations": { | |
| "term": { | |
| "technical_score": 0.0, | |
| "cultural_score": 0.0, | |
| "audience_score": 0.0, | |
| "reasoning": "string" | |
| } | |
| } | |
| }""", | |
| } | |
| # Initialize chats with respective system prompts | |
| self.chats = { | |
| analysis_type: Chat(self.model, sp=system_prompt) | |
| for analysis_type, system_prompt in self.system_prompts.items() | |
| } | |
| def create_semantic_prompt(self, tibetan_term: str, contexts: List[Dict]) -> str: | |
| return f""" | |
| Analyze this Buddhist term following these steps: | |
| Target Term: {tibetan_term} | |
| Analysis Process: | |
| 1. First analyze the Sanskrit source: | |
| - Identify the Sanskrit equivalent | |
| - Break down its morphology | |
| - Understand its literal and technical meanings | |
| 2. Map to Tibetan: | |
| - Analyze how Tibetan translates the Sanskrit | |
| - Note any semantic extensions or modifications | |
| - Understand the translation strategy | |
| 3. Study the commentaries: | |
| - Extract key explanations | |
| - Note technical clarifications | |
| - Identify special usages explained | |
| 4. Evaluate English translations: | |
| - Compare against Sanskrit and Tibetan meanings | |
| - Assess accuracy and completeness | |
| - Note which aspects are captured/missed | |
| 5. Synthesize understanding: | |
| - Combine insights from all sources | |
| - Document technical usage patterns | |
| - Note important connotations | |
| Contexts: | |
| {json.dumps(contexts, indent=2, ensure_ascii=False)} | |
| Important: | |
| - Base analysis strictly on provided contexts | |
| - Use commentaries to resolve ambiguities | |
| - Pay special attention to technical terms in commentaries | |
| - Note when English translations diverge from Sanskrit/Tibetan | |
| - Document specific usage examples from the context | |
| Remember: Return ONLY the JSON object with no other text.""" | |
| def create_generation_prompt( | |
| self, tibetan_term: str, semantic_analysis: Dict | |
| ) -> str: | |
| return f""" | |
| Respond ONLY with a JSON object containing translation candidates: | |
| Term: {tibetan_term} | |
| Semantic Analysis: | |
| {json.dumps(semantic_analysis, indent=2, ensure_ascii=False)} | |
| Remember: Return ONLY the JSON object with no other text.""" | |
| def create_evaluation_prompt( | |
| self, tibetan_term: str, candidates: Dict, semantic_analysis: Dict | |
| ) -> str: | |
| return f""" | |
| Respond ONLY with a JSON object evaluating these candidates: | |
| Term: {tibetan_term} | |
| Candidates: | |
| {json.dumps(candidates, indent=2, ensure_ascii=False)} | |
| Semantic Analysis: | |
| {json.dumps(semantic_analysis, indent=2, ensure_ascii=False)} | |
| Remember: Return ONLY the JSON object with no other text.""" | |
| def _track_usage(self, analysis_type: AnalysisType, response): | |
| cost = self.chats[analysis_type].cost | |
| self.total_api_calls_cost += cost | |
| self.token_usage[str(analysis_type)] = { | |
| "token_usage": repr(response.usage), | |
| "api_call_cost": cost, | |
| } | |
| def analyze_term(self, tibetan_term: str, contexts: List[Dict]) -> Dict: | |
| """Main analysis pipeline using cached prompts""" | |
| # 1. Semantic Analysis with cache | |
| semantic_prompt = self.create_semantic_prompt(tibetan_term, contexts) | |
| semantic_response = self.chats[AnalysisType.SEMANTIC](semantic_prompt) | |
| self._track_usage(AnalysisType.SEMANTIC, semantic_response) | |
| semantic_analysis = json.loads(semantic_response.content[0].text) | |
| # 2. Term Generation with cache | |
| generation_prompt = self.create_generation_prompt( | |
| tibetan_term, semantic_analysis | |
| ) | |
| generation_response = self.chats[AnalysisType.TERM_GENERATION]( | |
| generation_prompt | |
| ) | |
| self._track_usage(AnalysisType.TERM_GENERATION, generation_response) | |
| semantic_analysis = json.loads(semantic_response.content[0].text) | |
| candidates = json.loads(generation_response.content[0].text) | |
| # 3. Evaluation with cache | |
| evaluation_prompt = self.create_evaluation_prompt( | |
| tibetan_term, candidates, semantic_analysis | |
| ) | |
| evaluation_response = self.chats[AnalysisType.EVALUATION](evaluation_prompt) | |
| self._track_usage(AnalysisType.EVALUATION, evaluation_response) | |
| evaluations = json.loads(evaluation_response.content[0].text) | |
| # Combine results | |
| return self.format_results( | |
| tibetan_term, | |
| semantic_analysis, | |
| candidates, | |
| evaluations, | |
| ) | |
| def format_results( | |
| self, | |
| tibetan_term: str, | |
| semantic_analysis: Dict, | |
| candidates: Dict, | |
| evaluations: Dict, | |
| ) -> Dict: | |
| """Format the final results""" | |
| return { | |
| "tibetan_term": tibetan_term, | |
| "recommendations": { | |
| "Academic": { | |
| "term": candidates["academic"]["terms"][0], | |
| "reasoning": candidates["academic"]["reasoning"], | |
| }, | |
| "Practitioner": { | |
| "term": candidates["practitioner"]["terms"][0], | |
| "reasoning": candidates["practitioner"]["reasoning"], | |
| }, | |
| "General": { | |
| "term": candidates["general"]["terms"][0], | |
| "reasoning": candidates["general"]["reasoning"], | |
| }, | |
| }, | |
| "analysis": semantic_analysis, | |
| "evaluations": evaluations["evaluations"], | |
| "total_api_calls_cost": self.total_api_calls_cost, | |
| "token_usage": self.token_usage, | |
| } | |
| class TermStandardizationAgent: | |
| def __init__(self): | |
| self.analyzer = BuddhistTermAnalyzer() | |
| def select_best_terms(self, tibetan_term: str, contexts: List[Dict]) -> Dict: | |
| """Main entry point for term standardization""" | |
| results = self.analyzer.analyze_term(tibetan_term, contexts) | |
| return results | |
| # Example usage | |
| def main(): | |
| from pathlib import Path | |
| # Initialize agent | |
| agent = TermStandardizationAgent() | |
| # Test input | |
| tibetan_term = "བྱང་ཆུབ་སེམས་" | |
| contexts_fn = Path(__file__).parent / "data" / f"{tibetan_term}.json" | |
| contexts = json.load(contexts_fn.open()) | |
| # Process term | |
| results = agent.select_best_terms(tibetan_term, contexts) | |
| date_time = datetime.now().strftime("%Y%m%d%H%M%S") | |
| results_path = Path(__file__).parent / "results" | |
| results_path.mkdir(exist_ok=True, parents=True) | |
| result_fn = results_path / f"{tibetan_term}_{date_time}.json" | |
| json.dump(results, result_fn.open("w"), indent=2, ensure_ascii=False) | |
| print(f"Results saved to: {result_fn}") | |
| if __name__ == "__main__": | |
| main() | |