Spaces:

openpecha
/

translation_term_analyser

Sleeping

translation_term_analyser / term_standarization.py

test

commit

20dc456 about 1 year ago

11.4 kB

	import json
	from dataclasses import dataclass
	from datetime import datetime
	from enum import Enum
	from typing import Dict, List, Optional

	from claudette import Chat, models


	@dataclass
	class Context:
	tibetan: str
	english: str
	commentaries: List[str]
	sanskrit: Optional[str] = None


	class AnalysisType(Enum):
	SEMANTIC = "semantic"
	TERM_GENERATION = "term_generation"
	EVALUATION = "evaluation"


	class BuddhistTermAnalyzer:
	def __init__(self):
	# Use Claude 3.5 Sonnet
	self.model = models[1] # claude-3-5-sonnet
	self.total_api_calls_cost = 0
	self.token_usage = {}

	# Initialize different chats for different analysis types
	self.system_prompts = {
	AnalysisType.SEMANTIC: """You are an expert in Buddhist terminology analysis with deep knowledge of Sanskrit and Tibetan.
	Analyze the given term through a systematic philological approach.
	You must ONLY respond with a valid JSON object, no other text.
	Never include any explanatory text before or after the JSON.

	Required JSON structure:
	{
	"sanskrit_analysis": {
	"term": "string", # Sanskrit equivalent
	"morphology": "string", # Morphological breakdown
	"literal_meaning": "string", # Literal meaning in Sanskrit
	"technical_usage": "string" # Technical usage in Sanskrit Buddhist literature
	},
	"tibetan_mapping": {
	"term": "string", # Tibetan term
	"morphology": "string", # Morphological breakdown of Tibetan
	"translation_strategy": "string", # How Tibetan translates the Sanskrit
	"semantic_extension": "string" # Any semantic changes or extensions in Tibetan
	},
	"commentary_insights": [
	{
	"source": "string", # Which commentary
	"explanation": "string", # Key explanation
	"technical_points": ["string"] # Technical clarifications
	}
	],
	"english_renderings": [
	{
	"translation": "string",
	"accuracy_score": number, # 1-10
	"captures_sanskrit": boolean,
	"captures_tibetan": boolean,
	"notes": "string"
	}
	],
	"semantic_synthesis": {
	"core_meaning": "string", # Core meaning synthesized from all sources
	"technical_usage": ["string"], # List of technical usages found in context
	"connotative_aspects": ["string"] # Important connotations and implications
	},
	"usage_examples": [
	{
	"source_text": "string", # Original context
	"usage_type": "string", # How term is used here
	"commentary_explanation": "string" # What commentary says about this usage
	}
	]
	}""",
	AnalysisType.TERM_GENERATION: """You are an expert Buddhist translator.
	You must ONLY respond with a valid JSON object, no other text.
	Never include any explanatory text before or after the JSON.

	Required JSON structure:
	{
	"academic": {
	"terms": ["term1", "term2"],
	"reasoning": "string"
	},
	"practitioner": {
	"terms": ["term1", "term2"],
	"reasoning": "string"
	},
	"general": {
	"terms": ["term1", "term2"],
	"reasoning": "string"
	}
	}""",
	AnalysisType.EVALUATION: """You are an expert evaluator of Buddhist translations.
	You must ONLY respond with a valid JSON object, no other text.
	Never include any explanatory text before or after the JSON.

	Required JSON structure:
	{
	"evaluations": {
	"term": {
	"technical_score": 0.0,
	"cultural_score": 0.0,
	"audience_score": 0.0,
	"reasoning": "string"
	}
	}
	}""",
	}

	# Initialize chats with respective system prompts
	self.chats = {
	analysis_type: Chat(self.model, sp=system_prompt)
	for analysis_type, system_prompt in self.system_prompts.items()
	}

	def create_semantic_prompt(self, tibetan_term: str, contexts: List[Dict]) -> str:
	return f"""
	Analyze this Buddhist term following these steps:

	Target Term: {tibetan_term}

	Analysis Process:
	1. First analyze the Sanskrit source:
	- Identify the Sanskrit equivalent
	- Break down its morphology
	- Understand its literal and technical meanings

	2. Map to Tibetan:
	- Analyze how Tibetan translates the Sanskrit
	- Note any semantic extensions or modifications
	- Understand the translation strategy

	3. Study the commentaries:
	- Extract key explanations
	- Note technical clarifications
	- Identify special usages explained

	4. Evaluate English translations:
	- Compare against Sanskrit and Tibetan meanings
	- Assess accuracy and completeness
	- Note which aspects are captured/missed

	5. Synthesize understanding:
	- Combine insights from all sources
	- Document technical usage patterns
	- Note important connotations

	Contexts:
	{json.dumps(contexts, indent=2, ensure_ascii=False)}

	Important:
	- Base analysis strictly on provided contexts
	- Use commentaries to resolve ambiguities
	- Pay special attention to technical terms in commentaries
	- Note when English translations diverge from Sanskrit/Tibetan
	- Document specific usage examples from the context

	Remember: Return ONLY the JSON object with no other text."""

	def create_generation_prompt(
	self, tibetan_term: str, semantic_analysis: Dict
	) -> str:
	return f"""
	Respond ONLY with a JSON object containing translation candidates:

	Term: {tibetan_term}

	Semantic Analysis:
	{json.dumps(semantic_analysis, indent=2, ensure_ascii=False)}

	Remember: Return ONLY the JSON object with no other text."""

	def create_evaluation_prompt(
	self, tibetan_term: str, candidates: Dict, semantic_analysis: Dict
	) -> str:
	return f"""
	Respond ONLY with a JSON object evaluating these candidates:

	Term: {tibetan_term}

	Candidates:
	{json.dumps(candidates, indent=2, ensure_ascii=False)}

	Semantic Analysis:
	{json.dumps(semantic_analysis, indent=2, ensure_ascii=False)}

	Remember: Return ONLY the JSON object with no other text."""

	def _track_usage(self, analysis_type: AnalysisType, response):
	cost = self.chats[analysis_type].cost
	self.total_api_calls_cost += cost
	self.token_usage[str(analysis_type)] = {
	"token_usage": repr(response.usage),
	"api_call_cost": cost,
	}

	def analyze_term(self, tibetan_term: str, contexts: List[Dict]) -> Dict:
	"""Main analysis pipeline using cached prompts"""

	# 1. Semantic Analysis with cache
	semantic_prompt = self.create_semantic_prompt(tibetan_term, contexts)
	semantic_response = self.chats[AnalysisType.SEMANTIC](semantic_prompt)
	self._track_usage(AnalysisType.SEMANTIC, semantic_response)
	semantic_analysis = json.loads(semantic_response.content[0].text)

	# 2. Term Generation with cache
	generation_prompt = self.create_generation_prompt(
	tibetan_term, semantic_analysis
	)
	generation_response = self.chats[AnalysisType.TERM_GENERATION](
	generation_prompt
	)
	self._track_usage(AnalysisType.TERM_GENERATION, generation_response)
	semantic_analysis = json.loads(semantic_response.content[0].text)
	candidates = json.loads(generation_response.content[0].text)

	# 3. Evaluation with cache
	evaluation_prompt = self.create_evaluation_prompt(
	tibetan_term, candidates, semantic_analysis
	)
	evaluation_response = self.chats[AnalysisType.EVALUATION](evaluation_prompt)
	self._track_usage(AnalysisType.EVALUATION, evaluation_response)
	evaluations = json.loads(evaluation_response.content[0].text)

	# Combine results
	return self.format_results(
	tibetan_term,
	semantic_analysis,
	candidates,
	evaluations,
	)

	def format_results(
	self,
	tibetan_term: str,
	semantic_analysis: Dict,
	candidates: Dict,
	evaluations: Dict,
	) -> Dict:
	"""Format the final results"""
	return {
	"tibetan_term": tibetan_term,
	"recommendations": {
	"Academic": {
	"term": candidates["academic"]["terms"][0],
	"reasoning": candidates["academic"]["reasoning"],
	},
	"Practitioner": {
	"term": candidates["practitioner"]["terms"][0],
	"reasoning": candidates["practitioner"]["reasoning"],
	},
	"General": {
	"term": candidates["general"]["terms"][0],
	"reasoning": candidates["general"]["reasoning"],
	},
	},
	"analysis": semantic_analysis,
	"evaluations": evaluations["evaluations"],
	"total_api_calls_cost": self.total_api_calls_cost,
	"token_usage": self.token_usage,
	}


	class TermStandardizationAgent:
	def __init__(self):
	self.analyzer = BuddhistTermAnalyzer()

	def select_best_terms(self, tibetan_term: str, contexts: List[Dict]) -> Dict:
	"""Main entry point for term standardization"""
	results = self.analyzer.analyze_term(tibetan_term, contexts)
	return results


	# Example usage
	def main():
	from pathlib import Path

	# Initialize agent
	agent = TermStandardizationAgent()

	# Test input
	tibetan_term = "བྱང་ཆུབ་སེམས་"
	contexts_fn = Path(__file__).parent / "data" / f"{tibetan_term}.json"
	contexts = json.load(contexts_fn.open())

	# Process term
	results = agent.select_best_terms(tibetan_term, contexts)
	date_time = datetime.now().strftime("%Y%m%d%H%M%S")
	results_path = Path(__file__).parent / "results"
	results_path.mkdir(exist_ok=True, parents=True)
	result_fn = results_path / f"{tibetan_term}_{date_time}.json"
	json.dump(results, result_fn.open("w"), indent=2, ensure_ascii=False)
	print(f"Results saved to: {result_fn}")


	if __name__ == "__main__":
	main()