Spaces:
Runtime error
Runtime error
| import json | |
| import re | |
| from typing import Any, Dict | |
| from graphgen.bases import BaseLLMWrapper, BaseTripleEvaluator | |
| from graphgen.templates import ACCURACY_EVALUATION_PROMPT | |
| from graphgen.utils import detect_main_language, logger | |
| class AccuracyEvaluator(BaseTripleEvaluator): | |
| """Evaluates accuracy of entity recognition and relation extraction using LLM-as-a-Judge. | |
| For each chunk, uses LLM to evaluate the quality of extracted entities and relations | |
| by comparing them with the original chunk content. Provides multi-dimensional quality | |
| scores (accuracy, completeness, precision). | |
| """ | |
| def __init__( | |
| self, | |
| llm_client: BaseLLMWrapper, | |
| ): | |
| self.llm_client = llm_client | |
| async def evaluate(self, unit: tuple) -> Dict[str, Any]: | |
| """Evaluate entity and relation extraction quality using LLM-as-a-Judge. | |
| Returns: | |
| Dictionary containing entity_accuracy and relation_accuracy metrics. | |
| """ | |
| chunk_content, nodes, edges = unit | |
| lang = detect_main_language(chunk_content) | |
| # node | |
| prompt = ACCURACY_EVALUATION_PROMPT[lang]["ENTITY"].format( | |
| chunk_content=chunk_content, | |
| extracted_entities=json.dumps(nodes, ensure_ascii=False, indent=2), | |
| ) | |
| response = await self.llm_client.generate_answer(prompt) | |
| # Try to parse JSON response | |
| try: | |
| node_evaluation_result = json.loads(response) | |
| except json.JSONDecodeError: | |
| # Try to extract JSON from markdown code blocks or other formats | |
| json_match = re.search(r"\{.*\}", response, re.DOTALL) | |
| if json_match: | |
| node_evaluation_result = json.loads(json_match.group(0)) | |
| else: | |
| logger.warning("Failed to parse LLM response.") | |
| # default evaluation | |
| node_evaluation_result = { | |
| "accuracy": 0.0, | |
| "completeness": 0.0, | |
| "precision": 0.0, | |
| "overall_score": 0.0, | |
| "accuracy_reasoning": "Failed to parse LLM response", | |
| "completeness_reasoning": "", | |
| "precision_reasoning": "", | |
| "issues": ["LLM response parsing failed"], | |
| } | |
| # edge | |
| prompt = ACCURACY_EVALUATION_PROMPT[lang]["RELATION"].format( | |
| chunk_content=chunk_content, | |
| extracted_relations=json.dumps(edges, ensure_ascii=False, indent=2), | |
| ) | |
| response = await self.llm_client.generate_answer(prompt) | |
| # Try to parse JSON response | |
| try: | |
| edge_evaluation_result = json.loads(response) | |
| except json.JSONDecodeError: | |
| # Try to extract JSON from markdown code blocks or other formats | |
| json_match = re.search(r"\{.*\}", response, re.DOTALL) | |
| if json_match: | |
| edge_evaluation_result = json.loads(json_match.group(0)) | |
| else: | |
| logger.warning("Failed to parse LLM response.") | |
| # default evaluation | |
| edge_evaluation_result = { | |
| "accuracy": 0.0, | |
| "completeness": 0.0, | |
| "precision": 0.0, | |
| "overall_score": 0.0, | |
| "accuracy_reasoning": "Failed to parse LLM response", | |
| "completeness_reasoning": "", | |
| "precision_reasoning": "", | |
| "issues": ["LLM response parsing failed"], | |
| } | |
| return { | |
| "entity_accuracy": node_evaluation_result, | |
| "relation_accuracy": edge_evaluation_result, | |
| } | |