| | import json |
| |
|
| | from util.assistants import GPTAgent |
| | import json_repair |
| |
|
| | class evaluator: |
| | def __init__(self, model_name='GPT4-turbo'): |
| | self.model = GPTAgent(model_name) |
| |
|
| | def validate_scores(self, scores): |
| | required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"] |
| | for key in required_keys: |
| | if key not in scores or not isinstance(scores[key], (int, float)) or not (-1 <= scores[key] <= 1): |
| | return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1} |
| |
|
| | return scores |
| |
|
| | def __call__(self, question,explanation): |
| |
|
| | evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by |
| | an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle |
| | should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, |
| | and 1 indicates that the principle is fully satisfied. |
| | |
| | Question: |
| | {question} |
| | |
| | Provided Explanation: |
| | {explanation} |
| | |
| | Evaluation Criteria: |
| | |
| | Factually Correct: |
| | Definition: The explanation must be accurate and relevant to the question and the subject matter. |
| | Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question. |
| | |
| | Useful: |
| | Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making. |
| | Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions? |
| | |
| | Context Specific: |
| | Definition: The explanation should be relevant to the specific context or scenario implied by the question. |
| | Score: (0-1) How well does the explanation address the specific context or scenario of the question? |
| | |
| | User Specific: |
| | Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics. |
| | Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user? |
| | |
| | Provides Pluralism: |
| | Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives. |
| | Score: (0-1) How well does the explanation provide or support multiple perspectives? |
| | |
| | After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. |
| | |
| | Example JSON format: |
| | |
| | {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}} |
| | |
| | Directly provide me with the json without any additional text. |
| | |
| | Answer: |
| | """ |
| |
|
| | response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=150).strip() |
| | |
| | print(response) |
| | try: |
| | scores = json.loads(response) |
| | except json.JSONDecodeError: |
| | |
| | repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False) |
| | try: |
| | scores = json.loads(repaired_json) |
| | except json.JSONDecodeError: |
| | print("Failed to decode JSON response even after repair attempt. Skipping this batch.") |
| | return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1} |
| |
|
| |
|
| | return self.validate_scores(scores) |
| |
|
| | def write_evaluation_commentary(scores): |
| | evaluation_details = [] |
| | for principle, score in scores.items(): |
| |
|
| | if score == -1: |
| | evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'}) |
| | continue |
| |
|
| | if principle == "Factually Correct": |
| | if score >= 0.8: |
| | comment = "Excellent accuracy! The information is precise and directly relevant to the question." |
| | elif score >= 0.5: |
| | comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant." |
| | else: |
| | comment = "The explanation contains significant inaccuracies or irrelevant information." |
| | elif principle == "Useful": |
| | if score >= 0.8: |
| | comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making." |
| | elif score >= 0.5: |
| | comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding." |
| | else: |
| | comment = "The explanation does little to help understand or apply the information provided." |
| | elif principle == "Context Specific": |
| | if score >= 0.8: |
| | comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively." |
| | elif score >= 0.5: |
| | comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question." |
| | else: |
| | comment = "Fails to address the context of the question, lacking relevance or specificity." |
| | elif principle == "User Specific": |
| | if score >= 0.8: |
| | comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness." |
| | elif score >= 0.5: |
| | comment = "Moderately considerate of the user's knowledge level, but could be more tailored." |
| | else: |
| | comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest." |
| | elif principle == "Provides Pluralism": |
| | if score >= 0.8: |
| | comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding." |
| | elif score >= 0.5: |
| | comment = "Offers some alternative perspectives, but more could be provided to enrich understanding." |
| | else: |
| | comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic." |
| |
|
| | evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment}) |
| | return evaluation_details |
| |
|
| | if __name__ == '__main__': |
| | eval = evaluator() |
| | question = "What is the capital of France?" |
| | explanation = "The capital of France is Paris." |
| | print(eval(question, explanation)) |