| import json |
|
|
| from util.assistants import GPTAgent |
| import json_repair |
|
|
| class evaluator: |
| def __init__(self, model_name='GPT4-turbo'): |
| self.model = GPTAgent(model_name) |
|
|
| def validate_scores(self, scores): |
| required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"] |
|
|
| for key in required_keys: |
| if key not in scores: |
| return {k: {"Score": -1, "Justification": "Invalid input"} for k in required_keys} |
|
|
| score_data = scores[key] |
|
|
| if not isinstance(score_data, dict): |
| return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys} |
|
|
| if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not ( |
| 0 <= score_data["Score"] <= 10): |
| return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys} |
|
|
| if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[ |
| "Justification"].strip(): |
| return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys} |
|
|
| return scores |
|
|
| def evaluate_single(self, question,explanation): |
|
|
| evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by |
| an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle |
| should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all, |
| and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating. |
| |
| Question: |
| {question} |
| |
| Provided Explanation: |
| {explanation} |
| |
| Evaluation Criteria: |
| |
| Factually Correct: |
| Definition: The explanation must be accurate and relevant to the question and the subject matter. |
| Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question. |
| |
| Useful: |
| Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making. |
| Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions? |
| |
| Context Specific: |
| Definition: The explanation should be relevant to the specific context or scenario implied by the question. |
| Score: (0-10) How well does the explanation address the specific context or scenario of the question? |
| |
| User Specific: |
| Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics. |
| Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user? |
| |
| Provides Pluralism: |
| Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives. |
| Score: (0-10) How well does the explanation provide or support multiple perspectives? |
| |
| After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text. |
| |
| Example JSON format: |
| {{ |
| "Factually Correct": {{ |
| "Justification": "xxx", |
| "Score": 9 |
| }}, |
| "Useful": {{ |
| "Justification": "xxx", |
| "Score": 8.5 |
| }}, |
| "Context Specific": {{ |
| "Justification": "xxx", |
| "Score": 8 |
| }}, |
| "User Specific": {{ |
| "Justification": "xxx", |
| "Score": 7.5 |
| }}, |
| "Provides Pluralism": {{ |
| "Justification": "xxx", |
| "Score": 7 |
| }} |
| }} |
| |
| Answer: |
| """ |
|
|
| response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip() |
|
|
| print(response) |
| try: |
| scores = json.loads(response) |
| except json.JSONDecodeError: |
| |
| repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False) |
| try: |
| scores = json.loads(repaired_json) |
| except json.JSONDecodeError: |
| print("Failed to decode JSON response even after repair attempt. Skipping this batch.") |
| return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1} |
|
|
|
|
| return self.validate_scores(scores) |
|
|
| def format_conversation(self, conversation): |
| formatted_conversation = "\n".join( |
| f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation |
| ) |
| return formatted_conversation |
|
|
| def evaluate_conversation(self, conversation, context): |
| formatted_conversation = self.format_conversation(conversation) |
| evaluation_prompt = f""" |
| You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the explanation based on the following five principles. Each principle |
| should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all, |
| and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating. |
| |
| Conversation: |
| {formatted_conversation} |
| |
| Context: |
| {context} |
| |
| Evaluation Criteria: |
| |
| Factually Correct: |
| Definition: The explanation must be accurate and relevant to the question and the subject matter. |
| Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question. |
| |
| Useful: |
| Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making. |
| Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions? |
| |
| Context Specific: |
| Definition: The explanation should be relevant to the specific context or scenario implied by the question. |
| Score: (0-10) How well does the explanation address the specific context or scenario of the question? |
| |
| User Specific: |
| Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics. |
| Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user? |
| |
| Provides Pluralism: |
| Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives. |
| Score: (0-10) How well does the explanation provide or support multiple perspectives? |
| |
| After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text. |
| |
| Example JSON format: |
| {{ |
| "Factually Correct": {{ |
| "Justification": "xxx", |
| "Score": 9 |
| }}, |
| "Useful": {{ |
| "Justification": "xxx", |
| "Score": 8.5 |
| }}, |
| "Context Specific": {{ |
| "Justification": "xxx", |
| "Score": 8 |
| }}, |
| "User Specific": {{ |
| "Justification": "xxx", |
| "Score": 7.5 |
| }}, |
| "Provides Pluralism": {{ |
| "Justification": "xxx", |
| "Score": 7 |
| }} |
| }} |
| |
| Answer: |
| """ |
|
|
| print(evaluation_prompt) |
|
|
| response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=1000).strip() |
| try: |
| scores = json.loads(response) |
| except json.JSONDecodeError: |
| repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False) |
| try: |
| scores = json.loads(repaired_json) |
| except json.JSONDecodeError: |
| print("Failed to decode JSON response even after repair attempt. Skipping this batch.") |
| return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]} |
|
|
| return self.validate_scores(scores) |
|
|
|
|
| def write_evaluation_commentary(scores): |
| evaluation_details = [] |
|
|
| for principle, details in scores.items(): |
| print(details) |
| score = details.get('Score', -1) |
| justification = details.get('Justification', '') |
|
|
| if score == -1: |
| evaluation_details.append( |
| {'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.', |
| 'Justification': justification}) |
| continue |
|
|
| if principle == "Factually Correct": |
| if score >= 0.8: |
| comment = "Excellent accuracy! The information is precise and directly relevant to the question." |
| elif score >= 0.5: |
| comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant." |
| else: |
| comment = "The explanation contains significant inaccuracies or irrelevant information." |
| elif principle == "Useful": |
| if score >= 0.8: |
| comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making." |
| elif score >= 0.5: |
| comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding." |
| else: |
| comment = "The explanation does little to help understand or apply the information provided." |
| elif principle == "Context Specific": |
| if score >= 0.8: |
| comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively." |
| elif score >= 0.5: |
| comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question." |
| else: |
| comment = "Fails to address the context of the question, lacking relevance or specificity." |
| elif principle == "User Specific": |
| if score >= 0.8: |
| comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness." |
| elif score >= 0.5: |
| comment = "Moderately considerate of the user's knowledge level, but could be more tailored." |
| else: |
| comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest." |
| elif principle == "Provides Pluralism": |
| if score >= 0.8: |
| comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding." |
| elif score >= 0.5: |
| comment = "Offers some alternative perspectives, but more could be provided to enrich understanding." |
| else: |
| comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic." |
|
|
| evaluation_details.append( |
| {'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment}) |
|
|
| return evaluation_details |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| if __name__ == '__main__': |
|
|
| eval = evaluator() |
| conversation = [ |
| {"role": "system", "content": "You are a helpful assistant."}, |
| {"role": "user", "content": "Who won the world series in 2020?"}, |
| {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}, |
| {"role": "user", "content": "Where was it played?"} |
| ] |
| context = "general user, user_background is sports enthusiast" |
| results = eval.evaluate_conversation(conversation, context) |
| print(results) |
| print(write_evaluation_commentary(results)) |