Basitha commited on
Commit
e56d353
·
verified ·
1 Parent(s): 1c35fdf

Update common/validation_utils.py

Browse files
Files changed (1) hide show
  1. common/validation_utils.py +62 -16
common/validation_utils.py CHANGED
@@ -6,12 +6,12 @@ import re
6
  from RespondentAgent import *
7
  from langchain_groq import ChatGroq
8
 
 
9
  def validate_response(question, answer, user_profile_str, fast_facts_str, interview_transcript_text, respondent_type, ai_evaluator_agent, processor_llm):
10
  """
11
  Validates a response (answer) to a question using the appropriate evaluation method (exploratory or fact-based).
12
  Uses the LLM to determine if the question is exploratory or fact-based, then applies the correct rating logic.
13
  Returns True if the answer is valid (all relevant metrics >= 8/10), otherwise False.
14
-
15
  ## Evaluation Criteria (0-10 Scores & Guidelines):
16
  Ratings must be balanced and discriminative—do not default to high scores. Decimal ratings (e.g., 6.5, 8.2, 9.5) are allowed.
17
  Always take into account what the question specifically asked for and any constraints it imposed.
@@ -21,7 +21,6 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
21
  - Did the person make the most of what the prompt allowed, even in just a few words?
22
  - Is the tone, reasoning, or phrasing consistent with what a real person might say in that situation?
23
  Even short responses can show plausibility through subtle cues like word choice, minor hedging, or a relatable reaction. Only deduct for lack of nuance if the prompt clearly left room for more depth and the answer failed to take advantage of that—especially when the limitations are a direct result of the questioning style.
24
-
25
  Plausibility (Behavioral & Contextual Realism):
26
  - Assesses how realistic and in-character the response feels given all information, even if it's imagined.
27
  - Focus on tone, lifestyle consistency, and alignment with demographic/cultural context.
@@ -38,7 +37,6 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
38
  - 1–3 = Implausible. The response feels unnatural, exaggerated, or stereotyped. Tone or behavior contradicts the expected norms for the respondent’s profile. May reflect poor understanding of context.
39
  - 0 = Entirely unrealistic or fabricated. The response is clearly out of character, culturally misaligned, or extremely artificial. Strong evidence that it was not written in the respondent’s voice or context.
40
  - If a score is 8 or higher, justify why it's not a 10.
41
-
42
  Relevance:
43
  - Focus on how directly and completely the response answers the specific question asked.
44
  - A relevant answer should stay on-topic, address the core of the question, and avoid vague or generic filler.
@@ -55,7 +53,6 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
55
  - 1–3 = Barely relevant. The response is mostly off-topic, highly generic, or misinterprets the question. Gives the sense that the question was not understood or considered.
56
  - 0 = Irrelevant. The answer does not address the question at all or responds to a completely different topic.
57
  - If a score is 8 or higher, justify why it's not a 10.
58
-
59
  Accuracy (Faithfulness to Profile & Transcript):
60
  - Only include details that are clearly present in the user profile or transcript.
61
  - Paraphrasing is allowed and responses do not need to match the source text word-for-word. Paraphrasing should not be penalised. However, all relevant details must be included; omitting any information present in the source is not allowed and should be penalised.
@@ -96,20 +93,69 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
96
  else:
97
  evaluation_mode = "exploratory"
98
  logging.info(f"LLM determined evaluation mode: {evaluation_mode}")
99
- # Now do the correct evaluation
100
- from .InteractiveInterviewChatbot import evaluate_response_exploratory, evaluate_response_factbased
101
  if evaluation_mode == "exploratory":
102
- evaluation_result = evaluate_response_exploratory(user_profile_str, fast_facts_str, interview_transcript_text, question, answer, ai_evaluator_agent, respondent_type)
103
- plausibility = evaluation_result.get("plausibility_rating")
104
- relevance = evaluation_result.get("relevance_rating")
105
- logging.info(f"Exploratory evaluation result: {evaluation_result}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  if plausibility is not None and relevance is not None:
107
- return float(plausibility) >= 8.0 and float(relevance) >= 8.0
108
  return False
109
  else:
110
- evaluation_result = evaluate_response_factbased(user_profile_str, fast_facts_str, interview_transcript_text, question, answer, ai_evaluator_agent, respondent_type)
111
- accuracy = evaluation_result.get("accuracy_rating")
112
- logging.info(f"Fact-based evaluation result: {evaluation_result}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  if accuracy is not None:
114
- return float(accuracy) >= 8.0
115
- return False
 
6
  from RespondentAgent import *
7
  from langchain_groq import ChatGroq
8
 
9
+
10
  def validate_response(question, answer, user_profile_str, fast_facts_str, interview_transcript_text, respondent_type, ai_evaluator_agent, processor_llm):
11
  """
12
  Validates a response (answer) to a question using the appropriate evaluation method (exploratory or fact-based).
13
  Uses the LLM to determine if the question is exploratory or fact-based, then applies the correct rating logic.
14
  Returns True if the answer is valid (all relevant metrics >= 8/10), otherwise False.
 
15
  ## Evaluation Criteria (0-10 Scores & Guidelines):
16
  Ratings must be balanced and discriminative—do not default to high scores. Decimal ratings (e.g., 6.5, 8.2, 9.5) are allowed.
17
  Always take into account what the question specifically asked for and any constraints it imposed.
 
21
  - Did the person make the most of what the prompt allowed, even in just a few words?
22
  - Is the tone, reasoning, or phrasing consistent with what a real person might say in that situation?
23
  Even short responses can show plausibility through subtle cues like word choice, minor hedging, or a relatable reaction. Only deduct for lack of nuance if the prompt clearly left room for more depth and the answer failed to take advantage of that—especially when the limitations are a direct result of the questioning style.
 
24
  Plausibility (Behavioral & Contextual Realism):
25
  - Assesses how realistic and in-character the response feels given all information, even if it's imagined.
26
  - Focus on tone, lifestyle consistency, and alignment with demographic/cultural context.
 
37
  - 1–3 = Implausible. The response feels unnatural, exaggerated, or stereotyped. Tone or behavior contradicts the expected norms for the respondent’s profile. May reflect poor understanding of context.
38
  - 0 = Entirely unrealistic or fabricated. The response is clearly out of character, culturally misaligned, or extremely artificial. Strong evidence that it was not written in the respondent’s voice or context.
39
  - If a score is 8 or higher, justify why it's not a 10.
 
40
  Relevance:
41
  - Focus on how directly and completely the response answers the specific question asked.
42
  - A relevant answer should stay on-topic, address the core of the question, and avoid vague or generic filler.
 
53
  - 1–3 = Barely relevant. The response is mostly off-topic, highly generic, or misinterprets the question. Gives the sense that the question was not understood or considered.
54
  - 0 = Irrelevant. The answer does not address the question at all or responds to a completely different topic.
55
  - If a score is 8 or higher, justify why it's not a 10.
 
56
  Accuracy (Faithfulness to Profile & Transcript):
57
  - Only include details that are clearly present in the user profile or transcript.
58
  - Paraphrasing is allowed and responses do not need to match the source text word-for-word. Paraphrasing should not be penalised. However, all relevant details must be included; omitting any information present in the source is not allowed and should be penalised.
 
93
  else:
94
  evaluation_mode = "exploratory"
95
  logging.info(f"LLM determined evaluation mode: {evaluation_mode}")
96
+
97
+ # Now do the correct evaluation using LLM
98
  if evaluation_mode == "exploratory":
99
+ # Ask LLM for plausibility and relevance ratings
100
+ eval_prompt = f"""
101
+ You are an expert market research evaluator. Given the following:
102
+ - User Profile: {user_profile_str}
103
+ - Fast Facts: {fast_facts_str}
104
+ - Interview Transcript: {interview_transcript_text}
105
+ - Respondent Type: {respondent_type}
106
+ - Question: {question}
107
+ - Answer: {answer}
108
+ Please rate the answer on a scale of 0-10 for:
109
+ 1. Plausibility (how realistic, authentic, and in-character the response is, given the profile and context)
110
+ 2. Relevance (how directly and completely the answer addresses the question)
111
+ Output strictly in this format:
112
+ Plausibility Rating: <0-10>
113
+ Relevance Rating: <0-10>
114
+ """
115
+ eval_response = processor_llm.invoke(eval_prompt)
116
+ eval_text = eval_response.content.strip()
117
+ plausibility = None
118
+ relevance = None
119
+ for line in eval_text.split("\n"):
120
+ if line.lower().startswith("plausibility rating:"):
121
+ try:
122
+ plausibility = float(line.split(":",1)[1].strip())
123
+ except Exception:
124
+ plausibility = None
125
+ if line.lower().startswith("relevance rating:"):
126
+ try:
127
+ relevance = float(line.split(":",1)[1].strip())
128
+ except Exception:
129
+ relevance = None
130
+ logging.info(f"Exploratory evaluation: plausibility={plausibility}, relevance={relevance}")
131
  if plausibility is not None and relevance is not None:
132
+ return plausibility >= 8.0 and relevance >= 8.0
133
  return False
134
  else:
135
+ # Fact-based: Ask LLM for accuracy rating
136
+ eval_prompt = f"""
137
+ You are an expert market research evaluator. Given the following:
138
+ - User Profile: {user_profile_str}
139
+ - Fast Facts: {fast_facts_str}
140
+ - Interview Transcript: {interview_transcript_text}
141
+ - Respondent Type: {respondent_type}
142
+ - Question: {question}
143
+ - Answer: {answer}
144
+ Please rate the answer on a scale of 0-10 for:
145
+ 1. Accuracy (how well the answer matches the facts in the profile, transcript, or fast facts; penalise any unsupported or fabricated content)
146
+ Output strictly in this format:
147
+ Accuracy Rating: <0-10>
148
+ """
149
+ eval_response = processor_llm.invoke(eval_prompt)
150
+ eval_text = eval_response.content.strip()
151
+ accuracy = None
152
+ for line in eval_text.split("\n"):
153
+ if line.lower().startswith("accuracy rating:"):
154
+ try:
155
+ accuracy = float(line.split(":",1)[1].strip())
156
+ except Exception:
157
+ accuracy = None
158
+ logging.info(f"Fact-based evaluation: accuracy={accuracy}")
159
  if accuracy is not None:
160
+ return accuracy >= 8.0
161
+ return False