Spaces:

nat232
/

student_sample_panel

Build error

App Files Files Community

Basitha commited on Jun 18, 2025

Commit

e56d353

verified ·

1 Parent(s): 1c35fdf

Update common/validation_utils.py

Browse files

Files changed (1) hide show

common/validation_utils.py +62 -16

common/validation_utils.py CHANGED Viewed

@@ -6,12 +6,12 @@ import re
 from RespondentAgent import *
 from langchain_groq import ChatGroq
 def validate_response(question, answer, user_profile_str, fast_facts_str, interview_transcript_text, respondent_type, ai_evaluator_agent, processor_llm):
     """
     Validates a response (answer) to a question using the appropriate evaluation method (exploratory or fact-based).
     Uses the LLM to determine if the question is exploratory or fact-based, then applies the correct rating logic.
     Returns True if the answer is valid (all relevant metrics >= 8/10), otherwise False.
     ## Evaluation Criteria (0-10 Scores & Guidelines):
     Ratings must be balanced and discriminative—do not default to high scores. Decimal ratings (e.g., 6.5, 8.2, 9.5) are allowed.
     Always take into account what the question specifically asked for and any constraints it imposed.
@@ -21,7 +21,6 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
         - Did the person make the most of what the prompt allowed, even in just a few words?
         - Is the tone, reasoning, or phrasing consistent with what a real person might say in that situation?
     Even short responses can show plausibility through subtle cues like word choice, minor hedging, or a relatable reaction. Only deduct for lack of nuance if the prompt clearly left room for more depth and the answer failed to take advantage of that—especially when the limitations are a direct result of the questioning style.
     Plausibility (Behavioral & Contextual Realism):
     - Assesses how realistic and in-character the response feels given all information, even if it's imagined.
     - Focus on tone, lifestyle consistency, and alignment with demographic/cultural context.
@@ -38,7 +37,6 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
     - 1–3 = Implausible. The response feels unnatural, exaggerated, or stereotyped. Tone or behavior contradicts the expected norms for the respondent’s profile. May reflect poor understanding of context.
     - 0 = Entirely unrealistic or fabricated. The response is clearly out of character, culturally misaligned, or extremely artificial. Strong evidence that it was not written in the respondent’s voice or context.
     - If a score is 8 or higher, justify why it's not a 10.
     Relevance:
     - Focus on how directly and completely the response answers the specific question asked.
     - A relevant answer should stay on-topic, address the core of the question, and avoid vague or generic filler.
@@ -55,7 +53,6 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
     - 1–3 = Barely relevant. The response is mostly off-topic, highly generic, or misinterprets the question. Gives the sense that the question was not understood or considered.
     - 0 = Irrelevant. The answer does not address the question at all or responds to a completely different topic.
     - If a score is 8 or higher, justify why it's not a 10.
     Accuracy (Faithfulness to Profile & Transcript):
     - Only include details that are clearly present in the user profile or transcript.
     - Paraphrasing is allowed and responses do not need to match the source text word-for-word. Paraphrasing should not be penalised. However, all relevant details must be included; omitting any information present in the source is not allowed and should be penalised.
@@ -96,20 +93,69 @@ def validate_response(question, answer, user_profile_str, fast_facts_str, interv
             else:
                 evaluation_mode = "exploratory"
     logging.info(f"LLM determined evaluation mode: {evaluation_mode}")
-    # Now do the correct evaluation
-    from .InteractiveInterviewChatbot import evaluate_response_exploratory, evaluate_response_factbased
     if evaluation_mode == "exploratory":
-        evaluation_result = evaluate_response_exploratory(user_profile_str, fast_facts_str, interview_transcript_text, question, answer, ai_evaluator_agent, respondent_type)
-        plausibility = evaluation_result.get("plausibility_rating")
-        relevance = evaluation_result.get("relevance_rating")
-        logging.info(f"Exploratory evaluation result: {evaluation_result}")
         if plausibility is not None and relevance is not None:
-            return float(plausibility) >= 8.0 and float(relevance) >= 8.0
         return False
     else:
-        evaluation_result = evaluate_response_factbased(user_profile_str, fast_facts_str, interview_transcript_text, question, answer, ai_evaluator_agent, respondent_type)
-        accuracy = evaluation_result.get("accuracy_rating")
-        logging.info(f"Fact-based evaluation result: {evaluation_result}")
         if accuracy is not None:
-            return float(accuracy) >= 8.0
-        return False

 from RespondentAgent import *
 from langchain_groq import ChatGroq
 def validate_response(question, answer, user_profile_str, fast_facts_str, interview_transcript_text, respondent_type, ai_evaluator_agent, processor_llm):
     """
     Validates a response (answer) to a question using the appropriate evaluation method (exploratory or fact-based).
     Uses the LLM to determine if the question is exploratory or fact-based, then applies the correct rating logic.
     Returns True if the answer is valid (all relevant metrics >= 8/10), otherwise False.
     ## Evaluation Criteria (0-10 Scores & Guidelines):
     Ratings must be balanced and discriminative—do not default to high scores. Decimal ratings (e.g., 6.5, 8.2, 9.5) are allowed.
     Always take into account what the question specifically asked for and any constraints it imposed.
         - Did the person make the most of what the prompt allowed, even in just a few words?
         - Is the tone, reasoning, or phrasing consistent with what a real person might say in that situation?
     Even short responses can show plausibility through subtle cues like word choice, minor hedging, or a relatable reaction. Only deduct for lack of nuance if the prompt clearly left room for more depth and the answer failed to take advantage of that—especially when the limitations are a direct result of the questioning style.
     Plausibility (Behavioral & Contextual Realism):
     - Assesses how realistic and in-character the response feels given all information, even if it's imagined.
     - Focus on tone, lifestyle consistency, and alignment with demographic/cultural context.
     - 1–3 = Implausible. The response feels unnatural, exaggerated, or stereotyped. Tone or behavior contradicts the expected norms for the respondent’s profile. May reflect poor understanding of context.
     - 0 = Entirely unrealistic or fabricated. The response is clearly out of character, culturally misaligned, or extremely artificial. Strong evidence that it was not written in the respondent’s voice or context.
     - If a score is 8 or higher, justify why it's not a 10.
     Relevance:
     - Focus on how directly and completely the response answers the specific question asked.
     - A relevant answer should stay on-topic, address the core of the question, and avoid vague or generic filler.
     - 1–3 = Barely relevant. The response is mostly off-topic, highly generic, or misinterprets the question. Gives the sense that the question was not understood or considered.
     - 0 = Irrelevant. The answer does not address the question at all or responds to a completely different topic.
     - If a score is 8 or higher, justify why it's not a 10.
     Accuracy (Faithfulness to Profile & Transcript):
     - Only include details that are clearly present in the user profile or transcript.
     - Paraphrasing is allowed and responses do not need to match the source text word-for-word. Paraphrasing should not be penalised. However, all relevant details must be included; omitting any information present in the source is not allowed and should be penalised.
             else:
                 evaluation_mode = "exploratory"
     logging.info(f"LLM determined evaluation mode: {evaluation_mode}")
+    # Now do the correct evaluation using LLM
     if evaluation_mode == "exploratory":
+        # Ask LLM for plausibility and relevance ratings
+        eval_prompt = f"""
+        You are an expert market research evaluator. Given the following:
+        - User Profile: {user_profile_str}
+        - Fast Facts: {fast_facts_str}
+        - Interview Transcript: {interview_transcript_text}
+        - Respondent Type: {respondent_type}
+        - Question: {question}
+        - Answer: {answer}
+        Please rate the answer on a scale of 0-10 for:
+        1. Plausibility (how realistic, authentic, and in-character the response is, given the profile and context)
+        2. Relevance (how directly and completely the answer addresses the question)
+        Output strictly in this format:
+        Plausibility Rating: <0-10>
+        Relevance Rating: <0-10>
+        """
+        eval_response = processor_llm.invoke(eval_prompt)
+        eval_text = eval_response.content.strip()
+        plausibility = None
+        relevance = None
+        for line in eval_text.split("\n"):
+            if line.lower().startswith("plausibility rating:"):
+                try:
+                    plausibility = float(line.split(":",1)[1].strip())
+                except Exception:
+                    plausibility = None
+            if line.lower().startswith("relevance rating:"):
+                try:
+                    relevance = float(line.split(":",1)[1].strip())
+                except Exception:
+                    relevance = None
+        logging.info(f"Exploratory evaluation: plausibility={plausibility}, relevance={relevance}")
         if plausibility is not None and relevance is not None:
+            return plausibility >= 8.0 and relevance >= 8.0
         return False
     else:
+        # Fact-based: Ask LLM for accuracy rating
+        eval_prompt = f"""
+        You are an expert market research evaluator. Given the following:
+        - User Profile: {user_profile_str}
+        - Fast Facts: {fast_facts_str}
+        - Interview Transcript: {interview_transcript_text}
+        - Respondent Type: {respondent_type}
+        - Question: {question}
+        - Answer: {answer}
+        Please rate the answer on a scale of 0-10 for:
+        1. Accuracy (how well the answer matches the facts in the profile, transcript, or fast facts; penalise any unsupported or fabricated content)
+        Output strictly in this format:
+        Accuracy Rating: <0-10>
+        """
+        eval_response = processor_llm.invoke(eval_prompt)
+        eval_text = eval_response.content.strip()
+        accuracy = None
+        for line in eval_text.split("\n"):
+            if line.lower().startswith("accuracy rating:"):
+                try:
+                    accuracy = float(line.split(":",1)[1].strip())
+                except Exception:
+                    accuracy = None
+        logging.info(f"Fact-based evaluation: accuracy={accuracy}")
         if accuracy is not None:
+            return accuracy >= 8.0
+        return False