Spaces:
Build error
Build error
Update common/ResponseValidation.py
Browse files- common/ResponseValidation.py +78 -80
common/ResponseValidation.py
CHANGED
|
@@ -5,13 +5,7 @@ import re
|
|
| 5 |
from RespondentAgent import *
|
| 6 |
from langchain_groq import ChatGroq
|
| 7 |
|
| 8 |
-
|
| 9 |
-
def matches_user_speaking_style(answer, processor_llm, user_profile, agent_question, return_explanation=False):
|
| 10 |
-
"""
|
| 11 |
-
Uses the LLM to determine if the answer matches the expected tone and style
|
| 12 |
-
based on the user's communication profile.
|
| 13 |
-
Returns (True, None) if it is first-person and stylistically aligned, (False, explanation) otherwise if return_explanation=True.
|
| 14 |
-
"""
|
| 15 |
logging.info("[Style Match Check] Entry")
|
| 16 |
|
| 17 |
try:
|
|
@@ -23,46 +17,71 @@ def matches_user_speaking_style(answer, processor_llm, user_profile, agent_quest
|
|
| 23 |
lower_q = agent_question.strip().lower()
|
| 24 |
is_factual = any(kw in lower_q for kw in factual_keywords)
|
| 25 |
if is_factual:
|
| 26 |
-
logging.info("[Style Match Check] Question is factual — skipping strict
|
| 27 |
if return_explanation:
|
| 28 |
return True, None
|
| 29 |
return True
|
| 30 |
|
| 31 |
-
# --- Step 2:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
### Question:
|
| 40 |
{agent_question}
|
| 41 |
### Response:
|
| 42 |
{answer}
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
First Person: Yes
|
| 45 |
or
|
| 46 |
First Person: No
|
| 47 |
Reason: <short explanation>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
"""
|
| 49 |
-
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
return False, explanation
|
| 57 |
-
return False
|
| 58 |
|
| 59 |
-
# --- Step 3:
|
| 60 |
style = user_profile.get_field("Communication", "Style")
|
| 61 |
tone = user_profile.get_field("Communication", "Tone")
|
| 62 |
length = user_profile.get_field("Communication", "Length")
|
| 63 |
topics = user_profile.get_field("Communication", "Topics")
|
| 64 |
|
| 65 |
-
# --- Step 4: Style validation prompt ---
|
| 66 |
style_check_prompt = f"""
|
| 67 |
You are a communication coach and writing style analyst.
|
| 68 |
Evaluate how well the following response aligns with the given communication profile.
|
|
@@ -76,66 +95,41 @@ Evaluate how well the following response aligns with the given communication pro
|
|
| 76 |
- Common Topics: {topics}
|
| 77 |
---
|
| 78 |
### Instructions:
|
| 79 |
-
Assess
|
| 80 |
-
|
| 81 |
-
- If the tone and structure mostly match, even if not perfect, that’s acceptable.
|
| 82 |
-
- Only return “Style Match: No” if the response clearly *conflicts* with the profile (e.g., too formal, too short, too robotic).
|
| 83 |
-
Respond only with one of:
|
| 84 |
- Style Match: Yes
|
| 85 |
- Style Match: Mostly
|
| 86 |
- Style Match: No
|
| 87 |
"""
|
| 88 |
-
logging.info("[Style Match Check] Invoking LLM for profile-based style check")
|
| 89 |
style_response = processor_llm.invoke(style_check_prompt)
|
| 90 |
style_result = style_response.content.strip().lower()
|
| 91 |
|
| 92 |
if "style match: yes" in style_result or "style match: mostly" in style_result:
|
| 93 |
-
|
| 94 |
-
if return_explanation:
|
| 95 |
-
return True, None
|
| 96 |
-
return True
|
| 97 |
|
| 98 |
-
|
| 99 |
-
if "first person: yes" in fp_result:
|
| 100 |
-
logging.info("[Style Match Check] Potential false negative: First-person check passed but style rejected")
|
| 101 |
-
|
| 102 |
-
# --- Ask LLM for explanation on mismatch ---
|
| 103 |
explanation_prompt = f"""
|
| 104 |
-
You are a communication coach
|
| 105 |
-
The following response was
|
| 106 |
-
Please provide a concise reason why the style does not match.
|
| 107 |
---
|
| 108 |
-
|
| 109 |
-
{
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
- Preferred Length: {length}
|
| 114 |
-
- Common Topics: {topics}
|
| 115 |
-
---
|
| 116 |
-
### Please provide a short reason for style mismatch:
|
| 117 |
"""
|
| 118 |
explanation_response = processor_llm.invoke(explanation_prompt)
|
| 119 |
explanation = explanation_response.content.strip()
|
| 120 |
-
logging.
|
| 121 |
-
if return_explanation
|
| 122 |
-
return False, explanation
|
| 123 |
-
return False
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
return False, f"Unexpected output format: {style_result}"
|
| 129 |
-
return False
|
| 130 |
|
| 131 |
except Exception as e:
|
| 132 |
-
logging.error(f"[Style Match Check] Exception
|
| 133 |
-
if return_explanation
|
| 134 |
-
return False, str(e)
|
| 135 |
-
return False
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
def validate_response(question, answer, user_profile_str, fast_facts_str, interview_transcript_text, respondent_type, ai_evaluator_agent, processor_llm, return_explanation=False):
|
| 141 |
llm_mode_prompt = f"""
|
|
@@ -168,13 +162,20 @@ You are a market research evaluator. Given the following:
|
|
| 168 |
- Respondent Type: {respondent_type}
|
| 169 |
- Question: {question}
|
| 170 |
- Answer: {answer}
|
| 171 |
-
|
| 172 |
Rate the answer on a scale of 0–10 for:
|
| 173 |
-
1. **Plausibility** –
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
Output strictly in this format:
|
| 179 |
Plausibility Rating: <0-10>
|
| 180 |
Relevance Rating: <0-10>
|
|
@@ -228,12 +229,9 @@ You are a market research evaluator. Given the following:
|
|
| 228 |
- Respondent Type: {respondent_type}
|
| 229 |
- Question: {question}
|
| 230 |
- Answer: {answer}
|
| 231 |
-
|
| 232 |
Rate the answer on a scale of 0–10 for:
|
| 233 |
1. **Accuracy** – Does the content align with the user’s facts or transcript, without fabrications?
|
| 234 |
-
|
| 235 |
Ignore tone, phrasing, or style. Focus only on factual correctness.
|
| 236 |
-
|
| 237 |
Output strictly in this format:
|
| 238 |
Accuracy Rating: <0-10>
|
| 239 |
If the rating is less than 8, provide a short reason below:
|
|
@@ -261,4 +259,4 @@ Accuracy Reason: <reason>
|
|
| 261 |
return valid
|
| 262 |
if return_explanation:
|
| 263 |
return False, "Could not parse accuracy rating."
|
| 264 |
-
return False
|
|
|
|
| 5 |
from RespondentAgent import *
|
| 6 |
from langchain_groq import ChatGroq
|
| 7 |
|
| 8 |
+
def matches_user_speaking_style(answer, processor_llm, user_profile, agent_question, respondent_type="INDIVIDUAL", return_explanation=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
logging.info("[Style Match Check] Entry")
|
| 10 |
|
| 11 |
try:
|
|
|
|
| 17 |
lower_q = agent_question.strip().lower()
|
| 18 |
is_factual = any(kw in lower_q for kw in factual_keywords)
|
| 19 |
if is_factual:
|
| 20 |
+
logging.info("[Style Match Check] Question is factual — skipping strict style enforcement")
|
| 21 |
if return_explanation:
|
| 22 |
return True, None
|
| 23 |
return True
|
| 24 |
|
| 25 |
+
# --- Step 2: First-person or collective pronoun check ---
|
| 26 |
+
logging.info(f"[Style Match Check] Performing {'collective' if respondent_type == 'FOCUS GROUP' else 'first-person'} pronoun check")
|
| 27 |
+
|
| 28 |
+
if respondent_type == "FOCUS GROUP":
|
| 29 |
+
pronoun_prompt = f"""
|
| 30 |
+
You are an expert in writing style analysis.
|
| 31 |
+
Determine whether the following response is appropriate for a **focus group**, which must:
|
| 32 |
+
- Use collective language ("we", "our", "us", "some of us", "most participants")
|
| 33 |
+
- Avoid any first-person singular language ("I", "me", "my", etc.)
|
| 34 |
+
- Speak as a group, not as an individual
|
| 35 |
+
Check the response below and answer in the following format:
|
| 36 |
+
Focus Group Style: Yes
|
| 37 |
+
or
|
| 38 |
+
Focus Group Style: No
|
| 39 |
+
Reason: <short reason>
|
| 40 |
+
---
|
| 41 |
### Question:
|
| 42 |
{agent_question}
|
| 43 |
### Response:
|
| 44 |
{answer}
|
| 45 |
+
"""
|
| 46 |
+
response = processor_llm.invoke(pronoun_prompt)
|
| 47 |
+
result = response.content.strip().lower()
|
| 48 |
+
|
| 49 |
+
if "focus group style: no" in result:
|
| 50 |
+
explanation = result.split("reason:", 1)[-1].strip().capitalize() if "reason:" in result else "The response does not follow focus group voice."
|
| 51 |
+
logging.warning(f"[Style Match Check] Failed group tone: {explanation}")
|
| 52 |
+
return (False, explanation) if return_explanation else False
|
| 53 |
+
else:
|
| 54 |
+
# INDIVIDUAL — use first-person pronoun validation
|
| 55 |
+
fp_prompt = f"""
|
| 56 |
+
You are an expert in writing style analysis.
|
| 57 |
+
Determine whether the following response uses a personal **first-person** tone, appropriate for an individual.
|
| 58 |
+
- Look for use of "I", "me", "my", "mine", or implied personal ownership.
|
| 59 |
+
- Skip judgment on content quality or grammar — just the perspective.
|
| 60 |
+
Respond using this format:
|
| 61 |
First Person: Yes
|
| 62 |
or
|
| 63 |
First Person: No
|
| 64 |
Reason: <short explanation>
|
| 65 |
+
---
|
| 66 |
+
### Question:
|
| 67 |
+
{agent_question}
|
| 68 |
+
### Response:
|
| 69 |
+
{answer}
|
| 70 |
"""
|
| 71 |
+
fp_response = processor_llm.invoke(fp_prompt)
|
| 72 |
+
fp_result = fp_response.content.strip().lower()
|
| 73 |
|
| 74 |
+
if "first person: no" in fp_result:
|
| 75 |
+
explanation = fp_result.split("reason:", 1)[-1].strip().capitalize() if "reason:" in fp_result else "The answer is not in first person."
|
| 76 |
+
logging.warning(f"[Style Match Check] Failed first-person test: {explanation}")
|
| 77 |
+
return (False, explanation) if return_explanation else False
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
# --- Step 3: Communication style match ---
|
| 80 |
style = user_profile.get_field("Communication", "Style")
|
| 81 |
tone = user_profile.get_field("Communication", "Tone")
|
| 82 |
length = user_profile.get_field("Communication", "Length")
|
| 83 |
topics = user_profile.get_field("Communication", "Topics")
|
| 84 |
|
|
|
|
| 85 |
style_check_prompt = f"""
|
| 86 |
You are a communication coach and writing style analyst.
|
| 87 |
Evaluate how well the following response aligns with the given communication profile.
|
|
|
|
| 95 |
- Common Topics: {topics}
|
| 96 |
---
|
| 97 |
### Instructions:
|
| 98 |
+
Assess whether the response reflects the user's typical communication style.
|
| 99 |
+
Respond with only one of:
|
|
|
|
|
|
|
|
|
|
| 100 |
- Style Match: Yes
|
| 101 |
- Style Match: Mostly
|
| 102 |
- Style Match: No
|
| 103 |
"""
|
|
|
|
| 104 |
style_response = processor_llm.invoke(style_check_prompt)
|
| 105 |
style_result = style_response.content.strip().lower()
|
| 106 |
|
| 107 |
if "style match: yes" in style_result or "style match: mostly" in style_result:
|
| 108 |
+
return (True, None) if return_explanation else True
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
if "style match: no" in style_result:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
explanation_prompt = f"""
|
| 112 |
+
You are a communication coach.
|
| 113 |
+
The following response was judged as **not matching** the profile. Briefly explain why.
|
|
|
|
| 114 |
---
|
| 115 |
+
Response: {answer}
|
| 116 |
+
Style: {style}
|
| 117 |
+
Tone: {tone}
|
| 118 |
+
Length: {length}
|
| 119 |
+
Topics: {topics}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
"""
|
| 121 |
explanation_response = processor_llm.invoke(explanation_prompt)
|
| 122 |
explanation = explanation_response.content.strip()
|
| 123 |
+
logging.warning(f"[Style Match Check] Style mismatch explanation: {explanation}")
|
| 124 |
+
return (False, explanation) if return_explanation else False
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
# Fallback
|
| 127 |
+
logging.warning(f"[Style Match Check] Unclear result format: {style_result}")
|
| 128 |
+
return (False, f"Unexpected format: {style_result}") if return_explanation else False
|
|
|
|
|
|
|
| 129 |
|
| 130 |
except Exception as e:
|
| 131 |
+
logging.error(f"[Style Match Check] Exception: {e}")
|
| 132 |
+
return (False, str(e)) if return_explanation else False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
def validate_response(question, answer, user_profile_str, fast_facts_str, interview_transcript_text, respondent_type, ai_evaluator_agent, processor_llm, return_explanation=False):
|
| 135 |
llm_mode_prompt = f"""
|
|
|
|
| 162 |
- Respondent Type: {respondent_type}
|
| 163 |
- Question: {question}
|
| 164 |
- Answer: {answer}
|
|
|
|
| 165 |
Rate the answer on a scale of 0–10 for:
|
| 166 |
+
1. **Plausibility** – Does the response make sense given what is known about the respondent?
|
| 167 |
+
- Consider the respondent’s background, demographics, stated preferences, life stage, interests, and prior responses.
|
| 168 |
+
- Is the answer **internally consistent** and **realistic** for someone like this respondent?
|
| 169 |
+
- Does it feel like something a person in their position would genuinely say or experience?
|
| 170 |
+
- Avoid penalising for style — focus purely on whether the answer is believable and fits the persona.
|
| 171 |
+
- A low plausibility score indicates the answer seems fabricated, out of character, contradictory, or implausible for this individual or group.
|
| 172 |
+
2. **Relevance** – Does the answer directly and fully address the specific question asked?
|
| 173 |
+
- Check whether the response clearly **answers the intent of the question** without deflection or vagueness.
|
| 174 |
+
- Consider whether it provides a complete and meaningful response — not just a surface-level or partial reply.
|
| 175 |
+
- Does the answer stay **on-topic** and reflect the subject matter or framing of the original prompt?
|
| 176 |
+
- A low relevance score means the answer is off-topic, evasive, only loosely related, or ignores key elements of the question.
|
| 177 |
+
Ignore tone, emotional expression, writing style, grammar, or British/American English differences.
|
| 178 |
+
Focus **strictly** on the **content quality**, **truthfulness**, and **alignment with the question and user profile**.
|
| 179 |
Output strictly in this format:
|
| 180 |
Plausibility Rating: <0-10>
|
| 181 |
Relevance Rating: <0-10>
|
|
|
|
| 229 |
- Respondent Type: {respondent_type}
|
| 230 |
- Question: {question}
|
| 231 |
- Answer: {answer}
|
|
|
|
| 232 |
Rate the answer on a scale of 0–10 for:
|
| 233 |
1. **Accuracy** – Does the content align with the user’s facts or transcript, without fabrications?
|
|
|
|
| 234 |
Ignore tone, phrasing, or style. Focus only on factual correctness.
|
|
|
|
| 235 |
Output strictly in this format:
|
| 236 |
Accuracy Rating: <0-10>
|
| 237 |
If the rating is less than 8, provide a short reason below:
|
|
|
|
| 259 |
return valid
|
| 260 |
if return_explanation:
|
| 261 |
return False, "Could not parse accuracy rating."
|
| 262 |
+
return False
|