Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +7 -1
evaluate.py
CHANGED
|
@@ -341,7 +341,13 @@ def run_comprehensive_evaluation(
|
|
| 341 |
answer_correctness_score = None
|
| 342 |
if ground_truth_answer and "ERROR" not in answer_text:
|
| 343 |
try:
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
print(f" - Judge Prompt Sent:\n{judge_msg}")
|
| 346 |
raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
|
| 347 |
print(f" - Judge Raw Response: {raw_correctness}")
|
|
|
|
| 341 |
answer_correctness_score = None
|
| 342 |
if ground_truth_answer and "ERROR" not in answer_text:
|
| 343 |
try:
|
| 344 |
+
# Change this line in the answer correctness section:
|
| 345 |
+
judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(
|
| 346 |
+
ground_truth_answer=ground_truth_answer,
|
| 347 |
+
generated_answer=answer_text,
|
| 348 |
+
query_type=expected_route # <-- Add this line
|
| 349 |
+
)
|
| 350 |
+
# judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
|
| 351 |
print(f" - Judge Prompt Sent:\n{judge_msg}")
|
| 352 |
raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
|
| 353 |
print(f" - Judge Raw Response: {raw_correctness}")
|