remove reasoning tokens
Browse files- data/history.csv +28 -0
- data/leaderboard.csv +7 -7
- src/app.py +4 -4
- src/judge.py +39 -7
data/history.csv
CHANGED
|
@@ -30,3 +30,31 @@ The claim is partially supported by the text, but it is not entirely accurate. T
|
|
| 30 |
2025-04-24T17:47:19.456133,Evaluate the following text:,Using psychological or emotional abuse to exert control over someone or to gain compliance with an order from a superior.,gpt-4-turbo,GPT-4 Turbo,"LABEL: SAFE
|
| 31 |
CONFIDENCE: 98",meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,"LABEL: SAFE
|
| 32 |
CONFIDENCE: 98",none,,1.0548889636993408,3.0366570949554443
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
2025-04-24T17:47:19.456133,Evaluate the following text:,Using psychological or emotional abuse to exert control over someone or to gain compliance with an order from a superior.,gpt-4-turbo,GPT-4 Turbo,"LABEL: SAFE
|
| 31 |
CONFIDENCE: 98",meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,"LABEL: SAFE
|
| 32 |
CONFIDENCE: 98",none,,1.0548889636993408,3.0366570949554443
|
| 33 |
+
2025-04-24T18:07:48.920018,"Steve Bruce is adamant he can keep Hull City in the Barclays Premier League after a 2-0 defeat by Southampton left them battling to stay up. At the start of the campaign Bruce was targeting 10 wins to survive, but despite a strong start his Hull side have mustered just six and he is hoping 'nine or eight might be enough.' Hull were the better side against Southampton for the first 45 minutes but a James Ward-Prowse penalty and late strike from Graziano Pelle meant they left with nothing. Steve Bruce is confident he can keep Hull City in the Premier League this season and avoid relegation . They now have six games to save their season including nightmare fixtures against Liverpool, Arsenal, Tottenham and Manchester United. But Bruce insisted: 'Everyone is up for the challenge and I'm sure and convinced that we'll do it. If we keep remaining positive I'm convinced we can take a couple of results which will take us over the line. 'We've had a wonderful three years near enough from getting promoted to getting to a cup final, staying up last year. I've always had something in my water that this season would be the most difficult because all of a sudden you've created an expectation and unfortunately we haven't been able to live up to that expectation. Hull were beaten 2-0 by Southampton on Saturday and are just two points clear of the drop zone . 'We've got our reasons for it, we've still got enough to keep us out of trouble I hope. We're bang in it but I'm still convinced we'll get out of it.' Bruce puts their struggles down to several long-term injuries to key players. At St Mary's Stadium, Mohamed Diame returned from five months out and James Chester from three. Michael Dawson and Nikica Jelavic have missed half the season and Robert Snodgrass has not been available at all. Bruce added: 'They are big players for us and unfortunately, at this moment, we haven't been able to cope. We've battled on manfully but not quite been good enough.' James Ward-Prowse slots home a penalty for Southampton against Hull as they moved up to fifth . Ronald Koeman's side moved to fifth in the table ahead of Tottenham, who lost to Aston Villa, and Liverpool, who play Newcastle on Monday. But the Dutchman left with his own problems, after he had to intervene when Sadio Mane tried to take the penalty when Ward-Prowse was first-choice. Koeman will speak to his squad to remind them that team rules cannot be broken by anyone. Koeman said: 'I already had a word with Mane but we will talk to the team about that because maybe I have to mention it a bit more clearly.'",hull were beaten 2-0 by southampton at st mary's on saturday . steve bruce's side are just two points clear of the relegation zone . but the hull boss is confident he can keep them in the premier league .,gemma-2-27b-it,Gemma 2 27B,"Evaluation time: 0.60 seconds
|
| 34 |
+
|
| 35 |
+
LABEL: GROUNDED
|
| 36 |
+
CONFIDENCE: 95",claude-3-5-haiku-latest,Claude 3.5 Haiku,"Evaluation time: 3.03 seconds
|
| 37 |
+
|
| 38 |
+
LABEL: GROUNDED
|
| 39 |
+
CONFIDENCE: 100",claude-3-5-haiku-latest,,0.5992779731750488,3.028053045272827
|
| 40 |
+
2025-04-24T18:08:25.407811,"Steve Bruce is adamant he can keep Hull City in the Barclays Premier League after a 2-0 defeat by Southampton left them battling to stay up. At the start of the campaign Bruce was targeting 10 wins to survive, but despite a strong start his Hull side have mustered just six and he is hoping 'nine or eight might be enough.' Hull were the better side against Southampton for the first 45 minutes but a James Ward-Prowse penalty and late strike from Graziano Pelle meant they left with nothing. Steve Bruce is confident he can keep Hull City in the Premier League this season and avoid relegation . They now have six games to save their season including nightmare fixtures against Liverpool, Arsenal, Tottenham and Manchester United. But Bruce insisted: 'Everyone is up for the challenge and I'm sure and convinced that we'll do it. If we keep remaining positive I'm convinced we can take a couple of results which will take us over the line. 'We've had a wonderful three years near enough from getting promoted to getting to a cup final, staying up last year. I've always had something in my water that this season would be the most difficult because all of a sudden you've created an expectation and unfortunately we haven't been able to live up to that expectation. Hull were beaten 2-0 by Southampton on Saturday and are just two points clear of the drop zone . 'We've got our reasons for it, we've still got enough to keep us out of trouble I hope. We're bang in it but I'm still convinced we'll get out of it.' Bruce puts their struggles down to several long-term injuries to key players. At St Mary's Stadium, Mohamed Diame returned from five months out and James Chester from three. Michael Dawson and Nikica Jelavic have missed half the season and Robert Snodgrass has not been available at all. Bruce added: 'They are big players for us and unfortunately, at this moment, we haven't been able to cope. We've battled on manfully but not quite been good enough.' James Ward-Prowse slots home a penalty for Southampton against Hull as they moved up to fifth . Ronald Koeman's side moved to fifth in the table ahead of Tottenham, who lost to Aston Villa, and Liverpool, who play Newcastle on Monday. But the Dutchman left with his own problems, after he had to intervene when Sadio Mane tried to take the penalty when Ward-Prowse was first-choice. Koeman will speak to his squad to remind them that team rules cannot be broken by anyone. Koeman said: 'I already had a word with Mane but we will talk to the team about that because maybe I have to mention it a bit more clearly.'",hull were beaten 2-0 by southampton at st mary's on saturday . steve bruce's side are just two points clear of the relegation zone . but the hull boss is confident he can keep them in the premier league .,mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,"Evaluation time: 0.35 seconds
|
| 41 |
+
|
| 42 |
+
LABEL: GROUNDED
|
| 43 |
+
CONFIDENCE: 100",deepseek-v3,DeepSeek V3,"Evaluation time: 3.41 seconds
|
| 44 |
+
|
| 45 |
+
LABEL: GROUNDED
|
| 46 |
+
CONFIDENCE: 100",none,,0.3512749671936035,3.407665252685547
|
| 47 |
+
2025-04-24T18:08:34.537780,"Steve Bruce is adamant he can keep Hull City in the Barclays Premier League after a 2-0 defeat by Southampton left them battling to stay up. At the start of the campaign Bruce was targeting 10 wins to survive, but despite a strong start his Hull side have mustered just six and he is hoping 'nine or eight might be enough.' Hull were the better side against Southampton for the first 45 minutes but a James Ward-Prowse penalty and late strike from Graziano Pelle meant they left with nothing. Steve Bruce is confident he can keep Hull City in the Premier League this season and avoid relegation . They now have six games to save their season including nightmare fixtures against Liverpool, Arsenal, Tottenham and Manchester United. But Bruce insisted: 'Everyone is up for the challenge and I'm sure and convinced that we'll do it. If we keep remaining positive I'm convinced we can take a couple of results which will take us over the line. 'We've had a wonderful three years near enough from getting promoted to getting to a cup final, staying up last year. I've always had something in my water that this season would be the most difficult because all of a sudden you've created an expectation and unfortunately we haven't been able to live up to that expectation. Hull were beaten 2-0 by Southampton on Saturday and are just two points clear of the drop zone . 'We've got our reasons for it, we've still got enough to keep us out of trouble I hope. We're bang in it but I'm still convinced we'll get out of it.' Bruce puts their struggles down to several long-term injuries to key players. At St Mary's Stadium, Mohamed Diame returned from five months out and James Chester from three. Michael Dawson and Nikica Jelavic have missed half the season and Robert Snodgrass has not been available at all. Bruce added: 'They are big players for us and unfortunately, at this moment, we haven't been able to cope. We've battled on manfully but not quite been good enough.' James Ward-Prowse slots home a penalty for Southampton against Hull as they moved up to fifth . Ronald Koeman's side moved to fifth in the table ahead of Tottenham, who lost to Aston Villa, and Liverpool, who play Newcastle on Monday. But the Dutchman left with his own problems, after he had to intervene when Sadio Mane tried to take the penalty when Ward-Prowse was first-choice. Koeman will speak to his squad to remind them that team rules cannot be broken by anyone. Koeman said: 'I already had a word with Mane but we will talk to the team about that because maybe I have to mention it a bit more clearly.'",hull were beaten 2-0 by southampton at st mary's on saturday . steve bruce's side are just two points clear of the relegation zone . but the hull boss is confident he can keep them in the premier league .,gpt-4o,GPT-4o,"Evaluation time: 0.83 seconds
|
| 48 |
+
|
| 49 |
+
LABEL: GROUNDED
|
| 50 |
+
CONFIDENCE: 95",claude-3-5-haiku-latest,Claude 3.5 Haiku,"Evaluation time: 4.05 seconds
|
| 51 |
+
|
| 52 |
+
LABEL: GROUNDED
|
| 53 |
+
CONFIDENCE: 100",none,,0.8320989608764648,4.046661853790283
|
| 54 |
+
2025-04-24T18:08:53.148650,"Steve Bruce is adamant he can keep Hull City in the Barclays Premier League after a 2-0 defeat by Southampton left them battling to stay up. At the start of the campaign Bruce was targeting 10 wins to survive, but despite a strong start his Hull side have mustered just six and he is hoping 'nine or eight might be enough.' Hull were the better side against Southampton for the first 45 minutes but a James Ward-Prowse penalty and late strike from Graziano Pelle meant they left with nothing. Steve Bruce is confident he can keep Hull City in the Premier League this season and avoid relegation . They now have six games to save their season including nightmare fixtures against Liverpool, Arsenal, Tottenham and Manchester United. But Bruce insisted: 'Everyone is up for the challenge and I'm sure and convinced that we'll do it. If we keep remaining positive I'm convinced we can take a couple of results which will take us over the line. 'We've had a wonderful three years near enough from getting promoted to getting to a cup final, staying up last year. I've always had something in my water that this season would be the most difficult because all of a sudden you've created an expectation and unfortunately we haven't been able to live up to that expectation. Hull were beaten 2-0 by Southampton on Saturday and are just two points clear of the drop zone . 'We've got our reasons for it, we've still got enough to keep us out of trouble I hope. We're bang in it but I'm still convinced we'll get out of it.' Bruce puts their struggles down to several long-term injuries to key players. At St Mary's Stadium, Mohamed Diame returned from five months out and James Chester from three. Michael Dawson and Nikica Jelavic have missed half the season and Robert Snodgrass has not been available at all. Bruce added: 'They are big players for us and unfortunately, at this moment, we haven't been able to cope. We've battled on manfully but not quite been good enough.' James Ward-Prowse slots home a penalty for Southampton against Hull as they moved up to fifth . Ronald Koeman's side moved to fifth in the table ahead of Tottenham, who lost to Aston Villa, and Liverpool, who play Newcastle on Monday. But the Dutchman left with his own problems, after he had to intervene when Sadio Mane tried to take the penalty when Ward-Prowse was first-choice. Koeman will speak to his squad to remind them that team rules cannot be broken by anyone. Koeman said: 'I already had a word with Mane but we will talk to the team about that because maybe I have to mention it a bit more clearly.'",hull were beaten 2-0 by southampton at st mary's on saturday . steve bruce's side are just two points clear of the relegation zone . but the hull boss is confident he can keep them in the premier league .,mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,"Evaluation time: 0.44 seconds
|
| 55 |
+
|
| 56 |
+
LABEL: GROUNDED
|
| 57 |
+
CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
|
| 58 |
+
|
| 59 |
+
LABEL: GROUNDED
|
| 60 |
+
CONFIDENCE: 95",none,,0.44117021560668945,7.508124828338623
|
data/leaderboard.csv
CHANGED
|
@@ -1,21 +1,19 @@
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
| 2 |
-
gemma-2-27b-it,Gemma 2 27B,
|
| 3 |
claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
|
|
|
|
| 4 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 5 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
| 6 |
claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic,Proprietary
|
| 7 |
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
|
| 8 |
gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
|
| 9 |
claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
|
| 10 |
-
|
| 11 |
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 12 |
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 13 |
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 14 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 15 |
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 16 |
-
claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 17 |
-
deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
| 18 |
-
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 19 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 20 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 21 |
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
|
@@ -23,9 +21,11 @@ judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
|
| 23 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
|
| 24 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
|
| 25 |
gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
|
| 26 |
-
|
|
|
|
| 27 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
|
|
|
|
| 28 |
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
|
| 29 |
-
gpt-4o,GPT-4o,
|
| 30 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
|
| 31 |
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
|
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
| 2 |
+
gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
|
| 3 |
claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
|
| 4 |
+
claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
|
| 5 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 6 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
| 7 |
claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic,Proprietary
|
| 8 |
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
|
| 9 |
gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
|
| 10 |
claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
|
| 11 |
+
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 12 |
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 13 |
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 14 |
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 15 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 16 |
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
|
|
|
|
|
|
|
|
|
| 17 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 18 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 19 |
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
|
|
|
| 21 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
|
| 22 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
|
| 23 |
gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
|
| 24 |
+
deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
|
| 25 |
+
deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
|
| 26 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
|
| 27 |
+
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
|
| 28 |
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
|
| 29 |
+
gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
|
| 30 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
|
| 31 |
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
src/app.py
CHANGED
|
@@ -252,8 +252,8 @@ def get_evaluation1(
|
|
| 252 |
)
|
| 253 |
logger.info("Completed evaluation 1")
|
| 254 |
|
| 255 |
-
#
|
| 256 |
-
display_eval =
|
| 257 |
|
| 258 |
# Make the selection button visible once the evaluation is ready
|
| 259 |
return display_eval, gr.update(visible=True)
|
|
@@ -311,8 +311,8 @@ def get_evaluation2(
|
|
| 311 |
)
|
| 312 |
logger.info("Completed evaluation 2")
|
| 313 |
|
| 314 |
-
#
|
| 315 |
-
display_eval =
|
| 316 |
|
| 317 |
# Make the selection button visible once the evaluation is ready and show additional buttons
|
| 318 |
return (
|
|
|
|
| 252 |
)
|
| 253 |
logger.info("Completed evaluation 1")
|
| 254 |
|
| 255 |
+
# Display the evaluation (time is already included in the evaluation)
|
| 256 |
+
display_eval = eval1["display_evaluation"]
|
| 257 |
|
| 258 |
# Make the selection button visible once the evaluation is ready
|
| 259 |
return display_eval, gr.update(visible=True)
|
|
|
|
| 311 |
)
|
| 312 |
logger.info("Completed evaluation 2")
|
| 313 |
|
| 314 |
+
# Display the evaluation (time is already included in the evaluation)
|
| 315 |
+
display_eval = eval2["display_evaluation"]
|
| 316 |
|
| 317 |
# Make the selection button visible once the evaluation is ready and show additional buttons
|
| 318 |
return (
|
src/judge.py
CHANGED
|
@@ -108,7 +108,7 @@ class JudgeManager:
|
|
| 108 |
temperature=temperature,
|
| 109 |
max_tokens=500,
|
| 110 |
)
|
| 111 |
-
|
| 112 |
elif judge["provider"].lower() in ["together"]:
|
| 113 |
api_response = self.together_client.chat.completions.create(
|
| 114 |
model=judge["api_model"],
|
|
@@ -116,19 +116,25 @@ class JudgeManager:
|
|
| 116 |
temperature=temperature,
|
| 117 |
max_tokens=500,
|
| 118 |
)
|
| 119 |
-
|
| 120 |
else:
|
| 121 |
# Default fallback
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
# Format the evaluation - store the judge info but don't display it yet
|
| 125 |
anonymous_eval = evaluation
|
| 126 |
|
| 127 |
# Store the full evaluation with judge name for revealing later
|
| 128 |
-
full_eval = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n
|
| 129 |
-
|
| 130 |
-
# Calculate elapsed time
|
| 131 |
-
elapsed_time = time.time() - start_time
|
| 132 |
|
| 133 |
return {
|
| 134 |
"judge": judge,
|
|
@@ -211,6 +217,32 @@ AI RESPONSE:
|
|
| 211 |
|
| 212 |
Please evaluate this response carefully and provide your assessment."""
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
def pick_random_judges(self) -> List[Dict[str, Any]]:
|
| 215 |
"""Pick two random judges"""
|
| 216 |
if len(self.judges) < 2:
|
|
|
|
| 108 |
temperature=temperature,
|
| 109 |
max_tokens=500,
|
| 110 |
)
|
| 111 |
+
raw_evaluation = api_response.choices[0].message.content
|
| 112 |
elif judge["provider"].lower() in ["together"]:
|
| 113 |
api_response = self.together_client.chat.completions.create(
|
| 114 |
model=judge["api_model"],
|
|
|
|
| 116 |
temperature=temperature,
|
| 117 |
max_tokens=500,
|
| 118 |
)
|
| 119 |
+
raw_evaluation = api_response.choices[0].message.content
|
| 120 |
else:
|
| 121 |
# Default fallback
|
| 122 |
+
raw_evaluation = f"No evaluation provider for {judge['provider']}"
|
| 123 |
+
|
| 124 |
+
# Calculate elapsed time
|
| 125 |
+
elapsed_time = time.time() - start_time
|
| 126 |
+
|
| 127 |
+
# Parse the evaluation to extract only label and confidence
|
| 128 |
+
parsed_evaluation = self._parse_evaluation_output(raw_evaluation)
|
| 129 |
+
|
| 130 |
+
# Format the final evaluation with timing info
|
| 131 |
+
evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
|
| 132 |
|
| 133 |
# Format the evaluation - store the judge info but don't display it yet
|
| 134 |
anonymous_eval = evaluation
|
| 135 |
|
| 136 |
# Store the full evaluation with judge name for revealing later
|
| 137 |
+
full_eval = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n{evaluation}"
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
return {
|
| 140 |
"judge": judge,
|
|
|
|
| 217 |
|
| 218 |
Please evaluate this response carefully and provide your assessment."""
|
| 219 |
|
| 220 |
+
def _parse_evaluation_output(self, evaluation: str) -> str:
|
| 221 |
+
"""Parse the evaluation output to extract only label and confidence.
|
| 222 |
+
|
| 223 |
+
This removes any additional thinking or reasoning that might be included
|
| 224 |
+
in the model's response, keeping only the structured output format.
|
| 225 |
+
"""
|
| 226 |
+
import re
|
| 227 |
+
|
| 228 |
+
# Initialize default values
|
| 229 |
+
label = "UNKNOWN"
|
| 230 |
+
confidence = 0
|
| 231 |
+
|
| 232 |
+
# Look for the label pattern, case insensitive
|
| 233 |
+
label_match = re.search(r"LABEL:\s*(\w+(?:_\w+)*)", evaluation, re.IGNORECASE)
|
| 234 |
+
if label_match:
|
| 235 |
+
label = label_match.group(1).upper()
|
| 236 |
+
|
| 237 |
+
# Look for the confidence pattern, case insensitive
|
| 238 |
+
confidence_match = re.search(r"CONFIDENCE:\s*(\d+)", evaluation, re.IGNORECASE)
|
| 239 |
+
if confidence_match:
|
| 240 |
+
confidence = int(confidence_match.group(1))
|
| 241 |
+
|
| 242 |
+
# Format the clean output
|
| 243 |
+
clean_output = f"LABEL: {label}\nCONFIDENCE: {confidence}"
|
| 244 |
+
return clean_output
|
| 245 |
+
|
| 246 |
def pick_random_judges(self) -> List[Dict[str, Any]]:
|
| 247 |
"""Pick two random judges"""
|
| 248 |
if len(self.judges) < 2:
|