Spaces:
Runtime error
Runtime error
Zhejian
commited on
Commit
·
e23f952
1
Parent(s):
5002e45
bugfix
Browse files- app.py +19 -6
- evaluator.py +31 -11
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import asyncio
|
| 2 |
import datetime
|
|
|
|
| 3 |
import time
|
| 4 |
from huggingface_hub import HfApi, list_repo_files
|
| 5 |
from env import (
|
|
@@ -43,9 +44,9 @@ def format_score_result(score_results: list[EnsembleEvaluateScore]) -> tuple[flo
|
|
| 43 |
l3.append(result.total_score)
|
| 44 |
|
| 45 |
|
| 46 |
-
l1_total_score = sum(l1) / len(l1) if len(l1) > 0 else 0
|
| 47 |
-
l2_total_score = sum(l2) / len(l2) if len(l2) > 0 else 0
|
| 48 |
-
l3_total_score = sum(l3) / len(l3) if len(l3) > 0 else 0
|
| 49 |
|
| 50 |
total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
|
| 51 |
return total_score, l1_total_score, l2_total_score, l3_total_score
|
|
@@ -56,8 +57,8 @@ def on_new_files(new_files):
|
|
| 56 |
logger.info(f"New Files Found {new_files}")
|
| 57 |
for file in new_files:
|
| 58 |
file_name = file.split('/')[-1]
|
| 59 |
-
names = file_name.split('
|
| 60 |
-
model, organization = names[0], names[1]
|
| 61 |
|
| 62 |
json_data = read_json_file(file)
|
| 63 |
if not json_data:
|
|
@@ -171,8 +172,20 @@ def monitor_hf_dataset(dataset_name, interval=60):
|
|
| 171 |
on_new_files(new_files)
|
| 172 |
last_files = current_files
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
if __name__ == "__main__":
|
| 175 |
-
|
| 176 |
|
| 177 |
|
| 178 |
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import datetime
|
| 3 |
+
import threading
|
| 4 |
import time
|
| 5 |
from huggingface_hub import HfApi, list_repo_files
|
| 6 |
from env import (
|
|
|
|
| 44 |
l3.append(result.total_score)
|
| 45 |
|
| 46 |
|
| 47 |
+
l1_total_score = round(sum(l1) / len(l1),2) if len(l1) > 0 else 0
|
| 48 |
+
l2_total_score = round(sum(l2) / len(l2),2) if len(l2) > 0 else 0
|
| 49 |
+
l3_total_score = round(sum(l3) / len(l3),2) if len(l3) > 0 else 0
|
| 50 |
|
| 51 |
total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
|
| 52 |
return total_score, l1_total_score, l2_total_score, l3_total_score
|
|
|
|
| 57 |
logger.info(f"New Files Found {new_files}")
|
| 58 |
for file in new_files:
|
| 59 |
file_name = file.split('/')[-1]
|
| 60 |
+
names = file_name.split('<')
|
| 61 |
+
model, organization = names[0].split('>')[0], names[1].split('>')[0]
|
| 62 |
|
| 63 |
json_data = read_json_file(file)
|
| 64 |
if not json_data:
|
|
|
|
| 172 |
on_new_files(new_files)
|
| 173 |
last_files = current_files
|
| 174 |
|
| 175 |
+
|
| 176 |
+
def start_monitoring_delayed(delay_seconds=30):
|
| 177 |
+
"""延迟启动监控任务,确保 Space 先完成启动"""
|
| 178 |
+
def start_monitor():
|
| 179 |
+
logger.info("开始监控 HuggingFace 数据集变化...")
|
| 180 |
+
monitor_hf_dataset(SUBMISSION_DATASET, interval=60)
|
| 181 |
+
|
| 182 |
+
# 使用线程启动监控任务
|
| 183 |
+
monitor_thread = threading.Thread(target=start_monitor, daemon=True)
|
| 184 |
+
threading.Timer(delay_seconds, monitor_thread.start).start()
|
| 185 |
+
logger.info(f"监控任务将在 {delay_seconds} 秒后启动")
|
| 186 |
+
|
| 187 |
if __name__ == "__main__":
|
| 188 |
+
start_monitoring_delayed(30)
|
| 189 |
|
| 190 |
|
| 191 |
|
evaluator.py
CHANGED
|
@@ -2,6 +2,7 @@ import json
|
|
| 2 |
import asyncio
|
| 3 |
import os
|
| 4 |
from statistics import mean
|
|
|
|
| 5 |
from typing import List, Optional, Type, TypeVar
|
| 6 |
from tenacity import (
|
| 7 |
retry,
|
|
@@ -143,9 +144,14 @@ NOTE:
|
|
| 143 |
|
| 144 |
|
| 145 |
async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
|
| 147 |
-
reasoning_step_prompt = "\n".join([step.to_prompt() for step in output_answer.reasoning_steps])
|
| 148 |
-
function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
|
| 149 |
if not reasoning_items:
|
| 150 |
return 0.0, None
|
| 151 |
prompt = f"""
|
|
@@ -153,25 +159,27 @@ Task ID: {benchmark_item.task_id}
|
|
| 153 |
Question: {benchmark_item.question}
|
| 154 |
To be evaluated Reasoning Steps:
|
| 155 |
```
|
| 156 |
-
{
|
| 157 |
```
|
| 158 |
|
| 159 |
In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
|
| 160 |
```
|
| 161 |
-
{
|
| 162 |
```
|
| 163 |
|
| 164 |
-
Evaluation Rules:
|
|
|
|
| 165 |
for item in reasoning_items:
|
| 166 |
prompt += f"{item.to_prompt()}\n"
|
| 167 |
prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
|
|
|
|
| 168 |
max_retries = 3
|
| 169 |
retry_count = 0
|
| 170 |
while retry_count < max_retries:
|
| 171 |
try:
|
| 172 |
response = await self.client.chat.completions.create(
|
| 173 |
model=self.model_name,
|
| 174 |
-
messages=[{"role": "user", "content": prompt}],
|
| 175 |
**self.model_params
|
| 176 |
)
|
| 177 |
content = response.choices[0].message.content
|
|
@@ -192,16 +200,21 @@ Evaluation Rules:"""
|
|
| 192 |
return 0.0, None
|
| 193 |
|
| 194 |
async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
|
| 196 |
if not tool_use_items:
|
|
|
|
| 197 |
return 0.0, None
|
| 198 |
-
function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
|
| 199 |
prompt = f"""
|
| 200 |
Task ID: {benchmark_item.task_id}
|
| 201 |
Question: {benchmark_item.question}
|
| 202 |
To be evaluated tool use:
|
| 203 |
```
|
| 204 |
-
{
|
| 205 |
```
|
| 206 |
|
| 207 |
Evaluation Rules:
|
|
@@ -215,7 +228,7 @@ Evaluation Rules:
|
|
| 215 |
try:
|
| 216 |
response = await self.client.chat.completions.create(
|
| 217 |
model=self.model_name,
|
| 218 |
-
messages=[{"role": "user", "content": prompt}],
|
| 219 |
**self.model_params
|
| 220 |
)
|
| 221 |
content = response.choices[0].message.content
|
|
@@ -228,7 +241,7 @@ Evaluation Rules:
|
|
| 228 |
continue
|
| 229 |
return sum([item.score for item in result.items]), result
|
| 230 |
except Exception as e:
|
| 231 |
-
print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {
|
| 232 |
retry_count += 1
|
| 233 |
if retry_count == max_retries:
|
| 234 |
return 0.0, None
|
|
@@ -237,6 +250,12 @@ Evaluation Rules:
|
|
| 237 |
|
| 238 |
|
| 239 |
async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
|
| 241 |
if not evaluate_items:
|
| 242 |
return 0.0, None
|
|
@@ -254,12 +273,13 @@ Evaluation Rules:
|
|
| 254 |
for item in evaluate_items:
|
| 255 |
prompt += f"{item.to_prompt()}\n"
|
| 256 |
prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
|
|
|
|
| 257 |
max_retry = 3
|
| 258 |
for _ in range(max_retry):
|
| 259 |
try:
|
| 260 |
response = await self.client.chat.completions.create(
|
| 261 |
model=self.model_name,
|
| 262 |
-
messages=[{"role": "user", "content": prompt}],
|
| 263 |
**self.model_params
|
| 264 |
)
|
| 265 |
|
|
|
|
| 2 |
import asyncio
|
| 3 |
import os
|
| 4 |
from statistics import mean
|
| 5 |
+
import traceback
|
| 6 |
from typing import List, Optional, Type, TypeVar
|
| 7 |
from tenacity import (
|
| 8 |
retry,
|
|
|
|
| 144 |
|
| 145 |
|
| 146 |
async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
|
| 147 |
+
system_prompt = """You are a professional evaluator for AI assistants in the crypto domain. You need to score the assistant's reasoning ability based on the given evaluation criteria and reasoning process. Please follow these steps during evaluation:
|
| 148 |
+
1. Review the reasoning steps and understand whether each step's logic is relevant to the task and helps solve the problem.
|
| 149 |
+
2. If there are no explicit reasoning steps, treat tool calls as an alternative form of reasoning steps and consider the reasoning process represented by the tool usage.
|
| 150 |
+
3. Assess the completeness and rigor of the reasoning chain, judging whether each step is reasonable and accurate, and whether there are logical flaws or missing steps.
|
| 151 |
+
4. Consider the information references and tool calls in the reasoning process, judge whether the information sources are sufficient, whether the tool usage is appropriate, and analyze the connections and dependencies between each step in the reasoning chain.
|
| 152 |
+
5. According to the evaluation criteria, give a score for each criterion, with the score ranging from 0 to the maximum points for that criterion.
|
| 153 |
+
"""
|
| 154 |
reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
|
|
|
|
|
|
|
| 155 |
if not reasoning_items:
|
| 156 |
return 0.0, None
|
| 157 |
prompt = f"""
|
|
|
|
| 159 |
Question: {benchmark_item.question}
|
| 160 |
To be evaluated Reasoning Steps:
|
| 161 |
```
|
| 162 |
+
{"\n".join([step.to_prompt() for step in output_answer.reasoning_steps])}
|
| 163 |
```
|
| 164 |
|
| 165 |
In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
|
| 166 |
```
|
| 167 |
+
{"\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])}
|
| 168 |
```
|
| 169 |
|
| 170 |
+
Evaluation Rules:
|
| 171 |
+
"""
|
| 172 |
for item in reasoning_items:
|
| 173 |
prompt += f"{item.to_prompt()}\n"
|
| 174 |
prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
|
| 175 |
+
# print(prompt)
|
| 176 |
max_retries = 3
|
| 177 |
retry_count = 0
|
| 178 |
while retry_count < max_retries:
|
| 179 |
try:
|
| 180 |
response = await self.client.chat.completions.create(
|
| 181 |
model=self.model_name,
|
| 182 |
+
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
|
| 183 |
**self.model_params
|
| 184 |
)
|
| 185 |
content = response.choices[0].message.content
|
|
|
|
| 200 |
return 0.0, None
|
| 201 |
|
| 202 |
async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
|
| 203 |
+
system_prompt = """You are a professional crypto AI assistant evaluator. You need to score the assistant's tools using ability according to the given criterias and the tool use output. When evaluating, you should follow the following steps:
|
| 204 |
+
1. Take a brief look at the tool using, descriptions and input args, to make sure the tool using is correct/related to solving the task.
|
| 205 |
+
2. Evaluate each step of the tool use to estimate the efficiency and accuracy of the tool use.
|
| 206 |
+
3. Consider the continuity of tool calls: The return result of the previous tool call may affect the input arguments of the next tool call.
|
| 207 |
+
"""
|
| 208 |
tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
|
| 209 |
if not tool_use_items:
|
| 210 |
+
print(f"No tool use items for task {benchmark_item.task_id}")
|
| 211 |
return 0.0, None
|
|
|
|
| 212 |
prompt = f"""
|
| 213 |
Task ID: {benchmark_item.task_id}
|
| 214 |
Question: {benchmark_item.question}
|
| 215 |
To be evaluated tool use:
|
| 216 |
```
|
| 217 |
+
{"\n".join([step.to_prompt() for step in output_answer.function_calls])}
|
| 218 |
```
|
| 219 |
|
| 220 |
Evaluation Rules:
|
|
|
|
| 228 |
try:
|
| 229 |
response = await self.client.chat.completions.create(
|
| 230 |
model=self.model_name,
|
| 231 |
+
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
|
| 232 |
**self.model_params
|
| 233 |
)
|
| 234 |
content = response.choices[0].message.content
|
|
|
|
| 241 |
continue
|
| 242 |
return sum([item.score for item in result.items]), result
|
| 243 |
except Exception as e:
|
| 244 |
+
print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {traceback.format_exc()}")
|
| 245 |
retry_count += 1
|
| 246 |
if retry_count == max_retries:
|
| 247 |
return 0.0, None
|
|
|
|
| 250 |
|
| 251 |
|
| 252 |
async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
|
| 253 |
+
system_prompt = """You are a professional evaluator for crypto AI assistant answers. You need to score the AI assistant's final answer according to the given evaluation criteria. Please follow these steps during evaluation:
|
| 254 |
+
1. Carefully read the task question and the AI assistant's final output, and determine whether the answer accurately and completely solves the task requirements and conforms to basic common sense.
|
| 255 |
+
2. Check whether the facts, data, and reasoning process in the answer are correct, and whether there are logical errors, numerical errors, or fabricated facts.
|
| 256 |
+
3. For specific numerical values, allow a certain range of error. If the criteria do not specify the error range, use a ±5% margin.
|
| 257 |
+
4. For each evaluation criterion, give a score for each item, with the score ranging from 0 to the full score for that criterion.
|
| 258 |
+
Please strictly follow the evaluation criteria to provide objective and fair scoring, and briefly explain your reasoning for the scores."""
|
| 259 |
evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
|
| 260 |
if not evaluate_items:
|
| 261 |
return 0.0, None
|
|
|
|
| 273 |
for item in evaluate_items:
|
| 274 |
prompt += f"{item.to_prompt()}\n"
|
| 275 |
prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
|
| 276 |
+
# print(prompt)
|
| 277 |
max_retry = 3
|
| 278 |
for _ in range(max_retry):
|
| 279 |
try:
|
| 280 |
response = await self.client.chat.completions.create(
|
| 281 |
model=self.model_name,
|
| 282 |
+
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
|
| 283 |
**self.model_params
|
| 284 |
)
|
| 285 |
|