Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- evaluate.py +103 -0
- evaluate.sh +31 -0
- upload.py +60 -0
evaluate.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import vllm
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
import argparse
|
| 5 |
+
import re
|
| 6 |
+
from evaluation.datasets_loader import get_dataset_handler
|
| 7 |
+
from mathruler.grader import extract_boxed_content, grade_answer
|
| 8 |
+
import os
|
| 9 |
+
# math_verify = get_dataset_handler("math")
|
| 10 |
+
|
| 11 |
+
parser = argparse.ArgumentParser()
|
| 12 |
+
parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Base")
|
| 13 |
+
parser.add_argument("--num_samples", type=int, default=10)
|
| 14 |
+
parser.add_argument("--suffix", type=str, default="77")
|
| 15 |
+
parser.add_argument("--save_name", type=str, default="")
|
| 16 |
+
args = parser.parse_args()
|
| 17 |
+
STORAGE_PATH = os.getenv("STORAGE_PATH")
|
| 18 |
+
print('start load')
|
| 19 |
+
with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json", "r") as f:
|
| 20 |
+
data = json.load(f)
|
| 21 |
+
os.remove(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json")
|
| 22 |
+
# data = [ {
|
| 23 |
+
# "question": "Solve the equation \\(|x^2 - 3x + 2| = |x - 1.5|\\). How many real solutions does this equation have?",
|
| 24 |
+
# "answer": "4",
|
| 25 |
+
# "score": 0
|
| 26 |
+
# }]
|
| 27 |
+
def extract_answer(response):
|
| 28 |
+
return re.search(r"\\boxed{(.*?)}", response).group(1)
|
| 29 |
+
|
| 30 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
| 31 |
+
model = vllm.LLM(
|
| 32 |
+
model=args.model,
|
| 33 |
+
tokenizer=args.model,
|
| 34 |
+
gpu_memory_utilization=0.85,
|
| 35 |
+
seed=int(args.suffix),
|
| 36 |
+
)
|
| 37 |
+
sample_params = vllm.SamplingParams(
|
| 38 |
+
max_tokens=4096,
|
| 39 |
+
temperature=1.0,
|
| 40 |
+
top_p=1.0,
|
| 41 |
+
top_k=40,
|
| 42 |
+
stop_token_ids=[tokenizer.eos_token_id],
|
| 43 |
+
n=args.num_samples,
|
| 44 |
+
)
|
| 45 |
+
wrong_data = [item for item in data if item['score'] == -1]
|
| 46 |
+
correct_data = [item for item in data if item['score'] == 0]
|
| 47 |
+
questions = [item["question"] for item in correct_data]
|
| 48 |
+
answers = [item["answer"] for item in correct_data]
|
| 49 |
+
chats = [[{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},{"role": "user", "content": question}] for question in questions]
|
| 50 |
+
if tokenizer.chat_template:
|
| 51 |
+
prompts = [tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True, add_special_tokens=True) for chat in chats]
|
| 52 |
+
else:
|
| 53 |
+
prompts = ["system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"] for chat in chats]
|
| 54 |
+
responses = model.generate(prompts, sampling_params=sample_params,use_tqdm=True)
|
| 55 |
+
print(len(data))
|
| 56 |
+
results_all = []
|
| 57 |
+
for response, answer, question in zip(responses, answers, questions):
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
error_flag = False
|
| 61 |
+
count = 0
|
| 62 |
+
results = [extract_boxed_content(output.text) for output in response.outputs]
|
| 63 |
+
answer_counts = {}
|
| 64 |
+
for result in results:
|
| 65 |
+
found_match = False
|
| 66 |
+
# Check if result matches any existing answer group
|
| 67 |
+
try:
|
| 68 |
+
for existing_answer in answer_counts:
|
| 69 |
+
if grade_answer(result, existing_answer) or grade_answer(existing_answer, result) or result == existing_answer or ('no ' in result.lower() and 'no ' in existing_answer.lower()):
|
| 70 |
+
answer_counts[existing_answer] += 1
|
| 71 |
+
found_match = True
|
| 72 |
+
break
|
| 73 |
+
except:
|
| 74 |
+
error_flag = True
|
| 75 |
+
break
|
| 76 |
+
# If no match found, create new answer group
|
| 77 |
+
if not found_match:
|
| 78 |
+
answer_counts[result] = 1
|
| 79 |
+
# print(answer_counts)
|
| 80 |
+
# Find the answer with the most matches
|
| 81 |
+
if error_flag:
|
| 82 |
+
continue
|
| 83 |
+
if answer_counts:
|
| 84 |
+
max_count = max(answer_counts.values())
|
| 85 |
+
majority_answer = max(answer_counts.items(), key=lambda x: x[1])[0]
|
| 86 |
+
# print(majority_answer)
|
| 87 |
+
score = max_count/len(results)
|
| 88 |
+
# print(score)
|
| 89 |
+
# print(majority_answer)
|
| 90 |
+
if "证明" in question or 'box' in question.lower() or 'text' in majority_answer.lower():
|
| 91 |
+
continue
|
| 92 |
+
results_all.append({"question": question, "answer": majority_answer, "score": score, 'results':results})
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print("Error:", e)
|
| 95 |
+
continue
|
| 96 |
+
# print({"question": question, "answer": majority_answer, "score": score, 'results':results})
|
| 97 |
+
# print(score,question,flush=True)
|
| 98 |
+
print(len(results_all))
|
| 99 |
+
|
| 100 |
+
with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}_results.json", "w") as f:
|
| 101 |
+
json.dump(results_all, f, indent=4)
|
| 102 |
+
|
| 103 |
+
# break
|
evaluate.sh
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
model_name=$1
|
| 4 |
+
save_name=$2
|
| 5 |
+
|
| 6 |
+
pids=()
|
| 7 |
+
|
| 8 |
+
for i in {0..7}; do
|
| 9 |
+
CUDA_VISIBLE_DEVICES=$i python question_evaluate/evaluate.py --model $model_name --suffix $i --save_name $save_name &
|
| 10 |
+
pids[$i]=$!
|
| 11 |
+
done
|
| 12 |
+
|
| 13 |
+
wait ${pids[0]}
|
| 14 |
+
echo "Task 0 finished."
|
| 15 |
+
|
| 16 |
+
timeout_duration=3600
|
| 17 |
+
|
| 18 |
+
(
|
| 19 |
+
sleep $timeout_duration
|
| 20 |
+
echo "Timeout reached. Killing remaining tasks..."
|
| 21 |
+
for i in {1..7}; do
|
| 22 |
+
if kill -0 ${pids[$i]} 2>/dev/null; then
|
| 23 |
+
kill -9 ${pids[$i]} 2>/dev/null
|
| 24 |
+
echo "Killed task $i"
|
| 25 |
+
fi
|
| 26 |
+
done
|
| 27 |
+
) &
|
| 28 |
+
|
| 29 |
+
for i in {1..7}; do
|
| 30 |
+
wait ${pids[$i]} 2>/dev/null
|
| 31 |
+
done
|
upload.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import huggingface_hub
|
| 3 |
+
from datasets import Dataset, DatasetDict
|
| 4 |
+
from huggingface_hub import login
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
STORAGE_PATH = os.getenv("STORAGE_PATH")
|
| 9 |
+
HUGGINGFACENAME = os.getenv("HUGGINGFACENAME")
|
| 10 |
+
print(STORAGE_PATH)
|
| 11 |
+
with open('tokens.json', 'r') as f:
|
| 12 |
+
token = json.load(f)['huggingface']
|
| 13 |
+
login(token=token)
|
| 14 |
+
parser = argparse.ArgumentParser()
|
| 15 |
+
parser.add_argument("--repo_name", type=str, default="")
|
| 16 |
+
parser.add_argument("--max_score", type=float, default=0.7)
|
| 17 |
+
parser.add_argument("--min_score", type=float, default=0.3)
|
| 18 |
+
parser.add_argument("--experiment_name", type=str, default="Qwen_Qwen3-4B-Base_all")
|
| 19 |
+
args = parser.parse_args()
|
| 20 |
+
|
| 21 |
+
datas= []
|
| 22 |
+
for i in range(8):
|
| 23 |
+
try:
|
| 24 |
+
with open(f'{STORAGE_PATH}/generated_question/{args.experiment_name}_{i}_results.json', 'r') as f:
|
| 25 |
+
data = json.load(f)
|
| 26 |
+
datas.extend(data)
|
| 27 |
+
except:
|
| 28 |
+
print(f"File {args.experiment_name}_{i}_results.json not found")
|
| 29 |
+
continue
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
for i in range(8):
|
| 33 |
+
try:
|
| 34 |
+
os.remove(f'{STORAGE_PATH}/generated_question/{args.experiment_name}_{i}_results.json')
|
| 35 |
+
except:
|
| 36 |
+
print(f"File {args.experiment_name}_{i}_results.json not found")
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
scores = [data['score'] for data in datas]
|
| 40 |
+
# print the distribution of scores
|
| 41 |
+
import matplotlib.pyplot as plt
|
| 42 |
+
plt.hist(scores, bins=11)
|
| 43 |
+
plt.savefig('scores_distribution.png')
|
| 44 |
+
|
| 45 |
+
#count the number of score between 0.2 and 0.8
|
| 46 |
+
if not args.repo_name == "":
|
| 47 |
+
filtered_datas = [{'problem':data['question'],'answer':data['answer'],'score':data['score']} for data in datas if data['score'] >= args.min_score and data['score'] <= args.max_score and data['answer'] != '' and data['answer']!= 'None']
|
| 48 |
+
print(len(filtered_datas))
|
| 49 |
+
train_dataset = Dataset.from_list(filtered_datas)
|
| 50 |
+
dataset_dict = {"train": train_dataset}
|
| 51 |
+
config_name = f"{args.experiment_name}"
|
| 52 |
+
dataset = DatasetDict(dataset_dict)
|
| 53 |
+
dataset.push_to_hub(f"{HUGGINGFACENAME}/{args.repo_name}",private=True,config_name=config_name)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|