| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Offline evaluate the performance of a generated file using reward model and ground truth verifier. |
| The input is a parquet file that contains N generated sequences and (optional) the ground truth. |
| |
| """ |
|
|
| from collections import defaultdict |
|
|
| import hydra |
| import numpy as np |
| import pandas as pd |
| import ray |
| from omegaconf import OmegaConf |
| from tqdm import tqdm |
|
|
| from verl.trainer.ppo.reward import get_custom_reward_fn |
| from verl.utils.fs import copy_to_local |
|
|
|
|
| @ray.remote |
| def process_item(config, data_source, response_lst, reward_data): |
| reward_fn = get_custom_reward_fn(config) |
| ground_truth = reward_data["ground_truth"] |
| score_lst = [reward_fn(data_source, r, ground_truth) for r in response_lst] |
| return data_source, np.mean(score_lst) |
|
|
|
|
| @hydra.main(config_path="config", config_name="evaluation", version_base=None) |
| def main(config): |
| local_path = copy_to_local(config.data.path, use_shm=config.data.get("use_shm", False)) |
| dataset = pd.read_parquet(local_path) |
| responses = dataset[config.data.response_key] |
| data_sources = dataset[config.data.data_source_key] |
| reward_model_data = dataset[config.data.reward_model_key] |
|
|
| total = len(dataset) |
|
|
| |
| if not ray.is_initialized(): |
| ray.init(**OmegaConf.to_container(config.ray_kwargs.get("ray_init", {}))) |
|
|
| |
| data_source_reward = defaultdict(list) |
| |
| remote_tasks = [ |
| process_item.remote(config, data_sources[i], responses[i], reward_model_data[i]) for i in range(total) |
| ] |
|
|
| |
| with tqdm(total=total) as pbar: |
| while len(remote_tasks) > 0: |
| |
| done_ids, remote_tasks = ray.wait(remote_tasks) |
| for result_id in done_ids: |
| data_source, score = ray.get(result_id) |
| data_source_reward[data_source].append(score) |
| pbar.update(1) |
|
|
| metric_dict = {} |
| for data_source, rewards in data_source_reward.items(): |
| metric_dict[f"test_score/{data_source}"] = np.mean(rewards) |
|
|
| print(metric_dict) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|