Update dataset_construction/evaluate_on_eurus_2.py
Browse files
dataset_construction/evaluate_on_eurus_2.py
CHANGED
|
@@ -12,12 +12,12 @@ def read_args():
|
|
| 12 |
parser = argparse.ArgumentParser()
|
| 13 |
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-7B", help="The model to use for evaluation")
|
| 14 |
parser.add_argument("--batch_size", type=int, default=4, help="Batch size for evaluation")
|
| 15 |
-
parser.add_argument("--k", type=int, default=
|
| 16 |
-
parser.add_argument("--tensor_parallel_size", type=int, default=
|
| 17 |
-
parser.add_argument("--temperature", type=float, default=
|
| 18 |
-
parser.add_argument("--top_p", type=float, default=
|
| 19 |
-
parser.add_argument("--num_samples", type=int, default=
|
| 20 |
-
parser.add_argument("--
|
| 21 |
# NOTE: token consuming
|
| 22 |
parser.add_argument("--max_tokens", type=int, default=4096, help="Maximum number of tokens to generate.")
|
| 23 |
parser.add_argument("--log_per_step", type=int, default=1000, help="Log results every N samples")
|
|
@@ -83,7 +83,9 @@ def main():
|
|
| 83 |
num_samples = len(split_ds)
|
| 84 |
|
| 85 |
result_file = f"evaluation_results_{split}_shard{args.shard_index}_of_{args.num_shards}.json"
|
|
|
|
| 86 |
complete_result_file = f"complete_evaluation_results_{split}_shard{args.shard_index}_of_{args.num_shards}.json"
|
|
|
|
| 87 |
|
| 88 |
if args.resume and os.path.exists(result_file):
|
| 89 |
with open(result_file, "r", encoding="utf-8") as f:
|
|
|
|
| 12 |
parser = argparse.ArgumentParser()
|
| 13 |
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-7B", help="The model to use for evaluation")
|
| 14 |
parser.add_argument("--batch_size", type=int, default=4, help="Batch size for evaluation")
|
| 15 |
+
parser.add_argument("--k", type=int, default=8, help="Number of completions to generate for each prompt")
|
| 16 |
+
parser.add_argument("--tensor_parallel_size", type=int, default=1, help="Number of parallel tensors to use for generation")
|
| 17 |
+
parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for sampling")
|
| 18 |
+
parser.add_argument("--top_p", type=float, default=1.0, help="Top-p for sampling")
|
| 19 |
+
parser.add_argument("--num_samples", type=int, default=-1, help="Number of samples to evaluate; if -1, use the entire shard")
|
| 20 |
+
parser.add_argument("--output_folder", type=str, default=".", help="Base folder to save evaluation results")
|
| 21 |
# NOTE: token consuming
|
| 22 |
parser.add_argument("--max_tokens", type=int, default=4096, help="Maximum number of tokens to generate.")
|
| 23 |
parser.add_argument("--log_per_step", type=int, default=1000, help="Log results every N samples")
|
|
|
|
| 83 |
num_samples = len(split_ds)
|
| 84 |
|
| 85 |
result_file = f"evaluation_results_{split}_shard{args.shard_index}_of_{args.num_shards}.json"
|
| 86 |
+
result_file = os.path.join(args.output_folder, result_file)
|
| 87 |
complete_result_file = f"complete_evaluation_results_{split}_shard{args.shard_index}_of_{args.num_shards}.json"
|
| 88 |
+
complete_result_file = os.path.join(args.output_folder, complete_result_file)
|
| 89 |
|
| 90 |
if args.resume and os.path.exists(result_file):
|
| 91 |
with open(result_file, "r", encoding="utf-8") as f:
|