'''
for i, r in enumerate(results):
sample_id = r.get("sample_id", f"sample-{i}")
gts = r.get("gts", "")
saved_images = r.get("saved_images", [])
# Keys to skip (large or not useful for display)
skip_keys = {"sample_id", "dataset", "input", "output", "gts", "saved_images"}
# Display all keys equally in meta-row
meta_items = []
meta_items.append(f'
Ground Truth: {html_lib.escape(str(gts))}
')
for key, value in r.items():
if key in skip_keys:
continue
# Format values
if value is None:
formatted = "None"
elif isinstance(value, float):
formatted = f"{value:.4f}"
elif isinstance(value, (int, bool)):
formatted = str(value)
elif isinstance(value, str) and len(value) < 200:
formatted = html_lib.escape(value)
else:
continue # Skip complex or long values
meta_items.append(f'
{html_lib.escape(key)}: {formatted}
')
# Process input and output with image replacements
input_html = highlight_tags_with_images(r.get("input", ""), saved_images)
output_html = highlight_tags_with_images(r.get("output", ""), saved_images)
html_content += f'''
'''
html_content += '''
'''
with open(output_path, "w") as f:
f.write(html_content)
def save_results(dataset_results: dict, output_dir: str, run_config: Optional[dict] = None):
"""Save summary to output directory. Note: results.jsonl is saved incrementally during eval."""
os.makedirs(output_dir, exist_ok=True)
# Summary only (results.jsonl already saved incrementally)
summary = {"datasets": dataset_results}
if run_config:
summary["run_config"] = run_config
# Add web fetch stats if any
ws = _web_fetch_stats.get_stats()
if ws["total"] > 0:
summary["web_fetch_stats"] = ws
with open(os.path.join(output_dir, "summary.json"), "w") as f:
json.dump(summary, f, indent=2)
print(f"\nResults saved to: {output_dir}", flush=True)
print(f" Results: {output_dir}/results.jsonl", flush=True)
html_path = os.path.join(output_dir, "results.html")
if os.path.exists(html_path):
print(f" HTML: {html_path}", flush=True)
print(f" Summary: {output_dir}/summary.json", flush=True)
# =============================================================================
# CLI
# =============================================================================
EVAL_COMPAT_PROFILES = ("current", "qwen235b_repair", "step14_plus_tavily432")
def _env_flag(name: str, default: bool = False) -> bool:
value = os.environ.get(name)
if value is None:
return default
return value.lower() in {"1", "true", "yes", "on"}
def _argv_has_option(option: str) -> bool:
prefix = option + "="
return any(arg == option or arg.startswith(prefix) for arg in sys.argv[1:])
def apply_eval_compat_profile(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
"""Normalize evaluation behavior for reproducible cross-run comparisons."""
profile = args.eval_compat_profile
if profile not in EVAL_COMPAT_PROFILES:
parser.error(f"Unknown --eval-compat-profile: {args.eval_compat_profile}")
if profile == "step14_plus_tavily432":
if _argv_has_option("--format-retry-limit") and args.format_retry_limit not in (None, 0):
parser.error("--eval-compat-profile step14_plus_tavily432 requires --format-retry-limit 0")
if _argv_has_option("--final-answer-retry-limit") and args.final_answer_retry_limit not in (None, 0):
parser.error("--eval-compat-profile step14_plus_tavily432 requires --final-answer-retry-limit 0")
args.format_retry_limit = 0
args.disable_force_final_answer_turn = True
args.final_answer_retry_limit = 0
args.disable_no_tool_answer_recovery = True
return "step14_plus_tavily432"
if args.format_retry_limit is None:
args.format_retry_limit = int(os.environ.get("FORMAT_RETRY_LIMIT", "2"))
if args.final_answer_retry_limit is None:
args.final_answer_retry_limit = int(os.environ.get("FINAL_ANSWER_RETRY_LIMIT", "1"))
return profile
def main():
parser = argparse.ArgumentParser(description="VLM Evaluation Script v2")
parser.add_argument("--model", type=str, required=True, help="Model name")
parser.add_argument("--mode", type=str, required=True, choices=["direct", "tool"])
parser.add_argument("--datasets", type=str, default="",
help="Path to datasets JSON config. If omitted, use --benchmarks with --eval-root.")
parser.add_argument("--eval-root", type=str, default="",
help=(
"Root directory whose subfolders are image benchmarks with data.jsonl. "
f"Default used with --benchmarks: {DEFAULT_EVAL_ROOT}"
))
parser.add_argument("--benchmarks", type=str, nargs="+", default=None,
help="Benchmark subfolder names under --eval-root, e.g. mmsearch_end2end_only_image hr_mmsearch.")
parser.add_argument("--save-resolved-datasets-config", type=str, default="",
help="Optional path to save the auto-generated datasets JSON config.")
parser.add_argument("--model-client", type=str, choices=["gemini", "openai", "azure", "gateway", "vertex"], required=True,
help="Model API client: gemini, openai (Qwen/MARS), azure (GPT), gateway (company GPT gateway), or vertex (Vertex Gemini service-account pool)")
parser.add_argument("--judge-client", type=str, choices=["openai", "azure"], default=None,
help="Legacy judge client flag; llm_score now follows video_dr_gen and ignores this option")
parser.add_argument("--judge-temperature", type=float, default=0.0,
help="Legacy judge temperature flag; llm_score now follows video_dr_gen and ignores this option")
parser.add_argument("--data-root", type=str, default="", help="Root for relative paths")
parser.add_argument("--output-dir", type=str, default=None)
parser.add_argument("--target-ids", type=str, nargs="+", default=None,
help="Only evaluate the specified sample ids or source ids")
parser.add_argument("--vertex-account-pool-file", type=str,
default=os.environ.get("VERTEX_ACCOUNT_POOL_FILE", ""),
help="Vertex Gemini account pool JSON; defaults to env VERTEX_ACCOUNT_POOL_FILE")
parser.add_argument("--vertex-location", type=str,
default=os.environ.get("VERTEX_LOCATION", "global"),
help="Default Vertex location for account-pool entries without location (default: global)")
parser.add_argument("--vertex-rate-limit-cooldown", type=float,
default=float(os.environ.get("VERTEX_RATE_LIMIT_COOLDOWN_SECONDS", "60")),
help="Cooldown seconds for a Vertex account after 429/RESOURCE_EXHAUSTED before rotating back")
parser.add_argument("--max-concurrent", type=int, default=4)
parser.add_argument("--max-tokens", type=int, default=4096)
parser.add_argument("--max-turns", type=int, default=50,
help="Max assistant turns (default: 50)")
parser.add_argument("--eval-compat-profile", type=str, choices=EVAL_COMPAT_PROFILES,
default=os.environ.get("EVAL_COMPAT_PROFILE", "current"),
help=(
"Evaluation behavior profile. Use step14_plus_tavily432 to keep "
"step14 prompt/control flow while retaining later Tavily key-pool fixes."
))
parser.add_argument("--format-retry-limit", type=int, default=None,
help="Extra protocol-repair retries when a tool-mode response is empty, malformed, or only in reasoning_content (default: 2; forced to 0 by step14_plus_tavily432)")
parser.add_argument("--disable-force-final-answer-turn", action="store_true",
default=_env_flag("DISABLE_FORCE_FINAL_ANSWER_TURN"),
help="Disable the final no-tool answer turn used to prevent VideoDR tool loops near max_turns")
parser.add_argument("--final-answer-retry-limit", type=int, default=None,
help="Extra retries after the forced final-answer instruction if the model still fails to answer (default: 1; forced to 0 by step14_plus_tavily432)")
parser.add_argument("--disable-no-tool-answer-recovery", action="store_true",
default=_env_flag("DISABLE_NO_TOOL_ANSWER_RECOVERY"),
help="Disable recovery of plain no-tool outputs as final answers.")
parser.add_argument("--temperature", type=float, default=0.7)
parser.add_argument("--top-p", type=float, default=0.8)
parser.add_argument("--top-k", type=int, default=20)
parser.add_argument("--presence-penalty", type=float, default=1.5)
parser.add_argument("--repetition-penalty", type=float, default=1.0,
help="Repetition penalty for model generation (default: 1.0)")
parser.add_argument("--seed", type=int, default=3407,
help="Random seed for deterministic generation (default: 3407)")
parser.add_argument("--min-pixels", type=int, default=65536, help="Min pixels for image processing")
parser.add_argument("--max-pixels", type=int, default=8294400, help="Max pixels for image processing")
parser.add_argument("--factor", type=int, default=32, help="Alignment factor (32 for Qwen3-VL, 28 for Qwen2-VL)")
parser.add_argument("--qwen-vl-processing", type=lambda x: x.lower() == 'true', default=True,
help="Use Qwen-VL style image processing (default: True, set False for Gemini/GPT)")
parser.add_argument("--tool-config", type=str, default="", help="Path to tool config YAML (optional, overrides dataset system_prompt)")
parser.add_argument("--tool-ablation-profile", type=str, choices=TOOL_ABLATION_PROFILES,
default=os.environ.get("TOOL_ABLATION_PROFILE", "none"),
help=(
"Tool ablation profile for tool-mode evaluation: none, nosearch "
"(remove image_search/web_search), or nolocation (remove choose_frames/zoom_in)."
))
parser.add_argument("--serper-concurrency", type=int, default=5, help="Max concurrent Serper API requests (default: 5)")
parser.add_argument("--search-cache-dir", type=str, default="", help="Directory for search result cache (optional, no cache if not set)")
parser.add_argument("--seed-search-cache-from", type=str, nargs="*", default=None,
help="Seed this run's search cache from historical search_cache.db files under these files/directories. Defaults to inference/runs when --search-cache-dir is set.")
parser.add_argument("--no-auto-seed-search-cache", action="store_true",
default=os.environ.get("NO_AUTO_SEED_SEARCH_CACHE", "").lower() in {"1", "true", "yes", "on"},
help="Disable automatic historical search cache seeding.")
parser.add_argument("--seed-image-search-cache-from", type=str, nargs="*", default=None,
help="Seed this run's image_search_cache.json from historical image_search_cache.json files under these files/directories. Defaults to inference/runs in tool mode.")
parser.add_argument("--no-auto-seed-image-search-cache", action="store_true",
default=os.environ.get("NO_AUTO_SEED_IMAGE_SEARCH_CACHE", "").lower() in {"1", "true", "yes", "on"},
help="Disable automatic historical image_search_cache.json seeding.")
parser.add_argument("--web-search-backend", type=str,
choices=["serper_gateway", "gateway", "gateway_serper", "internal_serper", "company_serper", "mars", "tavily", "auto"],
default=_default_web_search_backend(),
help="Backend for web_search tool: serper_gateway, mars, tavily, or auto (default: env WEB_SEARCH_BACKEND/VIDEO_DR_WEB_SEARCH_BACKEND or serper_gateway)")
parser.add_argument("--serper-gateway-max-results", type=int,
default=int(os.environ.get("SERPER_GATEWAY_MAX_RESULTS", "8")),
help="Serper gateway max results, clamped to 1-20 (default: 8)")
parser.add_argument("--serper-gateway-timeout", type=int,
default=int(os.environ.get("SERPER_GATEWAY_TIMEOUT", "60")),
help="Serper gateway request timeout in seconds (default: 60)")
parser.add_argument("--serper-gateway-summary-max-tokens", type=int,
default=int(os.environ.get("SERPER_GATEWAY_SUMMARY_MAX_TOKENS", "1024")),
help="Deprecated compatibility flag for the old metadata-only Serper gateway summarizer.")
parser.add_argument("--tavily-search-depth", type=str, choices=["basic", "advanced"],
default=os.environ.get("TAVILY_SEARCH_DEPTH", "advanced"),
help="Tavily search_depth when --web-search-backend=tavily")
parser.add_argument("--tavily-max-results", type=int,
default=int(os.environ.get("TAVILY_MAX_RESULTS", "8")),
help="Tavily max_results, clamped to 1-20 (default: 8)")
parser.add_argument("--tavily-include-answer", type=str,
default=os.environ.get("TAVILY_INCLUDE_ANSWER", "advanced"),
help="Tavily include_answer: false, true/basic, or advanced (default: advanced)")
parser.add_argument("--tavily-include-raw-content", type=str,
default=os.environ.get("TAVILY_INCLUDE_RAW_CONTENT", "false"),
help="Tavily include_raw_content: false, true/markdown, or text (default: false)")
parser.add_argument("--tavily-topic", type=str, choices=["general", "news", "finance"],
default=os.environ.get("TAVILY_TOPIC", "general"),
help="Tavily topic (default: general)")
parser.add_argument("--tavily-auto-parameters", action="store_true",
default=os.environ.get("TAVILY_AUTO_PARAMETERS", "").lower() in {"1", "true", "yes", "on"},
help="Enable Tavily auto_parameters; uses extra Tavily credits")
parser.add_argument("--tavily-timeout", type=int,
default=int(os.environ.get("TAVILY_TIMEOUT", "60")),
help="Tavily request timeout in seconds (default: 60)")
parser.add_argument("--tavily-key-cooldown-seconds", type=int,
default=int(os.environ.get("TAVILY_KEY_COOLDOWN_SECONDS", "60")),
help="Cooldown for a Tavily key after 429/5xx before rotating back (default: 60)")
parser.add_argument("--video-initial-frames", type=int, default=DEFAULT_VIDEO_INITIAL_FRAMES,
help="Number of initial 1fps frames shown to the model for VideoDR")
parser.add_argument("--video-interval-samples", type=int, default=DEFAULT_VIDEO_INTERVAL_SAMPLES,
help="Number of uniformly sampled frames returned by choose_frames")
parser.add_argument("--video-max-resolution", type=int, default=DEFAULT_VIDEO_MAX_RESOLUTION,
help="Max side length for 1fps extracted video frames")
parser.add_argument("--video-jpeg-quality", type=int, default=DEFAULT_VIDEO_JPEG_QUALITY,
help="JPEG quality for cached video frames")
args = parser.parse_args()
resolved_eval_profile = apply_eval_compat_profile(args, parser)
video_dr_system_prompt = get_video_dr_system_prompt(resolved_eval_profile)
print(
"Evaluation compatibility profile: "
f"{resolved_eval_profile} "
f"(format_retry_limit={args.format_retry_limit}, "
f"force_final_answer_turn={not args.disable_force_final_answer_turn}, "
f"final_answer_retry_limit={args.final_answer_retry_limit}, "
f"no_tool_answer_recovery={not args.disable_no_tool_answer_recovery})",
flush=True,
)
model_client = args.model_client
# Get API credentials based on model_client
if model_client == "gemini":
if "GEMINI_API_KEY" not in os.environ:
parser.error("GEMINI_API_KEY required")
if "GEMINI_BASE_URL" not in os.environ:
parser.error("GEMINI_BASE_URL required")
api_key = os.environ["GEMINI_API_KEY"]
base_url = os.environ["GEMINI_BASE_URL"]
elif model_client == "azure":
if "AZURE_OPENAI_API_KEY" not in os.environ:
parser.error("AZURE_OPENAI_API_KEY required")
if "AZURE_OPENAI_BASE_URL" not in os.environ:
parser.error("AZURE_OPENAI_BASE_URL required")
api_key = os.environ["AZURE_OPENAI_API_KEY"]
base_url = os.environ["AZURE_OPENAI_BASE_URL"]
elif model_client == "gateway":
_, _, api_key = _get_model_gateway_credentials(args.model)
if not api_key:
token_hint = (
"MODEL_GATEWAY_GEMINI_TOKEN or GEMINI_GATEWAY_TOKEN required"
if _is_gemini_gateway_model(args.model)
else "MODEL_GATEWAY_TOKEN or GATEWAY_TOKEN required"
)
parser.error(token_hint)
base_url = (
os.environ.get("MODEL_GATEWAY_URL")
or os.environ.get("GATEWAY_URL")
or "http://112.65.194.90:8000/trpc.youtu.llm_interface_service.Greeter/DescribeLlmResult"
)
elif model_client == "vertex":
if not args.vertex_account_pool_file:
parser.error("--vertex-account-pool-file or VERTEX_ACCOUNT_POOL_FILE required for --model-client vertex")
api_key = ""
base_url = "vertex://account-pool"
else:
# OpenAI-compatible (Qwen, MARS, company OpenAI proxy, etc.)
base_url = os.environ.get("MODEL_BASE_URL", DEFAULT_COMPANY_OPENAI_BASE_URL)
api_key = (
os.environ.get("MODEL_API_KEY")
or os.environ.get("MODEL_OPENAI_API_KEY")
or os.environ.get("OPENAI_API_KEY", "")
or _read_secret_file(os.environ.get("MODEL_API_KEY_FILE", DEFAULT_COMPANY_OPENAI_API_KEY_FILE))
)
if model_client not in {"gateway", "vertex"} and not api_key:
configure_local_service_no_proxy(base_url)
vertex_account_pool = None
if model_client == "vertex":
vertex_account_pool = VertexAccountPool(
args.vertex_account_pool_file,
default_location=args.vertex_location,
cooldown_seconds=args.vertex_rate_limit_cooldown,
)
# Load datasets
auto_dataset_config = None
if args.benchmarks or args.eval_root:
if args.datasets:
parser.error("Use either --datasets or --eval-root/--benchmarks, not both.")
eval_root = args.eval_root or DEFAULT_EVAL_ROOT
auto_dataset_config = build_eval_root_dataset_config(eval_root, args.benchmarks)
print(
"Loading benchmarks from eval root "
f"{eval_root}: {', '.join(auto_dataset_config.keys())}",
flush=True,
)
if args.save_resolved_datasets_config:
save_dir = os.path.dirname(os.path.abspath(args.save_resolved_datasets_config))
if save_dir:
os.makedirs(save_dir, exist_ok=True)
with open(args.save_resolved_datasets_config, "w") as f:
json.dump(auto_dataset_config, f, ensure_ascii=False, indent=2)
print(
f"Saved resolved datasets config to {args.save_resolved_datasets_config}",
flush=True,
)
samples, dataset_configs = load_datasets(
auto_dataset_config,
args.data_root,
video_dr_system_prompt=video_dr_system_prompt,
)
else:
if not args.datasets:
parser.error("--datasets is required unless --benchmarks or --eval-root is provided.")
print(f"Loading datasets from {args.datasets}...")
samples, dataset_configs = load_datasets(
args.datasets,
args.data_root,
video_dr_system_prompt=video_dr_system_prompt,
)
if args.target_ids:
target_ids = set(args.target_ids)
total_before_filter = len(samples)
samples = [
sample for sample in samples
if sample["id"] in target_ids or sample.get("source_id") in target_ids
]
print(
f"Filtered samples with --target-ids: {len(samples)}/{total_before_filter}",
flush=True,
)
if not samples:
parser.error("--target-ids did not match any sample id/source id")
print(f"Loaded {len(samples)} samples total\n")
selected_dataset_names = {sample.get("dataset", "") for sample in samples}
selected_dataset_configs = [
dataset_configs[name] for name in selected_dataset_names if name in dataset_configs
]
has_video_dr_dataset = any(cfg.get("task_kind") == "video_dr" for cfg in selected_dataset_configs)
has_image_dataset = any(cfg.get("task_kind") == "image" for cfg in selected_dataset_configs)
if args.tool_ablation_profile != "none" and args.mode != "tool":
parser.error("--tool-ablation-profile 仅支持 --mode tool")
if args.tool_ablation_profile != "none" and has_video_dr_dataset and has_image_dataset:
parser.error("同一次工具消融评测暂不支持混合 VideoDR 与 image benchmark")
# Check LLM judge requirement
needs_llm = any(
"llm_score" in cfg.get("score_methods", [])
for cfg in selected_dataset_configs
)
judge_client = args.judge_client or ""
judge_base_url = ""
judge_api_key = ""
if needs_llm and (not MARS_SUMMARIZER_ADDRESS or not MARS_SUMMARIZER_MODEL):
print(
"[WARN] llm_score 已切换到 video_dr_gen 的 MARS summarizer judge,"
"但 MARS_SUMMARIZER_ADDRESS 或 MARS_SUMMARIZER_MODEL 未配置;"
"此时 llm_score 只会走快速路径,无法调用 LLM judge。",
flush=True,
)
serper_api_key = os.environ.get("SERPER_API_KEY", "")
tavily_api_keys = load_tavily_api_keys()
tavily_key_pool = TavilyApiKeyPool(tavily_api_keys, cooldown_seconds=args.tavily_key_cooldown_seconds)
summarizer_base_url = os.environ.get("SUMMARIZER_BASE_URL", "")
summarizer_model = os.environ.get("SUMMARIZER_MODEL", "")
resolved_web_search_backend = resolve_web_search_backend(args.web_search_backend, tavily_api_keys)
if args.mode == "tool":
print(
f"web_search backend: {resolved_web_search_backend}"
+ (" (from auto)" if args.web_search_backend == "auto" else ""),
flush=True,
)
if resolved_web_search_backend == "tavily" and not tavily_api_keys:
parser.error("TAVILY_API_KEY, TAVILY_API_KEYS, or TAVILY_API_KEY_FILE required when --web-search-backend=tavily")
if resolved_web_search_backend == "tavily":
print(f"Tavily API key pool: {len(tavily_api_keys)} key(s)", flush=True)
tools_section = ""
allowed_tool_names = None
if args.mode == "tool":
if has_video_dr_dataset:
allowed_tool_names = get_allowed_tool_names(args.tool_ablation_profile, "video_dr")
elif has_image_dataset:
allowed_tool_names = get_allowed_tool_names(args.tool_ablation_profile, "image")
if has_image_dataset and not args.tool_config:
parser.error("图像 benchmark 的 tool 模式需要提供 --tool-config")
if args.tool_config:
print(f"Loading tool config from {args.tool_config}...")
tools_section = load_tool_config(
args.tool_config,
allowed_tool_names=allowed_tool_names,
normalize_image_schema=has_image_dataset,
normalize_video_schema=has_video_dr_dataset,
)
if args.tool_ablation_profile != "none":
print(
"Tool ablation profile: "
f"{args.tool_ablation_profile}; allowed tools: {format_tool_names(allowed_tool_names or set())}",
flush=True,
)
if has_video_dr_dataset and args.tool_ablation_profile != "none":
if not tools_section:
parser.error("VideoDR 工具消融需要提供 --tool-config 以生成消融后的工具定义 prompt")
video_dr_system_prompt = build_video_tool_system_prompt(
tools_section=tools_section,
allowed_tool_names=allowed_tool_names or set(),
max_turns=args.max_turns,
)
for cfg in selected_dataset_configs:
if cfg.get("task_kind") == "video_dr":
cfg["system_prompt"] = video_dr_system_prompt
configure_local_service_no_proxy(summarizer_base_url)
configure_local_service_no_proxy(MARS_RETRIEVAL_ADDRESS)
configure_local_service_no_proxy(MARS_SUMMARIZER_ADDRESS)
if args.mode == "tool" and VIDEO_DR_IMAGE_SEARCH_MODE == "gateway":
configure_local_service_no_proxy(VIDEO_DR_GATEWAY_URL)
# Output dir
if args.output_dir:
output_dir = args.output_dir
else:
timestamp = time.strftime("%y%m%d%H%M%S")
output_dir = f"eval_{args.model.replace('/', '_')}_{args.mode}_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
if args.mode == "tool" and not args.search_cache_dir:
args.search_cache_dir = os.path.join(output_dir, "search_cache")
print(
f"Search cache dir not provided; using {args.search_cache_dir}",
flush=True,
)
# Create search cache if directory provided
search_cache = None
search_cache_seed_paths = resolve_cache_seed_paths(
args.seed_search_cache_from,
args.no_auto_seed_search_cache,
)
if args.search_cache_dir:
search_cache = SearchCache(args.search_cache_dir)
if search_cache_seed_paths:
search_cache.seed_from_paths(search_cache_seed_paths)
image_search_cache_seed_paths = resolve_cache_seed_paths(
args.seed_image_search_cache_from,
args.no_auto_seed_image_search_cache,
)
# Run evaluation
kwargs = {
"eval_compat_profile": resolved_eval_profile,
"video_dr_system_prompt": video_dr_system_prompt,
"general_video_direct_system_prompt": GENERAL_VIDEO_DIRECT_SYSTEM_PROMPT,
"max_tokens": args.max_tokens,
"max_turns": args.max_turns,
"format_retry_limit": args.format_retry_limit,
"force_final_answer_turn": not args.disable_force_final_answer_turn,
"final_answer_retry_limit": args.final_answer_retry_limit,
"recover_no_tool_answer": not args.disable_no_tool_answer_recovery,
"temperature": args.temperature,
"top_p": args.top_p,
"top_k": args.top_k,
"presence_penalty": args.presence_penalty,
"repetition_penalty": args.repetition_penalty,
"seed": args.seed,
"min_pixels": args.min_pixels,
"max_pixels": args.max_pixels,
"factor": args.factor,
"qwen_vl_processing": args.qwen_vl_processing,
"serper_api_key": serper_api_key,
"web_search_backend": args.web_search_backend,
"serper_gateway_max_results": args.serper_gateway_max_results,
"serper_gateway_timeout": args.serper_gateway_timeout,
"serper_gateway_summary_max_tokens": args.serper_gateway_summary_max_tokens,
"tavily_api_key": tavily_api_keys[0] if tavily_api_keys else "",
"tavily_api_key_pool": tavily_key_pool,
"tavily_search_depth": args.tavily_search_depth,
"tavily_max_results": args.tavily_max_results,
"tavily_include_answer": args.tavily_include_answer,
"tavily_include_raw_content": args.tavily_include_raw_content,
"tavily_topic": args.tavily_topic,
"tavily_auto_parameters": args.tavily_auto_parameters,
"tavily_timeout": args.tavily_timeout,
"tavily_include_domains": _split_csv_env(os.environ.get("TAVILY_INCLUDE_DOMAINS", "")),
"tavily_exclude_domains": _split_csv_env(os.environ.get("TAVILY_EXCLUDE_DOMAINS", "")),
"summarizer_base_url": summarizer_base_url,
"summarizer_model": summarizer_model,
"serper_concurrency": args.serper_concurrency,
"search_cache": search_cache,
"image_search_cache_seed_paths": image_search_cache_seed_paths,
"tools_section": tools_section,
"tool_ablation_profile": args.tool_ablation_profile,
"allowed_tool_names": allowed_tool_names,
"judge_client": judge_client,
"judge_base_url": judge_base_url,
"judge_api_key": judge_api_key,
"judge_temperature": args.judge_temperature,
"video_initial_frames": args.video_initial_frames,
"video_interval_samples": args.video_interval_samples,
"video_max_resolution": args.video_max_resolution,
"video_jpeg_quality": args.video_jpeg_quality,
"vertex_account_pool": vertex_account_pool,
}
# Health check - crash early if servers are down
async def health_check():
import aiohttp
print("Running health checks...", flush=True)
# Check model server
print(f" Checking model server: {base_url}", flush=True)
try:
timeout = aiohttp.ClientTimeout(total=10)
if model_client == "gateway":
result = await call_gateway_api(
[{"role": "user", "content": "Reply with exactly: OK"}],
args.model,
base_url,
api_key,
max_tokens=128,
temperature=0.0,
model_request_timeout=10,
)
if result.get("error"):
raise Exception(result["error"])
print(" Model gateway OK", flush=True)
elif model_client == "vertex":
result = await call_vertex_gemini_api(
[{"role": "user", "content": "Hello, please test the connection and reply with OK."}],
args.model,
base_url,
api_key,
vertex_account_pool=vertex_account_pool,
max_tokens=32,
temperature=0.0,
model_request_timeout=10,
)
if result.get("error"):
raise Exception(result["error"])
project = result.get("vertex_project_id", "")
print(f" Vertex Gemini OK project={project}", flush=True)
else:
async with create_http_session(timeout) as session:
# Try /v1/models endpoint (OpenAI-compatible)
url = _openai_models_url(base_url)
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
async with session.get(url, headers=headers) as resp:
if resp.status != 200:
raise Exception(f"Model server returned HTTP {resp.status}")
print(f" Model server OK", flush=True)
except Exception as e:
raise RuntimeError(f"Model server not reachable at {base_url}: {e}")
# Check summarizer server (if in tool mode)
if args.mode == "tool" and summarizer_base_url:
print(f" Checking summarizer server: {summarizer_base_url}", flush=True)
try:
async with create_http_session(timeout) as session:
url = f"{summarizer_base_url.rstrip('/')}/v1/models"
async with session.get(url) as resp:
if resp.status != 200:
raise Exception(f"Summarizer server returned HTTP {resp.status}")
print(f" Summarizer server OK", flush=True)
except Exception as e:
raise RuntimeError(f"Summarizer server not reachable at {summarizer_base_url}: {e}")
if needs_llm and MARS_SUMMARIZER_ADDRESS:
print(f" Checking LLM judge summarizer: http://{MARS_SUMMARIZER_ADDRESS}", flush=True)
try:
async with create_http_session(timeout) as session:
url = f"http://{MARS_SUMMARIZER_ADDRESS.rstrip('/')}/v1/models"
async with session.get(url, proxy="") as resp:
if resp.status != 200:
raise Exception(f"Judge summarizer returned HTTP {resp.status}")
print(" LLM judge summarizer OK", flush=True)
except Exception as e:
raise RuntimeError(f"LLM judge summarizer not reachable at http://{MARS_SUMMARIZER_ADDRESS}: {e}")
print("Health checks passed!", flush=True)
asyncio.run(health_check())
try:
dataset_results = asyncio.run(run_evaluation(
samples, dataset_configs, model_client, args.model, base_url, api_key,
args.mode, args.max_concurrent, output_dir=output_dir, **kwargs
))
save_results(
dataset_results,
output_dir,
run_config={
"eval_compat_profile": resolved_eval_profile,
"format_retry_limit": args.format_retry_limit,
"force_final_answer_turn": not args.disable_force_final_answer_turn,
"final_answer_retry_limit": args.final_answer_retry_limit,
"recover_no_tool_answer": not args.disable_no_tool_answer_recovery,
"web_search_backend": resolved_web_search_backend,
"tavily_search_depth": args.tavily_search_depth,
"tavily_max_results": args.tavily_max_results,
"tavily_include_answer": args.tavily_include_answer,
"tavily_include_raw_content": args.tavily_include_raw_content,
"tavily_topic": args.tavily_topic,
"eval_root": args.eval_root or (DEFAULT_EVAL_ROOT if auto_dataset_config else ""),
"benchmarks": list(auto_dataset_config.keys()) if auto_dataset_config else [],
"resolved_datasets_config": args.save_resolved_datasets_config,
},
)
finally:
# Skip browser cleanup - Playwright can hang indefinitely and OS will clean up on exit
# Always print cache stats and close, even on error
if search_cache:
stats = search_cache.get_stats()
print(f"Search cache: {stats['hits']}/{stats['total']} hits ({stats['hit_rate']:.1f}%), {stats['misses']} new searches cached", flush=True)
search_cache.close()
print("Done.", flush=True)
if __name__ == "__main__":
main()