opd_zt / scripts /zero_reward.py
sdzt's picture
Add files using upload-large-folder tool
bf46e5d verified
Raw
History Blame Contribute Delete
697 Bytes
"""Stub reward function for OPD: always returns 0.
verl's AgentLoop still calls compute_score() after every rollout even when
`distillation.distillation_loss.use_task_rewards=False`. We don't have a task
reward for video captioning / open-ended QA, so we return 0 and let the
distillation loss drive learning.
Signature matches verl's `verl.utils.reward_score.default_compute_score`.
"""
from __future__ import annotations
from typing import Any
def compute_score(
data_source: str,
solution_str: str,
ground_truth: str,
extra_info: dict[str, Any] | None = None,
**kwargs,
) -> float:
"""Return 0 — the distillation loss carries the signal in OPD."""
return 0.0