Spaces:

5ivatej
/

meta-hackathon

Sleeping

App Files Files Community

meta-hackathon / src /grader.py

Gokultm

updated rubrics, benchmarks etc

868d431 about 1 month ago

raw

history blame contribute delete

6.87 kB

	"""Reward / grading module.

	Implements the hybrid reward used by the ESC environment:

	step_reward = clip(immediate + future_oriented - penalties, 0, 1)

	- immediate : stage-appropriate empathy/validation/open-question signal
	- future_oriented : RLFF-ESC style lookahead — projects the oracle policy
	k steps forward from the post-action state and
	compares the projected resolution_score against the
	pre-action ceiling. Rewards actions that *preserve or
	advance* the attainable resolution, not just ones
	that look good this turn.
	- penalties : dismissive language, premature advice, repetitive
	bare replies, interrogation.

	This shaping gives the agent dense, varying signal across the trajectory
	(required by the rubric: "signal over the full trajectory, not just
	binary end-of-episode").
	"""
	from __future__ import annotations

	from dataclasses import dataclass
	from typing import Dict, List

	from .seeker import (
	Features,
	SeekerState,
	Stage,
	resolution_score,
	simulate_oracle_rollout,
	stage_progress,
	)

	# Hyper-parameters — tuned to keep step reward in [0, 1] under normal play.
	LOOKAHEAD_K = 3
	W_IMMEDIATE = 0.45
	W_FUTURE = 0.55
	DISMISSIVE_PENALTY = 0.6
	PREMATURE_ADVICE_PENALTY = 0.25
	BARE_PENALTY = 0.15
	INTERROGATION_PENALTY = 0.15
	REPETITION_PENALTY = 0.18


	@dataclass
	class GradeBreakdown:
	value: float
	immediate: float
	future_oriented: float
	penalties: float
	components: Dict[str, float]


	def _stage_fit_score(stage: Stage, f: Features) -> float:
	"""How appropriate are the agent's features for the current stage?"""
	if stage in (Stage.OPENING, Stage.EXPLORING):
	# Reward empathy + open questions; punish early advice strongly.
	fit = 0.5 * min(1.0, f.empathy) + 0.3 * min(1.0, f.open_question) + 0.2 * min(1.0, f.validation)
	fit -= 0.4 * min(1.0, f.advice)
	elif stage == Stage.REFLECTING:
	fit = 0.5 * min(1.0, f.validation) + 0.4 * min(1.0, f.empathy) + 0.1 * min(1.0, f.open_question)
	fit -= 0.2 * min(1.0, f.advice)
	elif stage == Stage.PLANNING:
	# Advice is finally okay here.
	fit = 0.4 * min(1.0, f.open_question) + 0.3 * min(1.0, f.advice) + 0.3 * min(1.0, f.empathy)
	else: # CLOSING
	fit = 0.5 * min(1.0, f.empathy) + 0.3 * min(1.0, f.safety) + 0.2 * min(1.0, f.validation)
	return max(0.0, min(1.0, fit))


	def _immediate_reward(pre_state: SeekerState, post_state: SeekerState, f: Features) -> float:
	"""Turn-level reward: stage fit + trust delta + distress delta."""
	stage_fit = _stage_fit_score(pre_state.stage, f)
	trust_delta = max(0.0, post_state.trust - pre_state.trust)
	distress_relief = max(0.0, pre_state.distress - post_state.distress)
	stage_advance = max(
	0.0, stage_progress(post_state.stage) - stage_progress(pre_state.stage)
	)
	reveal_bonus = 0.2 if (post_state.revealed and not pre_state.revealed) else 0.0
	return max(
	0.0,
	min(
	1.0,
	0.45 * stage_fit
	+ 0.20 * trust_delta * 2.0 # scale small deltas
	+ 0.20 * distress_relief * 2.0
	+ 0.10 * stage_advance
	+ 0.05 * 1.0 # small baseline for any non-destructive turn
	+ reveal_bonus,
	),
	)


	def _future_oriented_reward(pre_state: SeekerState, post_state: SeekerState) -> float:
	"""RLFF-ESC style: does this action preserve / advance future resolution?

	We roll the oracle policy k steps from both the pre- and post-action states
	and take the (clipped) delta. Positive delta = the action moved the
	attainable future forward; negative = the agent damaged trajectory
	potential and must recover.
	"""
	pre_ceiling = simulate_oracle_rollout(pre_state.snapshot(), LOOKAHEAD_K)
	post_ceiling = simulate_oracle_rollout(post_state.snapshot(), LOOKAHEAD_K)
	delta = post_ceiling - pre_ceiling
	# Map delta in roughly [-0.4, +0.4] to [0, 1] with 0 at delta=0.
	return max(0.0, min(1.0, 0.5 + 1.25 * delta))


	def _penalties(flags: Dict[str, bool], f: Features) -> float:
	p = 0.0
	if flags.get("dismissed"):
	p += DISMISSIVE_PENALTY
	if flags.get("advice_too_early"):
	p += PREMATURE_ADVICE_PENALTY
	if flags.get("bare_reply"):
	p += BARE_PENALTY
	if flags.get("interrogated"):
	p += INTERROGATION_PENALTY
	if flags.get("repetitive"):
	p += REPETITION_PENALTY
	return p


	def grade_step(
	pre_state: SeekerState,
	post_state: SeekerState,
	features: Features,
	flags: Dict[str, bool],
	) -> GradeBreakdown:
	imm = _immediate_reward(pre_state, post_state, features)
	fut = _future_oriented_reward(pre_state, post_state)
	pen = _penalties(flags, features)
	combined = W_IMMEDIATE * imm + W_FUTURE * fut - pen
	value = max(0.0, min(1.0, combined))
	components = {
	"stage_fit": _stage_fit_score(pre_state.stage, features),
	"trust_delta": post_state.trust - pre_state.trust,
	"distress_delta": pre_state.distress - post_state.distress,
	"resolution_score_post": resolution_score(post_state),
	"pre_oracle_ceiling": simulate_oracle_rollout(pre_state.snapshot(), LOOKAHEAD_K),
	"post_oracle_ceiling": simulate_oracle_rollout(post_state.snapshot(), LOOKAHEAD_K),
	}
	return GradeBreakdown(
	value=value,
	immediate=imm,
	future_oriented=fut,
	penalties=pen,
	components=components,
	)


	def final_task_score(
	cumulative_reward: float,
	steps_taken: int,
	max_turns: int,
	final_state: SeekerState,
	success_threshold: float,
	completed: bool,
	) -> Dict[str, float]:
	"""Compute the final [0,1] task score used by the grader."""
	# Component 1: average shaped reward over the trajectory (already in [0,1]).
	avg_reward = cumulative_reward / max(1, steps_taken)
	# Component 2: final resolution_score.
	final_res = resolution_score(final_state)
	# Component 3: efficiency — finishing sooner is slightly better, but never
	# negative. Flat 1.0 if used ≤ 60% of budget, linearly decays to 0.7 at max.
	usage = steps_taken / max_turns
	efficiency = 1.0 if usage <= 0.6 else max(0.7, 1.0 - 0.75 * (usage - 0.6))
	completion = 1.0 if completed else 0.0
	score = (
	0.30 * avg_reward
	+ 0.45 * final_res
	+ 0.10 * efficiency
	+ 0.15 * completion
	)
	score = max(0.0, min(1.0, score))
	return {
	"score": score,
	"avg_reward": avg_reward,
	"final_resolution": final_res,
	"efficiency": efficiency,
	"completion": completion,
	"success": 1.0 if (completed and score >= success_threshold) else 0.0,
	}