Spaces:

coffeine16
/

fitscript-env

Sleeping

File size: 25,863 Bytes

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
FitScript Environment Implementation.

Simulates a real-world fitness prescription task: generating, evaluating,
and refining personalized workout plans. Supports three tasks of increasing
difficulty with deterministic graders.
"""

import json
from uuid import uuid4
from typing import Dict, Any, Tuple

from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

try:
    from ..models import FitscriptAction, FitscriptObservation
except ImportError:
    from models import FitscriptAction, FitscriptObservation


# ---------------------------------------------------------------------------
# Grader base class
# ---------------------------------------------------------------------------

class BaseTask:
    """Base class for all FitScript tasks."""

    client_profile: dict = {}
    max_steps: int = 5

    def grade(
        self, action: FitscriptAction, step: int
    ) -> Tuple[float, Dict[str, float], str]:
        """
        Returns (score: float in [0,1], breakdown: dict, feedback: str).
        Must be implemented by every concrete task.
        """
        raise NotImplementedError


# ---------------------------------------------------------------------------
# Task 1 - EASY: Basic Plan Generation
# ---------------------------------------------------------------------------

class BasicPlanTask(BaseTask):
    """
    Scenario: 35-year-old beginner, no injuries, 3 days/week, home, no equipment.
    Grader: 4 criteria worth 0.25 each.
    Episode ends when plan submitted OR after 3 steps.
    """

    client_profile = {
        "age": 35,
        "fitness_level": "beginner",
        "goal": "general fitness",
        "equipment": [],
        "injuries": [],
        "days_per_week": 3,
    }
    max_steps = 3

    # Exercises that require equipment --- flag any appearance
    EQUIPMENT_EXERCISES = {
        "barbell", "dumbbell", "kettlebell", "cable", "machine",
        "bench press", "squat rack", "pull-up bar", "resistance band",
        "treadmill", "stationary bike",
    }

    # Advanced movements not appropriate for beginners
    ADVANCED_MOVEMENTS = {
        "muscle-up", "muscle up", "handstand push-up", "handstand pushup",
        "pistol squat", "one-arm push-up", "planche", "front lever",
        "back lever", "dragon flag",
    }

    def grade(self, action: FitscriptAction, step: int) -> Tuple[float, Dict[str, float], str]:
        scores: Dict[str, float] = {}
        feedback_parts = []

        try:
            plan = json.loads(action.plan) if action.plan else {}
        except json.JSONDecodeError:
            plan = {}

        # Criterion 1: Plan contains exactly 3 workout days
        days = plan.get("days", plan.get("workout_days", []))
        if isinstance(days, list) and len(days) == 3:
            scores["three_days"] = 0.25
            feedback_parts.append("✓ Plan has exactly 3 workout days.")
        else:
            scores["three_days"] = 0.0
            found = len(days) if isinstance(days, list) else "unknown"
            feedback_parts.append(f"✗ Expected 3 workout days, found {found}.")

        # Criterion 2: All exercises are bodyweight-only
        all_exercises = _extract_exercises(plan)
        plan_text_lower = action.plan.lower()
        equipment_found = [e for e in self.EQUIPMENT_EXERCISES if e in plan_text_lower]
        if not equipment_found:
            scores["bodyweight_only"] = 0.25
            feedback_parts.append("✓ No equipment required --- all bodyweight exercises.")
        else:
            scores["bodyweight_only"] = 0.0
            feedback_parts.append(f"✗ Equipment-dependent exercises found: {equipment_found[:3]}.")

        # Criterion 3: Each day has 4-8 exercises with sets and reps defined
        if isinstance(days, list) and len(days) > 0:
            days_ok = 0
            for day in days:
                exs = day.get("exercises", [])
                if 4 <= len(exs) <= 8 and all(
                    e.get("sets") and e.get("reps") for e in exs
                ):
                    days_ok += 1
            if days_ok == len(days) and len(days) > 0:
                scores["exercise_structure"] = 0.25
                feedback_parts.append("✓ Each day has 4-8 exercises with sets and reps defined.")
            else:
                scores["exercise_structure"] = 0.0
                feedback_parts.append(
                    f"✗ {days_ok}/{len(days)} days have 4-8 exercises with sets+reps. "
                    "Ensure every exercise has 'sets' and 'reps' fields."
                )
        else:
            scores["exercise_structure"] = 0.0
            feedback_parts.append("✗ Cannot evaluate exercise structure: no days found.")

        # Criterion 4: Beginner-appropriate (reps <= 15, no advanced movements)
        advanced_found = [m for m in self.ADVANCED_MOVEMENTS if m in plan_text_lower]
        reps_too_high = _check_reps_exceed(plan, max_reps=15)
        if not advanced_found and not reps_too_high:
            scores["beginner_appropriate"] = 0.25
            feedback_parts.append("✓ Plan is beginner-appropriate (no advanced movements, reps ≤ 15).")
        else:
            scores["beginner_appropriate"] = 0.0
            if advanced_found:
                feedback_parts.append(f"✗ Advanced movements not suitable for beginners: {advanced_found}.")
            if reps_too_high:
                feedback_parts.append("✗ Some exercises have reps > 15 --- too high for a beginner.")

        score = sum(scores.values())
        feedback = " ".join(feedback_parts)
        return score, scores, feedback


# ---------------------------------------------------------------------------
# Task 2 - MEDIUM: Injury-Safe Plan Modification
# ---------------------------------------------------------------------------

class InjurySafeTask(BaseTask):
    """
    Scenario: Intermediate client with lower-back injury. Pre-generated plan
    contains back squats, deadlifts, and bent-over rows. Agent must modify safely.
    Episode ends when modification submitted OR after 5 steps.
    """

    client_profile = {
        "age": 30,
        "fitness_level": "intermediate",
        "goal": "strength maintenance",
        "equipment": ["barbell", "dumbbells", "cables", "machines"],
        "injuries": ["lower back"],
        "days_per_week": 4,
        "initial_plan": {
            "days": [
                {
                    "name": "Day 1 - Lower Body",
                    "exercises": [
                        {"name": "Back Squat", "sets": 4, "reps": 8},
                        {"name": "Deadlift", "sets": 3, "reps": 5},
                        {"name": "Leg Press", "sets": 3, "reps": 10},
                        {"name": "Calf Raises", "sets": 4, "reps": 15},
                    ],
                },
                {
                    "name": "Day 2 - Upper Body",
                    "exercises": [
                        {"name": "Bench Press", "sets": 4, "reps": 8},
                        {"name": "Bent-Over Row", "sets": 4, "reps": 8},
                        {"name": "Overhead Press", "sets": 3, "reps": 10},
                        {"name": "Pull-Up", "sets": 3, "reps": "max"},
                    ],
                },
            ]
        },
    }
    max_steps = 5

    DEADLIFT_REPLACEMENTS = {
        "romanian deadlift", "rdl", "leg press", "leg curl",
        "hip thrust", "glute bridge", "trap bar deadlift",
    }
    SQUAT_REPLACEMENTS = {
        "goblet squat", "wall sit", "wall squat", "leg press",
        "box squat", "safety bar squat", "hack squat",
    }
    ROW_REPLACEMENTS = {
        "seated cable row", "seated row", "machine row",
        "chest-supported row", "chest supported row",
        "t-bar row", "seal row",
    }
    ORIGINAL_MUSCLE_GROUPS = {"quads", "hamstrings", "glutes", "back", "chest", "shoulders"}

    def grade(self, action: FitscriptAction, step: int) -> Tuple[float, Dict[str, float], str]:
        scores: Dict[str, float] = {}
        feedback_parts = []
        plan_text_lower = action.plan.lower()

        # Criterion 1: Deadlifts removed or replaced with safe alternatives
        has_deadlift = "deadlift" in plan_text_lower and not any(
            r in plan_text_lower for r in self.DEADLIFT_REPLACEMENTS
        )
        raw_deadlift = "deadlift" in plan_text_lower and "romanian" not in plan_text_lower and "rdl" not in plan_text_lower
        if not raw_deadlift:
            scores["deadlift_removed"] = 0.25
            feedback_parts.append("✓ Conventional deadlift removed or replaced safely.")
        else:
            scores["deadlift_removed"] = 0.0
            feedback_parts.append(
                "✗ Conventional deadlift still present. Replace with Romanian deadlift, leg press, or hip thrust."
            )

        # Criterion 2: Back squats replaced with safe alternatives
        has_back_squat = "back squat" in plan_text_lower
        if not has_back_squat:
            scores["squat_replaced"] = 0.25
            feedback_parts.append("✓ Back squat removed or replaced safely.")
        else:
            scores["squat_replaced"] = 0.0
            feedback_parts.append(
                "✗ Back squat still present. Replace with goblet squat, wall sit, or leg press."
            )

        # Criterion 3: Bent-over rows replaced with seated/machine variants
        has_bent_over_row = "bent-over row" in plan_text_lower or "bent over row" in plan_text_lower
        if not has_bent_over_row:
            scores["rows_replaced"] = 0.25
            feedback_parts.append("✓ Bent-over rows removed or replaced with spine-neutral variant.")
        else:
            scores["rows_replaced"] = 0.0
            feedback_parts.append(
                "✗ Bent-over rows still present. Replace with seated cable rows or machine rows."
            )

        # Criterion 4: Plan retains same muscle group targets
        # Proxy: check that back/leg work still appears in the plan
        back_work = any(
            t in plan_text_lower
            for t in ["row", "pull", "lat", "back", "rhomboid"]
        )
        leg_work = any(
            t in plan_text_lower
            for t in ["squat", "press", "lunge", "hip", "glute", "quad", "hamstring", "leg"]
        )
        if back_work and leg_work:
            scores["muscle_targets_retained"] = 0.25
            feedback_parts.append("✓ Original muscle groups (back, legs) still targeted despite modifications.")
        else:
            scores["muscle_targets_retained"] = 0.0
            missing = []
            if not back_work:
                missing.append("back")
            if not leg_work:
                missing.append("legs")
            feedback_parts.append(
                f"✗ Missing muscle group coverage: {missing}. Ensure modifications keep the same target areas."
            )

        score = sum(scores.values())
        feedback = " ".join(feedback_parts)
        return score, scores, feedback


# ---------------------------------------------------------------------------
# Task 3 - HARD: Periodized 4-Week Program
# ---------------------------------------------------------------------------

class PeriodizedProgramTask(BaseTask):
    """
    Scenario: Advanced powerlifter, 5 days/week, full gym, competition in 5 weeks.
    Needs 4-week block with deload in week 4.
    Episode ends when full program submitted OR after 8 steps.
    """

    client_profile = {
        "age": 27,
        "fitness_level": "advanced",
        "goal": "powerlifting competition prep",
        "equipment": ["full gym", "barbell", "squat rack", "bench", "deadlift platform"],
        "injuries": [],
        "days_per_week": 5,
        "competition_weeks_out": 5,
        "weak_points": ["upper back", "lockout strength"],
        "current_maxes": {"squat": 180, "bench": 120, "deadlift": 220},
    }
    max_steps = 8

    COMPETITION_LIFTS = {"squat", "bench", "bench press", "deadlift"}

    def grade(self, action: FitscriptAction, step: int) -> Tuple[float, Dict[str, float], str]:
        scores: Dict[str, float] = {}
        feedback_parts = []

        try:
            plan = json.loads(action.plan) if action.plan else {}
        except json.JSONDecodeError:
            plan = {}

        weeks = plan.get("weeks", [])

        # Criterion 1: 4 distinct weeks, each with 5 training days
        if isinstance(weeks, list) and len(weeks) == 4:
            all_five_days = all(
                len(w.get("days", w.get("training_days", []))) == 5
                for w in weeks
            )
            if all_five_days:
                scores["week_structure"] = 0.2
                feedback_parts.append("✓ 4 weeks present, each with 5 training days.")
            else:
                scores["week_structure"] = 0.1
                feedback_parts.append(
                    "~ 4 weeks present but not all weeks have exactly 5 training days."
                )
        else:
            scores["week_structure"] = 0.0
            found_weeks = len(weeks) if isinstance(weeks, list) else "unknown"
            feedback_parts.append(
                f"✗ Expected 4 weeks with 5 days each. Found {found_weeks} weeks."
            )

        # Criterion 2: Weeks 1-3 show progressive overload
        if isinstance(weeks, list) and len(weeks) >= 3:
            intensities = []
            for w in weeks[:3]:
                # Accept intensity as explicit field or infer from RPE/percentage keywords
                intensity = w.get("intensity") or w.get("avg_rpe") or w.get("percentage")
                if intensity is None:
                    # Try to infer from week label/description
                    desc = str(w).lower()
                    if "heavy" in desc or "high" in desc:
                        intensity = 85
                    elif "moderate" in desc or "medium" in desc:
                        intensity = 75
                    else:
                        intensity = None
                intensities.append(intensity)

            if all(i is not None for i in intensities) and intensities[0] < intensities[1] < intensities[2]:
                scores["progressive_overload"] = 0.2
                feedback_parts.append("✓ Weeks 1-3 show clear progressive overload (increasing intensity).")
            elif all(i is not None for i in intensities):
                scores["progressive_overload"] = 0.1
                feedback_parts.append(
                    "~ Intensity values present but progressive overload pattern not clearly ascending across weeks 1-3."
                )
            else:
                scores["progressive_overload"] = 0.0
                feedback_parts.append(
                    "✗ Cannot verify progressive overload. Add 'intensity', 'avg_rpe', or 'percentage' fields to each week."
                )
        else:
            scores["progressive_overload"] = 0.0
            feedback_parts.append("✗ Fewer than 3 weeks present; cannot verify progressive overload.")

        # Criterion 3: Week 4 is a deload (volume reduced >= 40% vs week 3)
        if isinstance(weeks, list) and len(weeks) == 4:
            w3 = weeks[2]
            w4 = weeks[3]
            w3_vol = _estimate_volume(w3)
            w4_vol = _estimate_volume(w4)
            is_deload_label = "deload" in str(w4).lower()
            if w3_vol > 0 and w4_vol > 0:
                reduction = (w3_vol - w4_vol) / w3_vol
                if reduction >= 0.40:
                    scores["deload_week"] = 0.2
                    feedback_parts.append(
                        f"✓ Week 4 deload: volume reduced by {reduction*100:.0f}% vs week 3."
                    )
                elif is_deload_label:
                    scores["deload_week"] = 0.1
                    feedback_parts.append(
                        "~ Week 4 labeled as deload but volume reduction < 40%. Reduce total sets/volume further."
                    )
                else:
                    scores["deload_week"] = 0.0
                    feedback_parts.append(
                        f"✗ Week 4 volume only reduced by {reduction*100:.0f}%. Deload requires >= 40% reduction."
                    )
            elif is_deload_label:
                scores["deload_week"] = 0.1
                feedback_parts.append(
                    "~ Week 4 labeled as deload but no volume data to verify the 40% reduction threshold."
                )
            else:
                scores["deload_week"] = 0.0
                feedback_parts.append(
                    "✗ Week 4 not identified as a deload and volume data insufficient to verify."
                )
        else:
            scores["deload_week"] = 0.0
            feedback_parts.append("✗ Fewer than 4 weeks present; cannot evaluate deload week.")

        # Criterion 4: Competition lifts appear as primary movements on separate days
        plan_text_lower = action.plan.lower()
        squat_present = "squat" in plan_text_lower
        bench_present = "bench" in plan_text_lower
        deadlift_present = "deadlift" in plan_text_lower
        if squat_present and bench_present and deadlift_present:
            scores["competition_lifts"] = 0.2
            feedback_parts.append("✓ All three competition lifts (squat, bench, deadlift) present as primary movements.")
        else:
            missing = []
            if not squat_present:
                missing.append("squat")
            if not bench_present:
                missing.append("bench press")
            if not deadlift_present:
                missing.append("deadlift")
            scores["competition_lifts"] = 0.0
            feedback_parts.append(f"✗ Missing competition lifts: {missing}.")

        # Criterion 5 (bonus): Accessory work targets weak points (upper back, lockout)
        weak_point_keywords = ["face pull", "upper back", "row", "rdl", "pause", "lockout", "band pull apart", "rear delt"]
        accessory_bonus = sum(1 for kw in weak_point_keywords if kw in plan_text_lower)
        if accessory_bonus >= 3:
            scores["accessory_weak_points"] = 0.2
            feedback_parts.append("✓ Accessory work targets weak points (upper back, lockout strength).")
        elif accessory_bonus >= 1:
            scores["accessory_weak_points"] = 0.1
            feedback_parts.append("~ Some accessory work present but weak points (upper back, lockout) not fully addressed.")
        else:
            scores["accessory_weak_points"] = 0.0
            feedback_parts.append("✗ No accessory work targeting weak points (upper back, lockout strength).")

        score = min(1.0, sum(scores.values()))
        feedback = " ".join(feedback_parts)
        return score, scores, feedback


# ---------------------------------------------------------------------------
# Helper utilities
# ---------------------------------------------------------------------------

def _extract_exercises(plan: dict) -> list:
    """Flatten all exercises from all days in a plan."""
    exercises = []
    for day in plan.get("days", plan.get("workout_days", [])):
        if isinstance(day, dict):
            exercises.extend(day.get("exercises", []))
    return exercises


def _check_reps_exceed(plan: dict, max_reps: int) -> bool:
    """Return True if any exercise in the plan has reps > max_reps."""
    for ex in _extract_exercises(plan):
        reps = ex.get("reps")
        if isinstance(reps, (int, float)) and reps > max_reps:
            return True
    return False


def _estimate_volume(week: dict) -> float:
    """Estimate total volume (sets × reps) across all days in a week."""
    total = 0
    for day in week.get("days", week.get("training_days", [])):
        if isinstance(day, dict):
            for ex in day.get("exercises", []):
                sets = ex.get("sets", 0)
                reps = ex.get("reps", 0)
                if isinstance(sets, (int, float)) and isinstance(reps, (int, float)):
                    total += sets * reps
    # Also accept a flat 'total_sets' key on the week
    if total == 0:
        total = week.get("total_sets", 0) * 8  # assume ~8 reps avg if only sets given
    return float(total)


# ---------------------------------------------------------------------------
# Task registry
# ---------------------------------------------------------------------------

TASKS: Dict[str, BaseTask] = {
    "basic_plan": BasicPlanTask(),
    "injury_safe_modification": InjurySafeTask(),
    "periodized_program": PeriodizedProgramTask(),
}


# ---------------------------------------------------------------------------
# Main environment class
# ---------------------------------------------------------------------------

class FitscriptEnvironment(Environment):
    """
    FitScript fitness prescription environment.

    Three tasks of increasing difficulty:
      - basic_plan (easy): generate a 3-day bodyweight beginner plan
      - injury_safe_modification (medium): modify a plan for a lower-back-injured client
      - periodized_program (hard): design a 4-week periodized powerlifting block

    Rewards are always in [0.0, 1.0]. Episodes terminate on task completion
    (score >= 0.99) or when max_steps is reached.
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = True

    def __init__(self, task_id: str = "basic_plan"):
        """
        Initialize the FitScript environment.

        Args:
            task_id: One of 'basic_plan', 'injury_safe_modification', 'periodized_program'.
        """
        if task_id not in TASKS:
            raise ValueError(
                f"Unknown task_id '{task_id}'. Valid options: {list(TASKS.keys())}"
            )
        self._task_id = task_id
        self._state = State(episode_id=str(uuid4()), step_count=0)
        self._last_plan: str = ""

    def reset(self) -> FitscriptObservation:
        """
        Reset the environment for the current task.

        Returns:
            FitscriptObservation with the client profile and welcome message.
        """
        self._state = State(episode_id=str(uuid4()), step_count=0)
        self._last_plan = ""

        task = TASKS[self._task_id]

        return FitscriptObservation(
            client_profile=task.client_profile,
            feedback="Welcome! Review the client profile and generate a plan.",
            score_breakdown={},
            task_id=self._task_id,
            step_count=0,
            done=False,
            reward=0.0,
        )

    def step(self, action: FitscriptAction) -> FitscriptObservation:  # type: ignore[override]
        """
        Execute a step: grade the submitted plan and return feedback.

        Args:
            action: FitscriptAction with action_type, plan JSON string, and optional reasoning.

        Returns:
            FitscriptObservation with score breakdown and feedback.
        """
        self._state.step_count += 1
        task = TASKS[self._task_id]

        # Penalty: empty or null plan
        if not action.plan or action.plan.strip() in ("", "null", "{}"):
            return FitscriptObservation(
                client_profile=task.client_profile,
                feedback="✗ Empty or null plan submitted. Please provide a structured workout plan.",
                score_breakdown={},
                task_id=self._task_id,
                step_count=self._state.step_count,
                done=self._state.step_count >= task.max_steps,
                reward=0.0,
            )

        # Penalty: identical plan submitted twice in a row
        if action.plan == self._last_plan:
            return FitscriptObservation(
                client_profile=task.client_profile,
                feedback="✗ Identical plan submitted twice. Please revise based on the previous feedback.",
                score_breakdown={},
                task_id=self._task_id,
                step_count=self._state.step_count,
                done=self._state.step_count >= task.max_steps,
                reward=0.0,
            )

        self._last_plan = action.plan

        # Grade the plan
        score, breakdown, feedback = task.grade(action, self._state.step_count)

        # Safety penalty: contraindicated exercises for injured clients
        injuries = task.client_profile.get("injuries", [])
        if injuries:
            plan_lower = action.plan.lower()
            CONTRAINDICATED = {
                "lower back": ["deadlift", "back squat", "good morning", "bent-over row"],
                "knee": ["lunge", "leg press", "deep squat", "box jump"],
                "shoulder": ["overhead press", "upright row", "behind neck"],
            }
            for injury in injuries:
                banned = CONTRAINDICATED.get(injury, [])
                if any(b in plan_lower for b in banned):
                    score = max(0.0, score - 0.3)
                    feedback += " ⚠️ Safety penalty applied: plan contains exercises contraindicated for the client's injury."
                    break

        # Clamp to [0.0, 1.0]
        score = max(0.0, min(1.0, score))

        done = score >= 0.99 or self._state.step_count >= task.max_steps

        return FitscriptObservation(
            client_profile=task.client_profile,
            feedback=feedback,
            score_breakdown=breakdown,
            task_id=self._task_id,
            step_count=self._state.step_count,
            done=done,
            reward=score,
        )

    @property
    def state(self) -> State:
        """Get the current environment state."""
        return self._state