fitscript-env / server /FitScript_environment.py
coffeine16's picture
clean initial commit
0446283
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
FitScript Environment Implementation.
Simulates a real-world fitness prescription task: generating, evaluating,
and refining personalized workout plans. Supports three tasks of increasing
difficulty with deterministic graders.
"""
import json
from uuid import uuid4
from typing import Dict, Any, Tuple
from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State
try:
from ..models import FitscriptAction, FitscriptObservation
except ImportError:
from models import FitscriptAction, FitscriptObservation
# ---------------------------------------------------------------------------
# Grader base class
# ---------------------------------------------------------------------------
class BaseTask:
"""Base class for all FitScript tasks."""
client_profile: dict = {}
max_steps: int = 5
def grade(
self, action: FitscriptAction, step: int
) -> Tuple[float, Dict[str, float], str]:
"""
Returns (score: float in [0,1], breakdown: dict, feedback: str).
Must be implemented by every concrete task.
"""
raise NotImplementedError
# ---------------------------------------------------------------------------
# Task 1 - EASY: Basic Plan Generation
# ---------------------------------------------------------------------------
class BasicPlanTask(BaseTask):
"""
Scenario: 35-year-old beginner, no injuries, 3 days/week, home, no equipment.
Grader: 4 criteria worth 0.25 each.
Episode ends when plan submitted OR after 3 steps.
"""
client_profile = {
"age": 35,
"fitness_level": "beginner",
"goal": "general fitness",
"equipment": [],
"injuries": [],
"days_per_week": 3,
}
max_steps = 3
# Exercises that require equipment --- flag any appearance
EQUIPMENT_EXERCISES = {
"barbell", "dumbbell", "kettlebell", "cable", "machine",
"bench press", "squat rack", "pull-up bar", "resistance band",
"treadmill", "stationary bike",
}
# Advanced movements not appropriate for beginners
ADVANCED_MOVEMENTS = {
"muscle-up", "muscle up", "handstand push-up", "handstand pushup",
"pistol squat", "one-arm push-up", "planche", "front lever",
"back lever", "dragon flag",
}
def grade(self, action: FitscriptAction, step: int) -> Tuple[float, Dict[str, float], str]:
scores: Dict[str, float] = {}
feedback_parts = []
try:
plan = json.loads(action.plan) if action.plan else {}
except json.JSONDecodeError:
plan = {}
# Criterion 1: Plan contains exactly 3 workout days
days = plan.get("days", plan.get("workout_days", []))
if isinstance(days, list) and len(days) == 3:
scores["three_days"] = 0.25
feedback_parts.append("✓ Plan has exactly 3 workout days.")
else:
scores["three_days"] = 0.0
found = len(days) if isinstance(days, list) else "unknown"
feedback_parts.append(f"✗ Expected 3 workout days, found {found}.")
# Criterion 2: All exercises are bodyweight-only
all_exercises = _extract_exercises(plan)
plan_text_lower = action.plan.lower()
equipment_found = [e for e in self.EQUIPMENT_EXERCISES if e in plan_text_lower]
if not equipment_found:
scores["bodyweight_only"] = 0.25
feedback_parts.append("✓ No equipment required --- all bodyweight exercises.")
else:
scores["bodyweight_only"] = 0.0
feedback_parts.append(f"✗ Equipment-dependent exercises found: {equipment_found[:3]}.")
# Criterion 3: Each day has 4-8 exercises with sets and reps defined
if isinstance(days, list) and len(days) > 0:
days_ok = 0
for day in days:
exs = day.get("exercises", [])
if 4 <= len(exs) <= 8 and all(
e.get("sets") and e.get("reps") for e in exs
):
days_ok += 1
if days_ok == len(days) and len(days) > 0:
scores["exercise_structure"] = 0.25
feedback_parts.append("✓ Each day has 4-8 exercises with sets and reps defined.")
else:
scores["exercise_structure"] = 0.0
feedback_parts.append(
f"✗ {days_ok}/{len(days)} days have 4-8 exercises with sets+reps. "
"Ensure every exercise has 'sets' and 'reps' fields."
)
else:
scores["exercise_structure"] = 0.0
feedback_parts.append("✗ Cannot evaluate exercise structure: no days found.")
# Criterion 4: Beginner-appropriate (reps <= 15, no advanced movements)
advanced_found = [m for m in self.ADVANCED_MOVEMENTS if m in plan_text_lower]
reps_too_high = _check_reps_exceed(plan, max_reps=15)
if not advanced_found and not reps_too_high:
scores["beginner_appropriate"] = 0.25
feedback_parts.append("✓ Plan is beginner-appropriate (no advanced movements, reps ≤ 15).")
else:
scores["beginner_appropriate"] = 0.0
if advanced_found:
feedback_parts.append(f"✗ Advanced movements not suitable for beginners: {advanced_found}.")
if reps_too_high:
feedback_parts.append("✗ Some exercises have reps > 15 --- too high for a beginner.")
score = sum(scores.values())
feedback = " ".join(feedback_parts)
return score, scores, feedback
# ---------------------------------------------------------------------------
# Task 2 - MEDIUM: Injury-Safe Plan Modification
# ---------------------------------------------------------------------------
class InjurySafeTask(BaseTask):
"""
Scenario: Intermediate client with lower-back injury. Pre-generated plan
contains back squats, deadlifts, and bent-over rows. Agent must modify safely.
Episode ends when modification submitted OR after 5 steps.
"""
client_profile = {
"age": 30,
"fitness_level": "intermediate",
"goal": "strength maintenance",
"equipment": ["barbell", "dumbbells", "cables", "machines"],
"injuries": ["lower back"],
"days_per_week": 4,
"initial_plan": {
"days": [
{
"name": "Day 1 - Lower Body",
"exercises": [
{"name": "Back Squat", "sets": 4, "reps": 8},
{"name": "Deadlift", "sets": 3, "reps": 5},
{"name": "Leg Press", "sets": 3, "reps": 10},
{"name": "Calf Raises", "sets": 4, "reps": 15},
],
},
{
"name": "Day 2 - Upper Body",
"exercises": [
{"name": "Bench Press", "sets": 4, "reps": 8},
{"name": "Bent-Over Row", "sets": 4, "reps": 8},
{"name": "Overhead Press", "sets": 3, "reps": 10},
{"name": "Pull-Up", "sets": 3, "reps": "max"},
],
},
]
},
}
max_steps = 5
DEADLIFT_REPLACEMENTS = {
"romanian deadlift", "rdl", "leg press", "leg curl",
"hip thrust", "glute bridge", "trap bar deadlift",
}
SQUAT_REPLACEMENTS = {
"goblet squat", "wall sit", "wall squat", "leg press",
"box squat", "safety bar squat", "hack squat",
}
ROW_REPLACEMENTS = {
"seated cable row", "seated row", "machine row",
"chest-supported row", "chest supported row",
"t-bar row", "seal row",
}
ORIGINAL_MUSCLE_GROUPS = {"quads", "hamstrings", "glutes", "back", "chest", "shoulders"}
def grade(self, action: FitscriptAction, step: int) -> Tuple[float, Dict[str, float], str]:
scores: Dict[str, float] = {}
feedback_parts = []
plan_text_lower = action.plan.lower()
# Criterion 1: Deadlifts removed or replaced with safe alternatives
has_deadlift = "deadlift" in plan_text_lower and not any(
r in plan_text_lower for r in self.DEADLIFT_REPLACEMENTS
)
raw_deadlift = "deadlift" in plan_text_lower and "romanian" not in plan_text_lower and "rdl" not in plan_text_lower
if not raw_deadlift:
scores["deadlift_removed"] = 0.25
feedback_parts.append("✓ Conventional deadlift removed or replaced safely.")
else:
scores["deadlift_removed"] = 0.0
feedback_parts.append(
"✗ Conventional deadlift still present. Replace with Romanian deadlift, leg press, or hip thrust."
)
# Criterion 2: Back squats replaced with safe alternatives
has_back_squat = "back squat" in plan_text_lower
if not has_back_squat:
scores["squat_replaced"] = 0.25
feedback_parts.append("✓ Back squat removed or replaced safely.")
else:
scores["squat_replaced"] = 0.0
feedback_parts.append(
"✗ Back squat still present. Replace with goblet squat, wall sit, or leg press."
)
# Criterion 3: Bent-over rows replaced with seated/machine variants
has_bent_over_row = "bent-over row" in plan_text_lower or "bent over row" in plan_text_lower
if not has_bent_over_row:
scores["rows_replaced"] = 0.25
feedback_parts.append("✓ Bent-over rows removed or replaced with spine-neutral variant.")
else:
scores["rows_replaced"] = 0.0
feedback_parts.append(
"✗ Bent-over rows still present. Replace with seated cable rows or machine rows."
)
# Criterion 4: Plan retains same muscle group targets
# Proxy: check that back/leg work still appears in the plan
back_work = any(
t in plan_text_lower
for t in ["row", "pull", "lat", "back", "rhomboid"]
)
leg_work = any(
t in plan_text_lower
for t in ["squat", "press", "lunge", "hip", "glute", "quad", "hamstring", "leg"]
)
if back_work and leg_work:
scores["muscle_targets_retained"] = 0.25
feedback_parts.append("✓ Original muscle groups (back, legs) still targeted despite modifications.")
else:
scores["muscle_targets_retained"] = 0.0
missing = []
if not back_work:
missing.append("back")
if not leg_work:
missing.append("legs")
feedback_parts.append(
f"✗ Missing muscle group coverage: {missing}. Ensure modifications keep the same target areas."
)
score = sum(scores.values())
feedback = " ".join(feedback_parts)
return score, scores, feedback
# ---------------------------------------------------------------------------
# Task 3 - HARD: Periodized 4-Week Program
# ---------------------------------------------------------------------------
class PeriodizedProgramTask(BaseTask):
"""
Scenario: Advanced powerlifter, 5 days/week, full gym, competition in 5 weeks.
Needs 4-week block with deload in week 4.
Episode ends when full program submitted OR after 8 steps.
"""
client_profile = {
"age": 27,
"fitness_level": "advanced",
"goal": "powerlifting competition prep",
"equipment": ["full gym", "barbell", "squat rack", "bench", "deadlift platform"],
"injuries": [],
"days_per_week": 5,
"competition_weeks_out": 5,
"weak_points": ["upper back", "lockout strength"],
"current_maxes": {"squat": 180, "bench": 120, "deadlift": 220},
}
max_steps = 8
COMPETITION_LIFTS = {"squat", "bench", "bench press", "deadlift"}
def grade(self, action: FitscriptAction, step: int) -> Tuple[float, Dict[str, float], str]:
scores: Dict[str, float] = {}
feedback_parts = []
try:
plan = json.loads(action.plan) if action.plan else {}
except json.JSONDecodeError:
plan = {}
weeks = plan.get("weeks", [])
# Criterion 1: 4 distinct weeks, each with 5 training days
if isinstance(weeks, list) and len(weeks) == 4:
all_five_days = all(
len(w.get("days", w.get("training_days", []))) == 5
for w in weeks
)
if all_five_days:
scores["week_structure"] = 0.2
feedback_parts.append("✓ 4 weeks present, each with 5 training days.")
else:
scores["week_structure"] = 0.1
feedback_parts.append(
"~ 4 weeks present but not all weeks have exactly 5 training days."
)
else:
scores["week_structure"] = 0.0
found_weeks = len(weeks) if isinstance(weeks, list) else "unknown"
feedback_parts.append(
f"✗ Expected 4 weeks with 5 days each. Found {found_weeks} weeks."
)
# Criterion 2: Weeks 1-3 show progressive overload
if isinstance(weeks, list) and len(weeks) >= 3:
intensities = []
for w in weeks[:3]:
# Accept intensity as explicit field or infer from RPE/percentage keywords
intensity = w.get("intensity") or w.get("avg_rpe") or w.get("percentage")
if intensity is None:
# Try to infer from week label/description
desc = str(w).lower()
if "heavy" in desc or "high" in desc:
intensity = 85
elif "moderate" in desc or "medium" in desc:
intensity = 75
else:
intensity = None
intensities.append(intensity)
if all(i is not None for i in intensities) and intensities[0] < intensities[1] < intensities[2]:
scores["progressive_overload"] = 0.2
feedback_parts.append("✓ Weeks 1-3 show clear progressive overload (increasing intensity).")
elif all(i is not None for i in intensities):
scores["progressive_overload"] = 0.1
feedback_parts.append(
"~ Intensity values present but progressive overload pattern not clearly ascending across weeks 1-3."
)
else:
scores["progressive_overload"] = 0.0
feedback_parts.append(
"✗ Cannot verify progressive overload. Add 'intensity', 'avg_rpe', or 'percentage' fields to each week."
)
else:
scores["progressive_overload"] = 0.0
feedback_parts.append("✗ Fewer than 3 weeks present; cannot verify progressive overload.")
# Criterion 3: Week 4 is a deload (volume reduced >= 40% vs week 3)
if isinstance(weeks, list) and len(weeks) == 4:
w3 = weeks[2]
w4 = weeks[3]
w3_vol = _estimate_volume(w3)
w4_vol = _estimate_volume(w4)
is_deload_label = "deload" in str(w4).lower()
if w3_vol > 0 and w4_vol > 0:
reduction = (w3_vol - w4_vol) / w3_vol
if reduction >= 0.40:
scores["deload_week"] = 0.2
feedback_parts.append(
f"✓ Week 4 deload: volume reduced by {reduction*100:.0f}% vs week 3."
)
elif is_deload_label:
scores["deload_week"] = 0.1
feedback_parts.append(
"~ Week 4 labeled as deload but volume reduction < 40%. Reduce total sets/volume further."
)
else:
scores["deload_week"] = 0.0
feedback_parts.append(
f"✗ Week 4 volume only reduced by {reduction*100:.0f}%. Deload requires >= 40% reduction."
)
elif is_deload_label:
scores["deload_week"] = 0.1
feedback_parts.append(
"~ Week 4 labeled as deload but no volume data to verify the 40% reduction threshold."
)
else:
scores["deload_week"] = 0.0
feedback_parts.append(
"✗ Week 4 not identified as a deload and volume data insufficient to verify."
)
else:
scores["deload_week"] = 0.0
feedback_parts.append("✗ Fewer than 4 weeks present; cannot evaluate deload week.")
# Criterion 4: Competition lifts appear as primary movements on separate days
plan_text_lower = action.plan.lower()
squat_present = "squat" in plan_text_lower
bench_present = "bench" in plan_text_lower
deadlift_present = "deadlift" in plan_text_lower
if squat_present and bench_present and deadlift_present:
scores["competition_lifts"] = 0.2
feedback_parts.append("✓ All three competition lifts (squat, bench, deadlift) present as primary movements.")
else:
missing = []
if not squat_present:
missing.append("squat")
if not bench_present:
missing.append("bench press")
if not deadlift_present:
missing.append("deadlift")
scores["competition_lifts"] = 0.0
feedback_parts.append(f"✗ Missing competition lifts: {missing}.")
# Criterion 5 (bonus): Accessory work targets weak points (upper back, lockout)
weak_point_keywords = ["face pull", "upper back", "row", "rdl", "pause", "lockout", "band pull apart", "rear delt"]
accessory_bonus = sum(1 for kw in weak_point_keywords if kw in plan_text_lower)
if accessory_bonus >= 3:
scores["accessory_weak_points"] = 0.2
feedback_parts.append("✓ Accessory work targets weak points (upper back, lockout strength).")
elif accessory_bonus >= 1:
scores["accessory_weak_points"] = 0.1
feedback_parts.append("~ Some accessory work present but weak points (upper back, lockout) not fully addressed.")
else:
scores["accessory_weak_points"] = 0.0
feedback_parts.append("✗ No accessory work targeting weak points (upper back, lockout strength).")
score = min(1.0, sum(scores.values()))
feedback = " ".join(feedback_parts)
return score, scores, feedback
# ---------------------------------------------------------------------------
# Helper utilities
# ---------------------------------------------------------------------------
def _extract_exercises(plan: dict) -> list:
"""Flatten all exercises from all days in a plan."""
exercises = []
for day in plan.get("days", plan.get("workout_days", [])):
if isinstance(day, dict):
exercises.extend(day.get("exercises", []))
return exercises
def _check_reps_exceed(plan: dict, max_reps: int) -> bool:
"""Return True if any exercise in the plan has reps > max_reps."""
for ex in _extract_exercises(plan):
reps = ex.get("reps")
if isinstance(reps, (int, float)) and reps > max_reps:
return True
return False
def _estimate_volume(week: dict) -> float:
"""Estimate total volume (sets × reps) across all days in a week."""
total = 0
for day in week.get("days", week.get("training_days", [])):
if isinstance(day, dict):
for ex in day.get("exercises", []):
sets = ex.get("sets", 0)
reps = ex.get("reps", 0)
if isinstance(sets, (int, float)) and isinstance(reps, (int, float)):
total += sets * reps
# Also accept a flat 'total_sets' key on the week
if total == 0:
total = week.get("total_sets", 0) * 8 # assume ~8 reps avg if only sets given
return float(total)
# ---------------------------------------------------------------------------
# Task registry
# ---------------------------------------------------------------------------
TASKS: Dict[str, BaseTask] = {
"basic_plan": BasicPlanTask(),
"injury_safe_modification": InjurySafeTask(),
"periodized_program": PeriodizedProgramTask(),
}
# ---------------------------------------------------------------------------
# Main environment class
# ---------------------------------------------------------------------------
class FitscriptEnvironment(Environment):
"""
FitScript fitness prescription environment.
Three tasks of increasing difficulty:
- basic_plan (easy): generate a 3-day bodyweight beginner plan
- injury_safe_modification (medium): modify a plan for a lower-back-injured client
- periodized_program (hard): design a 4-week periodized powerlifting block
Rewards are always in [0.0, 1.0]. Episodes terminate on task completion
(score >= 0.99) or when max_steps is reached.
"""
SUPPORTS_CONCURRENT_SESSIONS: bool = True
def __init__(self, task_id: str = "basic_plan"):
"""
Initialize the FitScript environment.
Args:
task_id: One of 'basic_plan', 'injury_safe_modification', 'periodized_program'.
"""
if task_id not in TASKS:
raise ValueError(
f"Unknown task_id '{task_id}'. Valid options: {list(TASKS.keys())}"
)
self._task_id = task_id
self._state = State(episode_id=str(uuid4()), step_count=0)
self._last_plan: str = ""
def reset(self) -> FitscriptObservation:
"""
Reset the environment for the current task.
Returns:
FitscriptObservation with the client profile and welcome message.
"""
self._state = State(episode_id=str(uuid4()), step_count=0)
self._last_plan = ""
task = TASKS[self._task_id]
return FitscriptObservation(
client_profile=task.client_profile,
feedback="Welcome! Review the client profile and generate a plan.",
score_breakdown={},
task_id=self._task_id,
step_count=0,
done=False,
reward=0.0,
)
def step(self, action: FitscriptAction) -> FitscriptObservation: # type: ignore[override]
"""
Execute a step: grade the submitted plan and return feedback.
Args:
action: FitscriptAction with action_type, plan JSON string, and optional reasoning.
Returns:
FitscriptObservation with score breakdown and feedback.
"""
self._state.step_count += 1
task = TASKS[self._task_id]
# Penalty: empty or null plan
if not action.plan or action.plan.strip() in ("", "null", "{}"):
return FitscriptObservation(
client_profile=task.client_profile,
feedback="✗ Empty or null plan submitted. Please provide a structured workout plan.",
score_breakdown={},
task_id=self._task_id,
step_count=self._state.step_count,
done=self._state.step_count >= task.max_steps,
reward=0.0,
)
# Penalty: identical plan submitted twice in a row
if action.plan == self._last_plan:
return FitscriptObservation(
client_profile=task.client_profile,
feedback="✗ Identical plan submitted twice. Please revise based on the previous feedback.",
score_breakdown={},
task_id=self._task_id,
step_count=self._state.step_count,
done=self._state.step_count >= task.max_steps,
reward=0.0,
)
self._last_plan = action.plan
# Grade the plan
score, breakdown, feedback = task.grade(action, self._state.step_count)
# Safety penalty: contraindicated exercises for injured clients
injuries = task.client_profile.get("injuries", [])
if injuries:
plan_lower = action.plan.lower()
CONTRAINDICATED = {
"lower back": ["deadlift", "back squat", "good morning", "bent-over row"],
"knee": ["lunge", "leg press", "deep squat", "box jump"],
"shoulder": ["overhead press", "upright row", "behind neck"],
}
for injury in injuries:
banned = CONTRAINDICATED.get(injury, [])
if any(b in plan_lower for b in banned):
score = max(0.0, score - 0.3)
feedback += " ⚠️ Safety penalty applied: plan contains exercises contraindicated for the client's injury."
break
# Clamp to [0.0, 1.0]
score = max(0.0, min(1.0, score))
done = score >= 0.99 or self._state.step_count >= task.max_steps
return FitscriptObservation(
client_profile=task.client_profile,
feedback=feedback,
score_breakdown=breakdown,
task_id=self._task_id,
step_count=self._state.step_count,
done=done,
reward=score,
)
@property
def state(self) -> State:
"""Get the current environment state."""
return self._state