lifeops-openenv / scripts /colab_train_minimal.py
avlukas's picture
Update for lifeops-openenv Space
2eab9e8
"""
Minimal LifeOps training script for Colab (OpenEnv 0.2.1 + Unsloth/TRL).
Uses OpenEnv (stable 0.2.1) deployed on HF Spaces. Copy each # Cell block
into a Colab cell and run.
Requirements:
- OpenEnv env deployed at: https://YOUR_ORG-lifeops-env.hf.space
(Create a Docker Space, push Dockerfile.openenv, env/, openenv_lifeops/)
"""
# =============================================================================
# Cell 1: Install dependencies (OpenEnv 0.2.1 + TRL)
# =============================================================================
# !pip install -q openenv-core==0.2.1 transformers trl torch datasets accelerate python-dotenv
# # Optional: Unsloth for 2x faster LoRA
# # !pip install -q unsloth
# =============================================================================
# Cell 2: Clone repo and setup path
# =============================================================================
# !git clone -q -b openenv-integration https://github.com/avlukas04/adaptive-planner-env.git
# import sys
# sys.path.insert(0, "adaptive-planner-env")
# =============================================================================
# Cell 3a: Use OpenEnv remote env (connects to HF Space)
# =============================================================================
"""
# Set your LifeOps OpenEnv Space URL (after deploying)
# Format: https://ORG-SPACE-NAME.hf.space (replace / with -)
LIFEOPS_ENV_URL = "https://avlukas-lifeops-openenv.hf.space"
from openenv_lifeops.env_adapter import LifeOpsEnvAdapter
from env.lifeops_env import _choose_simple_action
from training.train_rl import collect_trajectory
# Connect to OpenEnv env on HF Spaces
env = LifeOpsEnvAdapter(base_url=LIFEOPS_ENV_URL)
# Collect trajectories (uses remote env)
trajectories = []
for ep in range(10):
traj, reward, _, _ = collect_trajectory(env, policy="heuristic")
print(f"Episode {ep+1}: reward={reward:.2f}")
if reward > 0:
trajectories.append((traj, reward))
env.close()
print(f"Collected {len(trajectories)} positive-reward trajectories")
"""
# =============================================================================
# Cell 3b: Use local env (no deployment; for quick testing)
# =============================================================================
"""
from env.lifeops_env import LifeOpsEnv, _choose_simple_action
from training.train_rl import collect_trajectory
env = LifeOpsEnv(seed=42)
trajectories = []
for ep in range(10):
traj, reward, _, _ = collect_trajectory(env, policy="heuristic")
print(f"Episode {ep+1}: reward={reward:.2f}")
if reward > 0:
trajectories.append((traj, reward))
print(f"Collected {len(trajectories)} positive-reward trajectories")
"""
# =============================================================================
# Cell 4: Train with HF TRL SFTTrainer (OpenEnv env + reward signal)
# =============================================================================
"""
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from env.lifeops_env import LifeOpsEnv
from env.actions import generate_valid_actions
from agent.llm_agent import _state_to_prompt
from training.train_rl import collect_trajectory
# Collect trajectories (local or use LifeOpsEnvAdapter for remote)
env = LifeOpsEnv(seed=42)
rows = []
for _ in range(20):
traj, reward, _, _ = collect_trajectory(env, policy="heuristic")
if reward <= 0:
continue
for step in traj[:4]:
obs = step["obs"]
action_dict = step["action"]
valid = generate_valid_actions(obs)
prompt = _state_to_prompt(obs, valid)
at = action_dict.get("action_type", "?")
idx = next((i + 1 for i, a in enumerate(valid) if a.to_dict().get("action_type") == at), 1)
completion = f"Reasoning: Aligned with persona preferences. CHOICE: {idx}"
rows.append({"text": prompt + "\n" + completion})
if not rows:
rows = [{"text": "Persona: Early-bird\nCalendar: empty\nOptions: 1. accept 2. reject\nCHOICE: 1"}]
dataset = Dataset.from_list(rows)
print(f"Dataset size: {len(dataset)}")
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
config = SFTConfig(output_dir="./lifeops_sft", num_train_epochs=1, per_device_train_batch_size=2, max_seq_length=256)
trainer = SFTTrainer(model=model, args=config, train_dataset=dataset, dataset_text_field="text", tokenizer=tokenizer)
trainer.train()
trainer.save_model("./lifeops_sft")
print("Done. Model saved to ./lifeops_sft")
"""
# =============================================================================
# Cell 5: Run train_rl loop (local or OpenEnv)
# =============================================================================
"""
from training.train_rl import train
result = train(num_episodes=10, policy="heuristic")
print(f"Avg reward: {result['avg_reward']:.2f}")
"""