Spaces:
Sleeping
Sleeping
| """ | |
| Minimal LifeOps training script for Colab (OpenEnv 0.2.1 + Unsloth/TRL). | |
| Uses OpenEnv (stable 0.2.1) deployed on HF Spaces. Copy each # Cell block | |
| into a Colab cell and run. | |
| Requirements: | |
| - OpenEnv env deployed at: https://YOUR_ORG-lifeops-env.hf.space | |
| (Create a Docker Space, push Dockerfile.openenv, env/, openenv_lifeops/) | |
| """ | |
| # ============================================================================= | |
| # Cell 1: Install dependencies (OpenEnv 0.2.1 + TRL) | |
| # ============================================================================= | |
| # !pip install -q openenv-core==0.2.1 transformers trl torch datasets accelerate python-dotenv | |
| # # Optional: Unsloth for 2x faster LoRA | |
| # # !pip install -q unsloth | |
| # ============================================================================= | |
| # Cell 2: Clone repo and setup path | |
| # ============================================================================= | |
| # !git clone -q -b openenv-integration https://github.com/avlukas04/adaptive-planner-env.git | |
| # import sys | |
| # sys.path.insert(0, "adaptive-planner-env") | |
| # ============================================================================= | |
| # Cell 3a: Use OpenEnv remote env (connects to HF Space) | |
| # ============================================================================= | |
| """ | |
| # Set your LifeOps OpenEnv Space URL (after deploying) | |
| # Format: https://ORG-SPACE-NAME.hf.space (replace / with -) | |
| LIFEOPS_ENV_URL = "https://avlukas-lifeops-openenv.hf.space" | |
| from openenv_lifeops.env_adapter import LifeOpsEnvAdapter | |
| from env.lifeops_env import _choose_simple_action | |
| from training.train_rl import collect_trajectory | |
| # Connect to OpenEnv env on HF Spaces | |
| env = LifeOpsEnvAdapter(base_url=LIFEOPS_ENV_URL) | |
| # Collect trajectories (uses remote env) | |
| trajectories = [] | |
| for ep in range(10): | |
| traj, reward, _, _ = collect_trajectory(env, policy="heuristic") | |
| print(f"Episode {ep+1}: reward={reward:.2f}") | |
| if reward > 0: | |
| trajectories.append((traj, reward)) | |
| env.close() | |
| print(f"Collected {len(trajectories)} positive-reward trajectories") | |
| """ | |
| # ============================================================================= | |
| # Cell 3b: Use local env (no deployment; for quick testing) | |
| # ============================================================================= | |
| """ | |
| from env.lifeops_env import LifeOpsEnv, _choose_simple_action | |
| from training.train_rl import collect_trajectory | |
| env = LifeOpsEnv(seed=42) | |
| trajectories = [] | |
| for ep in range(10): | |
| traj, reward, _, _ = collect_trajectory(env, policy="heuristic") | |
| print(f"Episode {ep+1}: reward={reward:.2f}") | |
| if reward > 0: | |
| trajectories.append((traj, reward)) | |
| print(f"Collected {len(trajectories)} positive-reward trajectories") | |
| """ | |
| # ============================================================================= | |
| # Cell 4: Train with HF TRL SFTTrainer (OpenEnv env + reward signal) | |
| # ============================================================================= | |
| """ | |
| from datasets import Dataset | |
| from trl import SFTTrainer, SFTConfig | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| from env.lifeops_env import LifeOpsEnv | |
| from env.actions import generate_valid_actions | |
| from agent.llm_agent import _state_to_prompt | |
| from training.train_rl import collect_trajectory | |
| # Collect trajectories (local or use LifeOpsEnvAdapter for remote) | |
| env = LifeOpsEnv(seed=42) | |
| rows = [] | |
| for _ in range(20): | |
| traj, reward, _, _ = collect_trajectory(env, policy="heuristic") | |
| if reward <= 0: | |
| continue | |
| for step in traj[:4]: | |
| obs = step["obs"] | |
| action_dict = step["action"] | |
| valid = generate_valid_actions(obs) | |
| prompt = _state_to_prompt(obs, valid) | |
| at = action_dict.get("action_type", "?") | |
| idx = next((i + 1 for i, a in enumerate(valid) if a.to_dict().get("action_type") == at), 1) | |
| completion = f"Reasoning: Aligned with persona preferences. CHOICE: {idx}" | |
| rows.append({"text": prompt + "\n" + completion}) | |
| if not rows: | |
| rows = [{"text": "Persona: Early-bird\nCalendar: empty\nOptions: 1. accept 2. reject\nCHOICE: 1"}] | |
| dataset = Dataset.from_list(rows) | |
| print(f"Dataset size: {len(dataset)}") | |
| model_id = "google/flan-t5-small" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_id) | |
| config = SFTConfig(output_dir="./lifeops_sft", num_train_epochs=1, per_device_train_batch_size=2, max_seq_length=256) | |
| trainer = SFTTrainer(model=model, args=config, train_dataset=dataset, dataset_text_field="text", tokenizer=tokenizer) | |
| trainer.train() | |
| trainer.save_model("./lifeops_sft") | |
| print("Done. Model saved to ./lifeops_sft") | |
| """ | |
| # ============================================================================= | |
| # Cell 5: Run train_rl loop (local or OpenEnv) | |
| # ============================================================================= | |
| """ | |
| from training.train_rl import train | |
| result = train(num_episodes=10, policy="heuristic") | |
| print(f"Avg reward: {result['avg_reward']:.2f}") | |
| """ | |