Spaces:

mpnikhil
/

skill-invocation-env

Running

App Files Files Community

mpnikhil commited on 29 days ago

Commit

f678c99

verified ·

1 Parent(s): ac627d5

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

models.py +1 -1
train_demo.py +147 -112

models.py CHANGED Viewed

@@ -40,7 +40,7 @@ class SkillInvocationAction(Action):
         default=None, description='Skill ID (required for load/unload)'
     )
     answer: Optional[str] = Field(
-        default=None, description='Solution text (required for submit)'
     )

         default=None, description='Skill ID (required for load/unload)'
     )
     answer: Optional[str] = Field(
+        default=None, description='Solution text (required for submit)', max_length=100000
     )

train_demo.py CHANGED Viewed

@@ -1,123 +1,158 @@
-#!/usr/bin/env python3
-"""
-Minimal TRL + OpenEnv integration demo for the Skill Invocation Environment.
-This script demonstrates how to connect to the environment and run episodes.
-It can be run in Google Colab with Unsloth for actual RL training.
-Setup (Colab):
-    !pip install unsloth openenv-core trl
-    !pip install skill_invocation_env  # or install from local
-Usage:
-    # Against a local server:
-    python train_demo.py --base-url http://localhost:8000
-    # Against a HuggingFace Space:
-    python train_demo.py --base-url https://YOUR-SPACE.hf.space
-"""
-import sys
-import os
-# For local testing without server, use direct environment
-sys.path.insert(0, os.path.dirname(__file__))
-def demo_direct():
-    """Demo using the environment directly (no server needed)."""
-    from models import SkillInvocationAction
-    from server.skill_invocation_env_environment import SkillInvocationEnvironment
-    print("=== Direct Environment Demo ===\n")
-    env = SkillInvocationEnvironment()
-    # Run 3 episodes
-    for episode in range(3):
-        obs = env.reset(seed=episode)
-        print(f"--- Episode {episode + 1} ---")
-        print(f"Task: {obs.task_description[:100]}...")
-        print(f"Difficulty: {obs.difficulty}")
-        print(f"Skills available: {[s['name'] for s in obs.skill_catalog]}")
-        print(f"Context budget: {obs.context_budget_used}/{obs.context_budget_total}")
-        # Strategy: load the first skill in catalog
-        if obs.skill_catalog:
-            skill = obs.skill_catalog[0]
-            print(f"\nLoading skill: {skill['name']} ({skill['id']})")
-            obs = env.step(SkillInvocationAction(
-                action_type="load",
-                skill_id=skill["id"],
-            ))
-            if obs.skill_content:
-                print(f"Got skill content ({len(obs.skill_content)} chars)")
-                print(f"Preview: {obs.skill_content[:150]}...")
-                print(f"Context: {obs.context_budget_used}/{obs.context_budget_total}")
-        # Submit a dummy answer
-        print("\nSubmitting answer...")
-        obs = env.step(SkillInvocationAction(
-            action_type="submit",
-            answer="This is a placeholder answer for demonstration.",
-        ))
-        print(f"Done: {obs.done}")
-        print(f"Reward: {obs.reward}")
-        print(f"Verification: {obs.verification_result}")
-        print()
-    print("Demo complete!")
-def demo_client(base_url: str):
-    """Demo using the WebSocket client against a running server."""
-    from client import SkillInvocationEnv
-    from models import SkillInvocationAction
-    print(f"=== Client Demo (connecting to {base_url}) ===\n")
-    with SkillInvocationEnv(base_url=base_url) as client:
-        # Reset
-        result = client.reset()
-        obs = result.observation
-        print(f"Task: {obs.task_description[:100]}...")
-        print(f"Skills available: {[s['name'] for s in obs.skill_catalog]}")
-        # Load first skill
-        if obs.skill_catalog:
-            skill = obs.skill_catalog[0]
-            result = client.step(SkillInvocationAction(
-                action_type="load",
-                skill_id=skill["id"],
-            ))
-            print(f"\nLoaded '{skill['name']}'")
-            if result.observation.skill_content:
-                print(f"Content preview: {result.observation.skill_content[:200]}...")
-        # Submit
-        result = client.step(SkillInvocationAction(
-            action_type="submit",
-            answer="test answer",
-        ))
-        print(f"\nReward: {result.reward}")
-        print(f"Done: {result.done}")
-        print(f"Verification: {result.observation.verification_result}")
-    print("\nClient demo complete!")
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Skill Invocation Env Demo")
-    parser.add_argument(
-        "--base-url",
-        default=None,
-        help="Server URL (if not provided, runs directly without server)",
     )
-    args = parser.parse_args()
-    if args.base_url:
-        demo_client(args.base_url)
-    else:
-        demo_direct()

+import re
+import os
+import torch
+from datasets import Dataset
+from trl import GRPOConfig, GRPOTrainer
+from transformers import AutoTokenizer
+from skill_invocation_env.client import SkillInvocationEnv
+from skill_invocation_env.models import SkillInvocationAction
+# Configuration
+# Use 3B or 7B Qwen2.5 Coder. 3B fits very comfortably with batching on an H100.
+MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"
+ENV_URL = "https://mpnikhil-skill-invocation-env.hf.space"
+HF_TOKEN = os.getenv("HF_TOKEN")
+SYSTEM_PROMPT = """You are an expert AI software engineer. You will be given a task and a catalog of available skills (procedural knowledge).
+You must decide which skills to load to help you solve the task, and then submit your final answer.
+You must interact by outputting EXACTLY ONE of the following XML actions per turn:
+1. To load a skill to read its contents (costs context budget):
+<action type="load" skill_id="skill_01"/>
+2. To unload a skill if it is not useful (frees context budget):
+<action type="unload" skill_id="skill_01"/>
+3. To submit your final solution:
+<action type="submit">
+def your_code_here():
+    pass
+</action>
+Always think step-by-step before outputting an action.
+"""
+def parse_action(text: str) -> SkillInvocationAction:
+    """Parses the LLM's text output into a Pydantic Action object."""
+    load_match = re.search(r'<action\s+type="load"\s+skill_id="([^\"]+)"\s*/>', text)
+    if load_match:
+        return SkillInvocationAction(action_type="load", skill_id=load_match.group(1))
+    unload_match = re.search(r'<action\s+type="unload"\s+skill_id="([^\"]+)"\s*/>', text)
+    if unload_match:
+        return SkillInvocationAction(action_type="unload", skill_id=unload_match.group(1))
+    submit_match = re.search(r'<action\s+type="submit">(.*?)</action>', text, re.DOTALL)
+    if submit_match:
+        return SkillInvocationAction(action_type="submit", answer=submit_match.group(1).strip())
+    # Fallback if the model fails to follow format
+    return SkillInvocationAction(action_type="submit", answer=text)
+def format_observation(obs) -> str:
+    """Formats the Pydantic observation into a string for the LLM."""
+    prompt = f"TASK: {obs.task_description}\n\nSKILL CATALOG:\n"
+    for s in obs.skill_catalog:
+        prompt += f"- [{s['id']}] {s['name']}: {s['description']}\n"
+    if obs.skill_content:
+        prompt += f"\nJUST LOADED SKILL CONTENT:\n{obs.skill_content}\n"
+    prompt += f"\nBUDGET USED: {obs.context_budget_used} / {obs.context_budget_total}"
+    return prompt
+def rollout_func(prompts: list[str], trainer: GRPOTrainer):
+    """
+    Custom rollout function that handles multi-step interaction with the OpenEnv Space.
+    """
+    # 1. Setup clients for this batch
+    clients = [SkillInvocationEnv(base_url=ENV_URL) for _ in range(len(prompts))]
+    active_episodes = [True] * len(prompts)
+    # Initialize histories
+    histories = []
+    for _ in prompts:
+        histories.append([{"role": "system", "content": SYSTEM_PROMPT}])
+    # Start environments
+    for i, client in enumerate(clients):
+        res = client.reset()
+        histories[i].append({"role": "user", "content": format_observation(res.observation)})
+    # Multi-step generation loop (Max 4 turns: e.g., load, load, submit)
+    MAX_TURNS = 4
+    tokenizer = trainer.processing_class
+    all_rewards = [0.0] * len(prompts)
+    for turn in range(MAX_TURNS):
+        active_indices = [i for i, active in enumerate(active_episodes) if active]
+        if not active_indices:
+            break
+        # Format active prompts for vLLM
+        active_prompts = [tokenizer.apply_chat_template(histories[i], tokenize=False, add_generation_prompt=True) for i in active_indices]
+        # Generate completions
+        outputs = trainer.generate(active_prompts, max_new_tokens=512)
+        completions = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
+        # Step environments
+        for idx, completion in zip(active_indices, completions):
+            histories[idx].append({"role": "assistant", "content": completion})
+            action = parse_action(completion)
+            try:
+                res = clients[idx].step(action)
+                if res.done:
+                    active_episodes[idx] = False
+                    all_rewards[idx] = res.reward
+                else:
+                    histories[idx].append({"role": "user", "content": format_observation(res.observation)})
+            except Exception as e:
+                # Penalty for formatting errors or invalid actions
+                active_episodes[idx] = False
+                all_rewards[idx] = -1.0
+    return {
+        "env_reward": all_rewards,
+    }
+def reward_from_env(completions, **kwargs):
+    """Callback for TRL to fetch the rewards computed during the rollout."""
+    return kwargs.get("env_reward", [0.0] * len(completions))
+if __name__ == "__main__":
+    print(f"Starting GRPO Training on H100 with {MODEL_ID}...")
+    # Create dummy dataset (the rollout_func overrides the prompt anyway by calling env.reset())
+    dummy_dataset = Dataset.from_dict({"prompt": ["Start"] * 64})
+    training_args = GRPOConfig(
+        use_vllm=True,
+        vllm_mode="colocate", # Runs vLLM and PyTorch on the same H100 GPU!
+        num_train_epochs=1,
+        num_generations=8,    # How many rollout trajectories to try per prompt
+        max_completion_length=1024,
+        per_device_train_batch_size=8,
+        logging_steps=1,
+        output_dir="./outputs/qwen-skill-env",
     )
+    trainer = GRPOTrainer(
+        model=MODEL_ID,
+        reward_funcs=[reward_from_env],
+        train_dataset=dummy_dataset,
+        rollout_func=rollout_func,
+        args=training_args,
+    )
+    trainer.train()
+    print("Training complete! Pushing to hub...")
+    trainer.push_to_hub("mpnikhil/Qwen2.5-3B-Skill-Invocation", token=HF_TOKEN)