Spaces:

ARKAISW
/

QuantHive

Sleeping

App Files Files Community

ARKAISW commited on Apr 25

Commit

30a586b

1 Parent(s): a0d8bc5

fix(notebook): correct clone step order, extract prompt utils, fix github url

Browse files

Files changed (2) hide show

training/prompt_utils.py +152 -0
training/train_grpo_multiagent.py +1 -132

training/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import sys
+import json
+import random
+from pathlib import Path
+from typing import Dict, List
+import numpy as np
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from env.multi_agent_env import (
+    MultiAgentTradingEnv,
+    RISK_MANAGER,
+    PORTFOLIO_MGR,
+    TRADER,
+)
+from training.train_multi_agent import (
+    RuleRiskManagerPolicy,
+    RulePortfolioManagerPolicy,
+)
+SYSTEM_PROMPT = """You are a trading agent in a multi-agent governance system.
+The Risk Manager has set governance constraints, and the Portfolio Manager has allocated capital.
+Your job: propose a trade that maximizes profit while respecting these constraints.
+Respond exactly in this format:
+<thought>
+Your reasoning about the market state, risk constraints, and trade decision.
+</thought>
+<action>
+{"direction": 0, "size": 0.0, "sl": 0, "tp": 0}
+</action>
+"""
+def generate_pz_scenarios(
+    n: int = 500,
+    difficulty: str = "easy",
+    max_env_steps: int = 100,
+) -> List[Dict]:
+    """Run the PZ env with rule policies to generate realistic scenarios.
+    Each scenario captures:
+      - The Trader's full observation (29 dims)
+      - The RM constraints decoded from the message
+      - The PM allocation decoded from the message
+    """
+    env = MultiAgentTradingEnv(difficulty=difficulty, max_steps=max_env_steps)
+    rm_policy = RuleRiskManagerPolicy()
+    pm_policy = RulePortfolioManagerPolicy()
+    scenarios: List[Dict] = []
+    attempts = 0
+    max_attempts = n * 3
+    while len(scenarios) < n and attempts < max_attempts:
+        env.reset()
+        attempts += 1
+        step_count = 0
+        while env.agents and step_count < max_env_steps:
+            agent = env.agent_selection
+            if agent == RISK_MANAGER:
+                obs = env.observe(agent)
+                action = rm_policy.act(obs)
+                env.step(action)
+            elif agent == PORTFOLIO_MGR:
+                obs = env.observe(agent)
+                action = pm_policy.act(obs)
+                env.step(action)
+            elif agent == TRADER:
+                obs = env.observe(agent)
+                # Extract RM and PM messages from the observation
+                # obs layout: base(24) + rm_msg(3) + pm_msg(2) = 29
+                base_obs = obs[:24].tolist()
+                rm_msg = obs[24:27].tolist()  # [size_limit, allow_new, force_reduce]
+                pm_msg = obs[27:29].tolist()  # [cap_alloc, override_strength]
+                rm_size_limit = float(rm_msg[0])
+                rm_allow_new = bool(rm_msg[1] > 0.5)
+                rm_force_reduce = bool(rm_msg[2] > 0.5)
+                pm_cap_alloc = float(pm_msg[0])
+                pm_override = float(pm_msg[1])
+                scenarios.append({
+                    "state": [round(float(x), 4) for x in base_obs[:5]],
+                    "full_obs": [round(float(x), 4) for x in base_obs],
+                    "rm_size_limit": round(rm_size_limit, 3),
+                    "rm_allow_new": rm_allow_new,
+                    "rm_force_reduce": rm_force_reduce,
+                    "pm_cap_alloc": round(pm_cap_alloc, 3),
+                    "pm_override": round(pm_override, 3),
+                    "signals": {
+                        "ta": round(float(obs[5] * 2 - 1), 3),  # RSI mapped to [-1,1]
+                        "fa": round(float(obs[8]), 3),  # MACD as FA proxy
+                        "position_limit": round(rm_size_limit, 3),
+                        "rm_size_limit": round(rm_size_limit, 3),
+                    },
+                })
+                if len(scenarios) >= n:
+                    break
+                # Take a random trader action so the env advances
+                trader_action = {
+                    "direction": random.choice([0, 1, 2]),
+                    "size": np.array([random.uniform(0.05, 0.3)], dtype=np.float32),
+                    "sl": np.array([0.0], dtype=np.float32),
+                    "tp": np.array([0.0], dtype=np.float32),
+                }
+                env.step(trader_action)
+            step_count += 1
+    random.shuffle(scenarios)
+    return scenarios[:n]
+def build_prompt_multiagent(scenario: Dict) -> str:
+    """Build the prompt for the Trader, including RM and PM constraints."""
+    rm_limit = scenario["rm_size_limit"]
+    rm_allow_str = "allowed" if scenario.get("rm_allow_new", True) else "BLOCKED"
+    rm_force_str = "yes" if scenario.get("rm_force_reduce", False) else "no"
+    pm_cap = scenario["pm_cap_alloc"]
+    pm_override_str = "none" if scenario.get("pm_override", 0.0) < 0.5 else "ACTIVE"
+    state = scenario.get("state", [1.0, 1.0, 1.0, 1.0, 1.0])
+    signals = scenario.get("signals", {})
+    body = json.dumps({
+        "state": state,
+        "signals": signals,
+        "governance": {
+            "rm_size_limit": rm_limit,
+            "rm_allow_new": rm_allow_str,
+            "rm_force_reduce": rm_force_str,
+            "pm_cap_alloc": pm_cap,
+            "pm_override": pm_override_str,
+        },
+    }, separators=(",", ":"))
+    prompt = (
+        f"{SYSTEM_PROMPT}\n"
+        f"The Risk Manager has set the following constraints: "
+        f"size_limit={rm_limit:.2f}, new_positions={rm_allow_str}, force_reduce={rm_force_str}.\n"
+        f"The Portfolio Manager allocated: capital_cap={pm_cap:.2f}, override={pm_override_str}.\n\n"
+        f"Scenario:\n{body}\n"
+    )
+    return prompt

training/train_grpo_multiagent.py CHANGED Viewed

@@ -52,139 +52,8 @@ from training.train_multi_agent import (
 DEFAULT_MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit"
 DEFAULT_OUTPUT_DIR = "models/local_policy_grpo_multiagent"
-SYSTEM_PROMPT = """You are a trading agent in a multi-agent governance system.
-The Risk Manager has set governance constraints, and the Portfolio Manager has allocated capital.
-Your job: propose a trade that maximizes profit while respecting these constraints.
-Respond exactly in this format:
-<thought>
-Your reasoning about the market state, risk constraints, and trade decision.
-</thought>
-<action>
-{"direction": 0, "size": 0.0, "sl": 0, "tp": 0}
-</action>
-"""
-# ─── Scenario Generation from PettingZoo Env ──────────────────────────────────
-def generate_pz_scenarios(
-    n: int = 500,
-    difficulty: str = "easy",
-    max_env_steps: int = 100,
-) -> List[Dict]:
-    """Run the PZ env with rule policies to generate realistic scenarios.
-    Each scenario captures:
-      - The Trader's full observation (29 dims)
-      - The RM constraints decoded from the message
-      - The PM allocation decoded from the message
-    """
-    env = MultiAgentTradingEnv(difficulty=difficulty, max_steps=max_env_steps)
-    rm_policy = RuleRiskManagerPolicy()
-    pm_policy = RulePortfolioManagerPolicy()
-    scenarios: List[Dict] = []
-    attempts = 0
-    max_attempts = n * 3
-    while len(scenarios) < n and attempts < max_attempts:
-        env.reset()
-        attempts += 1
-        step_count = 0
-        while env.agents and step_count < max_env_steps:
-            agent = env.agent_selection
-            if agent == RISK_MANAGER:
-                obs = env.observe(agent)
-                action = rm_policy.act(obs)
-                env.step(action)
-            elif agent == PORTFOLIO_MGR:
-                obs = env.observe(agent)
-                action = pm_policy.act(obs)
-                env.step(action)
-            elif agent == TRADER:
-                obs = env.observe(agent)
-                # Extract RM and PM messages from the observation
-                # obs layout: base(24) + rm_msg(3) + pm_msg(2) = 29
-                base_obs = obs[:24].tolist()
-                rm_msg = obs[24:27].tolist()  # [size_limit, allow_new, force_reduce]
-                pm_msg = obs[27:29].tolist()  # [cap_alloc, override_strength]
-                rm_size_limit = float(rm_msg[0])
-                rm_allow_new = bool(rm_msg[1] > 0.5)
-                rm_force_reduce = bool(rm_msg[2] > 0.5)
-                pm_cap_alloc = float(pm_msg[0])
-                pm_override = float(pm_msg[1])
-                scenarios.append({
-                    "state": [round(float(x), 4) for x in base_obs[:5]],
-                    "full_obs": [round(float(x), 4) for x in base_obs],
-                    "rm_size_limit": round(rm_size_limit, 3),
-                    "rm_allow_new": rm_allow_new,
-                    "rm_force_reduce": rm_force_reduce,
-                    "pm_cap_alloc": round(pm_cap_alloc, 3),
-                    "pm_override": round(pm_override, 3),
-                    "signals": {
-                        "ta": round(float(obs[5] * 2 - 1), 3),  # RSI mapped to [-1,1]
-                        "fa": round(float(obs[8]), 3),  # MACD as FA proxy
-                        "position_limit": round(rm_size_limit, 3),
-                        "rm_size_limit": round(rm_size_limit, 3),
-                    },
-                })
-                if len(scenarios) >= n:
-                    break
-                # Take a random trader action so the env advances
-                trader_action = {
-                    "direction": random.choice([0, 1, 2]),
-                    "size": np.array([random.uniform(0.05, 0.3)], dtype=np.float32),
-                    "sl": np.array([0.0], dtype=np.float32),
-                    "tp": np.array([0.0], dtype=np.float32),
-                }
-                env.step(trader_action)
-            step_count += 1
-    random.shuffle(scenarios)
-    return scenarios[:n]
-def build_prompt_multiagent(scenario: Dict) -> str:
-    """Build the prompt for the Trader, including RM and PM constraints."""
-    rm_limit = scenario["rm_size_limit"]
-    rm_allow_str = "allowed" if scenario.get("rm_allow_new", True) else "BLOCKED"
-    rm_force_str = "yes" if scenario.get("rm_force_reduce", False) else "no"
-    pm_cap = scenario["pm_cap_alloc"]
-    pm_override_str = "none" if scenario.get("pm_override", 0.0) < 0.5 else "ACTIVE"
-    state = scenario.get("state", [1.0, 1.0, 1.0, 1.0, 1.0])
-    signals = scenario.get("signals", {})
-    body = json.dumps({
-        "state": state,
-        "signals": signals,
-        "governance": {
-            "rm_size_limit": rm_limit,
-            "rm_allow_new": rm_allow_str,
-            "rm_force_reduce": rm_force_str,
-            "pm_cap_alloc": pm_cap,
-            "pm_override": pm_override_str,
-        },
-    }, separators=(",", ":"))
-    prompt = (
-        f"{SYSTEM_PROMPT}\n"
-        f"The Risk Manager has set the following constraints: "
-        f"size_limit={rm_limit:.2f}, new_positions={rm_allow_str}, force_reduce={rm_force_str}.\n"
-        f"The Portfolio Manager allocated: capital_cap={pm_cap:.2f}, override={pm_override_str}.\n\n"
-        f"Scenario:\n{body}\n"
-    )
-    return prompt
 # ─── Updated GRPO Verifiers ───────────────────────────────────────────────────

 DEFAULT_MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit"
 DEFAULT_OUTPUT_DIR = "models/local_policy_grpo_multiagent"
+from training.prompt_utils import SYSTEM_PROMPT, generate_pz_scenarios, build_prompt_multiagent
 # ─── Updated GRPO Verifiers ───────────────────────────────────────────────────