ModuleMind

Running on Zero

File size: 4,773 Bytes

45e7dfb

"""
online.py -- persistent finetuning of the boss from real player fights.

Flow: the browser logs every boss decision (state, action, HP) during a fight and,
on fight end, POSTs the trajectory + who-died to /learn. We buffer trajectories and,
every MM_UPDATE_EVERY fights, run one REINFORCE step (mm_grad.OnlineLearner) that
nudges the HARD brain toward what actually worked against humans. The adapted
weights feed straight back into the live boss.

ONLY HARD-tier fights are used, so the data stays on-policy (Easy/Normal are
deliberately handicapped checkpoints; learning from them would be off-policy).

Persistence (optional, set as Space secrets):
  HF_TOKEN          - a write token
  MM_DATASET_REPO   - e.g. "your-name/boss-fight-online"
If set, adapted weights are pushed to / pulled from that dataset so learning
survives Space restarts. Without them it still adapts live, just in-memory.

Safety: a frozen copy of the sim-trained weights is kept as an anchor (the learner
pulls gently back toward it), so a weird run can't brick the boss.
"""
from __future__ import annotations

import os
import threading

import numpy as np

from mm_grad import OnlineLearner

HERE = os.path.dirname(os.path.abspath(__file__))
BASE_WEIGHTS = os.path.join(HERE, "mm_weights.npz")        # sim-trained HARD brain
LIVE_WEIGHTS = os.path.join(HERE, "mm_weights_live.npz")   # player-adapted snapshot

ENABLED = os.environ.get("MM_ONLINE", "1") == "1"
UPDATE_EVERY = int(os.environ.get("MM_UPDATE_EVERY", "3"))  # fights per update
ADAPT_TIER = "hard"
DATASET_REPO = os.environ.get("MM_DATASET_REPO")
HF_TOKEN = os.environ.get("HF_TOKEN")
WEIGHTS_IN_REPO = "mm_weights_live.npz"

_LOCK = threading.Lock()
_LEARNER = None
_BUFFER = []
_FIGHTS = 0


def _pull_from_dataset():
    """Try to download the latest adapted weights from the HF dataset."""
    if not (DATASET_REPO and HF_TOKEN):
        return None
    try:
        from huggingface_hub import hf_hub_download
        p = hf_hub_download(repo_id=DATASET_REPO, filename=WEIGHTS_IN_REPO,
                            repo_type="dataset", token=HF_TOKEN)
        return {k: v for k, v in np.load(p).items()}
    except Exception as e:
        print(f"[online] no persisted weights pulled ({e}); starting from sim weights")
        return None


def _push_to_dataset():
    """Upload the current adapted weights to the HF dataset (best-effort)."""
    if not (DATASET_REPO and HF_TOKEN):
        return
    try:
        from huggingface_hub import upload_file
        upload_file(path_or_fileobj=LIVE_WEIGHTS, path_in_repo=WEIGHTS_IN_REPO,
                    repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
    except Exception as e:
        print(f"[online] dataset push failed ({e})")


def get_learner():
    global _LEARNER
    if _LEARNER is None:
        base = _pull_from_dataset()
        if base is None and os.path.exists(LIVE_WEIGHTS):
            base = {k: v for k, v in np.load(LIVE_WEIGHTS).items()}
        if base is None:
            base = {k: v for k, v in np.load(BASE_WEIGHTS).items()}
        # anchor is ALWAYS the pristine sim-trained weights
        _LEARNER = OnlineLearner(base)
        _LEARNER.anchor = {k: v.astype(np.float64).copy()
                           for k, v in np.load(BASE_WEIGHTS).items()}
    return _LEARNER


def live_weights():
    """The HARD brain's current (possibly player-adapted) weights, shared live."""
    return get_learner().W


def _save_live():
    np.savez(LIVE_WEIGHTS, **{k: v.astype(np.float32) for k, v in get_learner().W.items()})
    _push_to_dataset()


def record_fight(trajectory: dict) -> dict:
    """Called by /learn. trajectory = {difficulty, steps:[{state,action,bossHP,playerHP}],
    result:{bossDied,playerDied}}. Buffers it and updates every UPDATE_EVERY fights."""
    global _FIGHTS
    if not ENABLED:
        return {"enabled": False}
    if trajectory.get("difficulty") != ADAPT_TIER:
        return {"enabled": True, "skipped": "only HARD-tier fights train the brain"}
    if len(trajectory.get("steps", [])) < 2:
        return {"enabled": True, "skipped": "too short"}
    with _LOCK:
        _BUFFER.append(trajectory)
        _FIGHTS += 1
        if len(_BUFFER) >= UPDATE_EVERY:
            stats = get_learner().update(list(_BUFFER))
            _BUFFER.clear()
            if stats.get("updated"):
                _save_live()
            return {"enabled": True, "fights": _FIGHTS, **stats}
        return {"enabled": True, "fights": _FIGHTS, "buffered": len(_BUFFER),
                "update_in": UPDATE_EVERY - len(_BUFFER)}


def status() -> dict:
    return {"enabled": ENABLED, "fights_seen": _FIGHTS, "buffered": len(_BUFFER),
            "persistent": bool(DATASET_REPO and HF_TOKEN), "adapt_tier": ADAPT_TIER}