Chris4K commited on
Commit
cd43a29
·
verified ·
1 Parent(s): 5245034

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +7 -0
  2. README.md +89 -7
  3. main.py +1138 -0
  4. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ WORKDIR /app
3
+ COPY requirements.txt .
4
+ RUN pip install --no-cache-dir -r requirements.txt
5
+ COPY main.py .
6
+ EXPOSE 7860
7
+ CMD ["python", "main.py"]
README.md CHANGED
@@ -1,11 +1,93 @@
1
  ---
2
- title: Agent Learn
3
- emoji: 📉
4
- colorFrom: pink
5
- colorTo: blue
6
  sdk: docker
7
- pinned: false
8
- short_description: q-learning, rhlf
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: agent-learn — FORGE Learning Layer
3
+ emoji: 🧠
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: docker
7
+ pinned: true
8
+ license: mit
9
+ short_description: Persistent Q-table, reward scoring, and RLHF store for FORGE
10
  ---
11
 
12
+ # 🧠 agent-learn
13
+ ### FORGE Persistent Learning Layer
14
+
15
+ Owns: Q-table (persistent), reward scoring pipeline, RLHF data store, skill candidate review.
16
+ Replaces the critical NEXUS /tmp Q-table that resets on every restart.
17
+
18
+ ## What it does
19
+
20
+ 1. **Q-table** — agents ask "what's the best action for my current state?" → epsilon-greedy response
21
+ 2. **Reward pipeline** — pulls unscored traces from agent-trace, scores them, writes rewards back
22
+ 3. **RLHF store** — labeled approve/reject completions for future fine-tuning
23
+ 4. **Skill candidates** — patterns detected by agents that recur enough to become FORGE skills
24
+
25
+ ## REST API
26
+
27
+ ```
28
+ GET /api/q?agent=&state={} Get all Q-values for agent+state
29
+ POST /api/q/best Best action (epsilon-greedy): {agent, state, actions[]}
30
+ POST /api/q/update Q-value update: {agent, state, action, reward, next_state?}
31
+ POST /api/q/hint Manual nudge: {agent, state, action, nudge}
32
+ GET /api/q/stats Q-table stats
33
+
34
+ POST /api/score Score a single trace event → reward
35
+ POST /api/sync Trigger trace pull + reward scoring now
36
+
37
+ GET /api/rlhf List RLHF entries
38
+ POST /api/rlhf Add labeled completion
39
+ PATCH /api/rlhf/{id} Update label/reward
40
+
41
+ GET /api/candidates List skill candidates (status=pending)
42
+ PATCH /api/candidates/{id} Update candidate (status: promoted|rejected)
43
+
44
+ GET /api/stats Full learning stats
45
+ GET /api/reward-trend Hourly avg reward trend
46
+ ```
47
+
48
+ ## MCP
49
+
50
+ ```
51
+ GET /mcp/sse SSE transport
52
+ POST /mcp JSON-RPC 2.0
53
+
54
+ Tools: learn_q_get, learn_q_best, learn_q_update, learn_q_hint,
55
+ learn_stats, learn_rlhf_add, learn_score_trace,
56
+ learn_candidate_add, learn_sync
57
+ ```
58
+
59
+ ## Secrets
60
+
61
+ | Key | Description |
62
+ |-----|-------------|
63
+ | `LEARN_KEY` | Optional write auth key |
64
+ | `TRACE_URL` | agent-trace URL (default: https://chris4k-agent-trace.hf.space) |
65
+ | `TRACE_KEY` | agent-trace auth key (if set) |
66
+ | `LEARN_RATE` | Q-learning α (default: 0.1) |
67
+ | `DISCOUNT` | Q-learning γ (default: 0.9) |
68
+ | `EPSILON` | Exploration rate (default: 0.15) |
69
+ | `SYNC_INTERVAL` | Trace pull interval seconds (default: 120) |
70
+
71
+ ## NEXUS integration (replacing /tmp Q-table)
72
+
73
+ ```python
74
+ LEARN_URL = "https://chris4k-agent-learn.hf.space"
75
+
76
+ # Before routing: ask LEARN for best model
77
+ resp = requests.post(f"{LEARN_URL}/api/q/best", json={
78
+ "agent": "nexus",
79
+ "state": {"agent": "nexus", "event": "model_selection"},
80
+ "actions": ["qwen/qwen3.5-35b-a3b", "claude-haiku-4-5", "hf_api", "local_cpu"]
81
+ }, timeout=3)
82
+ best_model = resp.json()["action"]
83
+
84
+ # After inference: update Q-value
85
+ requests.post(f"{LEARN_URL}/api/q/update", json={
86
+ "agent": "nexus",
87
+ "state": {"agent": "nexus", "event": "model_selection"},
88
+ "action": best_model,
89
+ "reward": 0.8
90
+ }, timeout=3)
91
+ ```
92
+
93
+ Built by [Chris4K](https://huggingface.co/Chris4K) — ki-fusion-labs.de
main.py ADDED
@@ -0,0 +1,1138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agent-learn — FORGE Persistent Learning Layer
3
+ Owns: Q-table (persistent), reward scoring pipeline, RLHF data store.
4
+ Reads traces from agent-trace, writes rewards back, updates Q-values.
5
+ Agents query here for best actions; NEXUS replaces its /tmp Q-table with this.
6
+ """
7
+
8
+ import asyncio, hashlib, json, math, os, sqlite3, time, uuid
9
+ from contextlib import asynccontextmanager
10
+ from pathlib import Path
11
+
12
+ import uvicorn
13
+ from fastapi import FastAPI, HTTPException, Query, Request
14
+ from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Config
18
+ # ---------------------------------------------------------------------------
19
+ DB_PATH = Path(os.getenv("LEARN_DB", "/tmp/learn.db"))
20
+ PORT = int(os.getenv("PORT", "7860"))
21
+ LEARN_KEY = os.getenv("LEARN_KEY", "")
22
+ TRACE_URL = os.getenv("TRACE_URL", "https://chris4k-agent-trace.hf.space")
23
+ TRACE_KEY = os.getenv("TRACE_KEY", "")
24
+ LEARN_RATE = float(os.getenv("LEARN_RATE", "0.1")) # α
25
+ DISCOUNT = float(os.getenv("DISCOUNT", "0.9")) # γ
26
+ EPSILON = float(os.getenv("EPSILON", "0.15")) # exploration rate
27
+ SYNC_INTERVAL= int(os.getenv("SYNC_INTERVAL", "120")) # seconds between trace pulls
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Database
31
+ # ---------------------------------------------------------------------------
32
+ def get_db():
33
+ conn = sqlite3.connect(str(DB_PATH), check_same_thread=False)
34
+ conn.row_factory = sqlite3.Row
35
+ conn.execute("PRAGMA journal_mode=WAL")
36
+ conn.execute("PRAGMA synchronous=NORMAL")
37
+ return conn
38
+
39
+ def init_db():
40
+ conn = get_db()
41
+ conn.executescript("""
42
+ -- Q-table: one row per (agent, state_hash, action)
43
+ CREATE TABLE IF NOT EXISTS qtable (
44
+ id TEXT PRIMARY KEY,
45
+ agent TEXT NOT NULL,
46
+ state_hash TEXT NOT NULL,
47
+ state_json TEXT NOT NULL DEFAULT '{}',
48
+ action TEXT NOT NULL,
49
+ q_value REAL NOT NULL DEFAULT 0.0,
50
+ visits INTEGER NOT NULL DEFAULT 0,
51
+ last_reward REAL,
52
+ updated_at REAL NOT NULL
53
+ );
54
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_qt_key ON qtable(agent, state_hash, action);
55
+ CREATE INDEX IF NOT EXISTS idx_qt_agent ON qtable(agent);
56
+ CREATE INDEX IF NOT EXISTS idx_qt_action ON qtable(action);
57
+
58
+ -- Reward log: every scored trace event
59
+ CREATE TABLE IF NOT EXISTS rewards (
60
+ id TEXT PRIMARY KEY,
61
+ trace_id TEXT NOT NULL,
62
+ agent TEXT NOT NULL,
63
+ event_type TEXT NOT NULL,
64
+ raw_score REAL NOT NULL,
65
+ components TEXT NOT NULL DEFAULT '{}',
66
+ ts REAL NOT NULL
67
+ );
68
+ CREATE INDEX IF NOT EXISTS idx_rw_agent ON rewards(agent);
69
+ CREATE INDEX IF NOT EXISTS idx_rw_ts ON rewards(ts DESC);
70
+
71
+ -- RLHF store: labeled completions for future fine-tuning
72
+ CREATE TABLE IF NOT EXISTS rlhf (
73
+ id TEXT PRIMARY KEY,
74
+ agent TEXT NOT NULL DEFAULT 'unknown',
75
+ prompt TEXT NOT NULL,
76
+ completion TEXT NOT NULL,
77
+ label TEXT NOT NULL DEFAULT 'unlabeled', -- approved|rejected|unlabeled
78
+ reward REAL,
79
+ source TEXT NOT NULL DEFAULT 'human', -- human|auto|model
80
+ meta TEXT NOT NULL DEFAULT '{}',
81
+ created_at REAL NOT NULL
82
+ );
83
+ CREATE INDEX IF NOT EXISTS idx_rlhf_agent ON rlhf(agent);
84
+ CREATE INDEX IF NOT EXISTS idx_rlhf_label ON rlhf(label);
85
+
86
+ -- Cursor: last ts pulled from agent-trace per agent
87
+ CREATE TABLE IF NOT EXISTS sync_cursor (
88
+ agent TEXT PRIMARY KEY,
89
+ last_ts REAL NOT NULL DEFAULT 0.0
90
+ );
91
+
92
+ -- Skill candidates surfaced from traces
93
+ CREATE TABLE IF NOT EXISTS skill_candidates (
94
+ id TEXT PRIMARY KEY,
95
+ description TEXT NOT NULL,
96
+ agent TEXT NOT NULL,
97
+ frequency INTEGER NOT NULL DEFAULT 1,
98
+ status TEXT NOT NULL DEFAULT 'pending', -- pending|promoted|rejected
99
+ created_at REAL NOT NULL,
100
+ updated_at REAL NOT NULL
101
+ );
102
+ """)
103
+ conn.commit(); conn.close()
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Q-table operations
107
+ # ---------------------------------------------------------------------------
108
+ def _state_hash(state: dict) -> str:
109
+ canonical = json.dumps(state, sort_keys=True, separators=(',',':'))
110
+ return hashlib.sha256(canonical.encode()).hexdigest()[:16]
111
+
112
+ def q_get(agent: str, state: dict) -> list:
113
+ """Return all (action, q_value, visits) rows for this agent+state."""
114
+ sh = _state_hash(state)
115
+ conn = get_db()
116
+ rows = conn.execute(
117
+ "SELECT action, q_value, visits, last_reward FROM qtable WHERE agent=? AND state_hash=? ORDER BY q_value DESC",
118
+ (agent, sh)).fetchall()
119
+ conn.close()
120
+ return [dict(r) for r in rows]
121
+
122
+ def q_best_action(agent: str, state: dict, actions: list) -> dict:
123
+ """
124
+ Epsilon-greedy action selection.
125
+ Returns {"action": str, "q_value": float, "strategy": "exploit"|"explore"|"init"}
126
+ """
127
+ import random
128
+ sh = _state_hash(state)
129
+ conn = get_db()
130
+ rows = conn.execute(
131
+ "SELECT action, q_value, visits FROM qtable WHERE agent=? AND state_hash=? ORDER BY q_value DESC",
132
+ (agent, sh)).fetchall()
133
+ conn.close()
134
+
135
+ known = {r["action"]: (r["q_value"], r["visits"]) for r in rows}
136
+ # Filter to valid actions
137
+ valid = [a for a in actions if a]
138
+
139
+ if not valid:
140
+ return {"action": None, "q_value": 0.0, "strategy": "no_actions"}
141
+
142
+ # Explore: random action
143
+ if random.random() < EPSILON:
144
+ a = random.choice(valid)
145
+ return {"action": a, "q_value": known.get(a, (0.0, 0))[0], "strategy": "explore"}
146
+
147
+ # Exploit: best known, or init with 0 for unknowns
148
+ best_a, best_q = None, float('-inf')
149
+ for a in valid:
150
+ q = known.get(a, (0.0, 0))[0]
151
+ if q > best_q:
152
+ best_q, best_a = q, a
153
+
154
+ strategy = "exploit" if best_a in known else "init"
155
+ return {"action": best_a or valid[0], "q_value": best_q if best_q > float('-inf') else 0.0,
156
+ "strategy": strategy}
157
+
158
+ def q_update(agent: str, state: dict, action: str, reward: float,
159
+ next_state: dict = None) -> dict:
160
+ """
161
+ Q-learning update: Q(s,a) ← Q(s,a) + α[r + γ·max_Q(s') - Q(s,a)]
162
+ """
163
+ sh = _state_hash(state)
164
+ now = time.time()
165
+ conn = get_db()
166
+
167
+ # Current Q(s,a)
168
+ row = conn.execute(
169
+ "SELECT q_value, visits FROM qtable WHERE agent=? AND state_hash=? AND action=?",
170
+ (agent, sh, action)).fetchone()
171
+ q_old = row["q_value"] if row else 0.0
172
+ visits = (row["visits"] if row else 0) + 1
173
+
174
+ # max Q(s') if next_state provided
175
+ max_q_next = 0.0
176
+ if next_state:
177
+ nsh = _state_hash(next_state)
178
+ best_next = conn.execute(
179
+ "SELECT MAX(q_value) FROM qtable WHERE agent=? AND state_hash=?",
180
+ (agent, nsh)).fetchone()[0]
181
+ max_q_next = best_next or 0.0
182
+
183
+ q_new = q_old + LEARN_RATE * (reward + DISCOUNT * max_q_next - q_old)
184
+
185
+ row_id = str(uuid.uuid4())
186
+ conn.execute("""
187
+ INSERT INTO qtable (id,agent,state_hash,state_json,action,q_value,visits,last_reward,updated_at)
188
+ VALUES (?,?,?,?,?,?,?,?,?)
189
+ ON CONFLICT(agent,state_hash,action) DO UPDATE SET
190
+ q_value=excluded.q_value, visits=excluded.visits,
191
+ last_reward=excluded.last_reward, updated_at=excluded.updated_at
192
+ """, (row_id, agent, sh, json.dumps(state), action, q_new, visits, reward, now))
193
+ conn.commit(); conn.close()
194
+
195
+ return {"agent": agent, "action": action, "q_old": round(q_old, 5),
196
+ "q_new": round(q_new, 5), "reward": reward, "visits": visits}
197
+
198
+ def q_hint(agent: str, state: dict, action: str, nudge: float) -> dict:
199
+ """Manual Q-value nudge (bias from operator). Additive."""
200
+ sh = _state_hash(state)
201
+ now = time.time()
202
+ conn = get_db()
203
+ row = conn.execute(
204
+ "SELECT q_value, visits FROM qtable WHERE agent=? AND state_hash=? AND action=?",
205
+ (agent, sh, action)).fetchone()
206
+ q_old = row["q_value"] if row else 0.0
207
+ visits = (row["visits"] if row else 0)
208
+ q_new = q_old + nudge
209
+ conn.execute("""
210
+ INSERT INTO qtable (id,agent,state_hash,state_json,action,q_value,visits,last_reward,updated_at)
211
+ VALUES (?,?,?,?,?,?,?,?,?)
212
+ ON CONFLICT(agent,state_hash,action) DO UPDATE SET
213
+ q_value=excluded.q_value, updated_at=excluded.updated_at
214
+ """, (str(uuid.uuid4()), agent, sh, json.dumps(state), action, q_new, visits, None, now))
215
+ conn.commit(); conn.close()
216
+ return {"agent": agent, "action": action, "q_old": round(q_old,5),
217
+ "q_new": round(q_new,5), "nudge": nudge}
218
+
219
+ def q_stats() -> dict:
220
+ conn = get_db()
221
+ total = conn.execute("SELECT COUNT(*) FROM qtable").fetchone()[0]
222
+ agents = conn.execute("SELECT agent, COUNT(*) as n, AVG(q_value) as avg_q, MAX(q_value) as max_q "
223
+ "FROM qtable GROUP BY agent ORDER BY n DESC").fetchall()
224
+ top = conn.execute("SELECT agent, action, q_value, visits FROM qtable "
225
+ "ORDER BY q_value DESC LIMIT 10").fetchall()
226
+ worst = conn.execute("SELECT agent, action, q_value, visits FROM qtable "
227
+ "ORDER BY q_value ASC LIMIT 10").fetchall()
228
+ conn.close()
229
+ return {
230
+ "total_entries": total,
231
+ "by_agent": [dict(r) for r in agents],
232
+ "top_actions": [dict(r) for r in top],
233
+ "worst_actions": [dict(r) for r in worst],
234
+ }
235
+
236
+ # ---------------------------------------------------------------------------
237
+ # Reward scoring
238
+ # ---------------------------------------------------------------------------
239
+ def score_trace_event(ev: dict) -> tuple[float, dict]:
240
+ """
241
+ Score a trace event → reward in [-1.0, 1.0].
242
+ Returns (score, components).
243
+ """
244
+ components = {}
245
+ score = 0.0
246
+
247
+ # Base: error is always bad
248
+ if ev.get("status") == "error":
249
+ components["error_penalty"] = -0.4
250
+ score -= 0.4
251
+
252
+ # Latency score for LLM calls (lower = better)
253
+ lat = ev.get("latency_ms")
254
+ if lat is not None and ev.get("event_type") == "llm_call":
255
+ if lat < 500:
256
+ v = 0.3; components["latency_fast"] = v
257
+ elif lat < 1500:
258
+ v = 0.1; components["latency_ok"] = v
259
+ elif lat < 4000:
260
+ v = -0.1; components["latency_slow"] = v
261
+ else:
262
+ v = -0.3; components["latency_very_slow"] = v
263
+ score += v
264
+
265
+ # Token efficiency for LLM calls
266
+ tin = ev.get("tokens_in") or 0
267
+ tout = ev.get("tokens_out") or 0
268
+ if tin > 0 and tout > 0 and ev.get("event_type") == "llm_call":
269
+ ratio = tout / max(tin, 1)
270
+ if ratio > 0.5:
271
+ v = 0.1; components["token_efficiency"] = v; score += v
272
+ elif ratio < 0.05:
273
+ v = -0.05; components["token_low_output"] = v; score += v
274
+
275
+ # ReAct step: reward progress
276
+ if ev.get("event_type") == "react_step":
277
+ components["react_progress"] = 0.1
278
+ score += 0.1
279
+
280
+ # Skill load: reward reuse over re-implementation
281
+ if ev.get("event_type") == "skill_load":
282
+ components["skill_reuse"] = 0.15
283
+ score += 0.15
284
+
285
+ # Self-reflect: always reward
286
+ if ev.get("event_type") == "self_reflect":
287
+ components["reflection_bonus"] = 0.2
288
+ score += 0.2
289
+
290
+ # Clamp to [-1, 1]
291
+ score = max(-1.0, min(1.0, score))
292
+ return round(score, 4), components
293
+
294
+ # ---------------------------------------------------------------------------
295
+ # Trace sync pipeline
296
+ # ---------------------------------------------------------------------------
297
+ _http_client = None
298
+
299
+ def _get_http():
300
+ global _http_client
301
+ if _http_client is None:
302
+ try:
303
+ import httpx
304
+ _http_client = httpx.Client(timeout=10.0)
305
+ except ImportError:
306
+ import urllib.request as _ur
307
+ _http_client = "urllib"
308
+ return _http_client
309
+
310
+ def _http_get(url, params=None) -> dict:
311
+ client = _get_http()
312
+ if hasattr(client, "get"):
313
+ r = client.get(url, params=params)
314
+ return r.json()
315
+ else:
316
+ import urllib.request, urllib.parse
317
+ if params:
318
+ url = url + "?" + urllib.parse.urlencode(params)
319
+ with urllib.request.urlopen(url, timeout=10) as resp:
320
+ return json.loads(resp.read())
321
+
322
+ def _http_patch(url, data: dict) -> bool:
323
+ client = _get_http()
324
+ if hasattr(client, "patch"):
325
+ r = client.patch(url, json=data)
326
+ return r.status_code < 300
327
+ else:
328
+ import urllib.request
329
+ req = urllib.request.Request(url, data=json.dumps(data).encode(),
330
+ headers={"Content-Type":"application/json"}, method="PATCH")
331
+ try:
332
+ urllib.request.urlopen(req, timeout=5)
333
+ return True
334
+ except Exception:
335
+ return False
336
+
337
+ def pull_and_score_traces() -> dict:
338
+ """
339
+ Pull unscored traces from agent-trace, score them, write rewards back.
340
+ Returns summary stats.
341
+ """
342
+ conn = get_db()
343
+ cursor_rows = {r["agent"]: r["last_ts"]
344
+ for r in conn.execute("SELECT agent, last_ts FROM sync_cursor").fetchall()}
345
+ conn.close()
346
+
347
+ try:
348
+ data = _http_get(f"{TRACE_URL}/api/traces",
349
+ {"has_reward": "false", "since_hours": 48, "limit": 200})
350
+ events = data.get("events", [])
351
+ except Exception as e:
352
+ return {"ok": False, "error": str(e)}
353
+
354
+ scored = 0
355
+ skipped = 0
356
+ reward_sum = 0.0
357
+ new_cursors = {}
358
+
359
+ for ev in events:
360
+ agent = ev.get("agent", "unknown")
361
+ ts = ev.get("ts", 0)
362
+
363
+ # Skip already-rewarded
364
+ if ev.get("reward") is not None:
365
+ skipped += 1
366
+ continue
367
+
368
+ reward, components = score_trace_event(ev)
369
+
370
+ # Write reward back to agent-trace
371
+ try:
372
+ _http_patch(f"{TRACE_URL}/api/trace/{ev['id']}/reward",
373
+ {"reward": reward, "source": "learn"})
374
+ except Exception:
375
+ pass # best-effort
376
+
377
+ # Log reward locally
378
+ conn = get_db()
379
+ conn.execute("""
380
+ INSERT OR IGNORE INTO rewards (id,trace_id,agent,event_type,raw_score,components,ts)
381
+ VALUES (?,?,?,?,?,?,?)
382
+ """, (str(uuid.uuid4()), ev["id"], agent,
383
+ ev.get("event_type","custom"), reward,
384
+ json.dumps(components), time.time()))
385
+ conn.commit(); conn.close()
386
+
387
+ # Q-table update: map event → (state, action)
388
+ _update_qtable_from_trace(ev, reward)
389
+
390
+ scored += 1
391
+ reward_sum += reward
392
+ new_cursors[agent] = max(new_cursors.get(agent, 0), ts)
393
+
394
+ # Update cursors
395
+ if new_cursors:
396
+ conn = get_db()
397
+ for agent, ts in new_cursors.items():
398
+ conn.execute("INSERT INTO sync_cursor (agent,last_ts) VALUES (?,?) "
399
+ "ON CONFLICT(agent) DO UPDATE SET last_ts=MAX(last_ts,excluded.last_ts)",
400
+ (agent, ts))
401
+ conn.commit(); conn.close()
402
+
403
+ return {
404
+ "ok": True,
405
+ "scored": scored,
406
+ "skipped": skipped,
407
+ "avg_reward": round(reward_sum / max(scored, 1), 4),
408
+ }
409
+
410
+ def _update_qtable_from_trace(ev: dict, reward: float):
411
+ """Map a trace event to a Q-table update."""
412
+ agent = ev.get("agent", "unknown")
413
+ event_type = ev.get("event_type", "custom")
414
+ model = ev.get("model", "")
415
+ tool = ev.get("tool_name", "")
416
+ lat = ev.get("latency_ms")
417
+
418
+ # State: context that was available when the decision was made
419
+ # Action: the choice that was made
420
+ if event_type == "llm_call" and model:
421
+ # State: which agent, what kind of task
422
+ state = {"agent": agent, "event": "model_selection"}
423
+ action = model
424
+ q_update(agent, state, action, reward)
425
+
426
+ elif event_type == "tool_use" and tool:
427
+ state = {"agent": agent, "event": "tool_selection"}
428
+ action = tool
429
+ q_update(agent, state, action, reward)
430
+
431
+ elif event_type == "skill_load" and ev.get("skill_id"):
432
+ state = {"agent": agent, "event": "skill_selection"}
433
+ action = ev["skill_id"]
434
+ q_update(agent, state, action, reward)
435
+
436
+ # ---------------------------------------------------------------------------
437
+ # RLHF store
438
+ # ---------------------------------------------------------------------------
439
+ def rlhf_add(agent: str, prompt: str, completion: str,
440
+ label: str = "unlabeled", reward: float = None,
441
+ source: str = "human", meta: dict = None) -> str:
442
+ now = time.time()
443
+ rid = str(uuid.uuid4())
444
+ label = label if label in ("approved","rejected","unlabeled") else "unlabeled"
445
+ conn = get_db()
446
+ conn.execute("""
447
+ INSERT INTO rlhf (id,agent,prompt,completion,label,reward,source,meta,created_at)
448
+ VALUES (?,?,?,?,?,?,?,?,?)
449
+ """, (rid, agent, prompt, completion, label, reward,
450
+ source, json.dumps(meta or {}), now))
451
+ conn.commit(); conn.close()
452
+ return rid
453
+
454
+ def rlhf_label(entry_id: str, label: str, reward: float = None) -> bool:
455
+ label = label if label in ("approved","rejected","unlabeled") else "unlabeled"
456
+ conn = get_db()
457
+ n = conn.execute(
458
+ "UPDATE rlhf SET label=?, reward=? WHERE id=?", (label, reward, entry_id)
459
+ ).rowcount
460
+ conn.commit(); conn.close()
461
+ return n > 0
462
+
463
+ def rlhf_list(agent: str = "", label: str = "", limit: int = 50) -> list:
464
+ conn = get_db()
465
+ where, params = [], []
466
+ if agent: where.append("agent=?"); params.append(agent)
467
+ if label: where.append("label=?"); params.append(label)
468
+ sql = ("SELECT * FROM rlhf" +
469
+ (f" WHERE {' AND '.join(where)}" if where else "") +
470
+ " ORDER BY created_at DESC LIMIT ?")
471
+ rows = conn.execute(sql, params+[limit]).fetchall()
472
+ conn.close()
473
+ result = []
474
+ for r in rows:
475
+ d = dict(r)
476
+ try: d["meta"] = json.loads(d["meta"])
477
+ except Exception: pass
478
+ result.append(d)
479
+ return result
480
+
481
+ def rlhf_stats() -> dict:
482
+ conn = get_db()
483
+ rows = conn.execute("SELECT label, COUNT(*) as n FROM rlhf GROUP BY label").fetchall()
484
+ conn.close()
485
+ total = sum(r["n"] for r in rows)
486
+ return {"total": total, "by_label": {r["label"]: r["n"] for r in rows}}
487
+
488
+ # ---------------------------------------------------------------------------
489
+ # Skill candidates
490
+ # ---------------------------------------------------------------------------
491
+ def candidate_add(description: str, agent: str) -> str:
492
+ conn = get_db()
493
+ # Dedup: if description matches existing pending candidate, increment frequency
494
+ existing = conn.execute(
495
+ "SELECT id, frequency FROM skill_candidates WHERE description=? AND status='pending'",
496
+ (description,)).fetchone()
497
+ if existing:
498
+ conn.execute("UPDATE skill_candidates SET frequency=frequency+1, updated_at=? WHERE id=?",
499
+ (time.time(), existing["id"]))
500
+ conn.commit(); conn.close()
501
+ return existing["id"]
502
+ cid = str(uuid.uuid4())
503
+ now = time.time()
504
+ conn.execute("""
505
+ INSERT INTO skill_candidates (id,description,agent,frequency,status,created_at,updated_at)
506
+ VALUES (?,?,?,1,'pending',?,?)
507
+ """, (cid, description, agent, now, now))
508
+ conn.commit(); conn.close()
509
+ return cid
510
+
511
+ def candidate_update(cid: str, status: str) -> bool:
512
+ conn = get_db()
513
+ n = conn.execute("UPDATE skill_candidates SET status=?, updated_at=? WHERE id=?",
514
+ (status, time.time(), cid)).rowcount
515
+ conn.commit(); conn.close()
516
+ return n > 0
517
+
518
+ def candidates_list(status: str = "pending") -> list:
519
+ conn = get_db()
520
+ rows = conn.execute(
521
+ "SELECT * FROM skill_candidates WHERE status=? ORDER BY frequency DESC, created_at DESC",
522
+ (status,)).fetchall()
523
+ conn.close()
524
+ return [dict(r) for r in rows]
525
+
526
+ # ---------------------------------------------------------------------------
527
+ # Learn stats
528
+ # ---------------------------------------------------------------------------
529
+ def learn_stats() -> dict:
530
+ conn = get_db()
531
+ rw_count = conn.execute("SELECT COUNT(*) FROM rewards").fetchone()[0]
532
+ rw_avg = conn.execute("SELECT AVG(raw_score) FROM rewards").fetchone()[0]
533
+ rw_24h = conn.execute("SELECT COUNT(*), AVG(raw_score) FROM rewards WHERE ts>=?",
534
+ (time.time()-86400,)).fetchone()
535
+ rlhf_s = rlhf_stats()
536
+ cands = conn.execute("SELECT COUNT(*) FROM skill_candidates WHERE status='pending'").fetchone()[0]
537
+ conn.close()
538
+ qs = q_stats()
539
+ return {
540
+ "qtable": qs,
541
+ "rewards": {
542
+ "total": rw_count,
543
+ "avg_all_time": round(rw_avg or 0, 4),
544
+ "last_24h": {"count": rw_24h[0], "avg": round(rw_24h[1] or 0, 4)},
545
+ },
546
+ "rlhf": rlhf_s,
547
+ "skill_candidates_pending": cands,
548
+ }
549
+
550
+ def reward_trend(hours: int = 24, bucket_minutes: int = 60) -> list:
551
+ conn = get_db()
552
+ since = time.time() - hours * 3600
553
+ rows = conn.execute(
554
+ "SELECT ts, raw_score, agent, event_type FROM rewards WHERE ts>=? ORDER BY ts",
555
+ (since,)).fetchall()
556
+ conn.close()
557
+ if not rows:
558
+ return []
559
+ # Bucket by hour
560
+ buckets = {}
561
+ for r in rows:
562
+ h = int(r["ts"] // 3600) * 3600
563
+ if h not in buckets:
564
+ buckets[h] = {"ts": h, "count": 0, "total": 0.0}
565
+ buckets[h]["count"] += 1
566
+ buckets[h]["total"] += r["raw_score"]
567
+ return [{"ts": v["ts"], "count": v["count"],
568
+ "avg_reward": round(v["total"]/v["count"],4)}
569
+ for v in sorted(buckets.values(), key=lambda x: x["ts"])]
570
+
571
+ # ---------------------------------------------------------------------------
572
+ # Background sync loop
573
+ # ---------------------------------------------------------------------------
574
+ async def _sync_loop():
575
+ while True:
576
+ await asyncio.sleep(SYNC_INTERVAL)
577
+ try:
578
+ pull_and_score_traces()
579
+ except Exception:
580
+ pass
581
+
582
+ # ---------------------------------------------------------------------------
583
+ # Seed
584
+ # ---------------------------------------------------------------------------
585
+ def seed_demo():
586
+ conn = get_db()
587
+ n = conn.execute("SELECT COUNT(*) FROM qtable").fetchone()[0]
588
+ conn.close()
589
+ if n > 0: return
590
+ # Seed NEXUS model selection Q-table from prior knowledge
591
+ now = time.time()
592
+ entries = [
593
+ # ki-fusion RTX5090 is best when available
594
+ ("nexus", {"agent":"nexus","event":"model_selection"}, "qwen/qwen3.5-35b-a3b", 0.72),
595
+ ("nexus", {"agent":"nexus","event":"model_selection"}, "claude-haiku-4-5", 0.55),
596
+ ("nexus", {"agent":"nexus","event":"model_selection"}, "hf_api", 0.30),
597
+ ("nexus", {"agent":"nexus","event":"model_selection"}, "local_cpu", 0.10),
598
+ # Tool selection
599
+ ("pulse", {"agent":"pulse","event":"tool_selection"}, "kanban_create", 0.65),
600
+ ("pulse", {"agent":"pulse","event":"tool_selection"}, "slot_reserve", 0.60),
601
+ ("pulse", {"agent":"pulse","event":"tool_selection"}, "trigger_agent", 0.50),
602
+ # Skill reuse
603
+ ("pulse", {"agent":"pulse","event":"skill_selection"}, "calculator", 0.40),
604
+ ("pulse", {"agent":"pulse","event":"skill_selection"}, "forge_client", 0.55),
605
+ ]
606
+ for agent, state, action, q in entries:
607
+ sh = _state_hash(state)
608
+ conn = get_db()
609
+ conn.execute("""
610
+ INSERT OR IGNORE INTO qtable (id,agent,state_hash,state_json,action,q_value,visits,last_reward,updated_at)
611
+ VALUES (?,?,?,?,?,?,0,NULL,?)
612
+ """, (str(uuid.uuid4()), agent, sh, json.dumps(state), action, q, now))
613
+ conn.commit(); conn.close()
614
+ # Seed RLHF examples
615
+ examples = [
616
+ ("nexus", "Route this query to the best available LLM.",
617
+ "I will use ki-fusion RTX5090 (qwen3.5-35b) as it has the best quality/speed ratio.",
618
+ "approved", 0.9),
619
+ ("nexus", "Route this query to the best available LLM.",
620
+ "I will use local_cpu for this complex multi-step reasoning task.",
621
+ "rejected", -0.3),
622
+ ("pulse", "Schedule this long-running background task.",
623
+ "I will reserve an LLM slot before starting and release it on completion.",
624
+ "approved", 0.8),
625
+ ]
626
+ for agent, prompt, completion, label, reward in examples:
627
+ rlhf_add(agent, prompt, completion, label, reward, "seed")
628
+ # Seed a skill candidate
629
+ candidate_add("Pattern: agents repeatedly fetch the same URL multiple times per session → caching skill needed", "learn")
630
+
631
+ # ---------------------------------------------------------------------------
632
+ # MCP
633
+ # ---------------------------------------------------------------------------
634
+ MCP_TOOLS = [
635
+ {"name":"learn_q_get","description":"Get all Q-values for an agent+state.",
636
+ "inputSchema":{"type":"object","required":["agent","state"],
637
+ "properties":{"agent":{"type":"string"},"state":{"type":"object"}}}},
638
+ {"name":"learn_q_best","description":"Get best action (epsilon-greedy) for an agent+state.",
639
+ "inputSchema":{"type":"object","required":["agent","state","actions"],
640
+ "properties":{"agent":{"type":"string"},"state":{"type":"object"},
641
+ "actions":{"type":"array","items":{"type":"string"}}}}},
642
+ {"name":"learn_q_update","description":"Update Q-value after taking an action and observing reward.",
643
+ "inputSchema":{"type":"object","required":["agent","state","action","reward"],
644
+ "properties":{"agent":{"type":"string"},"state":{"type":"object"},
645
+ "action":{"type":"string"},"reward":{"type":"number"},
646
+ "next_state":{"type":"object"}}}},
647
+ {"name":"learn_q_hint","description":"Manually nudge a Q-value (operator override).",
648
+ "inputSchema":{"type":"object","required":["agent","state","action","nudge"],
649
+ "properties":{"agent":{"type":"string"},"state":{"type":"object"},
650
+ "action":{"type":"string"},"nudge":{"type":"number"}}}},
651
+ {"name":"learn_stats","description":"Get learning system statistics.",
652
+ "inputSchema":{"type":"object","properties":{}}},
653
+ {"name":"learn_rlhf_add","description":"Add a labeled completion to the RLHF store.",
654
+ "inputSchema":{"type":"object","required":["agent","prompt","completion"],
655
+ "properties":{"agent":{"type":"string"},"prompt":{"type":"string"},
656
+ "completion":{"type":"string"},"label":{"type":"string"},
657
+ "reward":{"type":"number"},"source":{"type":"string"}}}},
658
+ {"name":"learn_score_trace","description":"Score a single trace event and return reward.",
659
+ "inputSchema":{"type":"object","required":["event"],
660
+ "properties":{"event":{"type":"object","description":"Trace event dict"}}}},
661
+ {"name":"learn_candidate_add","description":"Add a skill candidate for review.",
662
+ "inputSchema":{"type":"object","required":["description","agent"],
663
+ "properties":{"description":{"type":"string"},"agent":{"type":"string"}}}},
664
+ {"name":"learn_sync","description":"Trigger immediate trace pull and reward scoring.",
665
+ "inputSchema":{"type":"object","properties":{}}},
666
+ ]
667
+
668
+ def handle_mcp(method, params, req_id):
669
+ def ok(r): return {"jsonrpc":"2.0","id":req_id,"result":r}
670
+ def txt(d): return ok({"content":[{"type":"text","text":json.dumps(d)}]})
671
+ if method=="initialize":
672
+ return ok({"protocolVersion":"2024-11-05",
673
+ "serverInfo":{"name":"agent-learn","version":"1.0.0"},
674
+ "capabilities":{"tools":{}}})
675
+ if method=="tools/list": return ok({"tools":MCP_TOOLS})
676
+ if method=="tools/call":
677
+ n, a = params.get("name",""), params.get("arguments",{})
678
+ if n=="learn_q_get": return txt({"entries":q_get(a["agent"],a["state"])})
679
+ if n=="learn_q_best": return txt(q_best_action(a["agent"],a["state"],a.get("actions",[])))
680
+ if n=="learn_q_update": return txt(q_update(a["agent"],a["state"],a["action"],float(a["reward"]),a.get("next_state")))
681
+ if n=="learn_q_hint": return txt(q_hint(a["agent"],a["state"],a["action"],float(a["nudge"])))
682
+ if n=="learn_stats": return txt(learn_stats())
683
+ if n=="learn_rlhf_add":
684
+ rid = rlhf_add(a["agent"],a["prompt"],a["completion"],
685
+ a.get("label","unlabeled"),a.get("reward"),a.get("source","mcp"))
686
+ return txt({"ok":True,"id":rid})
687
+ if n=="learn_score_trace":
688
+ score, comp = score_trace_event(a.get("event",{}))
689
+ return txt({"reward":score,"components":comp})
690
+ if n=="learn_candidate_add":
691
+ cid = candidate_add(a["description"],a["agent"])
692
+ return txt({"ok":True,"id":cid})
693
+ if n=="learn_sync": return txt(pull_and_score_traces())
694
+ return {"jsonrpc":"2.0","id":req_id,"error":{"code":-32601,"message":f"Unknown tool: {n}"}}
695
+ if method in ("notifications/initialized","notifications/cancelled"): return None
696
+ return {"jsonrpc":"2.0","id":req_id,"error":{"code":-32601,"message":f"Method not found: {method}"}}
697
+
698
+ # ---------------------------------------------------------------------------
699
+ # FastAPI app
700
+ # ---------------------------------------------------------------------------
701
+ @asynccontextmanager
702
+ async def lifespan(app):
703
+ init_db(); seed_demo()
704
+ asyncio.create_task(_sync_loop())
705
+ yield
706
+
707
+ app = FastAPI(title="agent-learn", version="1.0.0", lifespan=lifespan)
708
+
709
+ def _auth(r): return not LEARN_KEY or r.headers.get("x-learn-key","") == LEARN_KEY
710
+
711
+ # --- Q-table REST ---
712
+ @app.get("/api/q")
713
+ async def api_q_get(agent:str=Query(...), state:str=Query("{}") ):
714
+ try: s = json.loads(state)
715
+ except Exception: raise HTTPException(400,"state must be JSON")
716
+ return JSONResponse({"entries": q_get(agent, s)})
717
+
718
+ @app.post("/api/q/best")
719
+ async def api_q_best(request:Request):
720
+ b = await request.json()
721
+ return JSONResponse(q_best_action(b["agent"], b.get("state",{}), b.get("actions",[])))
722
+
723
+ @app.post("/api/q/update")
724
+ async def api_q_update(request:Request):
725
+ if not _auth(request): raise HTTPException(403,"Invalid X-Learn-Key")
726
+ b = await request.json()
727
+ return JSONResponse(q_update(b["agent"],b.get("state",{}),b["action"],float(b["reward"]),b.get("next_state")))
728
+
729
+ @app.post("/api/q/hint")
730
+ async def api_q_hint(request:Request):
731
+ if not _auth(request): raise HTTPException(403,"Invalid X-Learn-Key")
732
+ b = await request.json()
733
+ return JSONResponse(q_hint(b["agent"],b.get("state",{}),b["action"],float(b["nudge"])))
734
+
735
+ @app.get("/api/q/stats")
736
+ async def api_q_stats(): return JSONResponse(q_stats())
737
+
738
+ # --- Scoring ---
739
+ @app.post("/api/score")
740
+ async def api_score(request:Request):
741
+ b = await request.json()
742
+ score, comp = score_trace_event(b)
743
+ return JSONResponse({"reward": score, "components": comp})
744
+
745
+ @app.post("/api/sync")
746
+ async def api_sync(request:Request):
747
+ if not _auth(request): raise HTTPException(403,"Invalid X-Learn-Key")
748
+ result = pull_and_score_traces()
749
+ return JSONResponse(result)
750
+
751
+ # --- RLHF ---
752
+ @app.get("/api/rlhf")
753
+ async def api_rlhf_list(agent:str=Query(""), label:str=Query(""), limit:int=Query(50)):
754
+ return JSONResponse({"entries": rlhf_list(agent,label,limit)})
755
+
756
+ @app.post("/api/rlhf", status_code=201)
757
+ async def api_rlhf_add(request:Request):
758
+ if not _auth(request): raise HTTPException(403,"Invalid X-Learn-Key")
759
+ b = await request.json()
760
+ rid = rlhf_add(b.get("agent","unknown"),b["prompt"],b["completion"],
761
+ b.get("label","unlabeled"),b.get("reward"),b.get("source","api"),b.get("meta"))
762
+ return JSONResponse({"ok":True,"id":rid})
763
+
764
+ @app.patch("/api/rlhf/{entry_id}")
765
+ async def api_rlhf_label(entry_id:str, request:Request):
766
+ if not _auth(request): raise HTTPException(403,"Invalid X-Learn-Key")
767
+ b = await request.json()
768
+ ok = rlhf_label(entry_id, b.get("label","unlabeled"), b.get("reward"))
769
+ return JSONResponse({"ok":ok})
770
+
771
+ # --- Skill candidates ---
772
+ @app.get("/api/candidates")
773
+ async def api_candidates(status:str=Query("pending")):
774
+ return JSONResponse({"candidates": candidates_list(status)})
775
+
776
+ @app.patch("/api/candidates/{cid}")
777
+ async def api_candidate_update(cid:str, request:Request):
778
+ if not _auth(request): raise HTTPException(403,"Invalid X-Learn-Key")
779
+ b = await request.json()
780
+ ok = candidate_update(cid, b.get("status","pending"))
781
+ return JSONResponse({"ok":ok})
782
+
783
+ # --- Stats ---
784
+ @app.get("/api/stats")
785
+ async def api_stats(): return JSONResponse(learn_stats())
786
+
787
+ @app.get("/api/reward-trend")
788
+ async def api_trend(hours:int=Query(24)): return JSONResponse({"trend":reward_trend(hours)})
789
+
790
+ @app.get("/api/health")
791
+ async def api_health():
792
+ conn=get_db(); n=conn.execute("SELECT COUNT(*) FROM qtable").fetchone()[0]; conn.close()
793
+ return JSONResponse({"ok":True,"qtable_entries":n,"version":"1.0.0"})
794
+
795
+ # --- MCP ---
796
+ @app.get("/mcp/sse")
797
+ async def mcp_sse(request:Request):
798
+ async def gen():
799
+ yield f"data: {json.dumps({'jsonrpc':'2.0','method':'connected','params':{}})}\n\n"
800
+ yield f"data: {json.dumps({'jsonrpc':'2.0','method':'notifications/tools','params':{'tools':MCP_TOOLS}})}\n\n"
801
+ while True:
802
+ if await request.is_disconnected(): break
803
+ yield ": ping\n\n"; await asyncio.sleep(15)
804
+ return StreamingResponse(gen(), media_type="text/event-stream",
805
+ headers={"Cache-Control":"no-cache","Connection":"keep-alive","X-Accel-Buffering":"no"})
806
+
807
+ @app.post("/mcp")
808
+ async def mcp_rpc(request:Request):
809
+ try: body = await request.json()
810
+ except Exception: return JSONResponse({"jsonrpc":"2.0","id":None,"error":{"code":-32700,"message":"Parse error"}})
811
+ if isinstance(body,list):
812
+ return JSONResponse([r for r in [handle_mcp(x.get("method",""),x.get("params",{}),x.get("id")) for x in body] if r])
813
+ r = handle_mcp(body.get("method",""),body.get("params",{}),body.get("id"))
814
+ return JSONResponse(r or {"jsonrpc":"2.0","id":body.get("id"),"result":{}})
815
+
816
+ # ---------------------------------------------------------------------------
817
+ # SPA Dashboard
818
+ # ---------------------------------------------------------------------------
819
+ SPA = r"""<!DOCTYPE html>
820
+ <html lang="en">
821
+ <head>
822
+ <meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1">
823
+ <title>&#129504; LEARN &#8212; FORGE Learning Layer</title>
824
+ <style>
825
+ @import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&family=Syne:wght@400;600;800&family=DM+Mono:wght@300;400;500&display=swap');
826
+ *{box-sizing:border-box;margin:0;padding:0}
827
+ :root{--bg:#06060d;--sf:#0d0d18;--sf2:#121222;--br:#1a1a2e;--ac:#ff6b00;--tx:#dde0f0;--mu:#50507a;--gr:#00ff88;--rd:#ff4455;--cy:#06b6d4;--pu:#8b5cf6;--ye:#f59e0b;--pk:#ec4899}
828
+ html,body{height:100%;background:var(--bg);color:var(--tx);font-family:'Syne',sans-serif}
829
+ ::-webkit-scrollbar{width:5px;height:5px}::-webkit-scrollbar-track{background:var(--sf)}::-webkit-scrollbar-thumb{background:var(--br);border-radius:3px}
830
+ .app{display:grid;grid-template-rows:52px 1fr;height:100vh;overflow:hidden}
831
+ .hdr{display:flex;align-items:center;gap:1rem;padding:0 1.5rem;border-bottom:1px solid var(--br);background:var(--sf)}
832
+ .logo{font-family:'Space Mono',monospace;font-size:1.1rem;font-weight:700;color:var(--ac)}
833
+ .sub{font-family:'DM Mono',monospace;font-size:.6rem;color:var(--mu);letter-spacing:.2em;text-transform:uppercase}
834
+ .hstats{display:flex;gap:1.5rem;margin-left:auto}
835
+ .hs{text-align:center}.hs-n{font-family:'Space Mono',monospace;font-size:1rem;font-weight:700;color:var(--ac)}
836
+ .hs-l{font-family:'DM Mono',monospace;font-size:.58rem;color:var(--mu);text-transform:uppercase;letter-spacing:.1em}
837
+ .tabs{display:flex;border-bottom:1px solid var(--br);background:var(--sf)}
838
+ .tab{padding:.55rem 1.3rem;font-family:'DM Mono',monospace;font-size:.72rem;color:var(--mu);border-bottom:2px solid transparent;cursor:pointer;letter-spacing:.05em;transition:all .15s}
839
+ .tab.active{color:var(--ac);border-bottom-color:var(--ac)}
840
+ .tab:hover{color:var(--tx)}
841
+ .body{flex:1;overflow-y:auto;padding:1.25rem}
842
+
843
+ /* Cards */
844
+ .kpis{display:grid;grid-template-columns:repeat(4,1fr);gap:.75rem;margin-bottom:1.25rem}
845
+ .kpi{background:var(--sf);border:1px solid var(--br);border-radius:8px;padding:.9rem 1rem}
846
+ .kpi-n{font-family:'Space Mono',monospace;font-size:1.6rem;font-weight:700;color:var(--ac);line-height:1}
847
+ .kpi-l{font-family:'DM Mono',monospace;font-size:.6rem;color:var(--mu);text-transform:uppercase;letter-spacing:.1em;margin-top:4px}
848
+ .kpi-sub{font-family:'DM Mono',monospace;font-size:.65rem;color:var(--mu);margin-top:2px}
849
+
850
+ /* Q-table */
851
+ .qtable-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:.75rem}
852
+ .qt-agent{background:var(--sf);border:1px solid var(--br);border-radius:8px;overflow:hidden}
853
+ .qt-agent-hdr{padding:.6rem 1rem;border-bottom:1px solid var(--br);font-family:'Space Mono',monospace;font-size:.8rem;font-weight:700;color:var(--ac);display:flex;align-items:center;gap:.5rem}
854
+ .qt-row{display:flex;align-items:center;padding:.35rem 1rem;gap:.6rem;border-bottom:1px solid #0d0d18;font-family:'DM Mono',monospace;font-size:.72rem}
855
+ .qt-row:last-child{border-bottom:none}
856
+ .qt-action{flex:1;color:var(--tx);overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
857
+ .qt-bar{width:80px;height:6px;background:var(--br);border-radius:3px;overflow:hidden;flex-shrink:0}
858
+ .qt-bar-fill{height:100%;border-radius:3px;transition:width .3s}
859
+ .qt-val{font-weight:700;width:48px;text-align:right;flex-shrink:0}
860
+ .qt-vis{font-size:.6rem;color:var(--mu);width:30px;text-align:right;flex-shrink:0}
861
+
862
+ /* Reward trend */
863
+ .trend-container{background:var(--sf);border:1px solid var(--br);border-radius:8px;padding:1rem;margin-bottom:1rem}
864
+ .trend-title{font-family:'DM Mono',monospace;font-size:.65rem;color:var(--mu);text-transform:uppercase;letter-spacing:.15em;margin-bottom:.75rem}
865
+ .trend-chart{height:80px;display:flex;align-items:flex-end;gap:3px}
866
+ .t-bar-wrap{flex:1;display:flex;flex-direction:column;align-items:center;height:100%}
867
+ .t-bar{width:100%;border-radius:2px 2px 0 0;min-height:2px;transition:height .3s}
868
+ .t-lbl{font-family:'DM Mono',monospace;font-size:.5rem;color:var(--mu);margin-top:2px;text-align:center}
869
+
870
+ /* RLHF table */
871
+ .rlhf-table{width:100%;border-collapse:collapse;font-family:'DM Mono',monospace;font-size:.75rem}
872
+ .rlhf-table th{padding:.4rem .75rem;text-align:left;font-size:.62rem;color:var(--mu);text-transform:uppercase;letter-spacing:.1em;border-bottom:1px solid var(--br)}
873
+ .rlhf-table td{padding:.45rem .75rem;border-bottom:1px solid #0d0d18;max-width:200px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
874
+ .rlhf-table tr:hover td{background:var(--sf)}
875
+ .badge{display:inline-block;padding:1px 7px;border-radius:4px;font-size:.62rem}
876
+ .badge-approved{background:#001a08;color:var(--gr);border:1px solid #004422}
877
+ .badge-rejected{background:#1a0000;color:var(--rd);border:1px solid #440011}
878
+ .badge-unlabeled{background:var(--sf2);color:var(--mu);border:1px solid var(--br)}
879
+
880
+ /* Skill candidates */
881
+ .cand-card{background:var(--sf);border:1px solid var(--br);border-radius:8px;padding:.8rem 1rem;margin-bottom:.6rem;display:flex;align-items:flex-start;gap:1rem}
882
+ .cand-desc{flex:1;font-size:.82rem;line-height:1.6}
883
+ .cand-meta{font-family:'DM Mono',monospace;font-size:.62rem;color:var(--mu)}
884
+ .cand-freq{font-family:'Space Mono',monospace;font-size:1.2rem;font-weight:700;color:var(--ye);min-width:30px;text-align:center}
885
+ .btn{padding:.4rem .9rem;border:none;border-radius:5px;cursor:pointer;font-family:'DM Mono',monospace;font-size:.7rem;transition:all .15s}
886
+ .btn-approve{background:#001a08;color:var(--gr);border:1px solid #004422}
887
+ .btn-approve:hover{background:#003010}
888
+ .btn-reject{background:#1a0000;color:var(--rd);border:1px solid #440011}
889
+ .btn-reject:hover{background:#300010}
890
+ .btn-sync{background:var(--sf2);color:var(--ac);border:1px solid var(--br);margin-left:auto}
891
+ .btn-sync:hover{border-color:var(--ac)}
892
+
893
+ /* Config panel */
894
+ .config-row{display:flex;align-items:center;padding:.6rem 1rem;border-bottom:1px solid var(--br);font-family:'DM Mono',monospace;font-size:.78rem}
895
+ .config-key{color:var(--mu);width:160px;text-transform:uppercase;font-size:.65rem;letter-spacing:.1em}
896
+ .config-val{color:var(--cy);font-weight:700}
897
+ .config-desc{color:var(--mu);font-size:.65rem;margin-left:.75rem}
898
+
899
+ .section{font-family:'DM Mono',monospace;font-size:.65rem;color:var(--pu);text-transform:uppercase;letter-spacing:.15em;margin:.75rem 0 .4rem}
900
+ .empty{text-align:center;padding:2rem;color:var(--mu);font-family:'DM Mono',monospace;font-size:.8rem}
901
+ </style>
902
+ </head>
903
+ <body>
904
+ <div class="app">
905
+ <header class="hdr">
906
+ <div><div class="logo">&#129504; LEARN</div><div class="sub">FORGE Learning Layer</div></div>
907
+ <div class="hstats">
908
+ <div class="hs"><div class="hs-n" id="hQ">&#8212;</div><div class="hs-l">Q-entries</div></div>
909
+ <div class="hs"><div class="hs-n" id="hR" style="color:var(--gr)">&#8212;</div><div class="hs-l">Rewards</div></div>
910
+ <div class="hs"><div class="hs-n" id="hA">&#8212;</div><div class="hs-l">Avg reward</div></div>
911
+ <div class="hs"><div class="hs-n" id="hC" style="color:var(--ye)">&#8212;</div><div class="hs-l">Candidates</div></div>
912
+ </div>
913
+ </header>
914
+ <div style="display:flex;flex-direction:column;overflow:hidden;flex:1">
915
+ <div class="tabs">
916
+ <div class="tab active" onclick="showTab('qtable')">&#9881; Q-Table</div>
917
+ <div class="tab" onclick="showTab('rewards')">&#127942; Rewards</div>
918
+ <div class="tab" onclick="showTab('rlhf')">&#128101; RLHF</div>
919
+ <div class="tab" onclick="showTab('candidates')">&#128161; Skill Candidates</div>
920
+ <div class="tab" onclick="showTab('config')">&#9881;&#65038; Config</div>
921
+ <button class="btn btn-sync" onclick="triggerSync()" style="margin:auto 1rem auto auto;padding:.3rem .75rem">&#8635; Sync Traces</button>
922
+ </div>
923
+ <div class="body" id="tabBody"></div>
924
+ </div>
925
+ </div>
926
+ <script>
927
+ let stats=null, trend=[], rlhf=[], candidates=[], currentTab='qtable';
928
+
929
+ async function loadAll(){
930
+ [stats,trend] = await Promise.all([
931
+ fetch('/api/stats').then(r=>r.json()),
932
+ fetch('/api/reward-trend?hours=24').then(r=>r.json()).then(d=>d.trend||[])
933
+ ]);
934
+ document.getElementById('hQ').textContent=stats.qtable?.total_entries||0;
935
+ document.getElementById('hR').textContent=stats.rewards?.total||0;
936
+ document.getElementById('hA').textContent=stats.rewards?.avg_all_time?.toFixed(3)||'—';
937
+ document.getElementById('hC').textContent=stats.skill_candidates_pending||0;
938
+ renderTab();
939
+ }
940
+
941
+ async function loadRLHF(){ rlhf = (await fetch('/api/rlhf?limit=50').then(r=>r.json())).entries||[]; }
942
+ async function loadCandidates(){ candidates = (await fetch('/api/candidates').then(r=>r.json())).candidates||[]; }
943
+
944
+ function showTab(t){
945
+ currentTab=t;
946
+ document.querySelectorAll('.tab').forEach((el,i)=>el.classList.toggle('active',['qtable','rewards','rlhf','candidates','config'][i]===t));
947
+ renderTab();
948
+ }
949
+
950
+ async function renderTab(){
951
+ if(currentTab==='qtable') renderQTable();
952
+ else if(currentTab==='rewards') renderRewards();
953
+ else if(currentTab==='rlhf') { await loadRLHF(); renderRLHF(); }
954
+ else if(currentTab==='candidates'){ await loadCandidates(); renderCandidates(); }
955
+ else if(currentTab==='config') renderConfig();
956
+ }
957
+
958
+ function renderQTable(){
959
+ const qt = stats?.qtable || {};
960
+ const byAgent = qt.by_agent || [];
961
+ const top = qt.top_actions || [];
962
+ // Group top by agent
963
+ const grouped = {};
964
+ top.forEach(r=>{ if(!grouped[r.agent]) grouped[r.agent]=[];grouped[r.agent].push(r) });
965
+ byAgent.forEach(a=>{ if(!grouped[a.agent]) grouped[a.agent]=[] });
966
+
967
+ const html = `
968
+ <div class="kpis">
969
+ <div class="kpi"><div class="kpi-n">${qt.total_entries||0}</div><div class="kpi-l">Total entries</div></div>
970
+ ${byAgent.slice(0,3).map(a=>`<div class="kpi"><div class="kpi-n" style="font-size:1.2rem">${a.n}</div><div class="kpi-l">${a.agent}</div><div class="kpi-sub">avg Q: ${(a.avg_q||0).toFixed(3)}</div></div>`).join('')}
971
+ </div>
972
+ <div class="section">Best Q-values per agent</div>
973
+ <div class="qtable-grid">
974
+ ${Object.entries(grouped).map(([agent, rows])=>{
975
+ const maxQ = Math.max(...rows.map(r=>r.q_value||0), 0.001);
976
+ return `<div class="qt-agent">
977
+ <div class="qt-agent-hdr">&#9881; ${agent}</div>
978
+ ${rows.length ? rows.map(r=>{
979
+ const pct = Math.max(0,Math.min(100,(r.q_value/maxQ)*100));
980
+ const col = r.q_value>0.5?'var(--gr)':r.q_value>0?'var(--ye)':'var(--rd)';
981
+ return `<div class="qt-row">
982
+ <span class="qt-action">${r.action}</span>
983
+ <div class="qt-bar"><div class="qt-bar-fill" style="width:${pct}%;background:${col}"></div></div>
984
+ <span class="qt-val" style="color:${col}">${r.q_value.toFixed(3)}</span>
985
+ <span class="qt-vis">${r.visits}x</span>
986
+ </div>`;
987
+ }).join('') : '<div class="qt-row" style="color:var(--mu)">No entries yet</div>'}
988
+ </div>`;
989
+ }).join('')}
990
+ </div>
991
+ <div class="section" style="margin-top:1rem">Worst-performing actions</div>
992
+ <div class="qtable-grid">
993
+ ${Object.values((qt.worst_actions||[]).reduce((g,r)=>{ if(!g[r.agent])g[r.agent]=[];g[r.agent].push(r);return g },{})).map(rows=>{
994
+ const agent=rows[0].agent;
995
+ return `<div class="qt-agent">
996
+ <div class="qt-agent-hdr" style="color:var(--rd)">&#9888; ${agent} — avoid</div>
997
+ ${rows.map(r=>`<div class="qt-row"><span class="qt-action">${r.action}</span><span class="qt-val" style="color:var(--rd)">${r.q_value.toFixed(3)}</span></div>`).join('')}
998
+ </div>`;
999
+ }).join('')}
1000
+ </div>`;
1001
+ document.getElementById('tabBody').innerHTML=html;
1002
+ }
1003
+
1004
+ function renderRewards(){
1005
+ const rw = stats?.rewards||{};
1006
+ const max = Math.max(...trend.map(t=>Math.abs(t.avg_reward||0)), 0.001);
1007
+ const bars = trend.length ? trend.map(t=>{
1008
+ const h=Math.max(3,Math.abs(t.avg_reward||0)/max*100);
1009
+ const col=t.avg_reward>=0?'var(--gr)':'var(--rd)';
1010
+ const hStr=new Date(t.ts*1000).getHours()+'h';
1011
+ return `<div class="t-bar-wrap"><div style="flex:1;display:flex;align-items:flex-end;width:100%"><div class="t-bar" style="height:${h}%;background:${col}" title="avg=${t.avg_reward} n=${t.count}"></div></div><div class="t-lbl">${hStr}</div></div>`;
1012
+ }).join('') : '<div style="color:var(--mu);font-family:DM Mono,monospace;font-size:.75rem;margin:auto">No reward data yet</div>';
1013
+
1014
+ document.getElementById('tabBody').innerHTML=`
1015
+ <div class="kpis">
1016
+ <div class="kpi"><div class="kpi-n">${rw.total||0}</div><div class="kpi-l">Total scored</div></div>
1017
+ <div class="kpi"><div class="kpi-n" style="color:var(--gr)">${rw.avg_all_time?.toFixed(3)||'—'}</div><div class="kpi-l">All-time avg</div></div>
1018
+ <div class="kpi"><div class="kpi-n" style="color:var(--cy)">${rw.last_24h?.count||0}</div><div class="kpi-l">Last 24h</div></div>
1019
+ <div class="kpi"><div class="kpi-n" style="color:var(--cy)">${rw.last_24h?.avg?.toFixed(3)||'—'}</div><div class="kpi-l">24h avg</div></div>
1020
+ </div>
1021
+ <div class="trend-container">
1022
+ <div class="trend-title">Avg reward per hour (24h)</div>
1023
+ <div class="trend-chart">${bars}</div>
1024
+ </div>
1025
+ <div class="section">Scoring model</div>
1026
+ <div style="background:var(--sf);border:1px solid var(--br);border-radius:8px;overflow:hidden">
1027
+ ${[['error_penalty','-0.40','Any event with status=error'],['latency_fast (LLM <500ms)','+0.30','LLM call completed quickly'],['latency_ok (500-1500ms)','+0.10','LLM call acceptable latency'],['latency_slow (1500-4000ms)','-0.10','LLM call slow'],['latency_very_slow (>4000ms)','-0.30','LLM call very slow'],['token_efficiency','+0.10','Output/input ratio > 0.5'],['react_progress','+0.10','Each ReAct step completed'],['skill_reuse','+0.15','Skill loaded from FORGE'],['reflection_bonus','+0.20','Agent performed self-reflection']].map(([k,v,d])=>`<div class="config-row"><span class="config-key">${k}</span><span class="config-val">${v}</span><span class="config-desc">${d}</span></div>`).join('')}
1028
+ </div>`;
1029
+ }
1030
+
1031
+ function renderRLHF(){
1032
+ const s = stats?.rlhf||{};
1033
+ document.getElementById('tabBody').innerHTML=`
1034
+ <div class="kpis">
1035
+ <div class="kpi"><div class="kpi-n">${s.total||0}</div><div class="kpi-l">Total entries</div></div>
1036
+ <div class="kpi"><div class="kpi-n" style="color:var(--gr)">${s.by_label?.approved||0}</div><div class="kpi-l">Approved</div></div>
1037
+ <div class="kpi"><div class="kpi-n" style="color:var(--rd)">${s.by_label?.rejected||0}</div><div class="kpi-l">Rejected</div></div>
1038
+ <div class="kpi"><div class="kpi-n" style="color:var(--mu)">${s.by_label?.unlabeled||0}</div><div class="kpi-l">Unlabeled</div></div>
1039
+ </div>
1040
+ <table class="rlhf-table" style="background:var(--sf);border:1px solid var(--br);border-radius:8px;overflow:hidden">
1041
+ <thead><tr><th>Agent</th><th>Prompt</th><th>Completion</th><th>Label</th><th>Reward</th><th>Source</th></tr></thead>
1042
+ <tbody>
1043
+ ${rlhf.length ? rlhf.map(r=>`<tr>
1044
+ <td>${r.agent}</td>
1045
+ <td title="${esc(r.prompt)}">${esc(r.prompt.slice(0,40))}...</td>
1046
+ <td title="${esc(r.completion)}">${esc(r.completion.slice(0,50))}...</td>
1047
+ <td><span class="badge badge-${r.label}">${r.label}</span></td>
1048
+ <td style="color:${(r.reward||0)>=0?'var(--gr)':'var(--rd)'}">${r.reward!=null?r.reward:'—'}</td>
1049
+ <td style="color:var(--mu)">${r.source}</td>
1050
+ </tr>`).join('') : '<tr><td colspan="6" class="empty">No RLHF entries yet</td></tr>'}
1051
+ </tbody>
1052
+ </table>`;
1053
+ }
1054
+
1055
+ function renderCandidates(){
1056
+ document.getElementById('tabBody').innerHTML=`
1057
+ <p style="font-family:'DM Mono',monospace;font-size:.75rem;color:var(--mu);margin-bottom:1rem">
1058
+ Patterns detected by agents that recur ${3}+ times. Promote to FORGE or reject.
1059
+ </p>
1060
+ ${candidates.length ? candidates.map(c=>`
1061
+ <div class="cand-card">
1062
+ <div class="cand-freq">${c.frequency}x</div>
1063
+ <div style="flex:1">
1064
+ <div class="cand-desc">${esc(c.description)}</div>
1065
+ <div class="cand-meta">from ${c.agent} &middot; ${new Date(c.created_at*1000).toLocaleDateString()}</div>
1066
+ </div>
1067
+ <div style="display:flex;flex-direction:column;gap:.35rem">
1068
+ <button class="btn btn-approve" onclick="updateCand('${c.id}','promoted')">&#8679; Promote</button>
1069
+ <button class="btn btn-reject" onclick="updateCand('${c.id}','rejected')">&#10005; Reject</button>
1070
+ </div>
1071
+ </div>`).join('') : '<div class="empty">No pending skill candidates</div>'}`;
1072
+ }
1073
+
1074
+ function renderConfig(){
1075
+ document.getElementById('tabBody').innerHTML=`
1076
+ <div class="section">Hyperparameters</div>
1077
+ <div style="background:var(--sf);border:1px solid var(--br);border-radius:8px;overflow:hidden">
1078
+ <div class="config-row"><span class="config-key">Learning rate &alpha;</span><span class="config-val" id="cfgLR">loading...</span><span class="config-desc">Q-value update step size</span></div>
1079
+ <div class="config-row"><span class="config-key">Discount &gamma;</span><span class="config-val" id="cfgDisc">loading...</span><span class="config-desc">Future reward weight</span></div>
1080
+ <div class="config-row"><span class="config-key">Epsilon &epsilon;</span><span class="config-val" id="cfgEps">loading...</span><span class="config-desc">Exploration rate (random action probability)</span></div>
1081
+ <div class="config-row"><span class="config-key">Sync interval</span><span class="config-val" id="cfgSync">loading...</span><span class="config-desc">Trace pull frequency (seconds)</span></div>
1082
+ <div class="config-row"><span class="config-key">Trace URL</span><span class="config-val" id="cfgTrace">loading...</span><span class="config-desc">agent-trace endpoint</span></div>
1083
+ </div>
1084
+ <div class="section" style="margin-top:1rem">MCP connection</div>
1085
+ <pre style="background:var(--sf);border:1px solid var(--br);border-radius:6px;padding:.75rem;font-family:'DM Mono',monospace;font-size:.72rem;color:var(--cy)">{"mcpServers":{"learn":{"command":"npx","args":["-y","mcp-remote","${window.location.origin}/mcp/sse"]}}}</pre>
1086
+ <div class="section" style="margin-top:1rem">Quick integration (NEXUS / any agent)</div>
1087
+ <pre style="background:var(--sf);border:1px solid var(--br);border-radius:6px;padding:.75rem;font-family:'DM Mono',monospace;font-size:.72rem;color:var(--gr)">LEARN_URL = "${window.location.origin}"
1088
+
1089
+ # Ask LEARN for best LLM to route to
1090
+ import requests
1091
+ resp = requests.post(f"{LEARN_URL}/api/q/best", json={
1092
+ "agent": "nexus",
1093
+ "state": {"agent": "nexus", "event": "model_selection"},
1094
+ "actions": ["qwen/qwen3.5-35b-a3b", "claude-haiku-4-5", "hf_api", "local_cpu"]
1095
+ })
1096
+ best = resp.json() # {"action": "qwen/qwen3.5-35b-a3b", "q_value": 0.72, "strategy": "exploit"}
1097
+
1098
+ # After inference, update Q-value
1099
+ requests.post(f"{LEARN_URL}/api/q/update", json={
1100
+ "agent": "nexus",
1101
+ "state": {"agent": "nexus", "event": "model_selection"},
1102
+ "action": best["action"],
1103
+ "reward": 0.8 # from trace scoring
1104
+ })</pre>`;
1105
+ fetch('/api/health').then(r=>r.json()).then(d=>{
1106
+ document.getElementById('cfgLR').textContent='0.1 (env: LEARN_RATE)';
1107
+ document.getElementById('cfgDisc').textContent='0.9 (env: DISCOUNT)';
1108
+ document.getElementById('cfgEps').textContent='0.15 (env: EPSILON)';
1109
+ document.getElementById('cfgSync').textContent='120s (env: SYNC_INTERVAL)';
1110
+ document.getElementById('cfgTrace').textContent='env: TRACE_URL';
1111
+ });
1112
+ }
1113
+
1114
+ async function triggerSync(){
1115
+ const btn=document.querySelector('.btn-sync');
1116
+ btn.textContent='&#8635; Syncing...';btn.disabled=true;
1117
+ const r=await fetch('/api/sync',{method:'POST'}).then(x=>x.json());
1118
+ btn.textContent=`&#8635; Scored ${r.scored||0}`;
1119
+ setTimeout(()=>{btn.textContent='&#8635; Sync Traces';btn.disabled=false;},3000);
1120
+ await loadAll();
1121
+ }
1122
+
1123
+ async function updateCand(id,status){
1124
+ await fetch(`/api/candidates/${id}`,{method:'PATCH',headers:{'Content-Type':'application/json'},body:JSON.stringify({status})});
1125
+ await loadCandidates();renderCandidates();
1126
+ }
1127
+
1128
+ function esc(s){return String(s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;')}
1129
+
1130
+ loadAll();setInterval(loadAll,15000);
1131
+ </script>
1132
+ </body></html>"""
1133
+
1134
+ @app.get("/", response_class=HTMLResponse)
1135
+ async def root(): return HTMLResponse(content=SPA, media_type="text/html; charset=utf-8")
1136
+
1137
+ if __name__ == "__main__":
1138
+ uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi>=0.111.0
2
+ uvicorn>=0.30.0
3
+ httpx>=0.27.0