ruvector-fixed / dist /core /learning-engine.js
Archie
Fix dimension/dimensions bug and positional insert/search args
40d7073
"use strict";
/**
* Multi-Algorithm Learning Engine
* Supports 9 RL algorithms for intelligent hooks optimization
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.LearningEngine = void 0;
// Default configs for each task type
const TASK_ALGORITHM_MAP = {
'agent-routing': {
algorithm: 'double-q',
learningRate: 0.1,
discountFactor: 0.95,
epsilon: 0.1,
},
'error-avoidance': {
algorithm: 'sarsa',
learningRate: 0.05,
discountFactor: 0.99,
epsilon: 0.05,
},
'confidence-scoring': {
algorithm: 'actor-critic',
learningRate: 0.01,
discountFactor: 0.95,
epsilon: 0.1,
entropyCoef: 0.01,
},
'trajectory-learning': {
algorithm: 'decision-transformer',
learningRate: 0.001,
discountFactor: 0.99,
epsilon: 0,
sequenceLength: 20,
},
'context-ranking': {
algorithm: 'ppo',
learningRate: 0.0003,
discountFactor: 0.99,
epsilon: 0.2,
clipRange: 0.2,
entropyCoef: 0.01,
},
'memory-recall': {
algorithm: 'td-lambda',
learningRate: 0.1,
discountFactor: 0.9,
epsilon: 0.1,
lambda: 0.8,
},
};
class LearningEngine {
constructor() {
this.configs = new Map();
this.qTables = new Map();
this.qTables2 = new Map(); // For Double-Q
this.eligibilityTraces = new Map();
this.actorWeights = new Map();
this.criticValues = new Map();
this.trajectories = [];
this.stats = new Map();
this.rewardHistory = [];
// Initialize with default configs
for (const [task, config] of Object.entries(TASK_ALGORITHM_MAP)) {
this.configs.set(task, { ...config });
}
// Initialize stats for all algorithms
const algorithms = [
'q-learning', 'sarsa', 'double-q', 'actor-critic',
'ppo', 'decision-transformer', 'monte-carlo', 'td-lambda', 'dqn'
];
for (const alg of algorithms) {
this.stats.set(alg, {
algorithm: alg,
updates: 0,
avgReward: 0,
convergenceScore: 0,
lastUpdate: Date.now(),
});
}
}
/**
* Configure algorithm for a specific task type
*/
configure(task, config) {
const existing = this.configs.get(task) || TASK_ALGORITHM_MAP[task];
this.configs.set(task, { ...existing, ...config });
}
/**
* Get current configuration for a task
*/
getConfig(task) {
return this.configs.get(task) || TASK_ALGORITHM_MAP[task];
}
/**
* Update Q-value using the appropriate algorithm
*/
update(task, experience) {
const config = this.getConfig(task);
let delta = 0;
switch (config.algorithm) {
case 'q-learning':
delta = this.qLearningUpdate(experience, config);
break;
case 'sarsa':
delta = this.sarsaUpdate(experience, config);
break;
case 'double-q':
delta = this.doubleQUpdate(experience, config);
break;
case 'actor-critic':
delta = this.actorCriticUpdate(experience, config);
break;
case 'ppo':
delta = this.ppoUpdate(experience, config);
break;
case 'td-lambda':
delta = this.tdLambdaUpdate(experience, config);
break;
case 'monte-carlo':
// Monte Carlo needs full episodes
this.addToCurrentTrajectory(experience);
if (experience.done) {
delta = this.monteCarloUpdate(config);
}
break;
case 'decision-transformer':
this.addToCurrentTrajectory(experience);
if (experience.done) {
delta = this.decisionTransformerUpdate(config);
}
break;
case 'dqn':
delta = this.dqnUpdate(experience, config);
break;
}
// Update stats
this.updateStats(config.algorithm, experience.reward, Math.abs(delta));
return delta;
}
/**
* Get best action for a state
*/
getBestAction(task, state, actions) {
const config = this.getConfig(task);
// Epsilon-greedy exploration
if (Math.random() < config.epsilon) {
const randomAction = actions[Math.floor(Math.random() * actions.length)];
return { action: randomAction, confidence: 0.5 };
}
let bestAction = actions[0];
let bestValue = -Infinity;
let values = [];
const qTable = this.getQTable(state);
for (const action of actions) {
const value = qTable.get(action) || 0;
values.push(value);
if (value > bestValue) {
bestValue = value;
bestAction = action;
}
}
// Calculate confidence using softmax
const confidence = this.softmaxConfidence(values, actions.indexOf(bestAction));
return { action: bestAction, confidence };
}
/**
* Get action probabilities (for Actor-Critic and PPO)
*/
getActionProbabilities(state, actions) {
const probs = new Map();
const qTable = this.getQTable(state);
const values = actions.map(a => qTable.get(a) || 0);
const maxVal = Math.max(...values);
const expValues = values.map(v => Math.exp(v - maxVal));
const sumExp = expValues.reduce((a, b) => a + b, 0);
for (let i = 0; i < actions.length; i++) {
probs.set(actions[i], expValues[i] / sumExp);
}
return probs;
}
// ============ Algorithm Implementations ============
/**
* Standard Q-Learning: Q(s,a) += α * (r + γ * max_a' Q(s',a') - Q(s,a))
*/
qLearningUpdate(exp, config) {
const { state, action, reward, nextState, done } = exp;
const { learningRate: α, discountFactor: γ } = config;
const qTable = this.getQTable(state);
const nextQTable = this.getQTable(nextState);
const currentQ = qTable.get(action) || 0;
const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
const target = reward + γ * maxNextQ;
const delta = target - currentQ;
const newQ = currentQ + α * delta;
qTable.set(action, newQ);
return delta;
}
/**
* SARSA: On-policy, more conservative
* Q(s,a) += α * (r + γ * Q(s',a') - Q(s,a))
*/
sarsaUpdate(exp, config) {
const { state, action, reward, nextState, done } = exp;
const { learningRate: α, discountFactor: γ, epsilon } = config;
const qTable = this.getQTable(state);
const nextQTable = this.getQTable(nextState);
const currentQ = qTable.get(action) || 0;
// On-policy: use expected value under current policy (ε-greedy)
let nextQ = 0;
if (!done) {
const nextActions = Array.from(nextQTable.keys());
if (nextActions.length > 0) {
const maxQ = Math.max(...Array.from(nextQTable.values()));
const avgQ = Array.from(nextQTable.values()).reduce((a, b) => a + b, 0) / nextActions.length;
// Expected value under ε-greedy
nextQ = (1 - epsilon) * maxQ + epsilon * avgQ;
}
}
const target = reward + γ * nextQ;
const delta = target - currentQ;
const newQ = currentQ + α * delta;
qTable.set(action, newQ);
return delta;
}
/**
* Double Q-Learning: Reduces overestimation bias
* Uses two Q-tables, randomly updates one using the other for target
*/
doubleQUpdate(exp, config) {
const { state, action, reward, nextState, done } = exp;
const { learningRate: α, discountFactor: γ } = config;
const useFirst = Math.random() < 0.5;
const qTable = useFirst ? this.getQTable(state) : this.getQTable2(state);
const otherQTable = useFirst ? this.getQTable2(nextState) : this.getQTable(nextState);
const nextQTable = useFirst ? this.getQTable(nextState) : this.getQTable2(nextState);
const currentQ = qTable.get(action) || 0;
let nextQ = 0;
if (!done) {
// Find best action in next state using one table
let bestAction = '';
let bestValue = -Infinity;
for (const [a, v] of nextQTable) {
if (v > bestValue) {
bestValue = v;
bestAction = a;
}
}
// Evaluate using other table
if (bestAction) {
nextQ = otherQTable.get(bestAction) || 0;
}
}
const target = reward + γ * nextQ;
const delta = target - currentQ;
const newQ = currentQ + α * delta;
qTable.set(action, newQ);
return delta;
}
/**
* Actor-Critic: Policy gradient with value baseline
*/
actorCriticUpdate(exp, config) {
const { state, action, reward, nextState, done } = exp;
const { learningRate: α, discountFactor: γ } = config;
// Critic update (TD error)
const V = this.criticValues.get(state) || 0;
const V_next = done ? 0 : (this.criticValues.get(nextState) || 0);
const tdError = reward + γ * V_next - V;
this.criticValues.set(state, V + α * tdError);
// Actor update (policy gradient)
const qTable = this.getQTable(state);
const currentQ = qTable.get(action) || 0;
// Use TD error as advantage estimate
const newQ = currentQ + α * tdError;
qTable.set(action, newQ);
return tdError;
}
/**
* PPO: Clipped policy gradient for stable training
*/
ppoUpdate(exp, config) {
const { state, action, reward, nextState, done } = exp;
const { learningRate: α, discountFactor: γ, clipRange = 0.2 } = config;
// Critic update
const V = this.criticValues.get(state) || 0;
const V_next = done ? 0 : (this.criticValues.get(nextState) || 0);
const advantage = reward + γ * V_next - V;
this.criticValues.set(state, V + α * advantage);
// Actor update with clipping
const qTable = this.getQTable(state);
const oldQ = qTable.get(action) || 0;
// Compute probability ratio (simplified)
const ratio = Math.exp(α * advantage);
const clippedRatio = Math.max(1 - clipRange, Math.min(1 + clipRange, ratio));
// PPO objective: min(ratio * A, clip(ratio) * A)
const update = Math.min(ratio * advantage, clippedRatio * advantage);
const newQ = oldQ + α * update;
qTable.set(action, newQ);
return advantage;
}
/**
* TD(λ): Temporal difference with eligibility traces
*/
tdLambdaUpdate(exp, config) {
const { state, action, reward, nextState, done } = exp;
const { learningRate: α, discountFactor: γ, lambda = 0.8 } = config;
const qTable = this.getQTable(state);
const nextQTable = this.getQTable(nextState);
const currentQ = qTable.get(action) || 0;
const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
const tdError = reward + γ * maxNextQ - currentQ;
// Update eligibility trace for current state-action
const traces = this.getEligibilityTraces(state);
traces.set(action, (traces.get(action) || 0) + 1);
// Update all state-actions with eligibility traces
for (const [s, sTraces] of this.eligibilityTraces) {
const sQTable = this.getQTable(s);
for (const [a, trace] of sTraces) {
const q = sQTable.get(a) || 0;
sQTable.set(a, q + α * tdError * trace);
// Decay trace
sTraces.set(a, γ * lambda * trace);
}
}
return tdError;
}
/**
* Monte Carlo: Full episode learning
*/
monteCarloUpdate(config) {
const { learningRate: α, discountFactor: γ } = config;
const trajectory = this.trajectories[this.trajectories.length - 1];
if (!trajectory || trajectory.experiences.length === 0)
return 0;
let G = 0; // Return
let totalDelta = 0;
// Work backwards through episode
for (let t = trajectory.experiences.length - 1; t >= 0; t--) {
const exp = trajectory.experiences[t];
G = exp.reward + γ * G;
const qTable = this.getQTable(exp.state);
const currentQ = qTable.get(exp.action) || 0;
const delta = G - currentQ;
qTable.set(exp.action, currentQ + α * delta);
totalDelta += Math.abs(delta);
}
trajectory.completed = true;
trajectory.totalReward = G;
return totalDelta / trajectory.experiences.length;
}
/**
* Decision Transformer: Sequence modeling for trajectories
*/
decisionTransformerUpdate(config) {
const { learningRate: α, sequenceLength = 20 } = config;
const trajectory = this.trajectories[this.trajectories.length - 1];
if (!trajectory || trajectory.experiences.length === 0)
return 0;
// Decision Transformer learns to predict actions given (return, state, action) sequences
// Here we use a simplified version that learns state-action patterns
let totalDelta = 0;
const experiences = trajectory.experiences.slice(-sequenceLength);
// Calculate returns-to-go
const returns = [];
let R = 0;
for (let i = experiences.length - 1; i >= 0; i--) {
R += experiences[i].reward;
returns.unshift(R);
}
// Update Q-values weighted by return-to-go
for (let i = 0; i < experiences.length; i++) {
const exp = experiences[i];
const qTable = this.getQTable(exp.state);
const currentQ = qTable.get(exp.action) || 0;
// Weight by normalized return
const normalizedReturn = returns[i] / (Math.abs(returns[0]) + 1);
const target = currentQ + α * normalizedReturn * exp.reward;
const delta = target - currentQ;
qTable.set(exp.action, target);
totalDelta += Math.abs(delta);
}
trajectory.completed = true;
trajectory.totalReward = returns[0];
return totalDelta / experiences.length;
}
/**
* DQN: Deep Q-Network (simplified without actual neural network)
* Uses experience replay and target network concepts
*/
dqnUpdate(exp, config) {
// Add to replay buffer (trajectory)
this.addToCurrentTrajectory(exp);
// Sample from replay buffer
const replayExp = this.sampleFromReplay();
if (!replayExp)
return this.qLearningUpdate(exp, config);
// Use sampled experience for update (breaks correlation)
return this.qLearningUpdate(replayExp, config);
}
// ============ Helper Methods ============
getQTable(state) {
if (!this.qTables.has(state)) {
this.qTables.set(state, new Map());
}
return this.qTables.get(state);
}
getQTable2(state) {
if (!this.qTables2.has(state)) {
this.qTables2.set(state, new Map());
}
return this.qTables2.get(state);
}
getEligibilityTraces(state) {
if (!this.eligibilityTraces.has(state)) {
this.eligibilityTraces.set(state, new Map());
}
return this.eligibilityTraces.get(state);
}
softmaxConfidence(values, selectedIdx) {
if (values.length === 0)
return 0.5;
const maxVal = Math.max(...values);
const expValues = values.map(v => Math.exp(v - maxVal));
const sumExp = expValues.reduce((a, b) => a + b, 0);
return expValues[selectedIdx] / sumExp;
}
addToCurrentTrajectory(exp) {
if (this.trajectories.length === 0 || this.trajectories[this.trajectories.length - 1].completed) {
this.trajectories.push({
experiences: [],
totalReward: 0,
completed: false,
});
}
this.trajectories[this.trajectories.length - 1].experiences.push(exp);
}
sampleFromReplay() {
const allExperiences = [];
for (const traj of this.trajectories) {
allExperiences.push(...traj.experiences);
}
if (allExperiences.length === 0)
return null;
return allExperiences[Math.floor(Math.random() * allExperiences.length)];
}
updateStats(algorithm, reward, delta) {
const stats = this.stats.get(algorithm);
if (!stats)
return;
stats.updates++;
stats.lastUpdate = Date.now();
// Running average reward
this.rewardHistory.push(reward);
if (this.rewardHistory.length > 1000) {
this.rewardHistory.shift();
}
stats.avgReward = this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length;
// Convergence score (inverse of recent delta magnitude)
stats.convergenceScore = 1 / (1 + delta);
}
/**
* Get statistics for all algorithms
*/
getStats() {
return new Map(this.stats);
}
/**
* Get statistics summary
*/
getStatsSummary() {
let bestAlgorithm = 'q-learning';
let bestScore = -Infinity;
let totalUpdates = 0;
const algorithms = [];
for (const [alg, stats] of this.stats) {
algorithms.push(stats);
totalUpdates += stats.updates;
const score = stats.avgReward * stats.convergenceScore;
if (score > bestScore && stats.updates > 0) {
bestScore = score;
bestAlgorithm = alg;
}
}
return {
bestAlgorithm,
totalUpdates,
avgReward: this.rewardHistory.length > 0
? this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length
: 0,
algorithms: algorithms.filter(a => a.updates > 0),
};
}
/**
* Export state for persistence
*/
export() {
const qTables = {};
for (const [state, actions] of this.qTables) {
qTables[state] = Object.fromEntries(actions);
}
const qTables2 = {};
for (const [state, actions] of this.qTables2) {
qTables2[state] = Object.fromEntries(actions);
}
const criticValues = Object.fromEntries(this.criticValues);
const stats = {};
for (const [alg, s] of this.stats) {
stats[alg] = s;
}
const configs = {};
for (const [task, config] of this.configs) {
configs[task] = config;
}
return {
qTables,
qTables2,
criticValues,
trajectories: this.trajectories.slice(-100), // Keep last 100 trajectories
stats,
configs,
rewardHistory: this.rewardHistory.slice(-1000),
};
}
/**
* Import state from persistence
*/
import(data) {
// Q-tables
this.qTables.clear();
for (const [state, actions] of Object.entries(data.qTables || {})) {
this.qTables.set(state, new Map(Object.entries(actions)));
}
this.qTables2.clear();
for (const [state, actions] of Object.entries(data.qTables2 || {})) {
this.qTables2.set(state, new Map(Object.entries(actions)));
}
// Critic values
this.criticValues = new Map(Object.entries(data.criticValues || {}));
// Trajectories
this.trajectories = data.trajectories || [];
// Stats
for (const [alg, s] of Object.entries(data.stats || {})) {
this.stats.set(alg, s);
}
// Configs
for (const [task, config] of Object.entries(data.configs || {})) {
this.configs.set(task, config);
}
// Reward history
this.rewardHistory = data.rewardHistory || [];
}
/**
* Clear all learning data
*/
clear() {
this.qTables.clear();
this.qTables2.clear();
this.eligibilityTraces.clear();
this.actorWeights.clear();
this.criticValues.clear();
this.trajectories = [];
this.rewardHistory = [];
// Reset stats
for (const stats of this.stats.values()) {
stats.updates = 0;
stats.avgReward = 0;
stats.convergenceScore = 0;
}
}
/**
* Get available algorithms
*/
static getAlgorithms() {
return [
{ algorithm: 'q-learning', description: 'Simple off-policy learning', bestFor: 'General routing' },
{ algorithm: 'sarsa', description: 'On-policy, conservative', bestFor: 'Error avoidance' },
{ algorithm: 'double-q', description: 'Reduces overestimation', bestFor: 'Precise routing' },
{ algorithm: 'actor-critic', description: 'Policy gradient + value', bestFor: 'Confidence scoring' },
{ algorithm: 'ppo', description: 'Stable policy updates', bestFor: 'Preference learning' },
{ algorithm: 'decision-transformer', description: 'Sequence modeling', bestFor: 'Trajectory patterns' },
{ algorithm: 'monte-carlo', description: 'Full episode learning', bestFor: 'Unbiased estimates' },
{ algorithm: 'td-lambda', description: 'Eligibility traces', bestFor: 'Credit assignment' },
{ algorithm: 'dqn', description: 'Experience replay', bestFor: 'High-dim states' },
];
}
}
exports.LearningEngine = LearningEngine;
exports.default = LearningEngine;