"use strict"; /** * Multi-Algorithm Learning Engine * Supports 9 RL algorithms for intelligent hooks optimization */ Object.defineProperty(exports, "__esModule", { value: true }); exports.LearningEngine = void 0; // Default configs for each task type const TASK_ALGORITHM_MAP = { 'agent-routing': { algorithm: 'double-q', learningRate: 0.1, discountFactor: 0.95, epsilon: 0.1, }, 'error-avoidance': { algorithm: 'sarsa', learningRate: 0.05, discountFactor: 0.99, epsilon: 0.05, }, 'confidence-scoring': { algorithm: 'actor-critic', learningRate: 0.01, discountFactor: 0.95, epsilon: 0.1, entropyCoef: 0.01, }, 'trajectory-learning': { algorithm: 'decision-transformer', learningRate: 0.001, discountFactor: 0.99, epsilon: 0, sequenceLength: 20, }, 'context-ranking': { algorithm: 'ppo', learningRate: 0.0003, discountFactor: 0.99, epsilon: 0.2, clipRange: 0.2, entropyCoef: 0.01, }, 'memory-recall': { algorithm: 'td-lambda', learningRate: 0.1, discountFactor: 0.9, epsilon: 0.1, lambda: 0.8, }, }; class LearningEngine { constructor() { this.configs = new Map(); this.qTables = new Map(); this.qTables2 = new Map(); // For Double-Q this.eligibilityTraces = new Map(); this.actorWeights = new Map(); this.criticValues = new Map(); this.trajectories = []; this.stats = new Map(); this.rewardHistory = []; // Initialize with default configs for (const [task, config] of Object.entries(TASK_ALGORITHM_MAP)) { this.configs.set(task, { ...config }); } // Initialize stats for all algorithms const algorithms = [ 'q-learning', 'sarsa', 'double-q', 'actor-critic', 'ppo', 'decision-transformer', 'monte-carlo', 'td-lambda', 'dqn' ]; for (const alg of algorithms) { this.stats.set(alg, { algorithm: alg, updates: 0, avgReward: 0, convergenceScore: 0, lastUpdate: Date.now(), }); } } /** * Configure algorithm for a specific task type */ configure(task, config) { const existing = this.configs.get(task) || TASK_ALGORITHM_MAP[task]; this.configs.set(task, { ...existing, ...config }); } /** * Get current configuration for a task */ getConfig(task) { return this.configs.get(task) || TASK_ALGORITHM_MAP[task]; } /** * Update Q-value using the appropriate algorithm */ update(task, experience) { const config = this.getConfig(task); let delta = 0; switch (config.algorithm) { case 'q-learning': delta = this.qLearningUpdate(experience, config); break; case 'sarsa': delta = this.sarsaUpdate(experience, config); break; case 'double-q': delta = this.doubleQUpdate(experience, config); break; case 'actor-critic': delta = this.actorCriticUpdate(experience, config); break; case 'ppo': delta = this.ppoUpdate(experience, config); break; case 'td-lambda': delta = this.tdLambdaUpdate(experience, config); break; case 'monte-carlo': // Monte Carlo needs full episodes this.addToCurrentTrajectory(experience); if (experience.done) { delta = this.monteCarloUpdate(config); } break; case 'decision-transformer': this.addToCurrentTrajectory(experience); if (experience.done) { delta = this.decisionTransformerUpdate(config); } break; case 'dqn': delta = this.dqnUpdate(experience, config); break; } // Update stats this.updateStats(config.algorithm, experience.reward, Math.abs(delta)); return delta; } /** * Get best action for a state */ getBestAction(task, state, actions) { const config = this.getConfig(task); // Epsilon-greedy exploration if (Math.random() < config.epsilon) { const randomAction = actions[Math.floor(Math.random() * actions.length)]; return { action: randomAction, confidence: 0.5 }; } let bestAction = actions[0]; let bestValue = -Infinity; let values = []; const qTable = this.getQTable(state); for (const action of actions) { const value = qTable.get(action) || 0; values.push(value); if (value > bestValue) { bestValue = value; bestAction = action; } } // Calculate confidence using softmax const confidence = this.softmaxConfidence(values, actions.indexOf(bestAction)); return { action: bestAction, confidence }; } /** * Get action probabilities (for Actor-Critic and PPO) */ getActionProbabilities(state, actions) { const probs = new Map(); const qTable = this.getQTable(state); const values = actions.map(a => qTable.get(a) || 0); const maxVal = Math.max(...values); const expValues = values.map(v => Math.exp(v - maxVal)); const sumExp = expValues.reduce((a, b) => a + b, 0); for (let i = 0; i < actions.length; i++) { probs.set(actions[i], expValues[i] / sumExp); } return probs; } // ============ Algorithm Implementations ============ /** * Standard Q-Learning: Q(s,a) += α * (r + γ * max_a' Q(s',a') - Q(s,a)) */ qLearningUpdate(exp, config) { const { state, action, reward, nextState, done } = exp; const { learningRate: α, discountFactor: γ } = config; const qTable = this.getQTable(state); const nextQTable = this.getQTable(nextState); const currentQ = qTable.get(action) || 0; const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values())); const target = reward + γ * maxNextQ; const delta = target - currentQ; const newQ = currentQ + α * delta; qTable.set(action, newQ); return delta; } /** * SARSA: On-policy, more conservative * Q(s,a) += α * (r + γ * Q(s',a') - Q(s,a)) */ sarsaUpdate(exp, config) { const { state, action, reward, nextState, done } = exp; const { learningRate: α, discountFactor: γ, epsilon } = config; const qTable = this.getQTable(state); const nextQTable = this.getQTable(nextState); const currentQ = qTable.get(action) || 0; // On-policy: use expected value under current policy (ε-greedy) let nextQ = 0; if (!done) { const nextActions = Array.from(nextQTable.keys()); if (nextActions.length > 0) { const maxQ = Math.max(...Array.from(nextQTable.values())); const avgQ = Array.from(nextQTable.values()).reduce((a, b) => a + b, 0) / nextActions.length; // Expected value under ε-greedy nextQ = (1 - epsilon) * maxQ + epsilon * avgQ; } } const target = reward + γ * nextQ; const delta = target - currentQ; const newQ = currentQ + α * delta; qTable.set(action, newQ); return delta; } /** * Double Q-Learning: Reduces overestimation bias * Uses two Q-tables, randomly updates one using the other for target */ doubleQUpdate(exp, config) { const { state, action, reward, nextState, done } = exp; const { learningRate: α, discountFactor: γ } = config; const useFirst = Math.random() < 0.5; const qTable = useFirst ? this.getQTable(state) : this.getQTable2(state); const otherQTable = useFirst ? this.getQTable2(nextState) : this.getQTable(nextState); const nextQTable = useFirst ? this.getQTable(nextState) : this.getQTable2(nextState); const currentQ = qTable.get(action) || 0; let nextQ = 0; if (!done) { // Find best action in next state using one table let bestAction = ''; let bestValue = -Infinity; for (const [a, v] of nextQTable) { if (v > bestValue) { bestValue = v; bestAction = a; } } // Evaluate using other table if (bestAction) { nextQ = otherQTable.get(bestAction) || 0; } } const target = reward + γ * nextQ; const delta = target - currentQ; const newQ = currentQ + α * delta; qTable.set(action, newQ); return delta; } /** * Actor-Critic: Policy gradient with value baseline */ actorCriticUpdate(exp, config) { const { state, action, reward, nextState, done } = exp; const { learningRate: α, discountFactor: γ } = config; // Critic update (TD error) const V = this.criticValues.get(state) || 0; const V_next = done ? 0 : (this.criticValues.get(nextState) || 0); const tdError = reward + γ * V_next - V; this.criticValues.set(state, V + α * tdError); // Actor update (policy gradient) const qTable = this.getQTable(state); const currentQ = qTable.get(action) || 0; // Use TD error as advantage estimate const newQ = currentQ + α * tdError; qTable.set(action, newQ); return tdError; } /** * PPO: Clipped policy gradient for stable training */ ppoUpdate(exp, config) { const { state, action, reward, nextState, done } = exp; const { learningRate: α, discountFactor: γ, clipRange = 0.2 } = config; // Critic update const V = this.criticValues.get(state) || 0; const V_next = done ? 0 : (this.criticValues.get(nextState) || 0); const advantage = reward + γ * V_next - V; this.criticValues.set(state, V + α * advantage); // Actor update with clipping const qTable = this.getQTable(state); const oldQ = qTable.get(action) || 0; // Compute probability ratio (simplified) const ratio = Math.exp(α * advantage); const clippedRatio = Math.max(1 - clipRange, Math.min(1 + clipRange, ratio)); // PPO objective: min(ratio * A, clip(ratio) * A) const update = Math.min(ratio * advantage, clippedRatio * advantage); const newQ = oldQ + α * update; qTable.set(action, newQ); return advantage; } /** * TD(λ): Temporal difference with eligibility traces */ tdLambdaUpdate(exp, config) { const { state, action, reward, nextState, done } = exp; const { learningRate: α, discountFactor: γ, lambda = 0.8 } = config; const qTable = this.getQTable(state); const nextQTable = this.getQTable(nextState); const currentQ = qTable.get(action) || 0; const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values())); const tdError = reward + γ * maxNextQ - currentQ; // Update eligibility trace for current state-action const traces = this.getEligibilityTraces(state); traces.set(action, (traces.get(action) || 0) + 1); // Update all state-actions with eligibility traces for (const [s, sTraces] of this.eligibilityTraces) { const sQTable = this.getQTable(s); for (const [a, trace] of sTraces) { const q = sQTable.get(a) || 0; sQTable.set(a, q + α * tdError * trace); // Decay trace sTraces.set(a, γ * lambda * trace); } } return tdError; } /** * Monte Carlo: Full episode learning */ monteCarloUpdate(config) { const { learningRate: α, discountFactor: γ } = config; const trajectory = this.trajectories[this.trajectories.length - 1]; if (!trajectory || trajectory.experiences.length === 0) return 0; let G = 0; // Return let totalDelta = 0; // Work backwards through episode for (let t = trajectory.experiences.length - 1; t >= 0; t--) { const exp = trajectory.experiences[t]; G = exp.reward + γ * G; const qTable = this.getQTable(exp.state); const currentQ = qTable.get(exp.action) || 0; const delta = G - currentQ; qTable.set(exp.action, currentQ + α * delta); totalDelta += Math.abs(delta); } trajectory.completed = true; trajectory.totalReward = G; return totalDelta / trajectory.experiences.length; } /** * Decision Transformer: Sequence modeling for trajectories */ decisionTransformerUpdate(config) { const { learningRate: α, sequenceLength = 20 } = config; const trajectory = this.trajectories[this.trajectories.length - 1]; if (!trajectory || trajectory.experiences.length === 0) return 0; // Decision Transformer learns to predict actions given (return, state, action) sequences // Here we use a simplified version that learns state-action patterns let totalDelta = 0; const experiences = trajectory.experiences.slice(-sequenceLength); // Calculate returns-to-go const returns = []; let R = 0; for (let i = experiences.length - 1; i >= 0; i--) { R += experiences[i].reward; returns.unshift(R); } // Update Q-values weighted by return-to-go for (let i = 0; i < experiences.length; i++) { const exp = experiences[i]; const qTable = this.getQTable(exp.state); const currentQ = qTable.get(exp.action) || 0; // Weight by normalized return const normalizedReturn = returns[i] / (Math.abs(returns[0]) + 1); const target = currentQ + α * normalizedReturn * exp.reward; const delta = target - currentQ; qTable.set(exp.action, target); totalDelta += Math.abs(delta); } trajectory.completed = true; trajectory.totalReward = returns[0]; return totalDelta / experiences.length; } /** * DQN: Deep Q-Network (simplified without actual neural network) * Uses experience replay and target network concepts */ dqnUpdate(exp, config) { // Add to replay buffer (trajectory) this.addToCurrentTrajectory(exp); // Sample from replay buffer const replayExp = this.sampleFromReplay(); if (!replayExp) return this.qLearningUpdate(exp, config); // Use sampled experience for update (breaks correlation) return this.qLearningUpdate(replayExp, config); } // ============ Helper Methods ============ getQTable(state) { if (!this.qTables.has(state)) { this.qTables.set(state, new Map()); } return this.qTables.get(state); } getQTable2(state) { if (!this.qTables2.has(state)) { this.qTables2.set(state, new Map()); } return this.qTables2.get(state); } getEligibilityTraces(state) { if (!this.eligibilityTraces.has(state)) { this.eligibilityTraces.set(state, new Map()); } return this.eligibilityTraces.get(state); } softmaxConfidence(values, selectedIdx) { if (values.length === 0) return 0.5; const maxVal = Math.max(...values); const expValues = values.map(v => Math.exp(v - maxVal)); const sumExp = expValues.reduce((a, b) => a + b, 0); return expValues[selectedIdx] / sumExp; } addToCurrentTrajectory(exp) { if (this.trajectories.length === 0 || this.trajectories[this.trajectories.length - 1].completed) { this.trajectories.push({ experiences: [], totalReward: 0, completed: false, }); } this.trajectories[this.trajectories.length - 1].experiences.push(exp); } sampleFromReplay() { const allExperiences = []; for (const traj of this.trajectories) { allExperiences.push(...traj.experiences); } if (allExperiences.length === 0) return null; return allExperiences[Math.floor(Math.random() * allExperiences.length)]; } updateStats(algorithm, reward, delta) { const stats = this.stats.get(algorithm); if (!stats) return; stats.updates++; stats.lastUpdate = Date.now(); // Running average reward this.rewardHistory.push(reward); if (this.rewardHistory.length > 1000) { this.rewardHistory.shift(); } stats.avgReward = this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length; // Convergence score (inverse of recent delta magnitude) stats.convergenceScore = 1 / (1 + delta); } /** * Get statistics for all algorithms */ getStats() { return new Map(this.stats); } /** * Get statistics summary */ getStatsSummary() { let bestAlgorithm = 'q-learning'; let bestScore = -Infinity; let totalUpdates = 0; const algorithms = []; for (const [alg, stats] of this.stats) { algorithms.push(stats); totalUpdates += stats.updates; const score = stats.avgReward * stats.convergenceScore; if (score > bestScore && stats.updates > 0) { bestScore = score; bestAlgorithm = alg; } } return { bestAlgorithm, totalUpdates, avgReward: this.rewardHistory.length > 0 ? this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length : 0, algorithms: algorithms.filter(a => a.updates > 0), }; } /** * Export state for persistence */ export() { const qTables = {}; for (const [state, actions] of this.qTables) { qTables[state] = Object.fromEntries(actions); } const qTables2 = {}; for (const [state, actions] of this.qTables2) { qTables2[state] = Object.fromEntries(actions); } const criticValues = Object.fromEntries(this.criticValues); const stats = {}; for (const [alg, s] of this.stats) { stats[alg] = s; } const configs = {}; for (const [task, config] of this.configs) { configs[task] = config; } return { qTables, qTables2, criticValues, trajectories: this.trajectories.slice(-100), // Keep last 100 trajectories stats, configs, rewardHistory: this.rewardHistory.slice(-1000), }; } /** * Import state from persistence */ import(data) { // Q-tables this.qTables.clear(); for (const [state, actions] of Object.entries(data.qTables || {})) { this.qTables.set(state, new Map(Object.entries(actions))); } this.qTables2.clear(); for (const [state, actions] of Object.entries(data.qTables2 || {})) { this.qTables2.set(state, new Map(Object.entries(actions))); } // Critic values this.criticValues = new Map(Object.entries(data.criticValues || {})); // Trajectories this.trajectories = data.trajectories || []; // Stats for (const [alg, s] of Object.entries(data.stats || {})) { this.stats.set(alg, s); } // Configs for (const [task, config] of Object.entries(data.configs || {})) { this.configs.set(task, config); } // Reward history this.rewardHistory = data.rewardHistory || []; } /** * Clear all learning data */ clear() { this.qTables.clear(); this.qTables2.clear(); this.eligibilityTraces.clear(); this.actorWeights.clear(); this.criticValues.clear(); this.trajectories = []; this.rewardHistory = []; // Reset stats for (const stats of this.stats.values()) { stats.updates = 0; stats.avgReward = 0; stats.convergenceScore = 0; } } /** * Get available algorithms */ static getAlgorithms() { return [ { algorithm: 'q-learning', description: 'Simple off-policy learning', bestFor: 'General routing' }, { algorithm: 'sarsa', description: 'On-policy, conservative', bestFor: 'Error avoidance' }, { algorithm: 'double-q', description: 'Reduces overestimation', bestFor: 'Precise routing' }, { algorithm: 'actor-critic', description: 'Policy gradient + value', bestFor: 'Confidence scoring' }, { algorithm: 'ppo', description: 'Stable policy updates', bestFor: 'Preference learning' }, { algorithm: 'decision-transformer', description: 'Sequence modeling', bestFor: 'Trajectory patterns' }, { algorithm: 'monte-carlo', description: 'Full episode learning', bestFor: 'Unbiased estimates' }, { algorithm: 'td-lambda', description: 'Eligibility traces', bestFor: 'Credit assignment' }, { algorithm: 'dqn', description: 'Experience replay', bestFor: 'High-dim states' }, ]; } } exports.LearningEngine = LearningEngine; exports.default = LearningEngine;