| | "use strict"; |
| | |
| | |
| | |
| | |
| | Object.defineProperty(exports, "__esModule", { value: true }); |
| | exports.LearningEngine = void 0; |
| | |
| | const TASK_ALGORITHM_MAP = { |
| | 'agent-routing': { |
| | algorithm: 'double-q', |
| | learningRate: 0.1, |
| | discountFactor: 0.95, |
| | epsilon: 0.1, |
| | }, |
| | 'error-avoidance': { |
| | algorithm: 'sarsa', |
| | learningRate: 0.05, |
| | discountFactor: 0.99, |
| | epsilon: 0.05, |
| | }, |
| | 'confidence-scoring': { |
| | algorithm: 'actor-critic', |
| | learningRate: 0.01, |
| | discountFactor: 0.95, |
| | epsilon: 0.1, |
| | entropyCoef: 0.01, |
| | }, |
| | 'trajectory-learning': { |
| | algorithm: 'decision-transformer', |
| | learningRate: 0.001, |
| | discountFactor: 0.99, |
| | epsilon: 0, |
| | sequenceLength: 20, |
| | }, |
| | 'context-ranking': { |
| | algorithm: 'ppo', |
| | learningRate: 0.0003, |
| | discountFactor: 0.99, |
| | epsilon: 0.2, |
| | clipRange: 0.2, |
| | entropyCoef: 0.01, |
| | }, |
| | 'memory-recall': { |
| | algorithm: 'td-lambda', |
| | learningRate: 0.1, |
| | discountFactor: 0.9, |
| | epsilon: 0.1, |
| | lambda: 0.8, |
| | }, |
| | }; |
| | class LearningEngine { |
| | constructor() { |
| | this.configs = new Map(); |
| | this.qTables = new Map(); |
| | this.qTables2 = new Map(); |
| | this.eligibilityTraces = new Map(); |
| | this.actorWeights = new Map(); |
| | this.criticValues = new Map(); |
| | this.trajectories = []; |
| | this.stats = new Map(); |
| | this.rewardHistory = []; |
| | |
| | for (const [task, config] of Object.entries(TASK_ALGORITHM_MAP)) { |
| | this.configs.set(task, { ...config }); |
| | } |
| | |
| | const algorithms = [ |
| | 'q-learning', 'sarsa', 'double-q', 'actor-critic', |
| | 'ppo', 'decision-transformer', 'monte-carlo', 'td-lambda', 'dqn' |
| | ]; |
| | for (const alg of algorithms) { |
| | this.stats.set(alg, { |
| | algorithm: alg, |
| | updates: 0, |
| | avgReward: 0, |
| | convergenceScore: 0, |
| | lastUpdate: Date.now(), |
| | }); |
| | } |
| | } |
| | |
| | |
| | |
| | configure(task, config) { |
| | const existing = this.configs.get(task) || TASK_ALGORITHM_MAP[task]; |
| | this.configs.set(task, { ...existing, ...config }); |
| | } |
| | |
| | |
| | |
| | getConfig(task) { |
| | return this.configs.get(task) || TASK_ALGORITHM_MAP[task]; |
| | } |
| | |
| | |
| | |
| | update(task, experience) { |
| | const config = this.getConfig(task); |
| | let delta = 0; |
| | switch (config.algorithm) { |
| | case 'q-learning': |
| | delta = this.qLearningUpdate(experience, config); |
| | break; |
| | case 'sarsa': |
| | delta = this.sarsaUpdate(experience, config); |
| | break; |
| | case 'double-q': |
| | delta = this.doubleQUpdate(experience, config); |
| | break; |
| | case 'actor-critic': |
| | delta = this.actorCriticUpdate(experience, config); |
| | break; |
| | case 'ppo': |
| | delta = this.ppoUpdate(experience, config); |
| | break; |
| | case 'td-lambda': |
| | delta = this.tdLambdaUpdate(experience, config); |
| | break; |
| | case 'monte-carlo': |
| | |
| | this.addToCurrentTrajectory(experience); |
| | if (experience.done) { |
| | delta = this.monteCarloUpdate(config); |
| | } |
| | break; |
| | case 'decision-transformer': |
| | this.addToCurrentTrajectory(experience); |
| | if (experience.done) { |
| | delta = this.decisionTransformerUpdate(config); |
| | } |
| | break; |
| | case 'dqn': |
| | delta = this.dqnUpdate(experience, config); |
| | break; |
| | } |
| | |
| | this.updateStats(config.algorithm, experience.reward, Math.abs(delta)); |
| | return delta; |
| | } |
| | |
| | |
| | |
| | getBestAction(task, state, actions) { |
| | const config = this.getConfig(task); |
| | |
| | if (Math.random() < config.epsilon) { |
| | const randomAction = actions[Math.floor(Math.random() * actions.length)]; |
| | return { action: randomAction, confidence: 0.5 }; |
| | } |
| | let bestAction = actions[0]; |
| | let bestValue = -Infinity; |
| | let values = []; |
| | const qTable = this.getQTable(state); |
| | for (const action of actions) { |
| | const value = qTable.get(action) || 0; |
| | values.push(value); |
| | if (value > bestValue) { |
| | bestValue = value; |
| | bestAction = action; |
| | } |
| | } |
| | |
| | const confidence = this.softmaxConfidence(values, actions.indexOf(bestAction)); |
| | return { action: bestAction, confidence }; |
| | } |
| | |
| | |
| | |
| | getActionProbabilities(state, actions) { |
| | const probs = new Map(); |
| | const qTable = this.getQTable(state); |
| | const values = actions.map(a => qTable.get(a) || 0); |
| | const maxVal = Math.max(...values); |
| | const expValues = values.map(v => Math.exp(v - maxVal)); |
| | const sumExp = expValues.reduce((a, b) => a + b, 0); |
| | for (let i = 0; i < actions.length; i++) { |
| | probs.set(actions[i], expValues[i] / sumExp); |
| | } |
| | return probs; |
| | } |
| | |
| | |
| | |
| | |
| | qLearningUpdate(exp, config) { |
| | const { state, action, reward, nextState, done } = exp; |
| | const { learningRate: α, discountFactor: γ } = config; |
| | const qTable = this.getQTable(state); |
| | const nextQTable = this.getQTable(nextState); |
| | const currentQ = qTable.get(action) || 0; |
| | const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values())); |
| | const target = reward + γ * maxNextQ; |
| | const delta = target - currentQ; |
| | const newQ = currentQ + α * delta; |
| | qTable.set(action, newQ); |
| | return delta; |
| | } |
| | |
| | |
| | |
| | |
| | sarsaUpdate(exp, config) { |
| | const { state, action, reward, nextState, done } = exp; |
| | const { learningRate: α, discountFactor: γ, epsilon } = config; |
| | const qTable = this.getQTable(state); |
| | const nextQTable = this.getQTable(nextState); |
| | const currentQ = qTable.get(action) || 0; |
| | |
| | let nextQ = 0; |
| | if (!done) { |
| | const nextActions = Array.from(nextQTable.keys()); |
| | if (nextActions.length > 0) { |
| | const maxQ = Math.max(...Array.from(nextQTable.values())); |
| | const avgQ = Array.from(nextQTable.values()).reduce((a, b) => a + b, 0) / nextActions.length; |
| | |
| | nextQ = (1 - epsilon) * maxQ + epsilon * avgQ; |
| | } |
| | } |
| | const target = reward + γ * nextQ; |
| | const delta = target - currentQ; |
| | const newQ = currentQ + α * delta; |
| | qTable.set(action, newQ); |
| | return delta; |
| | } |
| | |
| | |
| | |
| | |
| | doubleQUpdate(exp, config) { |
| | const { state, action, reward, nextState, done } = exp; |
| | const { learningRate: α, discountFactor: γ } = config; |
| | const useFirst = Math.random() < 0.5; |
| | const qTable = useFirst ? this.getQTable(state) : this.getQTable2(state); |
| | const otherQTable = useFirst ? this.getQTable2(nextState) : this.getQTable(nextState); |
| | const nextQTable = useFirst ? this.getQTable(nextState) : this.getQTable2(nextState); |
| | const currentQ = qTable.get(action) || 0; |
| | let nextQ = 0; |
| | if (!done) { |
| | |
| | let bestAction = ''; |
| | let bestValue = -Infinity; |
| | for (const [a, v] of nextQTable) { |
| | if (v > bestValue) { |
| | bestValue = v; |
| | bestAction = a; |
| | } |
| | } |
| | |
| | if (bestAction) { |
| | nextQ = otherQTable.get(bestAction) || 0; |
| | } |
| | } |
| | const target = reward + γ * nextQ; |
| | const delta = target - currentQ; |
| | const newQ = currentQ + α * delta; |
| | qTable.set(action, newQ); |
| | return delta; |
| | } |
| | |
| | |
| | |
| | actorCriticUpdate(exp, config) { |
| | const { state, action, reward, nextState, done } = exp; |
| | const { learningRate: α, discountFactor: γ } = config; |
| | |
| | const V = this.criticValues.get(state) || 0; |
| | const V_next = done ? 0 : (this.criticValues.get(nextState) || 0); |
| | const tdError = reward + γ * V_next - V; |
| | this.criticValues.set(state, V + α * tdError); |
| | |
| | const qTable = this.getQTable(state); |
| | const currentQ = qTable.get(action) || 0; |
| | |
| | const newQ = currentQ + α * tdError; |
| | qTable.set(action, newQ); |
| | return tdError; |
| | } |
| | |
| | |
| | |
| | ppoUpdate(exp, config) { |
| | const { state, action, reward, nextState, done } = exp; |
| | const { learningRate: α, discountFactor: γ, clipRange = 0.2 } = config; |
| | |
| | const V = this.criticValues.get(state) || 0; |
| | const V_next = done ? 0 : (this.criticValues.get(nextState) || 0); |
| | const advantage = reward + γ * V_next - V; |
| | this.criticValues.set(state, V + α * advantage); |
| | |
| | const qTable = this.getQTable(state); |
| | const oldQ = qTable.get(action) || 0; |
| | |
| | const ratio = Math.exp(α * advantage); |
| | const clippedRatio = Math.max(1 - clipRange, Math.min(1 + clipRange, ratio)); |
| | |
| | const update = Math.min(ratio * advantage, clippedRatio * advantage); |
| | const newQ = oldQ + α * update; |
| | qTable.set(action, newQ); |
| | return advantage; |
| | } |
| | |
| | |
| | |
| | tdLambdaUpdate(exp, config) { |
| | const { state, action, reward, nextState, done } = exp; |
| | const { learningRate: α, discountFactor: γ, lambda = 0.8 } = config; |
| | const qTable = this.getQTable(state); |
| | const nextQTable = this.getQTable(nextState); |
| | const currentQ = qTable.get(action) || 0; |
| | const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values())); |
| | const tdError = reward + γ * maxNextQ - currentQ; |
| | |
| | const traces = this.getEligibilityTraces(state); |
| | traces.set(action, (traces.get(action) || 0) + 1); |
| | |
| | for (const [s, sTraces] of this.eligibilityTraces) { |
| | const sQTable = this.getQTable(s); |
| | for (const [a, trace] of sTraces) { |
| | const q = sQTable.get(a) || 0; |
| | sQTable.set(a, q + α * tdError * trace); |
| | |
| | sTraces.set(a, γ * lambda * trace); |
| | } |
| | } |
| | return tdError; |
| | } |
| | |
| | |
| | |
| | monteCarloUpdate(config) { |
| | const { learningRate: α, discountFactor: γ } = config; |
| | const trajectory = this.trajectories[this.trajectories.length - 1]; |
| | if (!trajectory || trajectory.experiences.length === 0) |
| | return 0; |
| | let G = 0; |
| | let totalDelta = 0; |
| | |
| | for (let t = trajectory.experiences.length - 1; t >= 0; t--) { |
| | const exp = trajectory.experiences[t]; |
| | G = exp.reward + γ * G; |
| | const qTable = this.getQTable(exp.state); |
| | const currentQ = qTable.get(exp.action) || 0; |
| | const delta = G - currentQ; |
| | qTable.set(exp.action, currentQ + α * delta); |
| | totalDelta += Math.abs(delta); |
| | } |
| | trajectory.completed = true; |
| | trajectory.totalReward = G; |
| | return totalDelta / trajectory.experiences.length; |
| | } |
| | |
| | |
| | |
| | decisionTransformerUpdate(config) { |
| | const { learningRate: α, sequenceLength = 20 } = config; |
| | const trajectory = this.trajectories[this.trajectories.length - 1]; |
| | if (!trajectory || trajectory.experiences.length === 0) |
| | return 0; |
| | |
| | |
| | let totalDelta = 0; |
| | const experiences = trajectory.experiences.slice(-sequenceLength); |
| | |
| | const returns = []; |
| | let R = 0; |
| | for (let i = experiences.length - 1; i >= 0; i--) { |
| | R += experiences[i].reward; |
| | returns.unshift(R); |
| | } |
| | |
| | for (let i = 0; i < experiences.length; i++) { |
| | const exp = experiences[i]; |
| | const qTable = this.getQTable(exp.state); |
| | const currentQ = qTable.get(exp.action) || 0; |
| | |
| | const normalizedReturn = returns[i] / (Math.abs(returns[0]) + 1); |
| | const target = currentQ + α * normalizedReturn * exp.reward; |
| | const delta = target - currentQ; |
| | qTable.set(exp.action, target); |
| | totalDelta += Math.abs(delta); |
| | } |
| | trajectory.completed = true; |
| | trajectory.totalReward = returns[0]; |
| | return totalDelta / experiences.length; |
| | } |
| | |
| | |
| | |
| | |
| | dqnUpdate(exp, config) { |
| | |
| | this.addToCurrentTrajectory(exp); |
| | |
| | const replayExp = this.sampleFromReplay(); |
| | if (!replayExp) |
| | return this.qLearningUpdate(exp, config); |
| | |
| | return this.qLearningUpdate(replayExp, config); |
| | } |
| | |
| | getQTable(state) { |
| | if (!this.qTables.has(state)) { |
| | this.qTables.set(state, new Map()); |
| | } |
| | return this.qTables.get(state); |
| | } |
| | getQTable2(state) { |
| | if (!this.qTables2.has(state)) { |
| | this.qTables2.set(state, new Map()); |
| | } |
| | return this.qTables2.get(state); |
| | } |
| | getEligibilityTraces(state) { |
| | if (!this.eligibilityTraces.has(state)) { |
| | this.eligibilityTraces.set(state, new Map()); |
| | } |
| | return this.eligibilityTraces.get(state); |
| | } |
| | softmaxConfidence(values, selectedIdx) { |
| | if (values.length === 0) |
| | return 0.5; |
| | const maxVal = Math.max(...values); |
| | const expValues = values.map(v => Math.exp(v - maxVal)); |
| | const sumExp = expValues.reduce((a, b) => a + b, 0); |
| | return expValues[selectedIdx] / sumExp; |
| | } |
| | addToCurrentTrajectory(exp) { |
| | if (this.trajectories.length === 0 || this.trajectories[this.trajectories.length - 1].completed) { |
| | this.trajectories.push({ |
| | experiences: [], |
| | totalReward: 0, |
| | completed: false, |
| | }); |
| | } |
| | this.trajectories[this.trajectories.length - 1].experiences.push(exp); |
| | } |
| | sampleFromReplay() { |
| | const allExperiences = []; |
| | for (const traj of this.trajectories) { |
| | allExperiences.push(...traj.experiences); |
| | } |
| | if (allExperiences.length === 0) |
| | return null; |
| | return allExperiences[Math.floor(Math.random() * allExperiences.length)]; |
| | } |
| | updateStats(algorithm, reward, delta) { |
| | const stats = this.stats.get(algorithm); |
| | if (!stats) |
| | return; |
| | stats.updates++; |
| | stats.lastUpdate = Date.now(); |
| | |
| | this.rewardHistory.push(reward); |
| | if (this.rewardHistory.length > 1000) { |
| | this.rewardHistory.shift(); |
| | } |
| | stats.avgReward = this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length; |
| | |
| | stats.convergenceScore = 1 / (1 + delta); |
| | } |
| | |
| | |
| | |
| | getStats() { |
| | return new Map(this.stats); |
| | } |
| | |
| | |
| | |
| | getStatsSummary() { |
| | let bestAlgorithm = 'q-learning'; |
| | let bestScore = -Infinity; |
| | let totalUpdates = 0; |
| | const algorithms = []; |
| | for (const [alg, stats] of this.stats) { |
| | algorithms.push(stats); |
| | totalUpdates += stats.updates; |
| | const score = stats.avgReward * stats.convergenceScore; |
| | if (score > bestScore && stats.updates > 0) { |
| | bestScore = score; |
| | bestAlgorithm = alg; |
| | } |
| | } |
| | return { |
| | bestAlgorithm, |
| | totalUpdates, |
| | avgReward: this.rewardHistory.length > 0 |
| | ? this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length |
| | : 0, |
| | algorithms: algorithms.filter(a => a.updates > 0), |
| | }; |
| | } |
| | |
| | |
| | |
| | export() { |
| | const qTables = {}; |
| | for (const [state, actions] of this.qTables) { |
| | qTables[state] = Object.fromEntries(actions); |
| | } |
| | const qTables2 = {}; |
| | for (const [state, actions] of this.qTables2) { |
| | qTables2[state] = Object.fromEntries(actions); |
| | } |
| | const criticValues = Object.fromEntries(this.criticValues); |
| | const stats = {}; |
| | for (const [alg, s] of this.stats) { |
| | stats[alg] = s; |
| | } |
| | const configs = {}; |
| | for (const [task, config] of this.configs) { |
| | configs[task] = config; |
| | } |
| | return { |
| | qTables, |
| | qTables2, |
| | criticValues, |
| | trajectories: this.trajectories.slice(-100), |
| | stats, |
| | configs, |
| | rewardHistory: this.rewardHistory.slice(-1000), |
| | }; |
| | } |
| | |
| | |
| | |
| | import(data) { |
| | |
| | this.qTables.clear(); |
| | for (const [state, actions] of Object.entries(data.qTables || {})) { |
| | this.qTables.set(state, new Map(Object.entries(actions))); |
| | } |
| | this.qTables2.clear(); |
| | for (const [state, actions] of Object.entries(data.qTables2 || {})) { |
| | this.qTables2.set(state, new Map(Object.entries(actions))); |
| | } |
| | |
| | this.criticValues = new Map(Object.entries(data.criticValues || {})); |
| | |
| | this.trajectories = data.trajectories || []; |
| | |
| | for (const [alg, s] of Object.entries(data.stats || {})) { |
| | this.stats.set(alg, s); |
| | } |
| | |
| | for (const [task, config] of Object.entries(data.configs || {})) { |
| | this.configs.set(task, config); |
| | } |
| | |
| | this.rewardHistory = data.rewardHistory || []; |
| | } |
| | |
| | |
| | |
| | clear() { |
| | this.qTables.clear(); |
| | this.qTables2.clear(); |
| | this.eligibilityTraces.clear(); |
| | this.actorWeights.clear(); |
| | this.criticValues.clear(); |
| | this.trajectories = []; |
| | this.rewardHistory = []; |
| | |
| | for (const stats of this.stats.values()) { |
| | stats.updates = 0; |
| | stats.avgReward = 0; |
| | stats.convergenceScore = 0; |
| | } |
| | } |
| | |
| | |
| | |
| | static getAlgorithms() { |
| | return [ |
| | { algorithm: 'q-learning', description: 'Simple off-policy learning', bestFor: 'General routing' }, |
| | { algorithm: 'sarsa', description: 'On-policy, conservative', bestFor: 'Error avoidance' }, |
| | { algorithm: 'double-q', description: 'Reduces overestimation', bestFor: 'Precise routing' }, |
| | { algorithm: 'actor-critic', description: 'Policy gradient + value', bestFor: 'Confidence scoring' }, |
| | { algorithm: 'ppo', description: 'Stable policy updates', bestFor: 'Preference learning' }, |
| | { algorithm: 'decision-transformer', description: 'Sequence modeling', bestFor: 'Trajectory patterns' }, |
| | { algorithm: 'monte-carlo', description: 'Full episode learning', bestFor: 'Unbiased estimates' }, |
| | { algorithm: 'td-lambda', description: 'Eligibility traces', bestFor: 'Credit assignment' }, |
| | { algorithm: 'dqn', description: 'Experience replay', bestFor: 'High-dim states' }, |
| | ]; |
| | } |
| | } |
| | exports.LearningEngine = LearningEngine; |
| | exports.default = LearningEngine; |
| |
|