Spaces:
Running
Running
| <!-- PPO Simulation By Pejman Ebrahimi --> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
| <title>PPO Reinforcement Learning Simulation</title> | |
| <style> | |
| body { | |
| font-family: Arial, sans-serif; | |
| margin: 0; | |
| padding: 20px; | |
| line-height: 1.6; | |
| color: #333; | |
| background-color: #f8f9fa; | |
| } | |
| .container { | |
| max-width: 1000px; | |
| margin: 0 auto; | |
| background-color: white; | |
| padding: 20px; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1); | |
| } | |
| h1, | |
| h2, | |
| h3 { | |
| color: #2c3e50; | |
| } | |
| h1 { | |
| text-align: center; | |
| margin-bottom: 30px; | |
| border-bottom: 2px solid #3498db; | |
| padding-bottom: 10px; | |
| } | |
| .grid-container { | |
| display: grid; | |
| grid-template-columns: repeat(10, 1fr); | |
| gap: 2px; | |
| margin: 20px 0; | |
| } | |
| .cell { | |
| width: 100%; | |
| aspect-ratio: 1; | |
| background-color: #ecf0f1; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| cursor: pointer; | |
| position: relative; | |
| transition: all 0.3s; | |
| } | |
| .agent { | |
| background-color: #3498db; | |
| border-radius: 50%; | |
| width: 80%; | |
| height: 80%; | |
| position: absolute; | |
| } | |
| .goal { | |
| background-color: #2ecc71; | |
| width: 80%; | |
| height: 80%; | |
| position: absolute; | |
| } | |
| .obstacle { | |
| background-color: #e74c3c; | |
| width: 80%; | |
| height: 80%; | |
| position: absolute; | |
| } | |
| .panel { | |
| background-color: #f5f7f9; | |
| padding: 15px; | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| border: 1px solid #ddd; | |
| } | |
| .controls { | |
| display: flex; | |
| gap: 10px; | |
| flex-wrap: wrap; | |
| margin: 20px 0; | |
| } | |
| button { | |
| padding: 8px 15px; | |
| background-color: #3498db; | |
| color: white; | |
| border: none; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| transition: background-color 0.3s; | |
| } | |
| button:hover { | |
| background-color: #2980b9; | |
| } | |
| button:disabled { | |
| background-color: #95a5a6; | |
| cursor: not-allowed; | |
| } | |
| .sliders { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 10px; | |
| margin: 15px 0; | |
| } | |
| .slider-container { | |
| display: flex; | |
| align-items: center; | |
| } | |
| .slider-container label { | |
| flex: 1; | |
| min-width: 180px; | |
| } | |
| .slider-container input { | |
| flex: 2; | |
| } | |
| .slider-value { | |
| flex: 0 0 50px; | |
| text-align: right; | |
| } | |
| #log-container { | |
| max-height: 200px; | |
| overflow-y: auto; | |
| background-color: #2c3e50; | |
| color: #ecf0f1; | |
| padding: 10px; | |
| border-radius: 4px; | |
| margin-top: 20px; | |
| font-family: monospace; | |
| } | |
| .log-entry { | |
| margin: 5px 0; | |
| } | |
| .tab-container { | |
| margin-top: 20px; | |
| } | |
| .tab-buttons { | |
| display: flex; | |
| border-bottom: 1px solid #ddd; | |
| } | |
| .tab-button { | |
| padding: 10px 20px; | |
| background-color: #f1f1f1; | |
| border: none; | |
| cursor: pointer; | |
| transition: background-color 0.3s; | |
| } | |
| .tab-button.active { | |
| background-color: #3498db; | |
| color: white; | |
| } | |
| .tab-content { | |
| display: none; | |
| padding: 15px; | |
| border: 1px solid #ddd; | |
| border-top: none; | |
| animation: fadeIn 0.5s; | |
| } | |
| .tab-content.active { | |
| display: block; | |
| } | |
| #policy-display { | |
| width: 100%; | |
| height: 300px; | |
| overflow: auto; | |
| margin-top: 10px; | |
| } | |
| .policy-grid { | |
| display: grid; | |
| grid-template-columns: repeat(10, 1fr); | |
| gap: 2px; | |
| } | |
| .policy-cell { | |
| aspect-ratio: 1; | |
| border: 1px solid #ddd; | |
| padding: 2px; | |
| font-size: 10px; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| } | |
| .arrow { | |
| width: 0; | |
| height: 0; | |
| border-style: solid; | |
| margin: 2px; | |
| } | |
| .arrow-up { | |
| border-width: 0 4px 8px 4px; | |
| border-color: transparent transparent #3498db transparent; | |
| } | |
| .arrow-right { | |
| border-width: 4px 0 4px 8px; | |
| border-color: transparent transparent transparent #3498db; | |
| } | |
| .arrow-down { | |
| border-width: 8px 4px 0 4px; | |
| border-color: #3498db transparent transparent transparent; | |
| } | |
| .arrow-left { | |
| border-width: 4px 8px 4px 0; | |
| border-color: transparent #3498db transparent transparent; | |
| } | |
| .progress-container { | |
| margin-top: 10px; | |
| background-color: #f1f1f1; | |
| border-radius: 5px; | |
| height: 20px; | |
| position: relative; | |
| } | |
| .progress-bar { | |
| height: 100%; | |
| background-color: #3498db; | |
| border-radius: 5px; | |
| width: 0%; | |
| transition: width 0.3s; | |
| } | |
| .chart-container { | |
| height: 300px; | |
| margin: 15px 0; | |
| } | |
| @keyframes fadeIn { | |
| from { | |
| opacity: 0; | |
| } | |
| to { | |
| opacity: 1; | |
| } | |
| } | |
| .popup { | |
| display: none; | |
| position: fixed; | |
| top: 50%; | |
| left: 50%; | |
| transform: translate(-50%, -50%); | |
| background-color: white; | |
| padding: 20px; | |
| border-radius: 8px; | |
| box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2); | |
| z-index: 1000; | |
| max-width: 80%; | |
| max-height: 80%; | |
| overflow-y: auto; | |
| } | |
| .popup-overlay { | |
| display: none; | |
| position: fixed; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| background-color: rgba(0, 0, 0, 0.5); | |
| z-index: 999; | |
| } | |
| .reward-display { | |
| font-weight: bold; | |
| font-size: 1.2em; | |
| text-align: center; | |
| margin: 10px 0; | |
| } | |
| .explanation { | |
| background-color: #e8f4fc; | |
| padding: 15px; | |
| border-radius: 5px; | |
| margin: 10px 0; | |
| border-left: 4px solid #3498db; | |
| } | |
| .highlight { | |
| background-color: #fffacd; | |
| padding: 2px 4px; | |
| border-radius: 3px; | |
| } | |
| .concept-box { | |
| border: 1px solid #ddd; | |
| margin: 15px 0; | |
| border-radius: 5px; | |
| overflow: hidden; | |
| } | |
| .concept-title { | |
| background-color: #3498db; | |
| color: white; | |
| padding: 10px; | |
| margin: 0; | |
| } | |
| .concept-content { | |
| padding: 15px; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>Proximal Policy Optimization (PPO) Simulation</h1> | |
| <div class="explanation"> | |
| <p> | |
| This simulation demonstrates how an agent learns to navigate to a goal | |
| using <strong>Proximal Policy Optimization (PPO)</strong>. PPO is an | |
| on-policy reinforcement learning algorithm that uses a "clipping" | |
| mechanism to prevent large policy updates, making training more stable | |
| and efficient. | |
| </p> | |
| </div> | |
| <div class="tab-container"> | |
| <div class="tab-buttons"> | |
| <button class="tab-button active" onclick="openTab('simulation-tab')"> | |
| Simulation | |
| </button> | |
| <button class="tab-button" onclick="openTab('concepts-tab')"> | |
| PPO Concepts | |
| </button> | |
| <button class="tab-button" onclick="openTab('metrics-tab')"> | |
| Training Metrics | |
| </button> | |
| </div> | |
| <div id="simulation-tab" class="tab-content active"> | |
| <div class="panel"> | |
| <h3>Environment</h3> | |
| <p> | |
| The agent (blue) must navigate to the goal (green) while avoiding | |
| obstacles (red). | |
| </p> | |
| <div class="grid-container" id="grid"></div> | |
| <div class="reward-display"> | |
| Total Reward: <span id="reward-value">0</span> | |
| </div> | |
| </div> | |
| <div class="controls"> | |
| <button id="start-btn" onclick="startTraining()"> | |
| Start Training | |
| </button> | |
| <button id="reset-btn" onclick="resetEnvironment()"> | |
| Reset Environment | |
| </button> | |
| <button id="step-btn" onclick="stepTraining()" disabled> | |
| Step Forward | |
| </button> | |
| <button id="place-obstacle-btn" onclick="toggleObstaclePlacement()"> | |
| Place Obstacles | |
| </button> | |
| <button id="animation-speed-btn" onclick="toggleAnimationSpeed()"> | |
| Animation Speed: Normal | |
| </button> | |
| </div> | |
| <div class="panel"> | |
| <h3>PPO Parameters</h3> | |
| <div class="sliders"> | |
| <div class="slider-container"> | |
| <label for="clip-ratio">Clip Ratio (ε):</label> | |
| <input | |
| type="range" | |
| id="clip-ratio" | |
| min="0.05" | |
| max="0.5" | |
| step="0.05" | |
| value="0.2" | |
| oninput="updateSliderValue('clip-ratio')" | |
| /> | |
| <span class="slider-value" id="clip-ratio-value">0.2</span> | |
| </div> | |
| <div class="slider-container"> | |
| <label for="learning-rate">Learning Rate:</label> | |
| <input | |
| type="range" | |
| id="learning-rate" | |
| min="0.01" | |
| max="1" | |
| step="0.01" | |
| value="0.1" | |
| oninput="updateSliderValue('learning-rate')" | |
| /> | |
| <span class="slider-value" id="learning-rate-value">0.1</span> | |
| </div> | |
| <div class="slider-container"> | |
| <label for="epochs">PPO Epochs per Update:</label> | |
| <input | |
| type="range" | |
| id="epochs" | |
| min="1" | |
| max="10" | |
| step="1" | |
| value="4" | |
| oninput="updateSliderValue('epochs')" | |
| /> | |
| <span class="slider-value" id="epochs-value">4</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="panel"> | |
| <h3>Policy Visualization</h3> | |
| <p> | |
| This shows the current policy of the agent (arrows indicate | |
| preferred actions in each state). | |
| </p> | |
| <div id="policy-display"> | |
| <div class="policy-grid" id="policy-grid"></div> | |
| </div> | |
| </div> | |
| <div id="log-container"></div> | |
| </div> | |
| <div id="concepts-tab" class="tab-content"> | |
| <div class="concept-box"> | |
| <h3 class="concept-title">What is PPO?</h3> | |
| <div class="concept-content"> | |
| <p> | |
| Proximal Policy Optimization (PPO) is a policy gradient method | |
| for reinforcement learning developed by OpenAI in 2017. It has | |
| become one of the most popular RL algorithms due to its | |
| simplicity and effectiveness. | |
| </p> | |
| <p>PPO aims to balance two objectives:</p> | |
| <ul> | |
| <li>Improving the agent's policy to maximize rewards</li> | |
| <li> | |
| Preventing large policy updates that could destabilize | |
| training | |
| </li> | |
| </ul> | |
| </div> | |
| </div> | |
| <div class="concept-box"> | |
| <h3 class="concept-title">Key Innovations in PPO</h3> | |
| <div class="concept-content"> | |
| <p> | |
| The central innovation in PPO is the | |
| <strong>clipped surrogate objective function</strong>: | |
| </p> | |
| <p style="text-align: center"> | |
| L<sup>CLIP</sup>(θ) = E[min(r<sub>t</sub>(θ)A<sub>t</sub>, | |
| clip(r<sub>t</sub>(θ), 1-ε, 1+ε)A<sub>t</sub>)] | |
| </p> | |
| <p>where:</p> | |
| <ul> | |
| <li> | |
| <strong>r<sub>t</sub>(θ)</strong> is the ratio of | |
| probabilities under new and old policies | |
| </li> | |
| <li> | |
| <strong>A<sub>t</sub></strong> is the advantage estimate | |
| </li> | |
| <li> | |
| <strong>ε</strong> is the clipping parameter (usually 0.1 or | |
| 0.2) | |
| </li> | |
| </ul> | |
| <p> | |
| The clipping mechanism ensures that the policy update stays | |
| within a "trust region" by limiting how much the new policy can | |
| deviate from the old one. | |
| </p> | |
| </div> | |
| </div> | |
| <div class="concept-box"> | |
| <h3 class="concept-title">How PPO Works in This Simulation</h3> | |
| <div class="concept-content"> | |
| <ol> | |
| <li> | |
| The agent collects experience by interacting with the | |
| environment using its current policy | |
| </li> | |
| <li>Advantages are computed for each state-action pair</li> | |
| <li> | |
| The policy is updated using the clipped surrogate objective | |
| </li> | |
| <li> | |
| Multiple optimization epochs are performed on the same batch | |
| of data | |
| </li> | |
| <li>The process repeats with the new policy</li> | |
| </ol> | |
| <p> | |
| You can observe these steps in action in the simulation tab by | |
| watching the policy visualization and training metrics. | |
| </p> | |
| </div> | |
| </div> | |
| <div class="concept-box"> | |
| <h3 class="concept-title">PPO vs. Other RL Algorithms</h3> | |
| <div class="concept-content"> | |
| <p>PPO improves upon earlier algorithms in several ways:</p> | |
| <ul> | |
| <li> | |
| <strong>vs. REINFORCE:</strong> More stable training due to | |
| advantage estimation and clipping | |
| </li> | |
| <li> | |
| <strong>vs. TRPO:</strong> Simpler implementation while | |
| maintaining similar performance | |
| </li> | |
| <li> | |
| <strong>vs. A2C/A3C:</strong> Better sample efficiency and | |
| more stable policy updates | |
| </li> | |
| <li> | |
| <strong>vs. Off-policy algorithms (DQN, DDPG):</strong> Less | |
| sensitive to hyperparameters and often more stable | |
| </li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| <div id="metrics-tab" class="tab-content"> | |
| <div class="panel"> | |
| <h3>Training Progress</h3> | |
| <div class="progress-container"> | |
| <div class="progress-bar" id="training-progress"></div> | |
| </div> | |
| <p id="episode-counter">Episodes: 0 / 100</p> | |
| </div> | |
| <div class="panel"> | |
| <h3>Reward Over Time</h3> | |
| <div class="chart-container" id="reward-chart"></div> | |
| </div> | |
| <div class="panel"> | |
| <h3>Policy Loss</h3> | |
| <div class="chart-container" id="policy-loss-chart"></div> | |
| </div> | |
| <div class="panel"> | |
| <h3>Value Loss</h3> | |
| <div class="chart-container" id="value-loss-chart"></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="popup-overlay" id="popup-overlay"></div> | |
| <div class="popup" id="popup"> | |
| <h2 id="popup-title">Title</h2> | |
| <div id="popup-content">Content</div> | |
| <button onclick="closePopup()">Close</button> | |
| </div> | |
| <script> | |
| // Environment configuration | |
| const GRID_SIZE = 10; | |
| let grid = []; | |
| let agentPos = { x: 0, y: 0 }; | |
| let goalPos = { x: 9, y: 9 }; | |
| let obstacles = []; | |
| let placingObstacles = false; | |
| // Agent and PPO parameters | |
| let policyNetwork = {}; | |
| let valueNetwork = {}; | |
| let clipRatio = 0.2; | |
| let learningRate = 0.1; // Default learning rate (0-1 range) | |
| let ppoEpochs = 4; | |
| let gamma = 0.99; // Discount factor | |
| let lambda = 0.95; // GAE parameter | |
| // Training state | |
| let isTraining = false; | |
| let episode = 0; | |
| let maxEpisodes = 100; | |
| let episodeSteps = 0; | |
| let maxStepsPerEpisode = 100; // Increased max steps to allow more exploration | |
| let totalReward = 0; | |
| let episodeRewards = []; | |
| let policyLosses = []; | |
| let valueLosses = []; | |
| // Tracking for visualization | |
| let trajectories = []; | |
| let oldPolicy = {}; | |
| // Exploration parameters | |
| let explorationRate = 0.2; // Probability of taking a random action (exploration) | |
| // Initialize the environment | |
| function initializeEnvironment() { | |
| grid = []; | |
| obstacles = []; | |
| // Create the grid UI | |
| const gridContainer = document.getElementById("grid"); | |
| gridContainer.innerHTML = ""; | |
| for (let y = 0; y < GRID_SIZE; y++) { | |
| for (let x = 0; x < GRID_SIZE; x++) { | |
| const cell = document.createElement("div"); | |
| cell.classList.add("cell"); | |
| cell.dataset.x = x; | |
| cell.dataset.y = y; | |
| cell.addEventListener("click", handleCellClick); | |
| gridContainer.appendChild(cell); | |
| } | |
| } | |
| // Place agent and goal | |
| agentPos = { x: 0, y: 0 }; | |
| goalPos = { x: 9, y: 9 }; | |
| renderGrid(); | |
| // Initialize policy and value networks | |
| initializeNetworks(); | |
| renderPolicy(); | |
| updateReward(0); | |
| } | |
| // Initialize policy and value networks | |
| function initializeNetworks() { | |
| policyNetwork = {}; | |
| valueNetwork = {}; | |
| // Initialize learning rate | |
| learningRate = parseFloat( | |
| document.getElementById("learning-rate").value | |
| ); | |
| // Initialize policy and value for each state (cell) | |
| for (let y = 0; y < GRID_SIZE; y++) { | |
| for (let x = 0; x < GRID_SIZE; x++) { | |
| const stateKey = `${x},${y}`; | |
| // Initialize policy with random probabilities | |
| policyNetwork[stateKey] = { | |
| up: 0.25, | |
| right: 0.25, | |
| down: 0.25, | |
| left: 0.25, | |
| }; | |
| // Initialize value to zero | |
| valueNetwork[stateKey] = 0; | |
| } | |
| } | |
| } | |
| function renderGrid() { | |
| // Clear all cells | |
| const cells = document.querySelectorAll(".cell"); | |
| cells.forEach((cell) => { | |
| cell.innerHTML = ""; | |
| }); | |
| // Place agent | |
| const agentCell = document.querySelector( | |
| `.cell[data-x="${agentPos.x}"][data-y="${agentPos.y}"]` | |
| ); | |
| const agentElement = document.createElement("div"); | |
| agentElement.classList.add("agent"); | |
| agentCell.appendChild(agentElement); | |
| // Place goal | |
| const goalCell = document.querySelector( | |
| `.cell[data-x="${goalPos.x}"][data-y="${goalPos.y}"]` | |
| ); | |
| const goalElement = document.createElement("div"); | |
| goalElement.classList.add("goal"); | |
| goalCell.appendChild(goalElement); | |
| // Place obstacles | |
| obstacles.forEach((obstacle) => { | |
| const obstacleCell = document.querySelector( | |
| `.cell[data-x="${obstacle.x}"][data-y="${obstacle.y}"]` | |
| ); | |
| const obstacleElement = document.createElement("div"); | |
| obstacleElement.classList.add("obstacle"); | |
| obstacleCell.appendChild(obstacleElement); | |
| }); | |
| } | |
| function renderPolicy() { | |
| const policyGrid = document.getElementById("policy-grid"); | |
| policyGrid.innerHTML = ""; | |
| for (let y = 0; y < GRID_SIZE; y++) { | |
| for (let x = 0; x < GRID_SIZE; x++) { | |
| const cell = document.createElement("div"); | |
| cell.classList.add("policy-cell"); | |
| const stateKey = `${x},${y}`; | |
| const policy = policyNetwork[stateKey]; | |
| // Skip rendering policy for obstacles | |
| if (isObstacle(x, y)) { | |
| cell.style.backgroundColor = "#e74c3c"; | |
| policyGrid.appendChild(cell); | |
| continue; | |
| } | |
| // If it's the goal, mark it green | |
| if (x === goalPos.x && y === goalPos.y) { | |
| cell.style.backgroundColor = "#2ecc71"; | |
| policyGrid.appendChild(cell); | |
| continue; | |
| } | |
| // Create arrows for each action probability | |
| for (const [action, prob] of Object.entries(policy)) { | |
| if (prob > 0.2) { | |
| // Only show significant probabilities | |
| const arrow = document.createElement("div"); | |
| arrow.classList.add("arrow", `arrow-${action}`); | |
| arrow.style.opacity = Math.min(1, prob * 2); // Scale opacity with probability | |
| cell.appendChild(arrow); | |
| } | |
| } | |
| // Add state value indication using background color intensity | |
| const value = valueNetwork[stateKey]; | |
| const normalizedValue = (value + 10) / 20; // Normalize to [0,1] range assuming values between -10 and 10 | |
| const intensity = Math.max( | |
| 0, | |
| Math.min(255, Math.floor(normalizedValue * 255)) | |
| ); | |
| cell.style.backgroundColor = `rgba(236, 240, 241, ${normalizedValue})`; | |
| policyGrid.appendChild(cell); | |
| } | |
| } | |
| } | |
| function handleCellClick(event) { | |
| const x = parseInt(event.currentTarget.dataset.x); | |
| const y = parseInt(event.currentTarget.dataset.y); | |
| if (placingObstacles) { | |
| // Don't allow obstacles on agent or goal | |
| if ( | |
| (x === agentPos.x && y === agentPos.y) || | |
| (x === goalPos.x && y === goalPos.y) | |
| ) { | |
| return; | |
| } | |
| const obstacleIndex = obstacles.findIndex( | |
| (o) => o.x === x && o.y === y | |
| ); | |
| if (obstacleIndex === -1) { | |
| obstacles.push({ x, y }); | |
| } else { | |
| obstacles.splice(obstacleIndex, 1); | |
| } | |
| renderGrid(); | |
| renderPolicy(); | |
| } | |
| } | |
| function toggleObstaclePlacement() { | |
| placingObstacles = !placingObstacles; | |
| const btn = document.getElementById("place-obstacle-btn"); | |
| btn.textContent = placingObstacles ? "Done Placing" : "Place Obstacles"; | |
| btn.style.backgroundColor = placingObstacles ? "#e74c3c" : "#3498db"; | |
| } | |
| function isObstacle(x, y) { | |
| return obstacles.some((o) => o.x === x && o.y === y); | |
| } | |
| function resetEnvironment() { | |
| initializeEnvironment(); | |
| episodeRewards = []; | |
| policyLosses = []; | |
| valueLosses = []; | |
| episode = 0; | |
| updateEpisodeCounter(); | |
| updateReward(0); | |
| // Reset training state | |
| isTraining = false; | |
| document.getElementById("start-btn").textContent = "Start Training"; | |
| document.getElementById("step-btn").disabled = true; | |
| // Clear charts | |
| // In a real implementation, you would update the charts here | |
| logMessage("Environment reset. Ready for training!"); | |
| } | |
| function startTraining() { | |
| if (isTraining) { | |
| // Stop training | |
| isTraining = false; | |
| document.getElementById("start-btn").textContent = "Start Training"; | |
| document.getElementById("step-btn").disabled = true; | |
| } else { | |
| // Start training | |
| isTraining = true; | |
| document.getElementById("start-btn").textContent = "Stop Training"; | |
| document.getElementById("step-btn").disabled = false; | |
| // If we're at the end of training, reset first | |
| if (episode >= maxEpisodes) { | |
| resetEnvironment(); | |
| } | |
| runTrainingLoop(); | |
| } | |
| } | |
| function stepTraining() { | |
| if (episode < maxEpisodes) { | |
| runEpisode(); | |
| updateTrainingProgress(); | |
| } else { | |
| logMessage("Training complete! Reset to train again."); | |
| } | |
| } | |
| async function runTrainingLoop() { | |
| while (isTraining && episode < maxEpisodes) { | |
| await runEpisode(); | |
| updateTrainingProgress(); | |
| // Add a small delay to visualize the process | |
| await new Promise((resolve) => setTimeout(resolve, 200)); | |
| } | |
| if (episode >= maxEpisodes) { | |
| logMessage("Training complete!"); | |
| isTraining = false; | |
| document.getElementById("start-btn").textContent = "Start Training"; | |
| } | |
| } | |
| async function runEpisode() { | |
| // Reset agent position and episodic variables | |
| agentPos = { x: 0, y: 0 }; | |
| episodeSteps = 0; | |
| totalReward = 0; | |
| trajectories = []; | |
| // Decay exploration rate over time (important for improving policy) | |
| explorationRate = Math.max(0.05, 0.2 * Math.pow(0.99, episode)); | |
| renderGrid(); | |
| updateReward(totalReward); | |
| // Save old policy for PPO ratio calculation | |
| oldPolicy = JSON.parse(JSON.stringify(policyNetwork)); | |
| // Run episode until termination | |
| let done = false; | |
| while (!done && episodeSteps < maxStepsPerEpisode) { | |
| done = await executeStep(); | |
| episodeSteps++; | |
| // Small delay for visualization | |
| await new Promise((resolve) => | |
| setTimeout(resolve, animationSpeeds[animationSpeed] / 2) | |
| ); | |
| } | |
| // Add episode reward to history | |
| episodeRewards.push(totalReward); | |
| // Run PPO update if we have enough steps | |
| if (trajectories.length > 0) { | |
| const [policyLoss, valueLoss] = updatePPO(); | |
| policyLosses.push(policyLoss); | |
| valueLosses.push(valueLoss); | |
| } | |
| // Update UI | |
| renderPolicy(); | |
| episode++; | |
| updateEpisodeCounter(); | |
| logMessage( | |
| `Episode ${episode}: Reward=${totalReward.toFixed( | |
| 2 | |
| )}, Steps=${episodeSteps}, Exploration=${explorationRate.toFixed(2)}` | |
| ); | |
| return new Promise((resolve) => setTimeout(resolve, 10)); | |
| } | |
| async function executeStep() { | |
| const stateKey = `${agentPos.x},${agentPos.y}`; | |
| const policy = policyNetwork[stateKey]; | |
| // Choose action based on policy | |
| const action = sampleAction(policy); | |
| // Store old position | |
| const oldPos = { ...agentPos }; | |
| // Move agent | |
| moveAgent(action); | |
| // Calculate reward | |
| const reward = calculateReward(oldPos); | |
| totalReward += reward; | |
| updateReward(totalReward); | |
| // Check if episode is done | |
| const done = | |
| (agentPos.x === goalPos.x && agentPos.y === goalPos.y) || | |
| isObstacle(agentPos.x, agentPos.y); | |
| // If agent hit obstacle, move it back for visualization | |
| if (isObstacle(agentPos.x, agentPos.y)) { | |
| agentPos = { ...oldPos }; | |
| } | |
| // Render the grid | |
| renderGrid(); | |
| // Store trajectory | |
| const newStateKey = `${agentPos.x},${agentPos.y}`; | |
| trajectories.push({ | |
| state: stateKey, | |
| action, | |
| reward, | |
| nextState: newStateKey, | |
| done, | |
| }); | |
| return done; | |
| } | |
| function sampleAction(policy) { | |
| // Use exploration rate to decide whether to take random action or follow policy | |
| if (Math.random() < explorationRate) { | |
| // Take random action with exploration probability | |
| const actions = Object.keys(policy); | |
| const randomIndex = Math.floor(Math.random() * actions.length); | |
| return actions[randomIndex]; | |
| } | |
| // Otherwise sample from policy distribution | |
| const actions = Object.keys(policy); | |
| const probs = actions.map((a) => policy[a]); | |
| const rand = Math.random(); | |
| let cumProb = 0; | |
| for (let i = 0; i < actions.length; i++) { | |
| cumProb += probs[i]; | |
| if (rand < cumProb) { | |
| return actions[i]; | |
| } | |
| } | |
| return actions[actions.length - 1]; | |
| } | |
| function moveAgent(action) { | |
| // Save previous position | |
| const prevPos = { ...agentPos }; | |
| // Attempt to move agent | |
| switch (action) { | |
| case "up": | |
| agentPos.y = Math.max(0, agentPos.y - 1); | |
| break; | |
| case "right": | |
| agentPos.x = Math.min(GRID_SIZE - 1, agentPos.x + 1); | |
| break; | |
| case "down": | |
| agentPos.y = Math.min(GRID_SIZE - 1, agentPos.y + 1); | |
| break; | |
| case "left": | |
| agentPos.x = Math.max(0, agentPos.x - 1); | |
| break; | |
| } | |
| // Check if new position is an obstacle | |
| if (isObstacle(agentPos.x, agentPos.y)) { | |
| // Revert to previous position if it hit an obstacle | |
| agentPos.x = prevPos.x; | |
| agentPos.y = prevPos.y; | |
| return false; // Indicate movement was blocked | |
| } | |
| return true; // Movement successful | |
| } | |
| function calculateReward(oldPos, movementSuccessful) { | |
| // Reward for reaching goal | |
| if (agentPos.x === goalPos.x && agentPos.y === goalPos.y) { | |
| return 10; | |
| } | |
| // Penalty for attempting to move into an obstacle (but not actually moving into it) | |
| if (!movementSuccessful) { | |
| return -1; // Reduced penalty to avoid too much negative learning | |
| } | |
| // Small penalty for each step to encourage efficiency | |
| let stepPenalty = -0.1; | |
| // Small reward for getting closer to goal (using Manhattan distance) | |
| const oldDistance = | |
| Math.abs(oldPos.x - goalPos.x) + Math.abs(oldPos.y - goalPos.y); | |
| const newDistance = | |
| Math.abs(agentPos.x - goalPos.x) + Math.abs(agentPos.y - goalPos.y); | |
| const proximityReward = oldDistance > newDistance ? 0.3 : -0.1; // Stronger reward for progress | |
| return stepPenalty + proximityReward; | |
| } | |
| function updatePPO() { | |
| // Get parameters from sliders | |
| clipRatio = parseFloat(document.getElementById("clip-ratio").value); | |
| learningRate = parseFloat( | |
| document.getElementById("learning-rate").value | |
| ); | |
| ppoEpochs = parseInt(document.getElementById("epochs").value); | |
| // Compute returns and advantages | |
| const returns = []; | |
| const advantages = []; | |
| // Compute returns (discounted sum of future rewards) | |
| let discountedReturn = 0; | |
| for (let i = trajectories.length - 1; i >= 0; i--) { | |
| const transition = trajectories[i]; | |
| discountedReturn = | |
| transition.reward + | |
| gamma * (transition.done ? 0 : discountedReturn); | |
| returns.unshift(discountedReturn); | |
| } | |
| // Compute advantages using Generalized Advantage Estimation (GAE) | |
| let lastGaeAdvantage = 0; | |
| for (let i = trajectories.length - 1; i >= 0; i--) { | |
| const transition = trajectories[i]; | |
| const stateKey = transition.state; | |
| const nextStateKey = transition.nextState; | |
| const currentValue = valueNetwork[stateKey]; | |
| const nextValue = transition.done ? 0 : valueNetwork[nextStateKey]; | |
| // TD error | |
| const delta = transition.reward + gamma * nextValue - currentValue; | |
| // GAE | |
| lastGaeAdvantage = delta + gamma * lambda * lastGaeAdvantage; | |
| advantages.unshift(lastGaeAdvantage); | |
| } | |
| // Normalize advantages for more stable learning | |
| const meanAdvantage = | |
| advantages.reduce((a, b) => a + b, 0) / advantages.length; | |
| const stdAdvantage = | |
| Math.sqrt( | |
| advantages.reduce((a, b) => a + Math.pow(b - meanAdvantage, 2), 0) / | |
| advantages.length | |
| ) || 1; // Avoid division by zero | |
| for (let i = 0; i < advantages.length; i++) { | |
| advantages[i] = | |
| (advantages[i] - meanAdvantage) / (stdAdvantage + 1e-8); | |
| } | |
| // Store losses for metrics | |
| let totalPolicyLoss = 0; | |
| let totalValueLoss = 0; | |
| // Backup old policy for PPO ratio calculation | |
| const oldPolicyBackup = JSON.parse(JSON.stringify(policyNetwork)); | |
| // Multiple epochs of optimization on the same data (key PPO feature) | |
| for (let epoch = 0; epoch < ppoEpochs; epoch++) { | |
| // Update policy and value networks for each step in the trajectory | |
| for (let i = 0; i < trajectories.length; i++) { | |
| const transition = trajectories[i]; | |
| const stateKey = transition.state; | |
| const action = transition.action; | |
| // Get old action probability | |
| const oldActionProb = oldPolicy[stateKey][action]; | |
| // Get current action probability | |
| const currentActionProb = policyNetwork[stateKey][action]; | |
| // Compute probability ratio (crucial for PPO) | |
| const ratio = currentActionProb / Math.max(oldActionProb, 1e-8); | |
| // Get advantage for this action | |
| const advantage = advantages[i]; | |
| // Compute unclipped and clipped surrogate objectives | |
| const unclippedObjective = ratio * advantage; | |
| const clippedRatio = Math.max( | |
| Math.min(ratio, 1 + clipRatio), | |
| 1 - clipRatio | |
| ); | |
| const clippedObjective = clippedRatio * advantage; | |
| // PPO's clipped surrogate objective (core of PPO) | |
| const surrogateObjective = Math.min( | |
| unclippedObjective, | |
| clippedObjective | |
| ); | |
| // Compute policy gradient | |
| // Note: In PPO, we maximize the objective, so negative for gradient ascent | |
| const policyLoss = -surrogateObjective; | |
| totalPolicyLoss += policyLoss; | |
| // Value loss (using returns as targets) | |
| const valueTarget = returns[i]; | |
| const valuePrediction = valueNetwork[stateKey]; | |
| const valueLoss = 0.5 * Math.pow(valueTarget - valuePrediction, 2); | |
| totalValueLoss += valueLoss; | |
| // Update value network with gradient descent | |
| valueNetwork[stateKey] += | |
| learningRate * (valueTarget - valuePrediction); | |
| // Compute policy update based on whether we're using clipped or unclipped objective | |
| const useClippedObjective = unclippedObjective > clippedObjective; | |
| const policyGradient = | |
| learningRate * advantage * (useClippedObjective ? 0 : 1); | |
| // Apply policy gradient update | |
| // Increase probability of the taken action if it was good (positive advantage) | |
| // Decrease probability if it was bad (negative advantage) | |
| let newProb = policyNetwork[stateKey][action] + policyGradient; | |
| // Ensure probability stays positive (important for ratio calculation) | |
| newProb = Math.max(newProb, 0.01); | |
| policyNetwork[stateKey][action] = newProb; | |
| // Normalize probabilities to ensure they sum to 1 | |
| const sumProb = Object.values(policyNetwork[stateKey]).reduce( | |
| (a, b) => a + b, | |
| 0 | |
| ); | |
| for (const a in policyNetwork[stateKey]) { | |
| policyNetwork[stateKey][a] /= sumProb; | |
| } | |
| // Add some exploration (entropy bonus) | |
| // This is crucial for avoiding local optima | |
| if (i % 5 === 0) { | |
| // Apply periodically to maintain some exploration | |
| for (const a in policyNetwork[stateKey]) { | |
| // Slightly nudge probabilities toward uniform | |
| policyNetwork[stateKey][a] = | |
| 0.95 * policyNetwork[stateKey][a] + 0.05 * 0.25; | |
| } | |
| // Re-normalize | |
| const sumProb = Object.values(policyNetwork[stateKey]).reduce( | |
| (a, b) => a + b, | |
| 0 | |
| ); | |
| for (const a in policyNetwork[stateKey]) { | |
| policyNetwork[stateKey][a] /= sumProb; | |
| } | |
| } | |
| } | |
| } | |
| // Calculate average losses | |
| const avgPolicyLoss = | |
| totalPolicyLoss / (trajectories.length * ppoEpochs); | |
| const avgValueLoss = totalValueLoss / (trajectories.length * ppoEpochs); | |
| // Log progress periodically | |
| if (episode % 5 === 0) { | |
| logMessage( | |
| `Episode ${episode}: Average Policy Loss = ${avgPolicyLoss.toFixed( | |
| 4 | |
| )}, Value Loss = ${avgValueLoss.toFixed(4)}` | |
| ); | |
| } | |
| return [avgPolicyLoss, avgValueLoss]; | |
| } | |
| function updateReward(reward) { | |
| document.getElementById("reward-value").textContent = reward.toFixed(2); | |
| } | |
| function updateEpisodeCounter() { | |
| document.getElementById( | |
| "episode-counter" | |
| ).textContent = `Episodes: ${episode} / ${maxEpisodes}`; | |
| document.getElementById("training-progress").style.width = `${ | |
| (episode / maxEpisodes) * 100 | |
| }%`; | |
| } | |
| function updateTrainingProgress() { | |
| // Update charts with the latest data | |
| // In a real implementation, you would update charts here | |
| // Show progress | |
| updateEpisodeCounter(); | |
| } | |
| function updateSliderValue(id) { | |
| const slider = document.getElementById(id); | |
| const valueDisplay = document.getElementById(`${id}-value`); | |
| valueDisplay.textContent = slider.value; | |
| // Update corresponding variables | |
| if (id === "clip-ratio") clipRatio = parseFloat(slider.value); | |
| if (id === "learning-rate") learningRate = parseFloat(slider.value); | |
| if (id === "epochs") ppoEpochs = parseInt(slider.value); | |
| } | |
| function logMessage(message) { | |
| const logContainer = document.getElementById("log-container"); | |
| const logEntry = document.createElement("div"); | |
| logEntry.classList.add("log-entry"); | |
| logEntry.textContent = message; | |
| logContainer.appendChild(logEntry); | |
| logContainer.scrollTop = logContainer.scrollHeight; | |
| } | |
| function openTab(tabId) { | |
| // Hide all tab contents | |
| const tabContents = document.getElementsByClassName("tab-content"); | |
| for (let i = 0; i < tabContents.length; i++) { | |
| tabContents[i].classList.remove("active"); | |
| } | |
| // Remove active class from tab buttons | |
| const tabButtons = document.getElementsByClassName("tab-button"); | |
| for (let i = 0; i < tabButtons.length; i++) { | |
| tabButtons[i].classList.remove("active"); | |
| } | |
| // Show selected tab content and mark button as active | |
| document.getElementById(tabId).classList.add("active"); | |
| const activeButton = document.querySelector( | |
| `.tab-button[onclick="openTab('${tabId}')"]` | |
| ); | |
| activeButton.classList.add("active"); | |
| } | |
| function showPopup(title, content) { | |
| document.getElementById("popup-title").textContent = title; | |
| document.getElementById("popup-content").innerHTML = content; | |
| document.getElementById("popup-overlay").style.display = "block"; | |
| document.getElementById("popup").style.display = "block"; | |
| } | |
| function closePopup() { | |
| document.getElementById("popup-overlay").style.display = "none"; | |
| document.getElementById("popup").style.display = "none"; | |
| } | |
| // Initialize the environment when the page loads | |
| window.onload = function () { | |
| initializeEnvironment(); | |
| logMessage('Environment initialized. Click "Start Training" to begin!'); | |
| // Show concept popup with a delay | |
| setTimeout(() => { | |
| showPopup( | |
| "Welcome to PPO Simulation", | |
| ` | |
| <p>This simulation demonstrates Proximal Policy Optimization (PPO), a reinforcement learning algorithm.</p> | |
| <p>In this grid world:</p> | |
| <ul> | |
| <li>The agent (blue circle) must learn to navigate to the goal (green square)</li> | |
| <li>You can place obstacles (red squares) by clicking the "Place Obstacles" button</li> | |
| <li>The agent receives rewards for approaching the goal and penalties for hitting obstacles</li> | |
| <li>PPO helps the agent learn efficiently by preventing large policy updates</li> | |
| </ul> | |
| <p>Try experimenting with different parameters to see how they affect learning!</p> | |
| ` | |
| ); | |
| }, 1000); | |
| }; | |
| // Animation speed control | |
| let animationSpeed = "normal"; | |
| const animationSpeeds = { | |
| slow: 300, | |
| normal: 100, | |
| fast: 20, | |
| }; | |
| function toggleAnimationSpeed() { | |
| const speedBtn = document.getElementById("animation-speed-btn"); | |
| if (animationSpeed === "slow") { | |
| animationSpeed = "normal"; | |
| speedBtn.textContent = "Animation Speed: Normal"; | |
| } else if (animationSpeed === "normal") { | |
| animationSpeed = "fast"; | |
| speedBtn.textContent = "Animation Speed: Fast"; | |
| } else { | |
| animationSpeed = "slow"; | |
| speedBtn.textContent = "Animation Speed: Slow"; | |
| } | |
| } | |
| // Update animation speed in relevant functions | |
| async function runTrainingLoop() { | |
| while (isTraining && episode < maxEpisodes) { | |
| await runEpisode(); | |
| updateTrainingProgress(); | |
| // Use dynamic animation speed | |
| await new Promise((resolve) => | |
| setTimeout(resolve, animationSpeeds[animationSpeed]) | |
| ); | |
| } | |
| if (episode >= maxEpisodes) { | |
| logMessage("Training complete!"); | |
| isTraining = false; | |
| document.getElementById("start-btn").textContent = "Start Training"; | |
| } | |
| } | |
| async function executeStep() { | |
| const stateKey = `${agentPos.x},${agentPos.y}`; | |
| const policy = policyNetwork[stateKey]; | |
| // Choose action based on policy | |
| const action = sampleAction(policy); | |
| // Store old position | |
| const oldPos = { ...agentPos }; | |
| // Move agent | |
| const movementSuccessful = moveAgent(action); | |
| // Calculate reward | |
| const reward = calculateReward(oldPos, movementSuccessful); | |
| totalReward += reward; | |
| updateReward(totalReward); | |
| // Check if episode is done | |
| const done = agentPos.x === goalPos.x && agentPos.y === goalPos.y; | |
| // Render the grid | |
| renderGrid(); | |
| // Store trajectory | |
| const newStateKey = `${agentPos.x},${agentPos.y}`; | |
| trajectories.push({ | |
| state: stateKey, | |
| action, | |
| reward, | |
| nextState: newStateKey, | |
| done, | |
| }); | |
| // Use dynamic animation speed | |
| await new Promise((resolve) => | |
| setTimeout(resolve, animationSpeeds[animationSpeed] / 2) | |
| ); | |
| return done; | |
| } | |
| </script> | |
| <footer | |
| style=" | |
| text-align: center; | |
| margin-top: 30px; | |
| padding: 15px; | |
| background-color: #f8f9fa; | |
| border-top: 1px solid #ddd; | |
| " | |
| > | |
| © 2025 Pejman Ebrahimi - All Rights Reserved | |
| </footer> | |
| </body> | |
| </html> | |