Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
BPD.py +124 -0
README.md +132 -13
SGVLB.py +27 -0
gif_for_readme.gif +3 -0
layer.py +56 -0
logger.py +74 -0
main.py +90 -0
network.py +106 -0
replay_memory.py +39 -0
teacher_buffer/tmp +0 -0
utils.py +142 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+gif_for_readme.gif filter=lfs diff=lfs merge=lfs -text

BPD.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import copy
+import torch
+import torch.nn.functional as F
+from SGVLB import SGVLB
+from network import Net, Critic
+class BPDAgent(object):
+    def __init__(
+            self,
+            env,
+            args,
+            env_info,
+            thresholds,
+            datasize,
+            device,
+            discount,
+            tau,
+            noise_clip,
+            policy_freq,
+            h,
+            num_teacher_param,
+    ):
+        self.args = args
+        self.env = env
+        self.env_info = env_info
+        self.actor = Net(env_info['state_dim'], env_info['action_dim'], env_info['action_bound'],
+                         args.student_hidden_dims, thresholds['ALPHA_THRESHOLD'], thresholds['THETA_THRESHOLD'],
+                         device=device).to(device)
+        self.actor_target = copy.deepcopy(self.actor)
+        self.sgvlb = SGVLB(self.actor, datasize, loss_type='l2', device=device)
+        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
+        self.critic = Critic(env_info['state_dim'], env_info['action_dim']).to(device)
+        self.critic_target = copy.deepcopy(self.critic)
+        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
+        self.discount = discount
+        self.tau = tau
+        self.noise_clip = noise_clip
+        self.policy_freq = policy_freq
+        self.datasize = datasize
+        self.h = h
+        self.total_it = 0
+        self.kl_weight = 0
+    def set_kl_weight(self, kl_weight):
+        self.kl_weight = kl_weight
+        return
+    def test(self):
+        self.actor.eval()
+        with torch.no_grad():
+            return_list = []
+            for epi_cnt in range(1, self.args.num_test_epi):
+                episode_return = 0
+                done = False
+                state, _ = self.env.reset()
+                while not done:
+                    action = self.actor(state)
+                    action = action.cpu().numpy()[0]
+                    next_state, reward, terminated, truncated, _ = self.env.step(action)
+                    done = terminated or truncated
+                    episode_return += reward
+                    state = next_state
+                return_list.append(episode_return)
+        avg_return = sum(return_list) / len(return_list)
+        max_return = max(return_list)
+        min_return = min(return_list)
+        return avg_return, max_return, min_return
+    def train(self, transition):
+        self.actor.train()
+        self.total_it += 1
+        states, actions, rewards, next_states, dones = transition
+        with torch.no_grad():
+            next_actions = (
+                    self.actor_target(next_states)
+            ).clamp(self.env_info['action_bound'][0], self.env_info['action_bound'][1])
+            target_Q1, target_Q2 = self.critic_target(next_states, next_actions)
+            target_Q = torch.min(target_Q1, target_Q2)
+            target_Q = rewards + (1 - dones) * self.discount * target_Q
+        current_Q1, current_Q2 = self.critic(states, actions)
+        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
+        self.critic_optimizer.zero_grad()
+        critic_loss.backward()
+        self.critic_optimizer.step()
+        if self.total_it % self.policy_freq == 0:
+            pi = self.actor(states)
+            Q = self.critic.Q1(states, pi)
+            lmbda = (self.h * self.datasize) / Q.abs().mean().detach()
+            actor_loss = -lmbda * Q.mean() + self.sgvlb(pi, actions, self.kl_weight)  # lambda = h*|D|/avg(|Q|)
+            # Optimize the actor
+            self.actor_optimizer.zero_grad()
+            actor_loss.backward()
+            self.actor_optimizer.step()
+            # Update the frozen target models
+            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
+                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
+            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
+                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
+    def __del__(self):
+        del self.actor
+        del self.actor_target
+        del self.critic
+        del self.critic_target
+        return

README.md CHANGED Viewed

@@ -1,14 +1,133 @@
 ---
-license: mit
-language:
-- en
-pipeline_tag: reinforcement-learning
-tags:
-- offline
-- policy
-- bayesian
-- distillation
-- offline-rl
-- rl
-- pruning
----

+<div align="center">
+  <h1>Bayesian Policy Distillation</h1>
+  <h3>Towards Lightweight and Fast Neural Policy Networks</h3>
+  <a href="https://www.python.org/">
+    <img src="https://img.shields.io/badge/Python-3.7+-blue?logo=python&style=flat-square" alt="Python Badge"/>
+  </a>
+  &nbsp;&nbsp;
+  <a href="https://pytorch.org/">
+    <img src="https://img.shields.io/badge/PyTorch-1.8+-EE4C2C?logo=pytorch&style=flat-square" alt="PyTorch Badge"/>
+  </a>
+  &nbsp;&nbsp;
+  <a href="https://doi.org/10.1016/j.engappai.2025.113539">
+    <img src="https://img.shields.io/badge/EAAI%202026-Published-success?style=flat-square" alt="EAAI Badge"/>
+  </a>
+  &nbsp;&nbsp;
+  <a href="https://www.elsevier.com/">
+    <img src="https://img.shields.io/badge/Elsevier-Journal-orange?style=flat-square" alt="Elsevier Badge"/>
+  </a>
+  <br/><br/>
+  <img src="./gif_for_readme.gif" width="550px"/>
+</div>
 ---
+## Engineering Applications of Artificial Intelligence (EAAI 2026)
+### PyTorch Implementation
+This repository contains a PyTorch implementation of **Bayesian Policy Distillation (BPD)** from the paper:
+> **Bayesian policy distillation: Towards lightweight and fast neural policy networks**
+> Jangwon Kim, Yoonsu Jang, Jonghyeok Park, Yoonhee Gil, Soohee Han
+> *Engineering Applications of Artificial Intelligence*, Volume 166, 2026
+## 📄 Paper Link
+> **DOI:** https://doi.org/10.1016/j.engappai.2025.113539
+> **Journal:** Engineering Applications of Artificial Intelligence
+---
+## Bayesian Policy Distillation
+BPD achieves extreme policy compression through offline reinforcement learning by:
+1. **Bayesian Neural Networks**: Uncertainty-driven dynamic weight pruning
+2. **Sparse Variational Dropout**: Automatic sparsity induction via KL regularization
+3. **Offline RL Framework**: Value optimization + behavior cloning
+```math
+\mathcal{L}_{BPD}(\theta, \alpha) = -\lambda Q_{\psi_1}(s, \pi_\omega(s)) + \frac{|\mathcal{D}|}{M}\sum_{m=1}^{M}(\pi_{\omega_m}(s_m) - a_m)^2 + \eta \cdot D_{KL}(q(\omega|\theta,\alpha) \| p(\omega))
+```
+**Key Results:**
+- **~98% compression** (1.5-2.5% sparsity) while maintaining performance
+- **4.5× faster inference** on embedded systems
+- Successfully deployed on real inverted pendulum with **78% inference time reduction**
+---
+## Quick Start
+### Basic Training
+```bash
+python main.py --env-name Hopper-v3 --level expert --random-seed 1
+```
+### Custom Configuration
+```bash
+python main.py \
+    --env-name Walker2d-v3 \
+    --level medium \
+    --student-hidden-dims "(128, 128)" \
+    --alpha-threshold 2 \
+    --nu 4 \
+    --h 0.5
+```
+### Available Environments
+- `Hopper-v3`, `Walker2d-v3`, `HalfCheetah-v3`, `Ant-v3`
+### Teacher Policy Levels
+- `expert`: High-performance teacher policy
+- `medium`: Moderate-performance teacher policy
+---
+## Key Hyperparameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--student-hidden-dims` | (128, 128) | Student network hidden layer sizes |
+| `--alpha-threshold` | 2 | Pruning threshold for log(α) (higher = less compression) |
+| `--nu` | 4 | KL weight annealing speed |
+| `--h` | 0.5 | Q-value loss coefficient |
+| `--batch-size` | 256 | Mini-batch size |
+| `--max-teaching-count` | 1000000 | Total training iterations |
+| `--eval-freq` | 5000 | Evaluation frequency |
+**Adjusting Compression:**
+- `--alpha-threshold 3-4`: Conservative pruning
+- `--alpha-threshold 2`: Balanced [default]
+- `--alpha-threshold 1`: Aggressive pruning
+---
+## Results
+### MuJoCo Benchmark (Expert Teacher)
+| Environment | Teacher | BPD (Ours) | Sparsity | Compression |
+|------------|---------|------------|----------|-------------|
+| Ant-v3 | 5364 | 5455 | 2.40% | **41.7×** |
+| Walker2d-v3 | 5357 | 4817 | 1.68% | **59.5×** |
+| Hopper-v3 | 3583 | 3134 | 1.35% | **74.1×** |
+| HalfCheetah-v3 | 11432 | 10355 | 2.21% | **45.2×** |
+### Real Hardware (Inverted Pendulum)
+- **Inference**: 1.36ms → 0.30ms (**4.5× faster**)
+- **Memory**: 290.82KB → 4.43KB (**98.5% reduction**)
+- **Parameters**: 72,705 → 1,108 (**65.6× compression**)
+---
+## Citation
+```bibtex
+@article{kim2026bayesian,
+  title={Bayesian policy distillation: Towards lightweight and fast neural policy networks},
+  author={Kim, Jangwon and Jang, Yoonsu and Park, Jonghyeok and Gil, Yoonhee and Han, Soohee},
+  journal={Engineering Applications of Artificial Intelligence},
+  volume={166},
+  pages={113539},
+  year={2026},
+  publisher={Elsevier}
+}
+```

SGVLB.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SGVLB(nn.Module):
+    def __init__(self, net, train_size, loss_type='cross_entropy', device='cuda'):
+        super(SGVLB, self).__init__()
+        self.train_size = train_size
+        self.net = net
+        self.loss_type = loss_type
+        self.device = device
+    def forward(self, input, target, kl_weight=1.0):
+        assert not target.requires_grad
+        kl = torch.FloatTensor([0.0]).to(self.device)
+        for module in self.net.children():
+            if hasattr(module, 'kl_reg'):
+                kl = kl + module.kl_reg()
+        if self.loss_type == 'cross_entropy':
+            SGVLB = F.cross_entropy(input, target) * self.train_size + kl_weight * kl
+        elif self.loss_type in ['l2', 'L2']:
+            SGVLB = ((input - target) ** 2).mean() * self.train_size + kl_weight * kl
+        else:
+            raise NotImplementedError
+        return SGVLB

gif_for_readme.gif ADDED Viewed

Git LFS Details

SHA256: 56905a7305067be4489c80f29ff08072ad62e78762069a4230c05c929f6f6c79
Pointer size: 132 Bytes
Size of remote file: 1.69 MB

layer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+class LinearSVDO(nn.Module):
+    def __init__(self, in_features, out_features, alpha_threshold, theta_threshold, device):
+        super(LinearSVDO, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.alpha_threshold = alpha_threshold
+        self.theta_threshold = theta_threshold
+        self.device = device
+        self.W = Parameter(torch.Tensor(out_features, in_features))
+        self.log_sigma = Parameter(torch.Tensor(out_features, in_features))
+        self.bias = Parameter(torch.Tensor(1, out_features))
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.bias.data.zero_()
+        self.W.data.normal_(0, 0.02)
+        self.log_sigma.data.fill_(-5)
+    def forward(self, x):
+        self.log_alpha = self.log_sigma * 2.0 - 2.0 * torch.log(1e-16 + torch.abs(self.W))
+        self.log_alpha = torch.clamp(self.log_alpha, -10, 10)
+        if self.training:
+            lrt_mean = F.linear(x, self.W) + self.bias
+            lrt_std = F.linear(torch.sqrt(x * x), torch.exp(2*self.log_sigma)+ 1e-8)
+            eps = torch.randn_like(lrt_std)
+            return lrt_mean + lrt_std * eps
+        out = self.W * (self.log_alpha < self.alpha_threshold).float()
+        out = F.linear(x, out) + self.bias
+        return out
+    def get_pruned_weights(self):
+        W = self.W * (self.log_alpha < self.alpha_threshold).float()
+        return W
+    def get_num_remained_weights(self):
+        num = ((self.log_alpha < self.alpha_threshold) * (torch.abs(self.W) > self.theta_threshold)).sum().item()
+        return num
+    def kl_reg(self):
+        k1, k2, k3 = torch.FloatTensor([0.63576]).to(self.device), torch.FloatTensor([1.8732]).to(self.device), torch.FloatTensor([1.48695]).to(self.device)
+        KL = k1 * torch.sigmoid(k2 + k3 * self.log_alpha) - 0.5 * torch.log1p(torch.exp(-self.log_alpha))
+        KL = - torch.sum(KL)
+        return KL

logger.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import sys
+import random
+import numpy as np
+from collections import OrderedDict
+from tabulate import tabulate
+from pandas import DataFrame
+from time import gmtime, strftime
+class Logger:
+    def __init__(self, env_info, fmt=None):
+        self.handler = True
+        self.scalar_metrics = OrderedDict()
+        self.fmt = fmt if fmt else dict()
+        base = './logs'
+        if not os.path.exists(base): os.mkdir(base)
+        self.path = '%s/%s-%s' % (base, env_info['name'], env_info['seed'])
+        self.logs = self.path + '.csv'
+        self.output = self.path + '.out'
+        self.checkpoint = self.path + '.cpt'
+        def prin(*args):
+            str_to_write = ' '.join(map(str, args))
+            with open(self.output, 'a') as f:
+                f.write(str_to_write + '\n')
+                f.flush()
+            print(str_to_write)
+            sys.stdout.flush()
+        self.print = prin
+    def add_scalar(self, t, key, value):
+        if key not in self.scalar_metrics:
+            self.scalar_metrics[key] = []
+        self.scalar_metrics[key] += [(t, value)]
+    def add_dict(self, t, d):
+        for key, value in d.iteritems():
+            self.add_scalar(t, key, value)
+    def add(self, t, **args):
+        for key, value in args.items():
+            self.add_scalar(t, key, value)
+    def iter_info(self, order=None):
+        names = list(self.scalar_metrics.keys())
+        if order:
+            names = order
+        values = [self.scalar_metrics[name][-1][1] for name in names]
+        t = int(np.max([self.scalar_metrics[name][-1][0] for name in names]))
+        fmt = ['%s'] + [self.fmt[name] if name in self.fmt else '.1f' for name in names]
+        if self.handler:
+            self.handler = False
+            self.print(tabulate([[t] + values], ['epoch'] + names, floatfmt=fmt))
+        else:
+            self.print(tabulate([[t] + values], ['epoch'] + names, tablefmt='plain', floatfmt=fmt).split('\n')[1])
+    def save(self, silent=False):
+        result = None
+        for key in self.scalar_metrics.keys():
+            if result is None:
+                result = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t')
+            else:
+                df = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t')
+                result = result.join(df, how='outer')
+        result.to_csv(self.logs)
+        if not silent:
+            self.print('The log/output/model have been saved to: ' + self.path + ' + .csv/.out/.cpt')

main.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from BPD import BPDAgent
+from utils import set_seed, get_learning_info, get_compression_ratio, load_buffer
+import pickle
+import inspect
+import os
+import argparse
+import gdown
+import time
+import torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Experiment
+    parser.add_argument("--env-name", default="Hopper-v3")      # OpenAI gym environment name
+    parser.add_argument("--level", default="expert")            # expert or medium
+    parser.add_argument("--random-seed", default=1, type=int)
+    parser.add_argument("--eval-freq", default=5000, type=int)
+    parser.add_argument("--max-teaching-count", default=1000000, type=int)
+    parser.add_argument('--num-test-epi', default=10, type=int)
+    parser.add_argument("--teacher-hidden-dims", default=(400, 300), type=tuple)
+    parser.add_argument("--student-hidden-dims", default=(128, 128), type=tuple)
+    parser.add_argument("--batch-size", default=256, type=int)  # Batch size for both actor and critic
+    parser.add_argument("--discount", default=0.99)             # Discount factor
+    parser.add_argument("--tau", default=0.005)                 # Target network update rate
+    parser.add_argument("--noise-clip", default=0.5)            # Range to clip target policy noise
+    parser.add_argument("--policy-freq", default=2, type=int)   # Frequency of delayed policy updates
+    parser.add_argument("--h", default=0.5, type=float)
+    parser.add_argument("--nu", default=4, type=float)
+    parser.add_argument("--theta-threshold", default=0, type=float)
+    parser.add_argument("--alpha-threshold", default=2, type=float)
+    parser.add_argument("--init-kl-weight", default=0, type=float)
+    parser.add_argument("--kl-max-coef", default=2, type=int)
+    parser.add_argument("--datasize", default=1000000, type=int)
+    args = parser.parse_args()
+    # MuJoCo Environment Variable & Device Setting
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # STEP 1: Make Instances & Variables
+    max_avg_return = 0
+    seed = set_seed(args.random_seed)
+    args.random_seed = seed
+    learning_info = get_learning_info(args, seed)
+    agent = BPDAgent(**learning_info)
+    kl_weight = args.init_kl_weight
+    # STEP 2: Load Dataset (=teacher buffer)
+    buffer = load_buffer(args.env_name, args.level, args.datasize)
+    # STEP 3: Training
+    print(f"Distilling Start! | env_name: {args.env_name} | level: {args.level} | seed: {seed}")
+    time_start = time.time()
+    return_list = []
+    for teaching_cnt in range(1, args.max_teaching_count + 1):
+        kl_weight = (args.nu / args.max_teaching_count) * teaching_cnt
+        kl_weight = min(kl_weight, args.kl_max_coef)
+        agent.set_kl_weight(kl_weight)
+        transitions = buffer.sample(batch_size=args.batch_size)
+        agent.train(transitions)
+        if teaching_cnt % args.eval_freq == 0:
+            avg_student_return, max_student_return, min_student_return = agent.test()
+            return_list.append(avg_student_return)
+            print(f"[INFO] Teaching Count: [{teaching_cnt}/{args.max_teaching_count}]  |  Average Student Return:"
+                  f" {avg_student_return:.3f}  |  Max Student Return: {max_student_return:.3f}  |  Min Student Return:"
+                  f" {min_student_return:.3f}", end='')
+            for i, c in enumerate(agent.actor.children()):
+                temp = (torch.abs(c.get_pruned_weights()) == 0).float().data.cpu().numpy().mean()
+                if hasattr(c, 'kl_reg'):
+                    print(f"  |  sp_{i}: {1-temp:.3f}", end='')
+                del temp
+            print()
+    return_sum = 0
+    for i in range(10):
+        return_sum += return_list[-1 - i]
+    return_avg = return_sum / 10
+    time_end = time.time()
+    print(f"\nDistilling Finish!  |  Seed: {seed}  |  Consumed Time (sec): {time_end - time_start}")
+    print("Average Return of the Last 10 Episode: {}".format(return_avg))
+    cr = get_compression_ratio(learning_info["num_teacher_param"], agent)
+    print('Compression ratio (kep_w/all_w)=', cr)
+    print("-----------------------------------------------------------\n")

network.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from layer import LinearSVDO
+# Define a simple 2 layer Network
+class Net(nn.Module):
+    def __init__(self, state_dim, action_dim, action_bound, hidden_dims, alpha_threshold, theta_threshold, device):
+        super(Net, self).__init__()
+        self.fc1 = LinearSVDO(state_dim, hidden_dims[0], alpha_threshold, theta_threshold, device)
+        self.fc2 = LinearSVDO(hidden_dims[0], hidden_dims[1], alpha_threshold, theta_threshold, device)
+        self.fc3 = LinearSVDO(hidden_dims[1], action_dim, alpha_threshold, theta_threshold, device)
+        self.action_rescale = torch.as_tensor((action_bound[1] - action_bound[0]) / 2., dtype=torch.float32)
+        self.action_rescale_bias = torch.as_tensor((action_bound[1] + action_bound[0]) / 2., dtype=torch.float32)
+        self.device = device
+        self.alpha_threshold = alpha_threshold
+    def _format(self, state):
+        x = state
+        if not isinstance(x, torch.Tensor):
+            x = torch.tensor(x, device=self.device, dtype=torch.float32)
+            x = x.unsqueeze(0)
+        return x
+    def forward(self, x):
+        x = self._format(x)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.tanh(self.fc3(x))
+        x = x * self.action_rescale + self.action_rescale_bias
+        return x
+class Actor(nn.Module):
+    def __init__(self, state_dim, action_dim, student_hidden_dims, max_action):
+        super(Actor, self).__init__()
+        self.l1 = nn.Linear(state_dim, student_hidden_dims[0])
+        self.l2 = nn.Linear(student_hidden_dims[0], student_hidden_dims[1])
+        self.l3 = nn.Linear(student_hidden_dims[1], action_dim)
+        self.device = 'cuda'
+        self.max_action = max_action
+    def _format(self, state):
+        x = state
+        if not isinstance(x, torch.Tensor):
+            x = torch.tensor(x, device=self.device, dtype=torch.float32)
+            x = x.unsqueeze(0)
+        return x
+    def forward(self, state):
+        x = self._format(state)
+        a = F.relu(self.l1(x))
+        a = F.relu(self.l2(a))
+        return self.max_action * torch.tanh(self.l3(a))
+class Critic(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(Critic, self).__init__()
+        self.device = 'cuda'
+        # Q1 architecture
+        self.l1 = nn.Linear(state_dim + action_dim, 256)
+        self.l2 = nn.Linear(256, 256)
+        self.l3 = nn.Linear(256, 1)
+        # Q2 architecture
+        self.l4 = nn.Linear(state_dim + action_dim, 256)
+        self.l5 = nn.Linear(256, 256)
+        self.l6 = nn.Linear(256, 1)
+    def _format(self, state, action):
+        x, u = state, action
+        if not isinstance(x, torch.Tensor):
+            x = torch.tensor(x, device=self.device, dtype=torch.float32)
+            x = x.unsqueeze(0)
+        if not isinstance(u, torch.Tensor):
+            u = torch.tensor(u, device=self.device, dtype=torch.float32)
+            u = u.unsqueeze(0)
+        return x, u
+    def forward(self, state, action):
+        x, u = self._format(state, action)
+        sa = torch.cat([x, u], 1)
+        q1 = F.relu(self.l1(sa))
+        q1 = F.relu(self.l2(q1))
+        q1 = self.l3(q1)
+        q2 = F.relu(self.l4(sa))
+        q2 = F.relu(self.l5(q2))
+        q2 = self.l6(q2)
+        return q1, q2
+    def Q1(self, state, action):
+        sa = torch.cat([state, action], 1)
+        q1 = F.relu(self.l1(sa))
+        q1 = F.relu(self.l2(q1))
+        q1 = self.l3(q1)
+        return q1

replay_memory.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import numpy as np
+import copy
+class ReplayMemory:
+    def __init__(self, state_dim, action_dim, device='cuda', capacity=5e6):
+        self.capacity = int(capacity)
+        self.size = 0
+        self.position = 0
+        self.state_buffer = np.empty(shape=(self.capacity, state_dim), dtype=np.float32)
+        self.action_buffer = np.empty(shape=(self.capacity, action_dim), dtype=np.float32)
+        self.reward_buffer = np.empty(shape=(self.capacity, 1), dtype=np.float32)
+        self.next_state_buffer = np.empty(shape=(self.capacity, state_dim), dtype=np.float32)
+        self.done_buffer = np.empty(shape=(self.capacity, 1), dtype=np.float32)
+    def normalize_states(self, eps=1e-3):
+        mean = np.mean(copy.deepcopy(self.state_buffer).astype('float64'), axis=0)
+        std = np.std(copy.deepcopy(self.state_buffer).astype('float64'), axis=0) + eps
+        self.state_buffer = (self.state_buffer.astype('float64') - mean) / std
+        self.next_state_buffer = (self.next_state_buffer.astype('float64') - mean) / std
+        self.state_buffer = self.state_buffer.astype('float32')
+        self.next_state_buffer = self.next_state_buffer.astype('float32')
+        return
+    def sample(self, batch_size):
+        idxs = np.random.randint(0, self.size, size=batch_size)
+        states = torch.FloatTensor(self.state_buffer[idxs]).to('cuda')
+        actions = torch.FloatTensor(self.action_buffer[idxs]).to('cuda')
+        rewards = torch.FloatTensor(self.reward_buffer[idxs]).to('cuda')
+        next_states = torch.FloatTensor(self.next_state_buffer[idxs]).to('cuda')
+        dones = torch.FloatTensor(self.done_buffer[idxs]).to('cuda')
+        return states, actions, rewards, next_states, dones

teacher_buffer/tmp ADDED Viewed

File without changes

utils.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import numpy as np
+import random
+import torch
+import torch.nn as nn
+import os
+import inspect
+import pickle
+import gdown
+from network import Actor
+def weight_init(m):
+    """Custom weight init for Conv2D and Linear layers.
+        Reference: https://github.com/MishaLaskin/rad/blob/master/curl_sac.py"""
+    if isinstance(m, nn.Linear):
+        nn.init.orthogonal_(m.weight.data)
+        m.bias.data.fill_(0.0)
+    elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+        # delta-orthogonal init from https://arxiv.org/pdf/1806.05393.pdf
+        assert m.weight.size(2) == m.weight.size(3)
+        m.weight.data.fill_(0.0)
+        m.bias.data.fill_(0.0)
+        mid = m.weight.size(2) // 2
+        gain = nn.init.calculate_gain('relu')
+        nn.init.orthogonal_(m.weight.data[:, :, mid, mid], gain)
+def set_seed(random_seed):
+    if random_seed <= 0:
+        random_seed = np.random.randint(1, 9999)
+    else:
+        random_seed = random_seed
+    torch.manual_seed(random_seed)
+    np.random.seed(random_seed)
+    random.seed(random_seed)
+    return random_seed
+def make_env(env_name, seed):
+    import gymnasium as gym
+    # openai gym
+    env = gym.make(env_name)
+    env.action_space.seed(seed)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.shape[0]
+    action_bound = [env.action_space.low[0], env.action_space.high[0]]
+    env_info = {'name': env_name, 'state_dim': state_dim, 'action_dim': action_dim, 'action_bound': action_bound, 'seed': seed}
+    return env, env_info
+def get_learning_info(args, seed):
+    env, env_info = make_env(args.env_name, seed)
+    device = 'cuda'
+    alpha_dict = {'HalfCheetah-v3': args.alpha_threshold, 'Walker2d-v3': args.alpha_threshold,
+                  'Ant-v3': args.alpha_threshold, 'Hopper-v3': args.alpha_threshold}
+    thresholds = {"ALPHA_THRESHOLD": alpha_dict[args.env_name], "THETA_THRESHOLD": args.theta_threshold}
+    max_action = 1
+    t_p = Actor(env_info['state_dim'], env_info['action_dim'], (400, 300), 1)
+    num_teacher_param = sum(p2.numel() for p2 in t_p.parameters())
+    kwargs = {
+        "env": env,
+        "args": args,
+        "env_info": env_info,
+        "thresholds": thresholds,
+        "discount": args.discount,
+        "datasize": args.datasize,
+        "tau": args.tau,
+        "device": device,
+        "num_teacher_param": num_teacher_param,
+        "noise_clip": args.noise_clip * max_action,
+        "policy_freq": args.policy_freq,
+        "h": args.h,
+    }
+    return kwargs
+def get_compression_ratio(num_teacher_param, agent):
+    kep_w = 0
+    for c in agent.actor.children():
+        kep_w += c.get_num_remained_weights()
+    #
+    return kep_w / num_teacher_param
+def load_buffer(env_name, level, datasize):
+    current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+    file_path = os.path.join(current_dir, "teacher_buffer", "[" + level + "_buffer]_" + env_name + ".pickle")
+    try:
+        with open(file_path, "rb") as fr:
+            buffer = pickle.load(fr)
+            buffer.size = datasize
+    except FileNotFoundError:
+        # Download the file
+        if level == 'expert':
+            print("Downloading the teacher buffer...")
+            if env_name == "Ant-v3":
+                file_id = "10VBf3bM38bNw9WsniQvirpNjRFWp8HZO"
+            elif env_name == "Walker2d-v3":
+                file_id = "1ungLoqNKS4NIldZ9H2mswwGh-3Ipgy0D"
+            elif env_name == "HalfCheetah-v3":
+                file_id = "1wO0HwDi1GNf9d9SrDJrf9x8XMZDOTkzl"
+            elif env_name == "Hopper-v3":
+                file_id ="10pqCliJSM_Iyb05dxHZfYs9VlmCmPryE"
+            else:
+                raise ValueError("Invalid Environment Name")
+            url = f"https://drive.google.com/uc?id={file_id}"
+            gdown.download(url, file_path, quiet=False)
+            print("Download Complete!")
+        elif level == 'medium':
+            if env_name == "Ant-v3":
+                file_id = "1-SKleNu6l-tY2awkx3tgVDUKbjkOaj_D"
+            elif env_name == "Walker2d-v3":
+                file_id = "1x6nkBBSWMRb3bENxUzcntHT1WlSNJmoh"
+            elif env_name == "HalfCheetah-v3":
+                file_id = "1OHkB6yVK3QcqbuJH0B_iNW_2cBnv96mR"
+            elif env_name == "Hopper-v3":
+                file_id ="1uqH2pgKKrhadsCXCwQWrvDvZ4ZyYFkM-"
+            else:
+                raise ValueError("Invalid Environment Name")
+            url = f"https://drive.google.com/uc?id={file_id}"
+            gdown.download(url, file_path, quiet=False)
+        else:
+            raise ValueError("Invalid Level. Choose from ['expert', 'medium']")
+        with open(file_path, "rb") as fr:
+            buffer = pickle.load(fr)
+            buffer.size = datasize
+    return buffer