File size: 4,213 Bytes
96170c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from BPD import BPDAgent
from utils import set_seed, get_learning_info, get_compression_ratio, load_buffer
import pickle
import inspect
import os
import argparse
import gdown
import time
import torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # Experiment
    parser.add_argument("--env-name", default="Hopper-v3")      # OpenAI gym environment name
    parser.add_argument("--level", default="expert")            # expert or medium
    parser.add_argument("--random-seed", default=1, type=int)
    parser.add_argument("--eval-freq", default=5000, type=int)
    parser.add_argument("--max-teaching-count", default=1000000, type=int)
    parser.add_argument('--num-test-epi', default=10, type=int)
    parser.add_argument("--teacher-hidden-dims", default=(400, 300), type=tuple)
    parser.add_argument("--student-hidden-dims", default=(128, 128), type=tuple)

    parser.add_argument("--batch-size", default=256, type=int)  # Batch size for both actor and critic
    parser.add_argument("--discount", default=0.99)             # Discount factor
    parser.add_argument("--tau", default=0.005)                 # Target network update rate
    parser.add_argument("--noise-clip", default=0.5)            # Range to clip target policy noise
    parser.add_argument("--policy-freq", default=2, type=int)   # Frequency of delayed policy updates

    parser.add_argument("--h", default=0.5, type=float)
    parser.add_argument("--nu", default=4, type=float)
    parser.add_argument("--theta-threshold", default=0, type=float)
    parser.add_argument("--alpha-threshold", default=2, type=float)
    parser.add_argument("--init-kl-weight", default=0, type=float)
    parser.add_argument("--kl-max-coef", default=2, type=int)
    parser.add_argument("--datasize", default=1000000, type=int)

    args = parser.parse_args()

    # MuJoCo Environment Variable & Device Setting
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # STEP 1: Make Instances & Variables
    max_avg_return = 0
    seed = set_seed(args.random_seed)
    args.random_seed = seed
    learning_info = get_learning_info(args, seed)

    agent = BPDAgent(**learning_info)
    kl_weight = args.init_kl_weight

    # STEP 2: Load Dataset (=teacher buffer)
    buffer = load_buffer(args.env_name, args.level, args.datasize)

    # STEP 3: Training
    print(f"Distilling Start! | env_name: {args.env_name} | level: {args.level} | seed: {seed}")
    time_start = time.time()
    return_list = []
    for teaching_cnt in range(1, args.max_teaching_count + 1):
        kl_weight = (args.nu / args.max_teaching_count) * teaching_cnt
        kl_weight = min(kl_weight, args.kl_max_coef)
        agent.set_kl_weight(kl_weight)
        transitions = buffer.sample(batch_size=args.batch_size)
        agent.train(transitions)

        if teaching_cnt % args.eval_freq == 0:
            avg_student_return, max_student_return, min_student_return = agent.test()
            return_list.append(avg_student_return)
            print(f"[INFO] Teaching Count: [{teaching_cnt}/{args.max_teaching_count}]  |  Average Student Return:"
                  f" {avg_student_return:.3f}  |  Max Student Return: {max_student_return:.3f}  |  Min Student Return:"
                  f" {min_student_return:.3f}", end='')

            for i, c in enumerate(agent.actor.children()):
                temp = (torch.abs(c.get_pruned_weights()) == 0).float().data.cpu().numpy().mean()
                if hasattr(c, 'kl_reg'):
                    print(f"  |  sp_{i}: {1-temp:.3f}", end='')
                del temp
            print()

    return_sum = 0
    for i in range(10):
        return_sum += return_list[-1 - i]
    return_avg = return_sum / 10

    time_end = time.time()
    print(f"\nDistilling Finish!  |  Seed: {seed}  |  Consumed Time (sec): {time_end - time_start}")
    print("Average Return of the Last 10 Episode: {}".format(return_avg))
    cr = get_compression_ratio(learning_info["num_teacher_param"], agent)
    print('Compression ratio (kep_w/all_w)=', cr)
    print("-----------------------------------------------------------\n")