Spaces:

LLParallax
/

Apple

Runtime error

App Files Files Community

New Author Name commited on Feb 15, 2023

Commit

4b714e2

1 Parent(s): 8264cee

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.editorconfig +16 -0
.gitignore +18 -0
.pre-commit-config.yaml +29 -0
.streamlit/config.toml +34 -0
README.md +8 -7
app.py +606 -0
apple/envs/discrete_apple.py +384 -0
apple/envs/img/apple.png +0 -0
apple/envs/img/elf_down.png +0 -0
apple/envs/img/elf_left.png +0 -0
apple/envs/img/elf_right.png +0 -0
apple/envs/img/g1.png +0 -0
apple/envs/img/g2.png +0 -0
apple/envs/img/g3.png +0 -0
apple/envs/img/grass.jpg +0 -0
apple/envs/img/home.png +0 -0
apple/envs/img/home00.png +0 -0
apple/envs/img/home01.png +0 -0
apple/envs/img/home02.png +0 -0
apple/envs/img/home10.png +0 -0
apple/envs/img/home11.png +0 -0
apple/envs/img/home12.png +0 -0
apple/envs/img/home2.png +0 -0
apple/envs/img/home2_with_apples.png +0 -0
apple/envs/img/home_grass.png +0 -0
apple/envs/img/part_grass.png +0 -0
apple/envs/img/stool.png +0 -0
apple/envs/img/textures.jpg +0 -0
apple/envs/img/white.png +0 -0
apple/evaluation/render_episode.py +22 -0
apple/logger.py +384 -0
apple/models/categorical_policy.py +46 -0
apple/training/reinforce_trainer.py +74 -0
apple/training/trainer.py +77 -0
apple/utils.py +25 -0
apple/wrappers.py +35 -0
assets/apple_env.png +0 -0
assets/example_rollout.mp4 +0 -0
assets/generate_example_rollout.py +30 -0
input_args.py +17 -0
mrunner_exps/behavioral_cloning.py +51 -0
mrunner_exps/reinforce.py +53 -0
mrunner_exps/utils.py +10 -0
mrunner_run.py +18 -0
mrunner_runs/local.sh +3 -0
mrunner_runs/remote.sh +9 -0
pyproject.toml +92 -0
requirements.txt +7 -0
run.py +87 -0
setup.cfg +10 -0

.editorconfig ADDED Viewed

	@@ -0,0 +1,16 @@

+root = true
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+[*]
+indent_size = 4
+indent_style = space
+max_line_length = 120
+tab_width = 8
+[*.{yml,yaml}]
+indent_size = 2

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+/.venv/
+/.python-version
+/build/
+/dist/
+/site/
+/test-results.xml
+/.coverage
+/coverage.xml
+/.hypothesis/
+__pycache__/
+*.egg-info/
+/.vscode/
+wandb
+logs
+*.pt

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+repos:
+  - repo: local
+    hooks:
+      - id: autoflake
+        name: autoflake
+        entry: autoflake
+        args: [--in-place, --remove-all-unused-imports, --remove-unused-variables]
+        language: system
+        types_or: [python, pyi]
+      - id: isort
+        name: isort
+        entry: isort
+        args: [--quiet]
+        language: system
+        types_or: [python, pyi]
+      - id: black
+        name: black
+        entry: black
+        args: [--quiet]
+        language: system
+        types_or: [python, pyi]
+      - id: flake8
+        name: flake8
+        entry: pflake8
+        language: system
+        types_or: [python, pyi]

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[theme]
+#theme primary
+base="dark"
+# Primary accent color for interactive elements.
+primaryColor="f63366"
+# Background color for the main content area.
+#backgroundColor =
+# Background color used for the sidebar and most interactive widgets.
+#secondaryBackgroundColor ='grey'
+# Color used for almost all text.
+#textColor ='blue'
+# Font family for all text in the app, except code blocks. One of "sans serif", "serif", or "monospace".
+# Default: "sans serif"
+font = "sans serif"
+# [logger]
+# level='info'
+# messageFormat = "%(message)s"
+#messageFormat="%(asctime)s %(message)s"
+[global]
+# By default, Streamlit checks if the Python watchdog module is available and, if not, prints a warning asking for you to install it. The watchdog module is not required, but highly recommended. It improves Streamlit's ability to detect changes to files in your filesystem.
+# If you'd like to turn off this warning, set this to True.
+# Default: false
+disableWatchdogWarning = false
+# If True, will show a warning when you run a Streamlit-enabled script via "python my_script.py".
+# Default: true
+showWarningOnDirectExecution = false

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Apple
-emoji: 📚
-colorFrom: red
-colorTo: yellow
 sdk: streamlit
-sdk_version: 1.17.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Apple Retrieval
+emoji: 🍎
+colorFrom: yellow
+colorTo: red
 sdk: streamlit
+sdk_version: 1.15.2
 app_file: app.py
+pinned: true
+fullWidth: true
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,606 @@

+import asyncio
+from functools import partial
+import numpy as np
+import pandas as pd
+import streamlit as st
+import torch
+from apple.envs.discrete_apple import get_apple_env
+from apple.evaluation.render_episode import render_episode
+from apple.logger import EpochLogger
+from apple.models.categorical_policy import CategoricalPolicy
+from apple.training.trainer import Trainer
+QUEUE_SIZE = 1000
+def init_training(
+    c: float = 1.0,
+    start_x: float = 0.0,
+    goal_x: float = 50.0,
+    time_limit: int = 200,
+    lr: float = 1e-3,
+    weight1: float = 0.0,
+    weight2: float = 0.0,
+    pretrain: str = "phase1",
+    finetune: str = "full",
+    bias_in_state: bool = True,
+    position_in_state: bool = False,
+    apple_in_state: bool = True,
+):
+    st.session_state.logger = EpochLogger(verbose=False)
+    env_kwargs = dict(
+        start_x=start_x,
+        goal_x=goal_x,
+        c=c,
+        time_limit=time_limit,
+        bias_in_state=bias_in_state,
+        position_in_state=position_in_state,
+        apple_in_state=apple_in_state,
+    )
+    st.session_state.env_full = get_apple_env("full", render_mode="rgb_array", **env_kwargs)
+    st.session_state.env_phase1 = get_apple_env(pretrain, **env_kwargs)
+    st.session_state.env_phase2 = get_apple_env(finetune, **env_kwargs)
+    st.session_state.test_envs = [get_apple_env(task, **env_kwargs) for task in ["full", "phase1", "phase2"]]
+    st.session_state.model = CategoricalPolicy(
+        st.session_state.env_phase1.observation_space.shape[0], 1, weight1, weight2
+    )
+    st.session_state.optim = torch.optim.SGD(st.session_state.model.parameters(), lr=lr)
+    st.session_state.trainer = Trainer(st.session_state.model, st.session_state.optim, st.session_state.logger)
+    st.session_state.train_it = 0
+    st.session_state.draw_it = 0
+    st.session_state.total_steps = 0
+    st.session_state.data = []
+    st.session_state.obs1 = st.session_state.env_phase1.reset()
+    st.session_state.obs2 = st.session_state.env_phase2.reset()
+def init_reset():
+    st.session_state.rollout_iterator = iter(
+        partial(render_episode, st.session_state.env_full, st.session_state.model)()
+    )
+    st.session_state.last_image = dict(
+        x=0,
+        obs=st.session_state.env_full.reset(),
+        action=None,
+        reward=0,
+        done=False,
+        episode_len=0,
+        episode_return=0,
+        pixel_array=st.session_state.env_full.unwrapped.render(),
+    )
+def select_preset():
+    if st.session_state.pick_preset == 0:
+        preset_finetuning_interference()
+    elif st.session_state.pick_preset == 1:
+        preset_train_full_from_scratch()
+    elif st.session_state.pick_preset == 2:
+        preset_task_interference()
+    elif st.session_state.pick_preset == 3:
+        preset_without_task_interference()
+def preset_task_interference():
+    st.session_state.pick_c = 0.5
+    st.session_state.pick_goal_x = 20
+    st.session_state.pick_time_limit = 50
+    st.session_state.pick_lr = 0.05
+    st.session_state.pick_phase1task = "phase2"
+    st.session_state.pick_phase2task = "phase1"
+    st.session_state.pick_weight1 = 0.0
+    st.session_state.pick_weight2 = 0.0
+    st.session_state.pick_phase1steps = 500
+    st.session_state.pick_phase2steps = 500
+    need_reset()
+def preset_finetuning_interference():
+    st.session_state.pick_c = 0.5
+    st.session_state.pick_goal_x = 20
+    st.session_state.pick_time_limit = 50
+    st.session_state.pick_lr = 0.05
+    st.session_state.pick_phase1task = "phase2"
+    st.session_state.pick_phase2task = "full"
+    st.session_state.pick_weight1 = 0.0
+    st.session_state.pick_weight2 = 0.0
+    st.session_state.pick_phase1steps = 500
+    st.session_state.pick_phase2steps = 2000
+    need_reset()
+def preset_without_task_interference():
+    st.session_state.pick_c = 1.0
+    st.session_state.pick_goal_x = 20
+    st.session_state.pick_time_limit = 50
+    st.session_state.pick_lr = 0.05
+    st.session_state.pick_phase1task = "phase2"
+    st.session_state.pick_phase2task = "phase1"
+    st.session_state.pick_weight1 = 0.0
+    st.session_state.pick_weight2 = 0.0
+    st.session_state.pick_phase1steps = 500
+    st.session_state.pick_phase2steps = 500
+    need_reset()
+def preset_train_full_from_scratch():
+    st.session_state.pick_c = 0.5
+    st.session_state.pick_goal_x = 20
+    st.session_state.pick_time_limit = 50
+    st.session_state.pick_lr = 0.05
+    st.session_state.pick_phase1task = "phase2"
+    st.session_state.pick_phase2task = "full"
+    st.session_state.pick_weight1 = 0.0
+    st.session_state.pick_weight2 = 0.0
+    st.session_state.pick_phase1steps = 0
+    st.session_state.pick_phase2steps = 2000
+    need_reset()
+def empty_queue(q: asyncio.Queue):
+    for _ in range(q.qsize()):
+        # Depending on your program, you may want to
+        # catch QueueEmpty
+        q.get_nowait()
+        q.task_done()
+def reset(**kwargs):
+    init_training(**kwargs)
+    init_reset()
+    st.session_state.play = False
+    st.session_state.step = False
+    st.session_state.render = False
+    st.session_state.done = False
+    empty_queue(st.session_state.queue)
+    empty_queue(st.session_state.queue_render)
+    st.session_state.play_pause = False
+    st.session_state.need_reset = False
+def render_start():
+    st.session_state.render = True
+    st.session_state.done = False
+    init_reset()
+def need_reset():
+    st.session_state.need_reset = True
+    st.session_state.play = False
+    st.session_state.render = False
+def play_pause():
+    if st.session_state.play:
+        st.session_state.play = False
+        st.session_state.play_pause = False
+    else:
+        st.session_state.play = True
+        st.session_state.play_pause = True
+def step():
+    st.session_state.step = True
+def plot(data_placeholder):
+    df = pd.DataFrame(st.session_state.data)
+    if not df.empty:
+        df.set_index("total_env_steps", inplace=True)
+    container = data_placeholder.container()
+    c1, c2, c3 = container.columns(3)
+    def view_df(names):
+        rdf = df.loc[:, df.columns.isin(names)]
+        if rdf.empty:
+            return pd.DataFrame([{name: 0 for name in names}])
+        else:
+            return rdf
+    c1.write("phase1/success_rate")
+    c1.line_chart(view_df(["phase1/success"]))
+    c2.write("phase2/success_rate")
+    c2.line_chart(view_df(["phase2/success"]))
+    c3.write("full/success_rate")
+    c3.line_chart(view_df(["full/success"]))
+    c1.write("train/loss")
+    c1.line_chart(view_df(["train/loss"]))
+    c2.write("weight0")
+    c2.line_chart(view_df(["weight0"]))
+    c3.write("weight1")
+    c3.line_chart(view_df(["weight1"]))
+async def draw(data_placeholder, queue, delay, steps, plotfrequency):
+    while (st.session_state.play or st.session_state.step) and st.session_state.draw_it < steps:
+        _ = await asyncio.sleep(delay)
+        new_data = await queue.get()
+        st.session_state.draw_it += 1
+        if st.session_state.draw_it % plotfrequency == 0:
+            st.session_state.data.append(new_data)
+            plot(data_placeholder)
+            st.session_state.step = False
+        queue.task_done()
+async def train(queue, delay, steps, obs, env, num_eval_episodes, plotfrequency):
+    while (st.session_state.play or st.session_state.step) and st.session_state.train_it < steps:
+        _ = await asyncio.sleep(delay)
+        st.session_state.train_it += 1
+        st.session_state.total_steps += 1
+        output = st.session_state.model(obs)
+        action, log_prob = st.session_state.model.sample(output)
+        st.session_state.trainer.update(
+            env, output, st.session_state.model, st.session_state.optim, st.session_state.logger
+        )
+        obs, reward, done, info = env.step(action)
+        if done:
+            obs = env.reset()
+        if st.session_state.train_it % plotfrequency == 0:
+            st.session_state.trainer.test_agent(
+                st.session_state.model, st.session_state.logger, st.session_state.test_envs, num_eval_episodes
+            )
+            data = st.session_state.trainer.log(
+                st.session_state.logger, st.session_state.train_it, st.session_state.model
+            )
+        else:
+            data = 0
+        _ = await queue.put(data)
+async def produce_images(queue, delay):
+    while st.session_state.render and not st.session_state.done:
+        _ = await asyncio.sleep(delay)
+        data = next(st.session_state.rollout_iterator)
+        st.session_state.done = data["done"]
+        _ = await queue.put(data)
+def show_image(data, image_placeholder):
+    c = image_placeholder.container()
+    c.image(
+        data["pixel_array"],
+    )
+    c.text(
+        f"agent position: {data['x']} \ntimestep: {data['episode_len']} \nepisode return: {data['episode_return']} \n"
+    )
+async def consume_images(image_placeholder, queue, delay):
+    while st.session_state.render and not st.session_state.done:
+        _ = await asyncio.sleep(delay)
+        data = await queue.get()
+        st.session_state.last_image = data
+        show_image(data, image_placeholder)
+        queue.task_done()
+async def run_app(
+    data_placeholder,
+    queue,
+    produce_delay,
+    consume_delay,
+    phase1steps,
+    phase2steps,
+    plotfrequency,
+    num_eval_episodes,
+    image_placeholder,
+    queue_render,
+    render_produce_delay,
+    render_consume_delay,
+):
+    _ = await asyncio.gather(
+        produce_images(queue_render, render_produce_delay),
+        consume_images(image_placeholder, queue_render, render_consume_delay),
+    )
+    st.session_state.render = False
+    st.session_state.done = False
+    empty_queue(queue_render)
+    _ = await asyncio.gather(
+        train(
+            queue,
+            produce_delay,
+            phase1steps,
+            st.session_state.obs1,
+            st.session_state.env_phase1,
+            num_eval_episodes,
+            plotfrequency,
+        ),
+        draw(data_placeholder, queue, consume_delay, phase1steps, plotfrequency),
+    )
+    _ = await asyncio.gather(
+        train(
+            queue,
+            produce_delay,
+            phase1steps + phase2steps,
+            st.session_state.obs2,
+            st.session_state.env_phase2,
+            num_eval_episodes,
+            plotfrequency,
+        ),
+        draw(data_placeholder, queue, consume_delay, phase1steps + phase2steps, plotfrequency),
+    )
+##### ACTUAL APP
+if __name__ == "__main__":
+    st.set_page_config(
+        layout="wide",
+        initial_sidebar_state="auto",
+        page_title="Apple Retrieval",
+        page_icon=None,
+    )
+    st.title("ON THE ROLE OF FORGETTING IN FINE-TUNING REINFORCEMENT LEARNING MODELS")
+    st.header("Toy example of forgetting: AppleRetrieval")
+    col1, col2, col3 = st.sidebar.columns(3)
+    options = (
+        "phase2 full interference",
+        "full from scratch",
+        "phase2 phase1 forgetting",
+        "phase2 phase1 optimal solution",
+    )
+    st.sidebar.selectbox(
+        "parameter presets",
+        range(len(options)),
+        index=0,
+        format_func=lambda x: options[x],
+        on_change=select_preset,
+        key="pick_preset",
+    )
+    pick_container = st.sidebar.container()
+    c = pick_container.number_input("c", value=0.5, on_change=need_reset, key="pick_c")
+    goal_x = pick_container.number_input("distance to apple", value=20, on_change=need_reset, key="pick_goal_x")
+    time_limit = pick_container.number_input("time limit", value=50, on_change=need_reset, key="pick_time_limit")
+    lr = pick_container.selectbox(
+        "Learning rate",
+        np.array([0.00001, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]),
+        5,
+        on_change=need_reset,
+        key="pick_lr",
+    )
+    phase1task = pick_container.selectbox(
+        "Pretraining task", ("full", "phase1", "phase2"), 2, on_change=need_reset, key="pick_phase1task"
+    )
+    phase2task = pick_container.selectbox(
+        "Finetuning task", ("full", "phase1", "phase2"), 0, on_change=need_reset, key="pick_phase2task"
+    )
+    # weight1 = pick_container.number_input("init weight1", value=0, on_change=need_reset, key="pick_weight1")
+    # weight2 = pick_container.number_input("init weight2", value=0, on_change=need_reset, key="pick_weight2")
+    if "event_loop" not in st.session_state:
+        st.session_state.loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(st.session_state.loop)
+    if "queue" not in st.session_state:
+        st.session_state.queue = asyncio.Queue(QUEUE_SIZE)
+    if "queue_render" not in st.session_state:
+        st.session_state.queue_render = asyncio.Queue(QUEUE_SIZE)
+    if "play" not in st.session_state:
+        st.session_state.play = False
+    if "step" not in st.session_state:
+        st.session_state.step = False
+    if "render" not in st.session_state:
+        st.session_state.render = False
+    reset_button = partial(
+        reset,
+        c=c,
+        start_x=0,
+        goal_x=goal_x,
+        time_limit=time_limit,
+        lr=lr,
+        # weight1=weight1,
+        # weight2=weight2,
+        # weight1=0,
+        # weight2=10, # soves the environment
+        pretrain=phase1task,
+        finetune=phase2task,
+    )
+    col1.button("Reset", on_click=reset_button, type="primary")
+    if "logger" not in st.session_state or st.session_state.need_reset:
+        reset_button()
+    myKey = "play_pause"
+    if myKey not in st.session_state:
+        st.session_state[myKey] = False
+    if st.session_state[myKey]:
+        myBtn = col2.button("Pause", on_click=play_pause, type="primary")
+    else:
+        myBtn = col2.button("Play", on_click=play_pause, type="primary")
+    col3.button("Step", on_click=step, type="primary")
+    st.header("Summary")
+    st.write(
+        """
+        Run training on the "phase2 full interference" setting to see an example of forgetting in fine-tuning RL models.
+        A model is pre-trained on a part of the environment called Phase 2 for 500 steps,
+        and then it is fine-tuned on the whole environment for another 2000 steps.
+        However, it forgets how to perform on Phase 2 during fine-tuning before it even gets there.
+        We highlight this as an important problem in fine-tuning RL models.
+        We invite you to play around with the hyperparameters and find out more about the forgetting phenomenon.
+        """
+    )
+    st.markdown(
+        """
+        By comparing the results of these four presets, you can gain a deeper understanding of how different training configurations can impact the final performance of a neural network.
+        * `phase2 full interference` - already mentioned setting where we pretrain on phase 2 and finetune on whole environment.
+        * `full from scratch` - "standard" train on whole environment for 2000 steps for comparison.
+        * `phase2 phase1 forgetting` - a setting where we pretrain on phase 2 and finetune on phase 1. Here gradient interference between tasks causes network to forget how to solve phase 2.
+        * `phase2 phase1 without forgetting` - a setting where we pretrain on phase 2 and finetune on phase 1. Here gradient interference between tasks "move" network weights to optimal solution for both tasks.
+        """
+    )
+    data_placeholder = st.empty()
+    render_placeholder = st.empty()
+    phase1steps = st.sidebar.slider("Pretraining Steps", 0, 2000, 500, 10, key="pick_phase1steps")
+    phase2steps = st.sidebar.slider("Finetuning Steps", 0, 2000, 2000, 10, key="pick_phase2steps")
+    plotfrequency = st.sidebar.number_input("Log frequency ", min_value=1, value=10, step=1)
+    num_eval_episodes = st.sidebar.number_input("Number of evaluation episodes", min_value=1, value=10, step=1)
+    if not st.session_state.play and len(st.session_state.data) > 0:
+        plot(data_placeholder)
+    st.write(
+        """
+        Figure 1: Plots (a), (b), (c) show the performance of the agent on three different variants of the environment.
+        Phase1 (a) the agent's goal is to reach the apple.
+        Phase2 (b) the agent's has to go back home by returning to x = 0.
+        Full (c) combination of both phases of the environment.
+        (d) shows the training loss.
+        State vector has only two values, we can view the netwoks weights (e), (f).
+        """
+    )
+    st.header("Vizualize agent behavior")
+    st.button("Rollout one episode", on_click=render_start, type="primary")
+    c1, c2 = st.columns(2)
+    image_placeholder = c1.empty()
+    render_produce_delay = 1 / 3
+    render_consume_delay = 1 / 3
+    st.header("About the environment")
+    st.write(
+        """This study forms a preliminary investigation into the problem of forgetting in fine-tuning RL models.
+        We show that fine-tuning a pre-trained model on compositional RL problems might result in a rapid
+        deterioration of the performance of the pre-trained model if the relevant data is not available at the
+        beginning of the training.
+        This phenomenon is known as catastrophic forgetting.
+        In this demo we show how it can occur in simple toyish situations but it might occur in more realistic problems (e.g. (SAC with MLPs on a compositional robotic environmen).
+        In our [WIP] paper we showed that applying CL methods significantly limits forgetting and allows for efficient transfer.
+    """
+    )
+    st.write(
+        """The AppleRetrieval environment is a toy example to demonstrate the issue of interference in reinforcement learning.
+        The goal of the agent is to retrieve an apple from position x = M and return home to x = 0 within a set number of steps T.
+        The state of the agent is represented by a vector s, which has two elements: s = [1, -c] in phase 1 and s = [1, c] in phase 2.
+        The first element is a constant and the second element represents the information about the current phase.
+        The optimal policy is to go right in phase 1 and go left in phase 2.
+        The cause of interference can be identified by checking the reliance of the policy on either s1 or s2.
+        If the model mostly relies on s2, interference will be limited, but if it relies on s1,
+        interference will occur as its value is the same in both phases.
+        The magnitude of s1 and s2 can be adjusted by changing the c parameter to guide the model towards focusing on either one.
+        This simple toy environment shows that the issue of interference can be fundamental to reinforcement learning."""
+    )
+    cc1, cc2 = st.columns([1, 2])
+    cc1.image("assets/apple_env.png")
+    st.header("Training algorithm")
+    st.write(
+        """For practical reasons (time and stability) we don't use REINFORCE in this DEMO to illustrate the training dynamics of the environment.
+        Instead, we train the model by minimizing the negative log likelihood of target actions (move right or left).
+        We train the model in each step of the environment and sample actions from Categorical distribution taken from model's output.
+        """
+    )
+    st.markdown(
+        """
+        Pseudocode of the train loop:
+```python
+obs = env.reset()
+for timestep in range(steps):
+    probs = model(obs)
+    dist = Categorical(probs)
+    action = dist.sample()
+    target_action = env.get_target_action()
+    loss = -dist.log_prob(target_action)
+    optim.zero_grad()
+    loss.backward()
+    optim.step()
+    obs, reward, done, info = env.step(action)
+    if done:
+        obs = env.reset()
+        """
+    )
+    st.header("What do all the hyperparameters mean?")
+    st.markdown(
+        """
+        * **parameter presets** - few sets of pre-defined hyperparameters that can be used as a starting point for a specific experiment.
+        * **c** - second element of state vecor, decreasing this value will result in stronger forgetting.
+        * **distance to apple** - refers to how far agent needs to travel in the right before it encounters the apple.
+        * **time limit** - maximum amount of timesteps that the agent is allowed to interact with the environment before episode ends.
+        * **learning rate** - hyperparameter that determines the steps size taken by the learning algoritm in each iteration.
+        * **pretraining and finetuning task** - define the environment the model will be trained on each stage.
+        * **pretraining and finetuning steps** - define how long model will be trained on each stage.
+        * **init weights** - initial values assigned to the model's parameters before trainig begins.
+        * **log frequency** - refers to frequency of logging the metrics and evaluating the agent.
+        * **number of evaluation episodes** - number of rollouts during testing of the agent.
+        """
+    )
+    st.header("Limitations & Conclusions")
+    st.write(
+        """
+        At the same time, this study, due to its preliminary nature, has numerous limitations which we
+hope to address in future work. We only considered a fairly strict formulation of the forgetting
+scenario where we assumed that the pre-trained model works perfectly on tasks that appear later in
+the fine-tuning. In practice, one should also consider the case when even though there are differences
+between the pre-training and fine-tuning tasks, transfer is still possible.
+        """
+    )
+    st.write(
+        """
+        At the same time, even given
+these limitations, we see forgetting as an important problem to be solved and hope that addressing
+these issues in the future might help with building and fine-tuning better foundation models in RL.
+        """
+    )
+    produce_delay = 1 / 1000
+    consume_delay = 1 / 1000
+    plot(data_placeholder)
+    show_image(st.session_state.last_image, image_placeholder)
+    asyncio.run(
+        run_app(
+            data_placeholder,
+            st.session_state.queue,
+            produce_delay,
+            consume_delay,
+            phase1steps,
+            phase2steps,
+            plotfrequency,
+            num_eval_episodes,
+            image_placeholder,
+            st.session_state.queue_render,
+            render_produce_delay,
+            render_consume_delay,
+        )
+    )

apple/envs/discrete_apple.py ADDED Viewed

	@@ -0,0 +1,384 @@

+from contextlib import closing
+from io import StringIO
+from os import path
+from typing import Optional
+import gym
+import gym.spaces
+import numpy as np
+from gym.error import DependencyNotInstalled
+from gym.spaces import Box
+from gym.utils import colorize
+from gym.wrappers import TimeLimit
+from apple.wrappers import SuccessCounter
+def get_apple_env(task, time_limit=20, **kwargs):
+    if task == "full":
+        env = AppleEnv(**kwargs)
+    elif task == "phase1":
+        env = ApplePhase0Env(**kwargs)
+        time_limit = time_limit // 2
+    elif task == "phase2":
+        env = ApplePhase1Env(**kwargs)
+        time_limit = time_limit // 2
+    else:
+        raise NotImplementedError
+    env = TimeLimit(env, time_limit)
+    env = SuccessCounter(env)
+    env.name = task
+    return env
+class AppleEnv(gym.Env):
+    metadata = {
+        "render_modes": ["human", "ansi", "rgb_array"],
+        "render_fps": 4,
+    }
+    def __init__(
+        self,
+        start_x: int,
+        goal_x: int,
+        c: float,
+        reward_value: float = 1.0,
+        success_value: float = 1.0,
+        bias_in_state: bool = True,
+        position_in_state: bool = False,
+        apple_in_state: bool = True,
+        render_mode: Optional[str] = None,
+    ):
+        self.start_x = start_x
+        self.goal_x = goal_x
+        self.c = c
+        self.reward_value = reward_value
+        self.success_value = success_value
+        self.x = start_x
+        self.phase = 0
+        self.delta = 1
+        self.change_phase = True
+        self.init_pos = self.start_x
+        self.success_when_finish_phase = 1
+        self.bias_in_state = bias_in_state
+        self.position_in_state = position_in_state
+        self.apple_in_state = apple_in_state
+        example_state = self.state()
+        mult = np.ones_like(example_state)
+        if self.apple_in_state:
+            mult[-1] *= -1
+        self.observation_space = Box(low=example_state, high=example_state * mult)
+        self.action_space = gym.spaces.Discrete(2)
+        self.timestep = 0
+        self.render_mode = render_mode
+        self.scope_size = 15
+        self.nrow, self.ncol = nrow, ncol = self.gui_canvas().shape
+        self.window_size = (64 * ncol, 64 * nrow)
+        self.cell_size = (
+            self.window_size[0] // self.ncol,
+            self.window_size[1] // self.nrow,
+        )
+        self.window_surface = None
+        self.clock = None
+        self.empty_img = None
+        self.ground_img = None
+        self.underground_img = None
+        self.elf_images = None
+        self.home_images = None
+        self.goal_img = None
+        self.start_img = None
+        self.stool_img = None
+    def reset(self):
+        self.x = self.init_pos
+        if self.change_phase:
+            self.phase = 0
+        self.timestep = 0
+        state = self.state()
+        self.lastaction = None
+        return state
+    def validate_action(self, action):
+        err_msg = f"{action!r} ({type(action)}) invalid"
+        assert self.action_space.contains(action), err_msg
+        assert self.state is not None, "Call reset before using step method."
+    def move(self, action):
+        delta = self.delta if action == 1 else -self.delta
+        self.x += delta
+    def state(self):
+        assert self.bias_in_state or self.position_in_state or self.apple_in_state
+        if self.phase == 0:
+            c = -self.c
+        elif self.phase == 1:
+            c = self.c
+        else:
+            raise NotImplementedError
+        state = []
+        if self.bias_in_state:
+            state.append(1)
+        if self.position_in_state:
+            state.append(self.x)
+        if self.apple_in_state:
+            state.append(c)
+        return np.array(state)
+    def desc(self):
+        s = self.scope_size // 2
+        desc = list("." * self.scope_size)
+        desc = np.asarray(desc, dtype="c")
+        start_relative_position = self.start_x - self.x + s
+        if 0 <= start_relative_position <= self.scope_size - 1:
+            desc[start_relative_position] = "S"
+        goal_relative_position = self.goal_x - self.x + s
+        if 0 <= goal_relative_position <= self.scope_size - 1 and self.phase == 0:
+            desc[goal_relative_position] = "G"
+        if 0 <= goal_relative_position <= self.scope_size - 1 and self.phase == 1:
+            desc[goal_relative_position] = "D"
+        return desc
+    def text_canvas(self):
+        desc = self.desc()
+        canvas = np.ones((2, len(desc) * 3 + 2), dtype="c")
+        canvas[:] = "\x20"
+        for i, d in zip(range(2, len(canvas[0]), 3), desc):
+            canvas[0][i] = d
+        axis = np.arange(len(desc)) - len(desc) // 2 + self.x
+        for i, d in zip(range(2, len(canvas[0]), 3), axis):
+            if d % 5 == 0:
+                s = str(d)
+                c = len(s) // 2
+                for j, char in zip(range(len(s)), reversed(s)):
+                    canvas[1][i - j + c] = char
+        return canvas
+    def gui_canvas(self):
+        desc = self.desc()
+        upper_canvas = np.ones(len(desc), dtype="c")
+        upper_canvas[:] = "~"
+        lower_canvas = np.ones(len(desc), dtype="c")
+        lower_canvas[:] = "#"
+        canvas = np.stack([upper_canvas, upper_canvas, desc, lower_canvas])
+        return canvas
+    def reward(self, action):
+        return self.reward_value if action == self.get_target_action() else -self.reward_value
+    def get_target_action(self):
+        return 1 if self.phase == 0 else 0
+    def step(self, action):
+        self.timestep += 1
+        self.validate_action(action)
+        done = False
+        info = {"success": False}
+        reward = self.reward(action)
+        self.move(action)
+        finish_phase0 = self.phase == 0 and self.x >= self.goal_x
+        finish_phase1 = self.phase == 1 and self.x <= self.start_x
+        if self.change_phase:
+            if finish_phase0:
+                self.phase = 1
+        if (self.success_when_finish_phase == 0 and finish_phase0) or (
+            self.success_when_finish_phase == 1 and finish_phase1
+        ):
+            done = True
+            info["success"] = True
+            reward = self.success_value
+        state = self.state()
+        self.lastaction = action
+        if self.render_mode == "human":
+            self.render()
+        return state, reward, done, info
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+        if self.render_mode == "ansi":
+            return self._render_text()
+        else:  # self.render_mode in {"human", "rgb_array"}:
+            return self._render_gui(self.render_mode)
+    def _render_gui(self, mode):
+        try:
+            import pygame
+        except ImportError as e:
+            raise DependencyNotInstalled("pygame is not installed, run `pip install pygame`") from e
+        if self.window_surface is None:
+            pygame.init()
+            if mode == "human":
+                pygame.display.init()
+                pygame.display.set_caption("Apple Retrieval")
+                self.window_surface = pygame.display.set_mode(self.window_size)
+            elif mode == "rgb_array":
+                self.window_surface = pygame.Surface(self.window_size)
+        assert self.window_surface is not None, "Something went wrong with pygame. This should never happen."
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+        if self.empty_img is None:
+            file_name = path.join(path.dirname(__file__), "img/white.png")
+            self.empty_img = pygame.transform.scale(pygame.image.load(file_name), self.cell_size)
+        if self.ground_img is None:
+            file_name = path.join(path.dirname(__file__), "img/part_grass.png")
+            self.ground_img = pygame.transform.scale(pygame.image.load(file_name), self.cell_size)
+        if self.underground_img is None:
+            file_name = path.join(path.dirname(__file__), "img/g2.png")
+            self.underground_img = pygame.transform.scale(pygame.image.load(file_name), self.cell_size)
+        if self.goal_img is None:
+            file_name = path.join(path.dirname(__file__), "img/apple.png")
+            self.goal_img = pygame.transform.scale(pygame.image.load(file_name), self.cell_size)
+        if self.stool_img is None:
+            file_name = path.join(path.dirname(__file__), "img/stool.png")
+            self.stool_img = pygame.transform.scale(pygame.image.load(file_name), self.cell_size)
+        if self.start_img is None:
+            homes = [
+                path.join(path.dirname(__file__), "img/home00.png"),
+                path.join(path.dirname(__file__), "img/home01.png"),
+                path.join(path.dirname(__file__), "img/home02.png"),
+                path.join(path.dirname(__file__), "img/home10.png"),
+                path.join(path.dirname(__file__), "img/home11.png"),
+                path.join(path.dirname(__file__), "img/home12.png"),
+            ]
+            self.home_images = [pygame.transform.scale(pygame.image.load(f_name), self.cell_size) for f_name in homes]
+        if self.elf_images is None:
+            elfs = [
+                path.join(path.dirname(__file__), "img/elf_left.png"),
+                path.join(path.dirname(__file__), "img/elf_right.png"),
+                path.join(path.dirname(__file__), "img/elf_down.png"),
+            ]
+            self.elf_images = [pygame.transform.scale(pygame.image.load(f_name), self.cell_size) for f_name in elfs]
+        desc = self.gui_canvas().tolist()
+        cache = []
+        assert isinstance(desc, list), f"desc should be a list or an array, got {desc}"
+        for y in range(self.nrow):
+            for x in range(self.ncol):
+                pos = (x * self.cell_size[0], y * self.cell_size[1])
+                self.window_surface.blit(self.empty_img, pos)
+                if desc[y][x] == b"~":
+                    self.window_surface.blit(self.empty_img, pos)
+                elif desc[y][x] == b"#":
+                    self.window_surface.blit(self.underground_img, pos)
+                else:
+                    self.window_surface.blit(self.ground_img, pos)
+                # if y == self.nrow - 1:
+                if len(cache) > 0:
+                    cache_img, cache_pos = cache.pop()
+                    self.window_surface.blit(cache_img, cache_pos)
+                if desc[y][x] == b"G":
+                    self.window_surface.blit(self.stool_img, pos)
+                    self.window_surface.blit(self.goal_img, pos)
+                elif desc[y][x] == b"D":
+                    self.window_surface.blit(self.stool_img, pos)
+                elif desc[y][x] == b"S":
+                    for h in range(len(self.home_images)):
+                        i = h // 3
+                        j = h % 3
+                        home_img = self.home_images[i * 3 + j]
+                        home_pos = ((x - 1 + j) * self.cell_size[0], (y - 1 + i) * self.cell_size[1])
+                        if h == len(self.home_images) - 1:
+                            cache.append((home_img, home_pos))
+                        else:
+                            self.window_surface.blit(home_img, home_pos)
+        # paint the elf
+        # bot_row, bot_col = self.s // self.ncol, self.s % self.ncol
+        bot_col = self.scope_size // 2
+        bot_row = 2
+        cell_rect = (bot_col * self.cell_size[0], bot_row * self.cell_size[1])
+        last_action = self.lastaction if self.lastaction is not None else 2
+        elf_img = self.elf_images[last_action]
+        self.window_surface.blit(elf_img, cell_rect)
+        # font = pygame.font.SysFont(None, 20)
+        # img = font.render(f"agent position = {self.x}", True, "black")
+        # self.window_surface.blit(img, (5, 5))
+        # img = font.render(f"timestep = {self.timestep}", True, "black")
+        # self.window_surface.blit(img, (5, 25))
+        if mode == "human":
+            pygame.event.pump()
+            pygame.display.update()
+            self.clock.tick(self.metadata["render_fps"])
+        elif mode == "rgb_array":
+            return np.transpose(np.array(pygame.surfarray.pixels3d(self.window_surface)), axes=(1, 0, 2))
+    def _render_text(self):
+        desc = self.text_canvas()
+        outfile = StringIO()
+        row, col = 0, (self.scope_size // 2) * 3 + 2
+        desc = [[c.decode("utf-8") for c in line] for line in desc]
+        desc[row][col] = colorize(desc[row][col], "red", highlight=True)
+        outfile.write("\n")
+        outfile.write("\n".join("".join(line) for line in desc) + "\n")
+        with closing(outfile):
+            return outfile.getvalue()
+class ApplePhase0Env(AppleEnv):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.change_phase = False
+        self.phase = 0
+        self.init_pos = self.start_x
+        self.success_when_finish_phase = 0
+class ApplePhase1Env(AppleEnv):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.change_phase = False
+        self.phase = 1
+        self.init_pos = self.goal_x
+        self.success_when_finish_phase = 1

apple/envs/img/apple.png ADDED Viewed

apple/envs/img/elf_down.png ADDED Viewed

apple/envs/img/elf_left.png ADDED Viewed

apple/envs/img/elf_right.png ADDED Viewed

apple/envs/img/g1.png ADDED Viewed

apple/envs/img/g2.png ADDED Viewed

apple/envs/img/g3.png ADDED Viewed

apple/envs/img/grass.jpg ADDED Viewed

apple/envs/img/home.png ADDED Viewed

apple/envs/img/home00.png ADDED Viewed

apple/envs/img/home01.png ADDED Viewed

apple/envs/img/home02.png ADDED Viewed

apple/envs/img/home10.png ADDED Viewed

apple/envs/img/home11.png ADDED Viewed

apple/envs/img/home12.png ADDED Viewed

apple/envs/img/home2.png ADDED Viewed

apple/envs/img/home2_with_apples.png ADDED Viewed

apple/envs/img/home_grass.png ADDED Viewed

apple/envs/img/part_grass.png ADDED Viewed

apple/envs/img/stool.png ADDED Viewed

apple/envs/img/textures.jpg ADDED Viewed

apple/envs/img/white.png ADDED Viewed

apple/evaluation/render_episode.py ADDED Viewed

	@@ -0,0 +1,22 @@

+def render_episode(env, model):
+    obs, done, episode_return, episode_len = env.reset(), False, 0, 0
+    while not done:
+        action = model.get_action(obs)
+        new_obs, reward, done, _ = env.step(action)
+        episode_return += reward
+        episode_len += 1
+        data = dict(
+            x=env.x,
+            obs=obs,
+            action=action,
+            reward=reward,
+            done=done,
+            episode_len=episode_len,
+            episode_return=episode_return,
+            pixel_array=env.unwrapped.render(),
+        )
+        yield data
+        obs = new_obs

apple/logger.py ADDED Viewed

	@@ -0,0 +1,384 @@

+"""
+Some simple logging functionality, inspired by rllab's logging.
+Logs to a tab-separated-values file (path/to/output_directory/progress.txt)
+"""
+import atexit
+import os
+import os.path as osp
+import time
+import warnings
+import joblib
+import numpy as np
+import torch
+import wandb
+color2num = dict(gray=30, red=31, green=32, yellow=33, blue=34, magenta=35, cyan=36, white=37, crimson=38)
+def setup_logger_kwargs(exp_name, seed=None, data_dir=None, datestamp=True):
+    """
+    Sets up the output_dir for a logger and returns a dict for logger kwargs.
+    If no seed is given and datestamp is false,
+    ::
+        output_dir = data_dir/exp_name
+    If a seed is given and datestamp is false,
+    ::
+        output_dir = data_dir/exp_name/exp_name_s[seed]
+    If datestamp is true, amend to
+    ::
+        output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed]
+    You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in
+    ``spinup/user_config.py``.
+    Args:
+        exp_name (string): Name for experiment.
+        seed (int): Seed for random number generators used by experiment.
+        data_dir (string): Path to folder where results should be saved.
+            Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``.
+        datestamp (bool): Whether to include a date and timestamp in the
+            name of the save directory.
+    Returns:
+        logger_kwargs, a dict containing output_dir and exp_name.
+    """
+    if data_dir is None:
+        data_dir = osp.join(osp.abspath(osp.dirname(osp.dirname(osp.dirname(__file__)))), "logs")
+    # Make base path
+    ymd_time = time.strftime("%Y-%m-%d_") if datestamp else ""
+    relpath = "".join([ymd_time, exp_name])
+    if seed is not None:
+        # Make a seed-specific subfolder in the experiment directory.
+        if datestamp:
+            hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
+            subfolder = "".join([hms_time, "-", exp_name, "_s", str(seed)])
+        else:
+            subfolder = "".join([exp_name, "_s", str(seed)])
+        relpath = osp.join(relpath, subfolder)
+    logger_kwargs = dict(output_dir=osp.join(data_dir, relpath), exp_name=exp_name)
+    return logger_kwargs
+def colorize(string, color, bold=False, highlight=False):
+    """
+    Colorize a string.
+    This function was originally written by John Schulman.
+    """
+    attr = []
+    num = color2num[color]
+    if highlight:
+        num += 10
+    attr.append(str(num))
+    if bold:
+        attr.append("1")
+    return "\x1b[%sm%s\x1b[0m" % (";".join(attr), string)
+class Logger:
+    """
+    A general-purpose logger.
+    Makes it easy to save diagnostics, hyperparameter configurations, the
+    state of a training run, and the trained model.
+    """
+    def __init__(
+        self,
+        log_to_wandb=False,
+        verbose=False,
+        output_dir=None,
+        output_fname="progress.csv",
+        delimeter=",",
+        exp_name=None,
+        wandbcommit=1,
+    ):
+        """
+        Initialize a Logger.
+        Args:
+            log_to_wandb (bool): If True logger will log to wandb
+            output_dir (string): A directory for saving results to. If
+                ``None``, defaults to a temp directory of the form
+                ``/tmp/experiments/somerandomnumber``.
+            output_fname (string): Name for the tab-separated-value file
+                containing metrics logged throughout a training run.
+                Defaults to ``progress.csv``.
+            exp_name (string): Experiment name. If you run multiple training
+                runs and give them all the same ``exp_name``, the plotter
+                will know to group them. (Use case: if you run the same
+                hyperparameter configuration with multiple random seeds, you
+                should give them all the same ``exp_name``.)
+            delimeter (string): Used to separate logged values saved in output_fname
+        """
+        self.verbose = verbose
+        self.log_to_wandb = log_to_wandb
+        self.delimeter = delimeter
+        self.wandbcommit = wandbcommit
+        self.log_iter = 1
+        # We assume that there's no multiprocessing.
+        if output_dir is not None:
+            self.output_dir = output_dir or "/tmp/experiments/%i" % int(time.time())
+            if osp.exists(self.output_dir):
+                print("Warning: Log dir %s already exists! Storing info there anyway." % self.output_dir)
+            else:
+                os.makedirs(self.output_dir)
+            self.output_file = open(osp.join(self.output_dir, output_fname), "w+")
+            atexit.register(self.output_file.close)
+            print(colorize("Logging data to %s" % self.output_file.name, "green", bold=True))
+        else:
+            self.output_file = None
+        self.first_row = True
+        self.log_headers = []
+        self.log_current_row = {}
+        self.exp_name = exp_name
+    def log(self, msg, color="green"):
+        """Print a colorized message to stdout."""
+        print(colorize(msg, color, bold=True))
+    def log_tabular(self, key, val):
+        """
+        Log a value of some diagnostic.
+        Call this only once for each diagnostic quantity, each iteration.
+        After using ``log_tabular`` to store values for each diagnostic,
+        make sure to call ``dump_tabular`` to write them out to file and
+        stdout (otherwise they will not get saved anywhere).
+        """
+        if self.first_row:
+            self.log_headers.append(key)
+        else:
+            if key not in self.log_headers:
+                self.log_headers.append(key)
+                if self.output_file is not None:
+                    # move pointer at the beggining of the file
+                    self.output_file.seek(0)
+                    # skip the header
+                    self.output_file.readline()
+                    # keep rest of the file
+                    logs = self.output_file.read()
+                    # clear the file
+                    self.output_file.truncate(0)
+                    self.output_file.seek(0)
+                    # write new headers
+                    self.output_file.write(self.delimeter.join(self.log_headers) + "\n")
+                    # write stored file
+                    self.output_file.write(logs)
+                    self.output_file.seek(0)
+                    self.output_file.seek(0, 2)
+            # assert key in self.log_headers, (
+            #     "Trying to introduce a new key %s that you didn't include in the first iteration" % key
+            # )
+        assert key not in self.log_current_row, (
+            "You already set %s this iteration. Maybe you forgot to call dump_tabular()" % key
+        )
+        self.log_current_row[key] = val
+    def save_state(self, state_dict, itr=None):
+        """
+        Saves the state of an experiment.
+        To be clear: this is about saving *state*, not logging diagnostics.
+        All diagnostic logging is separate from this function. This function
+        will save whatever is in ``state_dict``---usually just a copy of the
+        environment---and the most recent parameters for the model you
+        previously set up saving for with ``setup_tf_saver``.
+        Call with any frequency you prefer. If you only want to maintain a
+        single state and overwrite it at each call with the most recent
+        version, leave ``itr=None``. If you want to keep all of the states you
+        save, provide unique (increasing) values for 'itr'.
+        Args:
+            state_dict (dict): Dictionary containing essential elements to
+                describe the current state of training.
+            itr: An int, or None. Current iteration of training.
+        """
+        fname = "vars.pkl" if itr is None else "vars%d.pkl" % itr
+        try:
+            joblib.dump(state_dict, osp.join(self.output_dir, fname))
+        except:
+            self.log("Warning: could not pickle state_dict.", color="red")
+        if hasattr(self, "pytorch_saver_elements"):
+            self._pytorch_simple_save(itr)
+    def setup_pytorch_saver(self, what_to_save):
+        """
+        Set up easy model saving for a single PyTorch model.
+        Because PyTorch saving and loading is especially painless, this is
+        very minimal; we just need references to whatever we would like to
+        pickle. This is integrated into the logger because the logger
+        knows where the user would like to save information about this
+        training run.
+        Args:
+            what_to_save: Any PyTorch model or serializable object containing
+                PyTorch models.
+        """
+        self.pytorch_saver_elements = what_to_save
+    def _pytorch_simple_save(self, itr=None):
+        """
+        Saves the PyTorch model (or models).
+        """
+        assert hasattr(self, "pytorch_saver_elements"), "First have to setup saving with self.setup_pytorch_saver"
+        fpath = "pyt_save"
+        fpath = osp.join(self.output_dir, fpath)
+        fname = "model" + ("%d" % itr if itr is not None else "") + ".pt"
+        fname = osp.join(fpath, fname)
+        os.makedirs(fpath, exist_ok=True)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # We are using a non-recommended way of saving PyTorch models,
+            # by pickling whole objects (which are dependent on the exact
+            # directory structure at the time of saving) as opposed to
+            # just saving network weights. This works sufficiently well
+            # for the purposes of Spinning Up, but you may want to do
+            # something different for your personal PyTorch project.
+            # We use a catch_warnings() context to avoid the warnings about
+            # not being able to save the source code.
+            torch.save(self.pytorch_saver_elements, fname)
+    def dump_tabular(self):
+        """
+        Write all of the diagnostics from the current iteration.
+        Writes both to stdout, and to the output file.
+        """
+        vals = []
+        key_lens = [len(key) for key in self.log_headers]
+        max_key_len = max(15, max(key_lens))
+        keystr = "%" + "%d" % max_key_len
+        fmt = "| " + keystr + "s | %15s |"
+        n_slashes = 22 + max_key_len
+        step = self.log_current_row.get("total_env_steps")
+        if self.verbose:
+            print("-" * n_slashes)
+            for key in self.log_headers:
+                val = self.log_current_row.get(key, "")
+                valstr = "%8.3g" % val if isinstance(val, float) else val
+                print(fmt % (key, valstr))
+                vals.append(val)
+            print("-" * n_slashes, flush=True)
+        if self.output_file is not None:
+            if self.first_row:
+                self.output_file.write(self.delimeter.join(self.log_headers) + "\n")
+            self.output_file.write(self.delimeter.join(map(str, vals)) + "\n")
+            self.output_file.flush()
+        key_val_dict = {key: self.log_current_row.get(key, "") for key in self.log_headers}
+        if self.log_to_wandb:
+            if self.log_iter % self.wandbcommit == 0:
+                wandb.log(key_val_dict, step=step, commit=True)
+            else:
+                wandb.log(key_val_dict, step=step, commit=False)
+        self.log_current_row.clear()
+        self.first_row = False
+        self.log_iter += 1
+        return key_val_dict
+class EpochLogger(Logger):
+    """
+    A variant of Logger tailored for tracking average values over epochs.
+    Typical use case: there is some quantity which is calculated many times
+    throughout an epoch, and at the end of the epoch, you would like to
+    report the average / std / min / max value of that quantity.
+    With an EpochLogger, each time the quantity is calculated, you would
+    use
+    .. code-block:: python
+        epoch_logger.store(NameOfQuantity=quantity_value)
+    to load it into the EpochLogger's state. Then at the end of the epoch, you
+    would use
+    .. code-block:: python
+        epoch_logger.log_tabular(NameOfQuantity, **options)
+    to record the desired values.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.epoch_dict = dict()
+    def store(self, d):
+        """
+        Save something into the epoch_logger's current state.
+        Provide an arbitrary number of keyword arguments with numerical
+        values.
+        """
+        for k, v in d.items():
+            if not (k in self.epoch_dict.keys()):
+                self.epoch_dict[k] = []
+            self.epoch_dict[k].append(v)
+    def log_tabular(self, key, val=None, with_min_and_max=False, with_median=False, with_sum=False, average_only=False):
+        """
+        Log a value or possibly the mean/std/min/max values of a diagnostic.
+        Args:
+            key (string): The name of the diagnostic. If you are logging a
+                diagnostic whose state has previously been saved with
+                ``store``, the key here has to match the key you used there.
+            val: A value for the diagnostic. If you have previously saved
+                values for this key via ``store``, do *not* provide a ``val``
+                here.
+            with_min_and_max (bool): If true, log min and max values of the
+                diagnostic over the epoch.
+            average_only (bool): If true, do not log the standard deviation
+                of the diagnostic over the epoch.
+        """
+        if val is not None:
+            super().log_tabular(key, val)
+        else:
+            stats = self.get_stats(key)
+            super().log_tabular(key if average_only else key + "/avg", stats[0])
+            if not (average_only):
+                super().log_tabular(key + "/std", stats[1])
+            if with_min_and_max:
+                super().log_tabular(key + "/max", stats[3])
+                super().log_tabular(key + "/min", stats[2])
+            if with_median:
+                super().log_tabular(key + "/med", stats[4])
+            if with_sum:
+                super().log_tabular(key + "/sum", stats[5])
+        self.epoch_dict[key] = []
+    def get_stats(self, key):
+        """
+        Lets an algorithm ask the logger for mean/std/min/max of a diagnostic.
+        """
+        v = self.epoch_dict.get(key)
+        if not v:
+            return [np.nan, np.nan, np.nan, np.nan]
+        vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape) > 0 else v
+        return [np.mean(vals), np.std(vals), np.min(vals), np.max(vals), np.median(vals), np.sum(vals)]

apple/models/categorical_policy.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+from torch.distributions import Categorical
+class CategoricalPolicy(nn.Module):
+    def __init__(self, state_dim, act_dim, weight1=None, weight2=None):
+        super().__init__()
+        self.model = nn.Linear(state_dim, act_dim, bias=False)
+        if weight1 is not None:
+            nn.init.constant_(self.model.weight[0][0], weight1)
+        if weight2 is not None:
+            nn.init.constant_(self.model.weight[0][1], weight2)
+    def forward(self, state):
+        x = torch.from_numpy(state).float().unsqueeze(0)
+        x = self.model(x)
+        # we just consider 1 dimensional probability of action
+        p = torch.sigmoid(x)
+        return torch.cat([p, 1 - p], dim=1)
+    def act(self, state):
+        probs = self.forward(state)
+        dist = Categorical(probs)
+        action = dist.sample()
+        return action.item(), dist.log_prob(action)
+    def sample(self, probs):
+        dist = Categorical(probs)
+        action = dist.sample()
+        return action.item(), dist.log_prob(action)
+    def log_prob(self, probs, target_action):
+        dist = Categorical(probs)
+        action = dist.sample()
+        return action.item(), dist.log_prob(target_action)
+    @torch.no_grad()
+    def get_action(self, state):
+        probs = self.forward(state)
+        dist = Categorical(probs)
+        action = dist.sample()
+        return action.item()

apple/training/reinforce_trainer.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from apple.training.trainer import Trainer
+def discount_cumsum(x, gamma):
+    discount_cumsum = torch.zeros_like(x)
+    discount_cumsum[-1] = x[-1]
+    for t in reversed(range(x.shape[0] - 1)):
+        discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
+    return discount_cumsum
+class ReinforceTrainer(Trainer):
+    def __init__(self, *args, gamma: float = 1.0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.gamma = gamma
+    def train(self, env, test_envs, num_episodes, log_every, update_every, num_eval_eps):
+        # code base on
+        # https://goodboychan.github.io/python/reinforcement_learning/pytorch/udacity/2021/05/12/REINFORCE-CartPole.html
+        self.optim.zero_grad()
+        for episode in range(num_episodes):
+            self.train_it += 1
+            if (episode + 1) % log_every == 0:
+                self.test_agent(self.model, self.logger, test_envs, num_eval_eps)
+                # Log info about epoch
+                self.logger.log_tabular("total_env_steps", self.train_it)
+                self.logger.log_tabular("train/return", with_min_and_max=True)
+                self.logger.log_tabular("train/ep_length", average_only=True)
+                for e, w in enumerate(self.model.model.weight.flatten()):
+                    self.logger.log_tabular(f"weights{e}", w.item())
+                self.logger.log_tabular("train/policy_loss", average_only=True)
+                self.logger.log_tabular("train/log_probs", average_only=True)
+                self.logger.dump_tabular()
+            state = env.reset()
+            saved_log_probs = []
+            rewards = []
+            ep_len, ep_ret = 0, 0
+            while True:
+                # Sample the action from current policy
+                action, log_prob = self.model.act(state)
+                saved_log_probs.append(log_prob)
+                state, reward, done, _ = env.step(action)
+                ep_ret += reward
+                ep_len += 1
+                rewards.append(reward)
+                if done:
+                    self.logger.store({"train/return": ep_ret, "train/ep_length": ep_len})
+                    break
+            saved_log_probs, rewards = torch.cat(saved_log_probs), torch.tensor(rewards)
+            discounted_rewards = discount_cumsum(rewards, gamma=self.gamma)
+            # Note that we are using Gradient Ascent, not Descent. So we need to calculate it with negative rewards.
+            policy_loss = (-discounted_rewards * saved_log_probs).sum()
+            # Backpropagation
+            if (episode + 1) % update_every == 0:
+                self.optim.zero_grad()
+            policy_loss.backward()
+            if (episode + 1) % update_every == 0:
+                self.optim.step()
+            self.logger.store({"train/policy_loss": policy_loss.item()})
+            self.logger.store({"train/log_probs": saved_log_probs.mean().item()})

apple/training/trainer.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+import torch
+class Trainer:
+    def __init__(self, model, optim, logger):
+        self.model = model
+        self.optim = optim
+        self.logger = logger
+        self.train_it = 0
+    def test_agent(self, model, logger, test_envs, num_episodes):
+        avg_success = []
+        for seq_idx, test_env in enumerate(test_envs):
+            key_prefix = f"{test_env.name}/"
+            for j in range(num_episodes):
+                obs, done, episode_return, episode_len = test_env.reset(), False, 0, 0
+                while not done:
+                    action = model.get_action(obs)
+                    obs, reward, done, _ = test_env.step(action)
+                    episode_return += reward
+                    episode_len += 1
+                logger.store({key_prefix + "return": episode_return, key_prefix + "ep_length": episode_len})
+            logger.log_tabular(key_prefix + "return", with_min_and_max=True)
+            logger.log_tabular(key_prefix + "ep_length", average_only=True)
+            env_success = test_env.pop_successes()
+            avg_success += env_success
+            logger.log_tabular(key_prefix + "success", np.mean(env_success))
+        key = "average_success"
+        logger.log_tabular(key, np.mean(avg_success))
+    def log(self, logger, step, model):
+        # Log info about epoch
+        logger.log_tabular("total_env_steps", step)
+        logger.log_tabular("train/loss", average_only=True)
+        logger.log_tabular("train/action", average_only=True)
+        for e, w in enumerate(model.model.weight.flatten()):
+            logger.log_tabular(f"weight{e}", w.item())
+        return logger.dump_tabular()
+    def update(self, env, probs, model, optim, logger):
+        target = torch.as_tensor([env.get_target_action()], dtype=torch.float32)
+        action, log_prob = model.log_prob(probs, target)
+        optim.zero_grad()
+        loss = -torch.mean(log_prob)
+        loss.backward()
+        optim.step()
+        logger.store({"train/action": action})
+        logger.store({"train/loss": loss.item()})
+    def train(self, env, test_envs, steps, log_every, num_eval_eps):
+        obs = env.reset()
+        for timestep in range(steps):
+            self.train_it += 1
+            if (timestep + 1) % log_every == 0:
+                self.test_agent(self.model, self.logger, test_envs, num_eval_eps)
+                self.log(self.logger, self.train_it, self.model)
+            output = self.model(obs)
+            action, log_prob = self.model.sample(output)
+            self.update(env, output, self.model, self.optim, self.logger)
+            obs, reward, done, info = env.step(action)
+            if done:
+                obs = env.reset()

apple/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import argparse
+import random
+from typing import Union
+import numpy as np
+import torch
+# https://stackoverflow.com/a/43357954/6365092
+def str2bool(v: Union[bool, str]) -> bool:
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)

apple/wrappers.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import (
+    Any,
+    Dict,
+    List,
+    Tuple,
+)
+import gym
+import numpy as np
+class SuccessCounter(gym.Wrapper):
+    """Helper class to keep count of successes in MetaWorld environments."""
+    def __init__(self, env: gym.Env) -> None:
+        super().__init__(env)
+        self.successes = []
+        self.current_success = False
+    def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]:
+        obs, reward, done, info = self.env.step(action)
+        if info.get("success", False):
+            self.current_success = True
+        if done:
+            self.successes.append(self.current_success)
+        return obs, reward, done, info
+    def pop_successes(self) -> List[bool]:
+        res = self.successes
+        self.successes = []
+        return res
+    def reset(self, **kwargs) -> np.ndarray:
+        self.current_success = False
+        return self.env.reset(**kwargs)

assets/apple_env.png ADDED Viewed

assets/example_rollout.mp4 ADDED Viewed

Binary file (86.8 kB). View file

assets/generate_example_rollout.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import numpy as np
+import skvideo.io
+from apple.envs.discrete_apple import get_apple_env
+env = get_apple_env("full", time_limit=10, start_x=0, c=0.5, goal_x=8, render_mode="rgb_array")
+imgs = []
+env.reset()
+for i in range(8):
+    imgs.append(env.unwrapped.render())
+    env.step(1)
+for i in range(9):
+    imgs.append(env.unwrapped.render())
+    env.step(0)
+skvideo.io.vwrite(
+    "example_rollout.mp4",
+    np.stack(imgs),
+    inputdict={
+        "-r": str(int(4)),
+    },
+    outputdict={
+        "-f": "mp4",
+        "-pix_fmt": "yuv420p",  # '-pix_fmt=yuv420p' needed for osx https://github.com/scikit-video/scikit-video/issues/74
+    },
+)

input_args.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import argparse
+from apple.utils import str2bool
+def apple_parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--c", type=float, default=0.25, required=False)
+    parser.add_argument("--start_x", type=float, default=0.0, required=False)
+    parser.add_argument("--goal_x", type=float, default=10.0, required=False)
+    parser.add_argument("--time_limit", type=int, default=100.0, required=False)
+    parser.add_argument("--lr", type=float, default=1e-3, required=False)
+    parser.add_argument("--log_to_wandb", type=str2bool, default=True, required=False)
+    return parser.parse_known_args(args=args)[0]

mrunner_exps/behavioral_cloning.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import numpy as np
+from mrunner.helpers.specification_helper import create_experiments_helper
+from mrunner_exps.utils import combine_config_with_defaults
+name = globals()["script"][:-3]
+# params for all exps
+config = {
+    "exp_tag": "behavioral_cloning",
+    "run_kind": "bc",
+    "log_to_wandb": True,
+    "pretrain_steps": 200,
+    "steps": 200,
+    "log_every": 1,
+    "num_eval_eps": 10,
+    "verbose": False,
+    "lr": 0.01,
+    "c": 0.5,
+    "start_x": 0.0,
+    "goal_x": 50.0,
+    "bias_in_state": True,
+    "position_in_state": False,
+    "time_limit": 100,
+    "wandbcommit": 100,
+    "pretrain": "phase2",
+    "finetune": "full",
+}
+config = combine_config_with_defaults(config)
+# params different between exps
+params_grid = [
+    {
+        "seed": list(range(10)),
+        "c": list(np.arange(0.1, 1.1, 0.1)),
+        "goal_x": list(np.arange(5, 50, 5)),
+    }
+]
+experiments_list = create_experiments_helper(
+    experiment_name=name,
+    project_name="apple",
+    with_neptune=False,
+    script="python3 mrunner_run.py",
+    python_path=".",
+    tags=[name],
+    exclude=["logs", "wandb"],
+    base_config=config,
+    params_grid=params_grid,
+)

mrunner_exps/reinforce.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+from mrunner.helpers.specification_helper import create_experiments_helper
+from mrunner_exps.utils import combine_config_with_defaults
+name = globals()["script"][:-3]
+# params for all exps
+config = {
+    "exp_tag": "reinforce_goal_c3",
+    "run_kind": "reinforce",
+    "log_to_wandb": True,
+    "pretrain_steps": 1000,
+    "steps": 2000,
+    "log_every": 1,
+    "num_eval_eps": 10,
+    "verbose": False,
+    "lr": 0.001,
+    "c": 1.0,
+    "start_x": 0.0,
+    "goal_x": 50.0,
+    "bias_in_state": True,
+    "position_in_state": False,
+    "time_limit": 100,
+    "gamma": 0.99,
+    "wandbcommit": 1000,
+    "pretrain": "phase2",
+    "finetune": "full",
+    "update_every": 10,  # for good definition on gradient
+}
+config = combine_config_with_defaults(config)
+# params different between exps
+params_grid = [
+    {
+        "seed": list(range(10)),
+        "c": list(np.arange(0.1, 1.1, 0.1)),
+        "goal_x": list(np.arange(5, 50, 5)),
+    }
+]
+experiments_list = create_experiments_helper(
+    experiment_name=name,
+    project_name="apple",
+    with_neptune=False,
+    script="python3 mrunner_run.py",
+    python_path=".",
+    tags=[name],
+    exclude=["logs", "wandb"],
+    base_config=config,
+    params_grid=params_grid,
+)

mrunner_exps/utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from input_args import apple_parse_args
+PARSE_ARGS_DICT = {"bc": apple_parse_args, "reinforce": apple_parse_args}
+def combine_config_with_defaults(config):
+    run_kind = config["run_kind"]
+    res = vars(PARSE_ARGS_DICT[run_kind]([]))
+    res.update(config)
+    return res

mrunner_run.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from mrunner.helpers.client_helper import get_configuration
+import wandb
+from run import main
+if __name__ == "__main__":
+    config = get_configuration(print_diagnostics=True, with_neptune=False)
+    del config["experiment_id"]
+    if config.log_to_wandb:
+        wandb.init(
+            entity="gmum",
+            project="apple",
+            config=config,
+        )
+    main(**config)

mrunner_runs/local.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/bin/bash
2	+
3	+ python mrunner_run.py --ex mrunner_exps/baseline.py

mrunner_runs/remote.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/bash
+conda activate apple
+ssh-add
+export PYTHONPATH=.
+mrunner --config ~/.mrunner.yaml --context eagle_transfer_mw2 run mrunner_exps/behavioral_cloning.py
+# mrunner --config ~/.mrunner.yaml --context eagle_transfer_mw2 run mrunner_exps/reinforce.py

pyproject.toml ADDED Viewed

	@@ -0,0 +1,92 @@

+[build-system]
+requires = ["setuptools", "setuptools_scm", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "apple"
+description = "Simplest experiment for showing forgetting"
+license = { text = "Proprietary" }
+authors = [{name = "BartekCupial", email = "bartlomiej.cupial@student.uj.edu.pl" }]
+dynamic = ["version"]
+requires-python = ">= 3.8, < 3.11"
+dependencies = [
+    "numpy ~= 1.23",
+    "typing-extensions ~= 4.3",
+    "gym == 0.23",
+    "torch ~= 1.12",
+    "wandb ~= 0.13",
+    "pandas ~= 1.5",
+    "matplotlib ~= 3.6",
+    "seaborn ~= 0.12",
+    "scipy ~= 1.9",
+    "joblib ~= 1.2",
+    "pygame ~= 2.1",
+]
+[project.optional-dependencies]
+build = ["build ~= 0.8"]
+mrunner = ["mrunner @ git+https://gitlab.com/awarelab/mrunner.git"]
+lint = [
+    "black ~= 22.6",
+    "autoflake ~= 1.4",
+    "flake8 ~= 4.0",
+    "flake8-pyi ~= 22.5",
+    "flake8-docstrings ~= 1.6",
+    "pyproject-flake8 ~= 0.0.1a4",
+    "isort ~= 5.10",
+    "pre-commit ~= 2.20",
+]
+test = [
+    "pytest ~= 7.1",
+    "pytest-cases ~= 3.6",
+    "pytest-cov ~= 3.0",
+    "pytest-xdist ~= 2.5",
+    "pytest-sugar ~= 0.9",
+    "hypothesis ~= 6.54",
+]
+dev = [
+    "apple[mrunner]",
+    "apple[build]",
+    "apple[lint]",
+    "apple[test]",
+]
+[project.urls]
+"Source" = "https://github.com/BartekCupial/apple"
+[tool.black]
+line_length = 120
+[tool.flake8]
+extend_exclude = [".venv/", "build/", "dist/", "docs/"]
+per_file_ignores = ["**/_[a-z]*.py:D", "tests/*.py:D", "*.pyi:D"]
+ignore = [
+    # Handled by black
+    "E", # pycodestyle
+    "W", # pycodestyle
+    "D",
+]
+ignore_decorators = "property" # https://github.com/PyCQA/pydocstyle/pull/546
+[tool.isort]
+profile = "black"
+line_length = 120
+order_by_type = true
+lines_between_types = 1
+combine_as_imports = true
+force_grid_wrap = 2
+[tool.pytest.ini_options]
+testpaths = "tests"
+addopts = """
+    -n auto
+    -ra
+    --tb short
+    --doctest-modules
+    --junit-xml test-results.xml
+    --cov-report term-missing:skip-covered
+    --cov-report xml:coverage.xml
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+numpy
+pandas
+gym == 0.23
+wandb
+joblib
+pygame

run.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import wandb
+from apple.envs.discrete_apple import get_apple_env
+from apple.logger import EpochLogger
+from apple.models.categorical_policy import CategoricalPolicy
+from apple.training.reinforce_trainer import ReinforceTrainer
+from apple.training.trainer import Trainer
+from apple.utils import set_seed
+from input_args import apple_parse_args
+def main(
+    run_kind: str,
+    c: float = 1.0,
+    start_x: float = 0.0,
+    goal_x: float = 50.0,
+    time_limit: int = 200,
+    bias_in_state: bool = True,
+    position_in_state: bool = False,
+    apple_in_state: bool = True,
+    lr: float = 1e-3,
+    pretrain_steps: int = 0,
+    steps: int = 10000,
+    log_every: int = 1,
+    num_eval_eps: int = 1,
+    pretrain: str = "phase1",
+    finetune: str = "full",
+    log_to_wandb: bool = False,
+    wandbcommit: int = 1,
+    verbose: bool = False,
+    output_dir="logs/apple",
+    gamma: float = 1.0,
+    update_every: int = 10,
+    seed=0,
+    **kwargs,
+):
+    set_seed(seed)
+    logger = EpochLogger(
+        exp_name=run_kind,
+        output_dir=output_dir,
+        log_to_wandb=log_to_wandb,
+        wandbcommit=wandbcommit,
+        verbose=verbose,
+    )
+    env_kwargs = dict(
+        start_x=start_x,
+        goal_x=goal_x,
+        c=c,
+        time_limit=time_limit,
+        bias_in_state=bias_in_state,
+        position_in_state=position_in_state,
+        apple_in_state=apple_in_state,
+    )
+    env_phase1 = get_apple_env(pretrain, **env_kwargs)
+    env_phase2 = get_apple_env(finetune, **env_kwargs)
+    test_envs = [get_apple_env(task, **env_kwargs) for task in ["full", "phase1", "phase2"]]
+    model = CategoricalPolicy(env_phase1.observation_space.shape[0], 1)
+    optim = torch.optim.SGD(model.parameters(), lr=lr)
+    if run_kind == "reinforce":
+        trainer = ReinforceTrainer(model, optim, logger, gamma=gamma)
+        trainer.train(env_phase1, test_envs, pretrain_steps, log_every, update_every, num_eval_eps)
+        trainer.train(env_phase2, test_envs, steps, log_every, update_every, num_eval_eps)
+    elif run_kind == "bc":
+        trainer = Trainer(model, optim, logger)
+        trainer.train(env_phase1, test_envs, pretrain_steps, log_every, num_eval_eps)
+        trainer.train(env_phase2, test_envs, steps, log_every, num_eval_eps)
+if __name__ == "__main__":
+    args = apple_parse_args()
+    if args.log_to_wandb:
+        wandb.init(
+            entity="gmum",
+            project="apple",
+            config=args,
+            settings=wandb.Settings(start_method="fork"),
+        )
+    main(**vars(args))

setup.cfg ADDED Viewed

	@@ -0,0 +1,10 @@

+[options]
+packages = find_namespace:
+package_dir =
+    = apple
+[options.packages.find]
+where = apple
+[options.package_data]
+* = py.typed, *.pyi